namespace-Pt commited on
Commit
cc68800
1 Parent(s): 0a05167

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +7 -4
README.md CHANGED
@@ -72,15 +72,19 @@ from peft import PeftModel
72
 
73
  model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
74
  peft_id = "namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA"
 
75
  torch_dtype = torch.bfloat16
76
  # place the model on GPU
77
  device_map = {"": "cuda"}
78
 
79
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
80
  base_model = AutoModelForCausalLM.from_pretrained(
81
  model_id,
82
  torch_dtype=torch.bfloat16,
83
  device_map=device_map,
 
 
84
  # NOTE: expand rope base
85
  rope_theta=200e6,
86
  max_position_embeddings=81920,
@@ -92,7 +96,6 @@ model = PeftModel.from_pretrained(
92
  torch_dtype=torch.bfloat16,
93
  device_map=device_map,
94
  )
95
-
96
  # NOTE: merge LoRA weights
97
  model = model.merge_and_unload().eval()
98
 
@@ -100,9 +103,9 @@ with torch.no_grad():
100
  # short context
101
  messages = [{"role": "user", "content": "Tell me about yourself."}]
102
  inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
103
- outputs = model.generate(**inputs, max_new_tokens=50)
104
  print(f"Input Length: {inputs['input_ids'].shape[1]}")
105
- print(f"Output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
106
 
107
  # long context
108
  with open("data/narrativeqa.json", encoding="utf-8") as f:
@@ -113,5 +116,5 @@ with torch.no_grad():
113
  print("*"*20)
114
  print(f"Input Length: {inputs['input_ids'].shape[1]}")
115
  print(f"Answers: {example['answer']}")
116
- print(f"Prediction: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
117
  ```
 
72
 
73
  model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
74
  peft_id = "namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA"
75
+
76
  torch_dtype = torch.bfloat16
77
  # place the model on GPU
78
  device_map = {"": "cuda"}
79
 
80
  tokenizer = AutoTokenizer.from_pretrained(model_id)
81
+
82
  base_model = AutoModelForCausalLM.from_pretrained(
83
  model_id,
84
  torch_dtype=torch.bfloat16,
85
  device_map=device_map,
86
+ attn_implementation="flash_attention_2",
87
+
88
  # NOTE: expand rope base
89
  rope_theta=200e6,
90
  max_position_embeddings=81920,
 
96
  torch_dtype=torch.bfloat16,
97
  device_map=device_map,
98
  )
 
99
  # NOTE: merge LoRA weights
100
  model = model.merge_and_unload().eval()
101
 
 
103
  # short context
104
  messages = [{"role": "user", "content": "Tell me about yourself."}]
105
  inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
106
+ outputs = model.generate(**inputs, max_new_tokens=50)[:, inputs["input_ids"].shape[1]:]
107
  print(f"Input Length: {inputs['input_ids'].shape[1]}")
108
+ print(f"Output: {tokenizer.decode(outputs[0])}")
109
 
110
  # long context
111
  with open("data/narrativeqa.json", encoding="utf-8") as f:
 
116
  print("*"*20)
117
  print(f"Input Length: {inputs['input_ids'].shape[1]}")
118
  print(f"Answers: {example['answer']}")
119
+ print(f"Prediction: {tokenizer.decode(outputs[0])}")
120
  ```