namespace-Pt
commited on
Commit
•
cc68800
1
Parent(s):
0a05167
Upload folder using huggingface_hub
Browse files
README.md
CHANGED
@@ -72,15 +72,19 @@ from peft import PeftModel
|
|
72 |
|
73 |
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
74 |
peft_id = "namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA"
|
|
|
75 |
torch_dtype = torch.bfloat16
|
76 |
# place the model on GPU
|
77 |
device_map = {"": "cuda"}
|
78 |
|
79 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
80 |
base_model = AutoModelForCausalLM.from_pretrained(
|
81 |
model_id,
|
82 |
torch_dtype=torch.bfloat16,
|
83 |
device_map=device_map,
|
|
|
|
|
84 |
# NOTE: expand rope base
|
85 |
rope_theta=200e6,
|
86 |
max_position_embeddings=81920,
|
@@ -92,7 +96,6 @@ model = PeftModel.from_pretrained(
|
|
92 |
torch_dtype=torch.bfloat16,
|
93 |
device_map=device_map,
|
94 |
)
|
95 |
-
|
96 |
# NOTE: merge LoRA weights
|
97 |
model = model.merge_and_unload().eval()
|
98 |
|
@@ -100,9 +103,9 @@ with torch.no_grad():
|
|
100 |
# short context
|
101 |
messages = [{"role": "user", "content": "Tell me about yourself."}]
|
102 |
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
|
103 |
-
outputs = model.generate(**inputs, max_new_tokens=50)
|
104 |
print(f"Input Length: {inputs['input_ids'].shape[1]}")
|
105 |
-
print(f"Output: {tokenizer.decode(outputs[0]
|
106 |
|
107 |
# long context
|
108 |
with open("data/narrativeqa.json", encoding="utf-8") as f:
|
@@ -113,5 +116,5 @@ with torch.no_grad():
|
|
113 |
print("*"*20)
|
114 |
print(f"Input Length: {inputs['input_ids'].shape[1]}")
|
115 |
print(f"Answers: {example['answer']}")
|
116 |
-
print(f"Prediction: {tokenizer.decode(outputs[0]
|
117 |
```
|
|
|
72 |
|
73 |
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
74 |
peft_id = "namespace-Pt/Llama-3-8B-Instruct-80K-QLoRA"
|
75 |
+
|
76 |
torch_dtype = torch.bfloat16
|
77 |
# place the model on GPU
|
78 |
device_map = {"": "cuda"}
|
79 |
|
80 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
81 |
+
|
82 |
base_model = AutoModelForCausalLM.from_pretrained(
|
83 |
model_id,
|
84 |
torch_dtype=torch.bfloat16,
|
85 |
device_map=device_map,
|
86 |
+
attn_implementation="flash_attention_2",
|
87 |
+
|
88 |
# NOTE: expand rope base
|
89 |
rope_theta=200e6,
|
90 |
max_position_embeddings=81920,
|
|
|
96 |
torch_dtype=torch.bfloat16,
|
97 |
device_map=device_map,
|
98 |
)
|
|
|
99 |
# NOTE: merge LoRA weights
|
100 |
model = model.merge_and_unload().eval()
|
101 |
|
|
|
103 |
# short context
|
104 |
messages = [{"role": "user", "content": "Tell me about yourself."}]
|
105 |
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda")
|
106 |
+
outputs = model.generate(**inputs, max_new_tokens=50)[:, inputs["input_ids"].shape[1]:]
|
107 |
print(f"Input Length: {inputs['input_ids'].shape[1]}")
|
108 |
+
print(f"Output: {tokenizer.decode(outputs[0])}")
|
109 |
|
110 |
# long context
|
111 |
with open("data/narrativeqa.json", encoding="utf-8") as f:
|
|
|
116 |
print("*"*20)
|
117 |
print(f"Input Length: {inputs['input_ids'].shape[1]}")
|
118 |
print(f"Answers: {example['answer']}")
|
119 |
+
print(f"Prediction: {tokenizer.decode(outputs[0])}")
|
120 |
```
|