MergeLlama-7b

Paused

codys12 commited on Oct 17, 2023

Commit

faf8f3f

•

1 Parent(s): 8824f88

testing

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,8 +19,9 @@ DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = 4096
 if torch.cuda.is_available():
-    model_id = "mistralai/Mistral-7B-Instruct-v0.1"
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -35,11 +36,16 @@ def generate(
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     conversation = []
     for user, assistant in chat_history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to("cuda")
     if len(input_ids) > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

 MAX_INPUT_TOKEN_LENGTH = 4096
 if torch.cuda.is_available():
+    model_id = "codys12/MergeLlama-7b"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
+    model.cuda()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     conversation = []
+    current_input = ""
     for user, assistant in chat_history:
+        input += user
+        input += assistant
+    current_input += message
+    device = "cuda:0"
+    inputs_ids = tokenizer(message, return_tensors="pt").to(device)
     if len(input_ids) > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")