asadmasad commited on
Commit
64439b2
1 Parent(s): 7731594

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +6 -17
handler.py CHANGED
@@ -1,31 +1,19 @@
1
  from typing import Any, Dict
2
 
3
  import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
5
-
6
- # from peft import PeftConfig, PeftModel
7
 
8
 
9
  class EndpointHandler:
10
  def __init__(self, path=""):
11
  # load model and processor from path
12
  self.tokenizer = AutoTokenizer.from_pretrained(path)
13
- # try:
14
- # config = AutoConfig.from_pretrained(path)
15
  model = AutoModelForCausalLM.from_pretrained(
16
  path,
17
- # return_dict=True,
18
- # load_in_8bit=True,
19
  device_map="auto",
20
  torch_dtype=torch.float16,
21
  trust_remote_code=True,
22
  )
23
- # model.resize_token_embeddings(len(self.tokenizer))
24
- # model = PeftModel.from_pretrained(model, path)
25
- # except Exception:
26
- # model = AutoModelForCausalLM.from_pretrained(
27
- # path, device_map="auto", load_in_8bit=True, torch_dtype=torch.float16, trust_remote_code=True
28
- # )
29
  self.model = model
30
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
31
 
@@ -33,17 +21,18 @@ class EndpointHandler:
33
  # process input
34
  inputs = data.pop("inputs", data)
35
  parameters = data.pop("parameters", None)
 
36
 
37
  # preprocess
38
- inputs = self.tokenizer(f"User: {inputs}\n\n", return_tensors="pt").to(self.device)
39
 
40
  # pass inputs with all kwargs in data
41
  if parameters is not None:
42
- outputs = self.model.generate(**inputs, max_new_tokens=880, **parameters)
43
  else:
44
- outputs = self.model.generate(**inputs, max_new_tokens=880)
45
 
46
  # postprocess the prediction
47
- prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
48
 
49
  return [{"generated_text": prediction}]
 
1
  from typing import Any, Dict
2
 
3
  import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
5
 
6
 
7
  class EndpointHandler:
8
  def __init__(self, path=""):
9
  # load model and processor from path
10
  self.tokenizer = AutoTokenizer.from_pretrained(path)
 
 
11
  model = AutoModelForCausalLM.from_pretrained(
12
  path,
 
 
13
  device_map="auto",
14
  torch_dtype=torch.float16,
15
  trust_remote_code=True,
16
  )
 
 
 
 
 
 
17
  self.model = model
18
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
 
21
  # process input
22
  inputs = data.pop("inputs", data)
23
  parameters = data.pop("parameters", None)
24
+ messages=[{ 'role': 'user', 'content': inputs}]
25
 
26
  # preprocess
27
+ inputs = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(self.device)
28
 
29
  # pass inputs with all kwargs in data
30
  if parameters is not None:
31
+ outputs = self.model.generate(inputs, num_return_sequences=1, eos_token_id=self.tokenizer.eos_token_id, **parameters) #, max_new_tokens=880
32
  else:
33
+ outputs = self.model.generate(inputs, num_return_sequences=1, eos_token_id=self.tokenizer.eos_token_id) #, max_new_tokens=880
34
 
35
  # postprocess the prediction
36
+ prediction = self.tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
37
 
38
  return [{"generated_text": prediction}]