yinsong1986 commited on
Commit
0ae4abd
1 Parent(s): ae1084a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +23 -17
README.md CHANGED
@@ -160,6 +160,10 @@ hub = {
160
  'HF_MODEL_ID':'amazon/MistralLite',
161
  'HF_TASK':'text-generation',
162
  'SM_NUM_GPUS':'1',
 
 
 
 
163
  }
164
 
165
  model = HuggingFaceModel(
@@ -171,7 +175,8 @@ model = HuggingFaceModel(
171
  predictor = model.deploy(
172
  initial_instance_count=1,
173
  instance_type="ml.g5.2xlarge",
174
- endpoint_name=model_name
 
175
  )
176
  ```
177
 
@@ -185,10 +190,10 @@ input_data = {
185
  "do_sample": False,
186
  "max_new_tokens": 400,
187
  "return_full_text": False,
188
- "typical_p": 0.2,
189
- "temperature":None,
190
- "truncate":None,
191
- "seed": 1,
192
  }
193
  }
194
  result = predictor.predict(input_data)[0]["generated_text"]
@@ -215,12 +220,12 @@ parameters = {
215
  "do_sample": False,
216
  "max_new_tokens": 400,
217
  "return_full_text": False,
218
- "typical_p": 0.2,
219
- "temperature":None,
220
- "truncate":None,
221
- "seed": 1,
222
- }
223
- endpoint_name = "MistralLite-2023-10-16-09-45-58"
224
  prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
225
  result = call_endpoint(client, prompt, endpoint_name, parameters)
226
  print(result)
@@ -236,7 +241,7 @@ Example Docker parameters:
236
  ```shell
237
  docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
238
  --model-id amazon/MistralLite \
239
- --max-input-length 8192 \
240
  --max-total-tokens 16384 \
241
  --max-batch-prefill-tokens 16384 \
242
  --trust-remote-code
@@ -263,16 +268,17 @@ def invoke_tgi(prompt,
263
  print_stream=True,
264
  assist_role=True):
265
  if (assist_role):
266
- prompt = f"<|prompter|>{prompt}<|/s|><|assistant|>"
267
  output = ""
268
  for response in tgi_client.generate_stream(
269
  prompt,
270
  do_sample=False,
271
  max_new_tokens=max_new_tokens,
272
- temperature=None,
273
- truncate=None,
274
- seed=random_seed,
275
- typical_p=0.2,
 
276
  ):
277
  if hasattr(response, "token"):
278
  if not response.token.special:
 
160
  'HF_MODEL_ID':'amazon/MistralLite',
161
  'HF_TASK':'text-generation',
162
  'SM_NUM_GPUS':'1',
163
+ "MAX_INPUT_LENGTH": '16000',
164
+ "MAX_TOTAL_TOKENS": '16384',
165
+ "MAX_BATCH_PREFILL_TOKENS": '16384',
166
+ "MAX_BATCH_TOTAL_TOKENS": '16384',
167
  }
168
 
169
  model = HuggingFaceModel(
 
175
  predictor = model.deploy(
176
  initial_instance_count=1,
177
  instance_type="ml.g5.2xlarge",
178
+ endpoint_name=model_name,
179
+
180
  )
181
  ```
182
 
 
190
  "do_sample": False,
191
  "max_new_tokens": 400,
192
  "return_full_text": False,
193
+ #"typical_p": 0.2,
194
+ #"temperature":None,
195
+ #"truncate":None,
196
+ #"seed": 1,
197
  }
198
  }
199
  result = predictor.predict(input_data)[0]["generated_text"]
 
220
  "do_sample": False,
221
  "max_new_tokens": 400,
222
  "return_full_text": False,
223
+ #"typical_p": 0.2,
224
+ #"temperature":None,
225
+ #"truncate":None,
226
+ #"seed": 1,
227
+ }
228
+ endpoint_name = predictor.endpoint_name
229
  prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
230
  result = call_endpoint(client, prompt, endpoint_name, parameters)
231
  print(result)
 
241
  ```shell
242
  docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
243
  --model-id amazon/MistralLite \
244
+ --max-input-length 16000 \
245
  --max-total-tokens 16384 \
246
  --max-batch-prefill-tokens 16384 \
247
  --trust-remote-code
 
268
  print_stream=True,
269
  assist_role=True):
270
  if (assist_role):
271
+ prompt = f"<|prompter|>{prompt}</s><|assistant|>"
272
  output = ""
273
  for response in tgi_client.generate_stream(
274
  prompt,
275
  do_sample=False,
276
  max_new_tokens=max_new_tokens,
277
+ return_full_text=False,
278
+ #temperature=None,
279
+ #truncate=None,
280
+ #seed=random_seed,
281
+ #typical_p=0.2,
282
  ):
283
  if hasattr(response, "token"):
284
  if not response.token.special: