Spaces:

ari9dam
/

Orca-2-13B

Runtime error

App Files Files Community

ari9dam commited on Nov 22, 2023

Commit

5eb35d4

•

1 Parent(s): bc393d0

gpu 80bit inference

Browse files

Files changed (2) hide show

app.py +8 -5
requirements.txt +4 -2

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from threading import Thread
 from typing import Iterator
 import gradio as gr
 import torch
 import transformers
@@ -11,8 +11,9 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 model_id = "microsoft/Orca-2-13b"
-model = transformers.AutoModelForCausalLM.from_pretrained(model_id)
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_fast=False)
@@ -21,15 +22,17 @@ user_message = "How can you determine if a restaurant is popular among locals or
 DESCRIPTION = """
 # Orca-2 13B
-This Space demonstrates model [Orca-2-13B](https://huggingface.co/microsoft/Orca-2-13B) by Microsoft, a Llama 2 derivate model with 13B parameters fine-tuned for sigle turn instructions. This space is running on Inference Endpoints using text-generation-inference library. If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://ui.endpoints.huggingface.co/).
 The system message is set to be the cautious system message:
 You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.
-Feel free to modify it in the additional input section. The demo uses greedy decoding.
 🔎 For more details about the Orca family of models take a look [at our blog post](https://msft.it/6042iGtzK).
 🔨 Looking for lighter versions of Orca-2? 🐇 Check out the [7B Chat model](https://huggingface.co/spaces/huggingface-projects/Orca-2-7b). Note: Orca 2 is licensed under the [Microsoft Research License](LICENSE). Llama 2 is licensed under the [LLAMA 2 Community License](https://ai.meta.com/llama/license/).
 """
 # Function to combine system message and user
 def to_prompt(conversations):
@@ -43,7 +46,7 @@ def to_prompt(conversations):
     inputs = tokenizer(prompt, return_tensors='pt').input_ids
     return inputs
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],

 import os
 from threading import Thread
 from typing import Iterator
+import spaces
 import gradio as gr
 import torch
 import transformers
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 model_id = "microsoft/Orca-2-13b"
+model = transformers.AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_fast=False)
 DESCRIPTION = """
 # Orca-2 13B
+This Space demonstrates model [Orca-2-13B](https://huggingface.co/microsoft/Orca-2-13B) by Microsoft, a Llama 2 derivative with 13B parameters fine-tuned for sigle turn instructions. This space is <b>running 8-bit inference with greedy decoding</b>.
 The system message is set to be the cautious system message:
 You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.
+Feel free to modify it in the additional input section.
 🔎 For more details about the Orca family of models take a look [at our blog post](https://msft.it/6042iGtzK).
 🔨 Looking for lighter versions of Orca-2? 🐇 Check out the [7B Chat model](https://huggingface.co/spaces/huggingface-projects/Orca-2-7b). Note: Orca 2 is licensed under the [Microsoft Research License](LICENSE). Llama 2 is licensed under the [LLAMA 2 Community License](https://ai.meta.com/llama/license/).
 """
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 # Function to combine system message and user
 def to_prompt(conversations):
     inputs = tokenizer(prompt, return_tensors='pt').input_ids
     return inputs
+@spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ altair==5.1.2
 annotated-types==0.6.0
 anyio==3.7.1
 attrs==23.1.0
 certifi==2023.11.17
 charset-normalizer==3.3.2
 click==8.1.7
@@ -56,6 +57,7 @@ requests==2.31.0
 rich==13.7.0
 rpds-py==0.13.1
 safetensors==0.4.0
 semantic-version==2.10.0
 sentencepiece==0.1.99
 shellingham==1.5.4
@@ -67,7 +69,7 @@ sympy==1.12
 tokenizers==0.13.3
 tomlkit==0.12.0
 toolz==0.12.0
-torch
 tqdm==4.66.1
 transformers==4.33.1
 triton==2.1.0
@@ -77,4 +79,4 @@ tzdata==2023.3
 urllib3==2.1.0
 uvicorn==0.24.0.post1
 websockets==11.0.3
-zipp==3.17.0

 annotated-types==0.6.0
 anyio==3.7.1
 attrs==23.1.0
+bitsandbytes==0.41.1
 certifi==2023.11.17
 charset-normalizer==3.3.2
 click==8.1.7
 rich==13.7.0
 rpds-py==0.13.1
 safetensors==0.4.0
+scipy==1.11.4
 semantic-version==2.10.0
 sentencepiece==0.1.99
 shellingham==1.5.4
 tokenizers==0.13.3
 tomlkit==0.12.0
 toolz==0.12.0
+torch --index-url https://download.pytorch.org/whl/cu118
 tqdm==4.66.1
 transformers==4.33.1
 triton==2.1.0
 urllib3==2.1.0
 uvicorn==0.24.0.post1
 websockets==11.0.3
+zipp==3.17.0