Eiad Gomaa
commited on
Commit
•
6f6da11
1
Parent(s):
5ab0078
new model2
Browse files- app.py +14 -15
- requirements.txt +2 -1
app.py
CHANGED
@@ -9,6 +9,12 @@ import logging
|
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
@st.cache_resource
|
13 |
def load_model():
|
14 |
"""Load model and tokenizer with caching"""
|
@@ -16,13 +22,10 @@ def load_model():
|
|
16 |
st.spinner("Loading model... This may take a few minutes")
|
17 |
logger.info("Starting model loading...")
|
18 |
|
19 |
-
#
|
20 |
model = AutoModelForCausalLM.from_pretrained(
|
21 |
"NousResearch/Llama-3.2-1B",
|
22 |
-
|
23 |
-
device_map="auto", # Automatically handle device placement
|
24 |
-
low_cpu_mem_usage=True,
|
25 |
-
torch_dtype=torch.float32 if not torch.cuda.is_available() else torch.float16
|
26 |
)
|
27 |
|
28 |
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
|
@@ -62,7 +65,7 @@ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30)
|
|
62 |
padding=True,
|
63 |
truncation=True,
|
64 |
max_length=256 # Reduced for CPU
|
65 |
-
)
|
66 |
|
67 |
start_time = time.time()
|
68 |
|
@@ -81,8 +84,7 @@ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30)
|
|
81 |
top_k=40,
|
82 |
repetition_penalty=1.5, # Increased repetition penalty
|
83 |
no_repeat_ngram_size=3, # Prevent 3-gram repetitions
|
84 |
-
early_stopping=True
|
85 |
-
length_penalty=1.0
|
86 |
)
|
87 |
|
88 |
generation_time = time.time() - start_time
|
@@ -113,13 +115,10 @@ with st.sidebar:
|
|
113 |
# Device and memory information
|
114 |
device = "GPU" if torch.cuda.is_available() else "CPU"
|
115 |
st.write(f"Running on: {device}")
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
import psutil
|
121 |
-
st.write(f"CPU Memory Usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
|
122 |
-
st.write("⚠️ Running on CPU - Responses may be slow")
|
123 |
|
124 |
# Model settings
|
125 |
st.write("### Model Settings")
|
|
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
+
# Display installation instructions if needed
|
13 |
+
st.sidebar.write("### Required Packages")
|
14 |
+
st.sidebar.code("""
|
15 |
+
pip install transformers torch streamlit
|
16 |
+
""")
|
17 |
+
|
18 |
@st.cache_resource
|
19 |
def load_model():
|
20 |
"""Load model and tokenizer with caching"""
|
|
|
22 |
st.spinner("Loading model... This may take a few minutes")
|
23 |
logger.info("Starting model loading...")
|
24 |
|
25 |
+
# Basic model loading without device map
|
26 |
model = AutoModelForCausalLM.from_pretrained(
|
27 |
"NousResearch/Llama-3.2-1B",
|
28 |
+
torch_dtype=torch.float32 # Use float32 for CPU
|
|
|
|
|
|
|
29 |
)
|
30 |
|
31 |
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
|
|
|
65 |
padding=True,
|
66 |
truncation=True,
|
67 |
max_length=256 # Reduced for CPU
|
68 |
+
)
|
69 |
|
70 |
start_time = time.time()
|
71 |
|
|
|
84 |
top_k=40,
|
85 |
repetition_penalty=1.5, # Increased repetition penalty
|
86 |
no_repeat_ngram_size=3, # Prevent 3-gram repetitions
|
87 |
+
early_stopping=True
|
|
|
88 |
)
|
89 |
|
90 |
generation_time = time.time() - start_time
|
|
|
115 |
# Device and memory information
|
116 |
device = "GPU" if torch.cuda.is_available() else "CPU"
|
117 |
st.write(f"Running on: {device}")
|
118 |
+
|
119 |
+
# Warning for CPU usage
|
120 |
+
if not torch.cuda.is_available():
|
121 |
+
st.warning("⚠️ Running on CPU - Responses may be very slow. Consider using a GPU or a smaller model.")
|
|
|
|
|
|
|
122 |
|
123 |
# Model settings
|
124 |
st.write("### Model Settings")
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ streamlit
|
|
2 |
transformers
|
3 |
torch # If your model requires PyTorch
|
4 |
# or
|
5 |
-
tensorflow
|
|
|
|
2 |
transformers
|
3 |
torch # If your model requires PyTorch
|
4 |
# or
|
5 |
+
tensorflow
|
6 |
+
accelerate
|