Eiad Gomaa commited on
Commit
6f6da11
1 Parent(s): 5ab0078

new model2

Browse files
Files changed (2) hide show
  1. app.py +14 -15
  2. requirements.txt +2 -1
app.py CHANGED
@@ -9,6 +9,12 @@ import logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
 
 
 
 
 
 
12
  @st.cache_resource
13
  def load_model():
14
  """Load model and tokenizer with caching"""
@@ -16,13 +22,10 @@ def load_model():
16
  st.spinner("Loading model... This may take a few minutes")
17
  logger.info("Starting model loading...")
18
 
19
- # Load with 8-bit quantization for CPU
20
  model = AutoModelForCausalLM.from_pretrained(
21
  "NousResearch/Llama-3.2-1B",
22
- load_in_8bit=True, # Use 8-bit quantization
23
- device_map="auto", # Automatically handle device placement
24
- low_cpu_mem_usage=True,
25
- torch_dtype=torch.float32 if not torch.cuda.is_available() else torch.float16
26
  )
27
 
28
  tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
@@ -62,7 +65,7 @@ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30)
62
  padding=True,
63
  truncation=True,
64
  max_length=256 # Reduced for CPU
65
- ).to(model.device)
66
 
67
  start_time = time.time()
68
 
@@ -81,8 +84,7 @@ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30)
81
  top_k=40,
82
  repetition_penalty=1.5, # Increased repetition penalty
83
  no_repeat_ngram_size=3, # Prevent 3-gram repetitions
84
- early_stopping=True,
85
- length_penalty=1.0
86
  )
87
 
88
  generation_time = time.time() - start_time
@@ -113,13 +115,10 @@ with st.sidebar:
113
  # Device and memory information
114
  device = "GPU" if torch.cuda.is_available() else "CPU"
115
  st.write(f"Running on: {device}")
116
- if torch.cuda.is_available():
117
- st.write(f"GPU: {torch.cuda.get_device_name(0)}")
118
- st.write(f"Memory Usage: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
119
- else:
120
- import psutil
121
- st.write(f"CPU Memory Usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
122
- st.write("⚠️ Running on CPU - Responses may be slow")
123
 
124
  # Model settings
125
  st.write("### Model Settings")
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Display installation instructions if needed
13
+ st.sidebar.write("### Required Packages")
14
+ st.sidebar.code("""
15
+ pip install transformers torch streamlit
16
+ """)
17
+
18
  @st.cache_resource
19
  def load_model():
20
  """Load model and tokenizer with caching"""
 
22
  st.spinner("Loading model... This may take a few minutes")
23
  logger.info("Starting model loading...")
24
 
25
+ # Basic model loading without device map
26
  model = AutoModelForCausalLM.from_pretrained(
27
  "NousResearch/Llama-3.2-1B",
28
+ torch_dtype=torch.float32 # Use float32 for CPU
 
 
 
29
  )
30
 
31
  tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
 
65
  padding=True,
66
  truncation=True,
67
  max_length=256 # Reduced for CPU
68
+ )
69
 
70
  start_time = time.time()
71
 
 
84
  top_k=40,
85
  repetition_penalty=1.5, # Increased repetition penalty
86
  no_repeat_ngram_size=3, # Prevent 3-gram repetitions
87
+ early_stopping=True
 
88
  )
89
 
90
  generation_time = time.time() - start_time
 
115
  # Device and memory information
116
  device = "GPU" if torch.cuda.is_available() else "CPU"
117
  st.write(f"Running on: {device}")
118
+
119
+ # Warning for CPU usage
120
+ if not torch.cuda.is_available():
121
+ st.warning("⚠️ Running on CPU - Responses may be very slow. Consider using a GPU or a smaller model.")
 
 
 
122
 
123
  # Model settings
124
  st.write("### Model Settings")
requirements.txt CHANGED
@@ -2,4 +2,5 @@ streamlit
2
  transformers
3
  torch # If your model requires PyTorch
4
  # or
5
- tensorflow
 
 
2
  transformers
3
  torch # If your model requires PyTorch
4
  # or
5
+ tensorflow
6
+ accelerate