datascientist22 commited on
Commit
4ee92ad
1 Parent(s): eaf4e19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -2,16 +2,16 @@ import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
 
5
- # Load the tokenizer and model for CPU (avoid bitsandbytes quantization)
6
  tokenizer = AutoTokenizer.from_pretrained("MohamedMotaz/Examination-llama-8b-4bit")
 
 
7
  model = AutoModelForCausalLM.from_pretrained(
8
- "MohamedMotaz/Examination-llama-8b-4bit",
9
- torch_dtype=torch.float32 # Use float32 to avoid 8-bit quantization
 
10
  )
11
 
12
- # Ensure the model runs on CPU
13
- model = model.to("cpu")
14
-
15
  # App Title
16
  st.title("Exam Corrector: Automated Grading with LLama 8b Model (CPU)")
17
 
@@ -32,8 +32,8 @@ if st.button("Grade Answer"):
32
  inputs = f"Model Answer: {model_answer}\n\nStudent Answer: {student_answer}\n\nResponse:"
33
 
34
  # Tokenize the inputs using PyTorch tensors
35
- input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to("cpu")
36
-
37
  # Generate the response using the model (PyTorch, CPU-based)
38
  with torch.no_grad():
39
  outputs = model.generate(input_ids, max_length=200)
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
 
5
+ # Load the tokenizer and model for CPU without bitsandbytes
6
  tokenizer = AutoTokenizer.from_pretrained("MohamedMotaz/Examination-llama-8b-4bit")
7
+
8
+ # Load the model in full precision, explicitly avoiding 8-bit quantization
9
  model = AutoModelForCausalLM.from_pretrained(
10
+ "MohamedMotaz/Examination-llama-8b-4bit",
11
+ torch_dtype=torch.float32, # Ensure it uses full precision (float32)
12
+ device_map="cpu", # Force the model to run on the CPU
13
  )
14
 
 
 
 
15
  # App Title
16
  st.title("Exam Corrector: Automated Grading with LLama 8b Model (CPU)")
17
 
 
32
  inputs = f"Model Answer: {model_answer}\n\nStudent Answer: {student_answer}\n\nResponse:"
33
 
34
  # Tokenize the inputs using PyTorch tensors
35
+ input_ids = tokenizer(inputs, return_tensors="pt").input_ids
36
+
37
  # Generate the response using the model (PyTorch, CPU-based)
38
  with torch.no_grad():
39
  outputs = model.generate(input_ids, max_length=200)