Waseem7711 commited on
Commit
df61c4d
1 Parent(s): 1386537

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -62
app.py CHANGED
@@ -1,49 +1,3 @@
1
- Hugging Face's logo
2
- Hugging Face
3
- Search models, datasets, users...
4
- Models
5
- Datasets
6
- Spaces
7
- Posts
8
- Docs
9
- Pricing
10
-
11
-
12
-
13
- Spaces:
14
-
15
- Waseem7711
16
- /
17
- RAG_Chat_Bot
18
-
19
-
20
- like
21
- 0
22
-
23
- App
24
- Files
25
- Community
26
- Settings
27
- RAG_Chat_Bot
28
- /
29
- app.py
30
-
31
- Waseem7711's picture
32
- Waseem7711
33
- Update app.py
34
- 43c74e3
35
- verified
36
- 12 minutes ago
37
- raw
38
-
39
- Copy download link
40
- history
41
- blame
42
- edit
43
- delete
44
- No virus
45
-
46
- 2.21 kB
47
  import streamlit as st
48
  from transformers import AutoTokenizer, AutoModelForCausalLM
49
  import torch
@@ -52,6 +6,7 @@ import fitz # PyMuPDF for PDF handling
52
  # Load the model and tokenizer
53
  @st.cache_resource
54
  def load_model():
 
55
  tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
56
  model = AutoModelForCausalLM.from_pretrained(
57
  "himmeow/vi-gemma-2b-RAG",
@@ -64,6 +19,7 @@ def load_model():
64
 
65
  # Function to extract text from PDF
66
  def extract_text_from_pdf(pdf_file):
 
67
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
68
  text = ""
69
  for page_num in range(doc.page_count):
@@ -73,45 +29,54 @@ def extract_text_from_pdf(pdf_file):
73
 
74
  # Function to generate response from model
75
  def generate_response(input_text, query, tokenizer, model):
76
- prompt = """
 
77
  ### Instruction and Input:
78
  Based on the following context/document:
79
- {}
80
- Please answer the question: {}
 
81
  ### Response:
82
- {}
83
  """
84
- formatted_input = prompt.format(input_text, query, " ")
85
- input_ids = tokenizer(formatted_input, return_tensors="pt")
86
  if torch.cuda.is_available():
87
  input_ids = input_ids.to("cuda")
 
88
  outputs = model.generate(
89
  **input_ids,
90
  max_new_tokens=500,
91
  no_repeat_ngram_size=5
92
  )
 
93
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
94
 
95
- # Streamlit app
96
  def main():
97
  st.title("PDF Question Answering with vi-gemma-2b-RAG")
98
-
 
99
  pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
100
-
101
  if pdf_file is not None:
102
  with st.spinner("Reading the PDF..."):
 
103
  pdf_text = extract_text_from_pdf(pdf_file)
104
 
105
  st.text_area("Extracted Text", pdf_text, height=300)
106
-
 
107
  query = st.text_input("Enter your question:")
108
-
109
  if st.button("Get Answer"):
110
- with st.spinner("Generating response..."):
111
- tokenizer, model = load_model()
112
- response = generate_response(pdf_text, query, tokenizer, model)
113
- st.text_area("Response", response, height=200)
 
 
 
 
 
114
 
115
  if __name__ == "__main__":
116
  main()
117
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
6
  # Load the model and tokenizer
7
  @st.cache_resource
8
  def load_model():
9
+ # Load the tokenizer and model
10
  tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
11
  model = AutoModelForCausalLM.from_pretrained(
12
  "himmeow/vi-gemma-2b-RAG",
 
19
 
20
  # Function to extract text from PDF
21
  def extract_text_from_pdf(pdf_file):
22
+ # Extract text from the uploaded PDF file using PyMuPDF
23
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
24
  text = ""
25
  for page_num in range(doc.page_count):
 
29
 
30
  # Function to generate response from model
31
  def generate_response(input_text, query, tokenizer, model):
32
+ # Format the input prompt for the model
33
+ prompt = f"""
34
  ### Instruction and Input:
35
  Based on the following context/document:
36
+ {input_text}
37
+ Please answer the question: {query}
38
+
39
  ### Response:
 
40
  """
41
+ input_ids = tokenizer(prompt, return_tensors="pt")
 
42
  if torch.cuda.is_available():
43
  input_ids = input_ids.to("cuda")
44
+ # Generate a response from the model
45
  outputs = model.generate(
46
  **input_ids,
47
  max_new_tokens=500,
48
  no_repeat_ngram_size=5
49
  )
50
+ # Decode the generated output into readable text
51
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
52
 
53
+ # Streamlit app main function
54
  def main():
55
  st.title("PDF Question Answering with vi-gemma-2b-RAG")
56
+
57
+ # File uploader widget for PDF files
58
  pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
59
+
60
  if pdf_file is not None:
61
  with st.spinner("Reading the PDF..."):
62
+ # Extract text from the uploaded PDF
63
  pdf_text = extract_text_from_pdf(pdf_file)
64
 
65
  st.text_area("Extracted Text", pdf_text, height=300)
66
+
67
+ # Text input for the user's question
68
  query = st.text_input("Enter your question:")
69
+
70
  if st.button("Get Answer"):
71
+ if query.strip() == "":
72
+ st.warning("Please enter a question.")
73
+ else:
74
+ with st.spinner("Generating response..."):
75
+ # Load the model and tokenizer
76
+ tokenizer, model = load_model()
77
+ # Generate the response using the model
78
+ response = generate_response(pdf_text, query, tokenizer, model)
79
+ st.text_area("Response", response, height=200)
80
 
81
  if __name__ == "__main__":
82
  main()