datascientist22 commited on
Commit
5e9dd30
β€’
1 Parent(s): 6feb14e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -55
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
- from PyPDF2 import PdfReader
5
 
6
  # Initialize the tokenizer and model from the saved checkpoint
7
  tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
@@ -15,71 +15,97 @@ model = AutoModelForCausalLM.from_pretrained(
15
  if torch.cuda.is_available():
16
  model.to("cuda")
17
 
18
- # Function to extract text from PDF
19
- def extract_text_from_pdf(pdf_path):
20
- pdf_text = ""
21
- with open(pdf_path, "rb") as file:
22
- reader = PdfReader(file)
23
- for page_num in range(len(reader.pages)):
24
- page = reader.pages[page_num]
25
- text = page.extract_text()
26
- pdf_text += text + "\n"
27
- return pdf_text
28
-
29
- # Streamlit app
30
- st.write("**Created by: Engr. Hamesh Raj** [LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")
31
-
32
- st.title("πŸ“„ PDF Question Answering")
33
-
34
- # Sidebar for PDF upload
35
- uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
36
-
37
- if uploaded_file is not None:
38
- # Extract text from the uploaded PDF
39
- pdf_text = extract_text_from_pdf(uploaded_file)
40
-
41
- st.text_area("Extracted PDF Text", pdf_text, height=200)
42
-
43
- # Input field for the user's question
44
- user_query = st.text_input("Enter your question:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Display the submit button below the input field
47
- if st.button("Submit") and user_query:
48
- # Format the input text
49
- input_text = f"{user_query}\n\n### Response:\n"
50
 
51
- # Encode the input text into input ids
52
- input_ids = tokenizer(input_text, return_tensors="pt")
53
 
54
  # Use GPU for input ids if available
55
  if torch.cuda.is_available():
56
  input_ids = input_ids.to("cuda")
57
 
58
- # Generate text using the model
59
  outputs = model.generate(
60
  **input_ids,
61
- max_new_tokens=150, # Limit the number of tokens generated
62
- no_repeat_ngram_size=5, # Prevent repetition of 5-gram phrases
63
  )
64
 
65
- # Decode and print the results
66
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
67
 
68
- # Display question and answer
69
- st.write(f"**Q{len(st.session_state) + 1}: {user_query}**")
70
- st.write(f"**A{len(st.session_state) + 1}: {answer.strip()}**")
71
-
72
- # Store in session state for chat history
73
- if "history" not in st.session_state:
74
- st.session_state.history = []
75
-
76
- st.session_state.history.append({
77
- "question": user_query,
78
- "answer": answer.strip()
79
- })
80
 
81
  # Display chat history
82
- if "history" in st.session_state:
83
- for i, qa in enumerate(st.session_state.history):
84
- st.write(f"**Q{i + 1}: {qa['question']}**")
85
- st.write(f"**A{i + 1}: {qa['answer']}**")
 
 
1
  import streamlit as st
2
+ from PyPDF2 import PdfReader
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
 
6
  # Initialize the tokenizer and model from the saved checkpoint
7
  tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
 
15
  if torch.cuda.is_available():
16
  model.to("cuda")
17
 
18
+ # Set up the Streamlit app layout
19
+ st.set_page_config(page_title="RAG PDF Chatbot", layout="wide")
20
+
21
+ # Sidebar with file upload and app title with creator details
22
+ st.sidebar.title("πŸ“ PDF Upload")
23
+ uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
24
+
25
+ # Multicolor sidebar background
26
+ st.sidebar.markdown("""
27
+ <style>
28
+ .sidebar .sidebar-content {
29
+ background: linear-gradient(135deg, #ff9a9e, #fad0c4 40%, #fad0c4 60%, #ff9a9e);
30
+ color: white;
31
+ }
32
+ </style>
33
+ """, unsafe_allow_html=True)
34
+
35
+ st.sidebar.markdown("""
36
+ ### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)
37
+ """)
38
+
39
+ # Main title
40
+ st.markdown("""
41
+ <h1 style='text-align: center; color: #ff6f61;'>πŸ“œ RAG PDF Chatbot</h1>
42
+ """, unsafe_allow_html=True)
43
+
44
+ # Multicolor background for the main content
45
+ st.markdown("""
46
+ <style>
47
+ body {
48
+ background: linear-gradient(135deg, #89f7fe 0%, #66a6ff 100%);
49
+ }
50
+ </style>
51
+ """, unsafe_allow_html=True)
52
+
53
+ # Input field for user queries
54
+ query = st.text_input("Enter your query here:")
55
+ submit_button = st.button("Submit")
56
+
57
+ # Initialize chat history
58
+ if 'chat_history' not in st.session_state:
59
+ st.session_state.chat_history = []
60
+
61
+ # Function to extract text from PDF files
62
+ def extract_text_from_pdfs(files):
63
+ text = ""
64
+ for uploaded_file in files:
65
+ reader = PdfReader(uploaded_file)
66
+ for page in reader.pages:
67
+ text += page.extract_text() + "\n"
68
+ return text
69
+
70
+ # Handle the query submission
71
+ if submit_button and query:
72
+ # Extract text from uploaded PDFs
73
+ if uploaded_files:
74
+ pdf_text = extract_text_from_pdfs(uploaded_files)
75
+
76
+ # Prepare the input prompt
77
+ prompt = f"""
78
+ ### Instruction and Input:
79
+ Based on the following context/document:
80
+ {pdf_text}
81
+ Please answer the question: {query}
82
 
83
+ ### Response:
84
+ """
 
 
85
 
86
+ # Encode the input text
87
+ input_ids = tokenizer(prompt, return_tensors="pt")
88
 
89
  # Use GPU for input ids if available
90
  if torch.cuda.is_available():
91
  input_ids = input_ids.to("cuda")
92
 
93
+ # Generate the response
94
  outputs = model.generate(
95
  **input_ids,
96
+ max_new_tokens=500,
97
+ no_repeat_ngram_size=5,
98
  )
99
 
100
+ # Decode the response
101
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
102
 
103
+ # Update chat history
104
+ st.session_state.chat_history.append((query, response))
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Display chat history
107
+ if st.session_state.chat_history:
108
+ for i, (q, a) in enumerate(st.session_state.chat_history):
109
+ st.markdown(f"**Question {i + 1}:** {q}")
110
+ st.markdown(f"**Answer:** {a}")
111
+ st.write("---")