datascientist22 commited on
Commit
5a93818
β€’
1 Parent(s): e93e1aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -77
app.py CHANGED
@@ -1,95 +1,67 @@
1
  import streamlit as st
2
- from PyPDF2 import PdfReader
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
 
6
- # Cache the model and tokenizer loading
7
- @st.cache_resource
8
- def load_model():
9
- tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
10
- model = AutoModelForCausalLM.from_pretrained(
11
- "himmeow/vi-gemma-2b-RAG",
12
- device_map="auto",
13
- torch_dtype=torch.bfloat16
14
- )
15
- if torch.cuda.is_available():
16
- model.to("cuda")
17
- return tokenizer, model
18
-
19
- # Cache the text extraction from PDFs
20
- @st.cache_data
21
- def extract_text_from_pdfs(files):
22
- pdf_text = ""
23
- for file in files:
24
  reader = PdfReader(file)
25
  for page_num in range(len(reader.pages)):
26
  page = reader.pages[page_num]
27
- pdf_text += page.extract_text() + "\n"
28
- return pdf_text
29
-
30
- # Load the model and tokenizer
31
- tokenizer, model = load_model()
32
-
33
- # Sidebar for PDF file upload
34
- st.sidebar.title("πŸ“‚ Upload PDFs")
35
- uploaded_files = st.sidebar.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
36
-
37
- # Initialize session state
38
- if "history" not in st.session_state:
39
- st.session_state.history = []
40
 
41
- # Extract text from PDFs and maintain session state
42
- if uploaded_files:
43
- if "pdf_text" not in st.session_state:
44
- st.session_state.pdf_text = extract_text_from_pdfs(uploaded_files)
45
 
46
- # Main interface
47
- st.title("πŸ’¬ RAG PDF Chatbot")
48
- st.markdown("Ask questions based on the uploaded PDF documents.")
49
 
50
- # Input for user query
51
- query = st.text_input("Enter your question:")
 
52
 
53
- # Process and respond to user query
54
- if st.button("Submit"):
55
- if uploaded_files and query:
56
- with st.spinner("Generating response..."):
57
- # Prepare the input data
58
- prompt = """
59
- ### Instruction and Input:
60
- Based on the following context/document:
61
- {}
62
- Please answer the question: {}
63
 
64
- ### Response:
65
- """.format(st.session_state.pdf_text, query)
66
 
67
- # Encode the input text
68
- input_ids = tokenizer(prompt, return_tensors="pt")
 
69
 
70
- # Use GPU for input ids if available
71
- if torch.cuda.is_available():
72
- input_ids = input_ids.to("cuda")
73
 
74
- # Generate text using the model
75
- outputs = model.generate(
76
- **input_ids,
77
- max_new_tokens=500, # Limit tokens to speed up generation
78
- no_repeat_ngram_size=3, # Avoid repetition
79
- do_sample=True, # Sampling for variability
80
- temperature=0.7 # Control randomness
81
- )
82
 
83
- # Decode and display the results
84
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
85
- st.session_state.history.append({"question": query, "answer": response})
 
 
 
86
 
87
- # Display chat history
88
- if st.session_state.history:
89
- for i, qa in enumerate(reversed(st.session_state.history), 1):
90
- st.markdown(f"**Q{i}:** {qa['question']}")
91
- st.markdown(f"**A{i}:** {qa['answer']}")
92
 
93
- # Footer with author information
94
- st.sidebar.markdown("### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
95
- st.sidebar.markdown("## πŸ—‚οΈ RAG PDF Chatbot")
 
1
  import streamlit as st
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
+ from PyPDF2 import PdfReader
5
 
6
+ # Initialize the tokenizer and model from the saved checkpoint
7
+ tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
8
+ model = AutoModelForCausalLM.from_pretrained(
9
+ "himmeow/vi-gemma-2b-RAG",
10
+ device_map="auto",
11
+ torch_dtype=torch.bfloat16
12
+ )
13
+
14
+ # Use GPU if available
15
+ if torch.cuda.is_available():
16
+ model.to("cuda")
17
+
18
+ # Function to extract text from PDF
19
+ def extract_text_from_pdf(pdf_path):
20
+ pdf_Text = ""
21
+ with open(pdf_path, "rb") as file:
 
 
22
  reader = PdfReader(file)
23
  for page_num in range(len(reader.pages)):
24
  page = reader.pages[page_num]
25
+ text = page.extract_text()
26
+ pdf_Text += text + "\n"
27
+ return pdf_Text
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # Streamlit app
30
+ st.title("πŸ“„ PDF Question Answering")
 
 
31
 
32
+ # Sidebar for PDF upload
33
+ uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
 
34
 
35
+ if uploaded_file is not None:
36
+ # Extract text from the uploaded PDF
37
+ pdf_text = extract_text_from_pdf(uploaded_file)
38
 
39
+ st.text_area("Extracted PDF Text", pdf_text, height=200)
 
 
 
 
 
 
 
 
 
40
 
41
+ # Input field for the user's question
42
+ user_query = st.text_input("Enter your question:")
43
 
44
+ if st.button("Submit") and user_query:
45
+ # Format the input text
46
+ input_text = f"{user_query}\n\n### Response:\n"
47
 
48
+ # Encode the input text into input ids
49
+ input_ids = tokenizer(input_text, return_tensors="pt")
 
50
 
51
+ # Use GPU for input ids if available
52
+ if torch.cuda.is_available():
53
+ input_ids = input_ids.to("cuda")
 
 
 
 
 
54
 
55
+ # Generate text using the model
56
+ outputs = model.generate(
57
+ **input_ids,
58
+ max_new_tokens=150, # Limit the number of tokens generated
59
+ no_repeat_ngram_size=5, # Prevent repetition of 5-gram phrases
60
+ )
61
 
62
+ # Decode and print the results
63
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
64
 
65
+ # Display question and answer
66
+ st.write(f"**Q: {user_query}**")
67
+ st.write(f"**A: {answer.strip()}**")