Spaces:

datascientist22
/

rag-pdfQA-chatbot

Sleeping

App Files Files Community

datascientist22 commited on 21 days ago

Commit

5a93818

•

1 Parent(s): e93e1aa

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -77

app.py CHANGED Viewed

@@ -1,95 +1,67 @@
 import streamlit as st
-from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-# Cache the model and tokenizer loading
-@st.cache_resource
-def load_model():
-    tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
-    model = AutoModelForCausalLM.from_pretrained(
-        "himmeow/vi-gemma-2b-RAG",
-        device_map="auto",
-        torch_dtype=torch.bfloat16
-    )
-    if torch.cuda.is_available():
-        model.to("cuda")
-    return tokenizer, model
-# Cache the text extraction from PDFs
-@st.cache_data
-def extract_text_from_pdfs(files):
-    pdf_text = ""
-    for file in files:
         reader = PdfReader(file)
         for page_num in range(len(reader.pages)):
             page = reader.pages[page_num]
-            pdf_text += page.extract_text() + "\n"
-    return pdf_text
-# Load the model and tokenizer
-tokenizer, model = load_model()
-# Sidebar for PDF file upload
-st.sidebar.title("📂 Upload PDFs")
-uploaded_files = st.sidebar.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
-# Initialize session state
-if "history" not in st.session_state:
-    st.session_state.history = []
-# Extract text from PDFs and maintain session state
-if uploaded_files:
-    if "pdf_text" not in st.session_state:
-        st.session_state.pdf_text = extract_text_from_pdfs(uploaded_files)
-# Main interface
-st.title("💬 RAG PDF Chatbot")
-st.markdown("Ask questions based on the uploaded PDF documents.")
-# Input for user query
-query = st.text_input("Enter your question:")
-# Process and respond to user query
-if st.button("Submit"):
-    if uploaded_files and query:
-        with st.spinner("Generating response..."):
-            # Prepare the input data
-            prompt = """
-            ### Instruction and Input:
-            Based on the following context/document:
-            {}
-            Please answer the question: {}
-            ### Response:
-            """.format(st.session_state.pdf_text, query)
-            # Encode the input text
-            input_ids = tokenizer(prompt, return_tensors="pt")
-            # Use GPU for input ids if available
-            if torch.cuda.is_available():
-                input_ids = input_ids.to("cuda")
-            # Generate text using the model
-            outputs = model.generate(
-                **input_ids,
-                max_new_tokens=500,  # Limit tokens to speed up generation
-                no_repeat_ngram_size=3,  # Avoid repetition
-                do_sample=True,  # Sampling for variability
-                temperature=0.7  # Control randomness
-            )
-            # Decode and display the results
-            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            st.session_state.history.append({"question": query, "answer": response})
-# Display chat history
-if st.session_state.history:
-    for i, qa in enumerate(reversed(st.session_state.history), 1):
-        st.markdown(f"**Q{i}:** {qa['question']}")
-        st.markdown(f"**A{i}:** {qa['answer']}")
-# Footer with author information
-st.sidebar.markdown("### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
-st.sidebar.markdown("## 🗂️ RAG PDF Chatbot")

 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+from PyPDF2 import PdfReader
+# Initialize the tokenizer and model from the saved checkpoint
+tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
+model = AutoModelForCausalLM.from_pretrained(
+    "himmeow/vi-gemma-2b-RAG",
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
+# Use GPU if available
+if torch.cuda.is_available():
+    model.to("cuda")
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_path):
+    pdf_Text = ""
+    with open(pdf_path, "rb") as file:
         reader = PdfReader(file)
         for page_num in range(len(reader.pages)):
             page = reader.pages[page_num]
+            text = page.extract_text()
+            pdf_Text += text + "\n"
+    return pdf_Text
+# Streamlit app
+st.title("📄 PDF Question Answering")
+# Sidebar for PDF upload
+uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
+if uploaded_file is not None:
+    # Extract text from the uploaded PDF
+    pdf_text = extract_text_from_pdf(uploaded_file)
+    st.text_area("Extracted PDF Text", pdf_text, height=200)
+    # Input field for the user's question
+    user_query = st.text_input("Enter your question:")
+    if st.button("Submit") and user_query:
+        # Format the input text
+        input_text = f"{user_query}\n\n### Response:\n"
+        # Encode the input text into input ids
+        input_ids = tokenizer(input_text, return_tensors="pt")
+        # Use GPU for input ids if available
+        if torch.cuda.is_available():
+            input_ids = input_ids.to("cuda")
+        # Generate text using the model
+        outputs = model.generate(
+            **input_ids,
+            max_new_tokens=150,  # Limit the number of tokens generated
+            no_repeat_ngram_size=5,  # Prevent repetition of 5-gram phrases
+        )
+        # Decode and print the results
+        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Display question and answer
+        st.write(f"**Q: {user_query}**")
+        st.write(f"**A: {answer.strip()}**")