Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PyPDF2 import PdfReader | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
# App configuration | |
st.set_page_config(page_title="PDF Chatbot", layout="wide") | |
st.markdown( | |
""" | |
<style> | |
body { | |
background: linear-gradient(90deg, rgba(255,224,230,1) 0%, rgba(224,255,255,1) 50%, rgba(224,240,255,1) 100%); | |
color: #000; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# Title and "Created by" section | |
st.markdown("<h1 style='text-align: center; color: #FF69B4;'>π PDF RAG Chatbot</h1>", unsafe_allow_html=True) | |
st.markdown( | |
"<h4 style='text-align: center;'>Created by: <a href='https://www.linkedin.com/in/datascientisthameshraj/' style='color:#FF4500;'>Engr. Hamesh Raj</a></h4>", | |
unsafe_allow_html=True | |
) | |
# Sidebar for PDF file upload | |
uploaded_files = st.sidebar.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) | |
# Query input | |
query = st.text_input("Ask a question about the uploaded PDFs:") | |
# Initialize session state to store conversation | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = [] | |
# Function to extract text from PDFs | |
def extract_text_from_pdfs(files): | |
pdf_text = "" | |
for file in files: | |
reader = PdfReader(file) | |
for page_num in range(len(reader.pages)): | |
page = reader.pages[page_num] | |
pdf_text += page.extract_text() + "\n" | |
return pdf_text | |
# Load model and tokenizer | |
def load_model(): | |
tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG") | |
model = AutoModelForCausalLM.from_pretrained( | |
"himmeow/vi-gemma-2b-RAG", | |
device_map="auto", | |
torch_dtype=torch.bfloat16 | |
) | |
if torch.cuda.is_available(): | |
model.to("cuda") | |
return tokenizer, model | |
# Process and respond to user query | |
if st.button("Submit"): | |
if uploaded_files and query: | |
pdf_text = extract_text_from_pdfs(uploaded_files) | |
tokenizer, model = load_model() | |
prompt = """ | |
### Instruction and Input: | |
Based on the following context/document: | |
{} | |
Please answer the question: {} | |
### Response: | |
{} | |
""" | |
input_text = prompt.format(pdf_text, query, " ") | |
input_ids = tokenizer(input_text, return_tensors="pt") | |
if torch.cuda.is_available(): | |
input_ids = input_ids.to("cuda") | |
outputs = model.generate( | |
**input_ids, | |
max_new_tokens=500, | |
no_repeat_ngram_size=5, | |
) | |
answer = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Store the conversation | |
st.session_state.conversation.insert(0, {"question": query, "answer": answer}) | |
# Display conversation | |
if st.session_state.conversation: | |
st.markdown("## Previous Conversations") | |
for qa in st.session_state.conversation: | |
st.markdown(f"**Q: {qa['question']}**") | |
st.markdown(f"**A: {qa['answer']}**") | |
st.markdown("---") |