Spaces:

datascientist22
/

rag-pdfQA-chatbot

Sleeping

App Files Files Community

rag-pdfQA-chatbot / app.py

datascientist22

Update app.py

249d7ba verified 2 months ago

raw

history blame

2.46 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	from io import BytesIO

	# Initialize the tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
	model = AutoModelForCausalLM.from_pretrained(
	"himmeow/vi-gemma-2b-RAG",
	device_map="auto",
	torch_dtype=torch.bfloat16
	)

	# Use GPU if available
	if torch.cuda.is_available():
	model.to("cuda")

	# Streamlit app layout
	st.set_page_config(page_title="📄 PDF Query App", page_icon=":book:", layout="wide")
	st.title("📄 PDF Query App")
	st.sidebar.title("Upload File and Query")

	# Sidebar: File Upload
	uploaded_file = st.sidebar.file_uploader("Upload your PDF file", type="pdf")

	# Sidebar: Query Input
	query = st.sidebar.text_input("Enter your query:")

	# Sidebar: Submit Button
	if st.sidebar.button("Submit"):
	if uploaded_file and query:
	# Read the PDF file
	pdf_text = ""
	with BytesIO(uploaded_file.read()) as file:
	reader = PdfReader(file)
	for page_num in range(len(reader.pages)):
	page = reader.pages[page_num]
	text = page.extract_text()
	pdf_text += text + "\n"

	# Define the prompt format for the model
	prompt = """
	### Instruction and Input:
	Based on the following context/document:
	{}
	Please answer the question: {}

	### Response:
	{}
	"""

	# Format the input text
	input_text = prompt.format(pdf_text, query, " ")

	# Encode the input text into input ids
	input_ids = tokenizer(input_text, return_tensors="pt")

	# Use GPU for input ids if available
	if torch.cuda.is_available():
	input_ids = input_ids.to("cuda")

	# Generate text using the model
	outputs = model.generate(
	**input_ids,
	max_new_tokens=500, # Limit the number of tokens generated
	no_repeat_ngram_size=5, # Prevent repetition of 5-gram phrases
	)

	# Decode and display the results
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	st.write(response)
	else:
	st.sidebar.error("Please upload a PDF file and enter a query.")

	# Footer with LinkedIn link
	st.sidebar.write("---")
	st.sidebar.write("Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")