import streamlit as st
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from io import BytesIO

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
model = AutoModelForCausalLM.from_pretrained(
    "himmeow/vi-gemma-2b-RAG",
    device_map="auto",
    torch_dtype=torch.float16  # Use FP16 for faster computation if supported
)

# Use GPU if available
if torch.cuda.is_available():
    model.to("cuda")

# Streamlit app layout
st.set_page_config(page_title="📄 PDF Query App", page_icon=":book:", layout="wide")
st.title("📄 PDF Query App")
st.sidebar.title("Upload File and Query")

# Sidebar: File Upload
uploaded_file = st.sidebar.file_uploader("Upload your PDF file", type="pdf")

# Sidebar: Query Input
query = st.sidebar.text_input("Enter your query:")

# Sidebar: Submit Button
if st.sidebar.button("Submit"):
    if uploaded_file and query:
        # Read the PDF file
        pdf_text = ""
        with BytesIO(uploaded_file.read()) as file:
            reader = PdfReader(file)
            for page in reader.pages:
                text = page.extract_text()
                pdf_text += text + "\n"

        # Define the prompt format for the model
        prompt = f"""
        {pdf_text}
        
        Please answer the question: {query}

        """

        # Break the text into chunks if it's too long for the model
        max_input_length = 2048  # Adjust based on the model's max length
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)

        # Use GPU for input ids if available
        if torch.cuda.is_available():
            input_ids = input_ids.to("cuda")

        # Generate text using the model
        outputs = model.generate(
            **input_ids,
            max_new_tokens=250,  # Reduce the number of tokens generated for faster results
            no_repeat_ngram_size=3,  # Prevent repetition
            num_beams=2,  # Use beam search with fewer beams for faster results
        )

        # Decode and display the results
        response = tokenizer.decode(outputs[0], skip_special