datascientist22 commited on
Commit
62e64fb
β€’
1 Parent(s): 977e550

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -98
app.py CHANGED
@@ -3,114 +3,72 @@ from PyPDF2 import PdfReader
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
 
6
- # Set up the Streamlit app layout
7
- st.set_page_config(page_title="RAG PDF Chatbot", layout="wide")
 
 
 
 
 
8
 
9
- # Sidebar with file upload and app title with creator details
10
- st.sidebar.title("πŸ“ PDF Upload")
11
- uploaded_files = st.sidebar.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
12
 
13
- # Multicolor sidebar background
14
- st.sidebar.markdown("""
15
- <style>
16
- .sidebar .sidebar-content {
17
- background: linear-gradient(135deg, #ff9a9e, #fad0c4 40%, #fad0c4 60%, #ff9a9e);
18
- color: white;
19
- }
20
- </style>
21
- """, unsafe_allow_html=True)
22
 
23
- st.sidebar.markdown("""
24
- ### Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)
25
- """)
26
 
27
- # Main title
28
- st.markdown("""
29
- <h1 style='text-align: center; color: #ff6f61;'>πŸ“œ RAG PDF Chatbot</h1>
30
- """, unsafe_allow_html=True)
31
 
32
- # Multicolor background for the main content
33
- st.markdown("""
34
- <style>
35
- body {
36
- background: linear-gradient(135deg, #89f7fe 0%, #66a6ff 100%);
37
- }
38
- </style>
39
- """, unsafe_allow_html=True)
 
 
40
 
41
- # Input field for user queries
42
- query = st.text_input("Enter your query here:")
43
- submit_button = st.button("Submit")
 
 
 
44
 
45
- # Initialize chat history
46
- if 'chat_history' not in st.session_state:
47
- st.session_state.chat_history = []
48
 
49
- # Load the tokenizer and model
50
- try:
51
- tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
52
- model = AutoModelForCausalLM.from_pretrained("himmeow/vi-gemma-2b-RAG")
53
- device = "cuda" if torch.cuda.is_available() else "cpu"
54
- model = model.to(device)
55
- except Exception as e:
56
- st.error(f"Error loading model or tokenizer: {e}")
57
- st.stop()
58
 
59
- # Function to extract text from PDF files
60
- def extract_text_from_pdfs(files):
61
- text = ""
62
- for uploaded_file in files:
63
- try:
64
- reader = PdfReader(uploaded_file)
65
- for page in reader.pages:
66
- text += page.extract_text() + "\n"
67
- except Exception as e:
68
- st.error(f"Error reading PDF file: {e}")
69
- return text
70
 
71
- # Handle the query submission
72
- if submit_button:
73
- if not uploaded_files:
74
- st.warning("⚠️ Please upload at least one PDF file before submitting.")
75
- elif not query:
76
- st.warning("⚠️ Please enter a query before submitting.")
77
- else:
78
- try:
79
- # Extract text from uploaded PDFs
80
- pdf_text = extract_text_from_pdfs(uploaded_files)
81
- if not pdf_text.strip():
82
- st.warning("⚠️ No text found in the uploaded PDFs.")
83
- else:
84
- # Prepare the input prompt
85
- prompt = f"""
86
- Based on the following context/document:
87
- {pdf_text}
88
- Please answer the question: {query}
89
- """
90
-
91
- # Encode the input text
92
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
93
-
94
- # Generate the response
95
- outputs = model.generate(
96
- input_ids=inputs['input_ids'].to(device),
97
- max_new_tokens=500,
98
- no_repeat_ngram_size=5,
99
- )
100
-
101
- # Decode the response and clean it
102
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
103
- clean_response = response.strip()
104
 
105
- # Update chat history
106
- st.session_state.chat_history.append((query, clean_response))
 
 
 
 
107
 
108
- except Exception as e:
109
- st.error(f"An error occurred during processing: {e}")
 
110
 
111
- # Display chat history
112
- if st.session_state.chat_history:
113
- for q, a in st.session_state.chat_history:
114
- st.markdown(f"**Question:** {q}")
115
- st.markdown(f"**Answer:** {a}")
116
- st.write("---")
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
 
6
+ # Initialize the tokenizer and model
7
+ tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
8
+ model = AutoModelForCausalLM.from_pretrained(
9
+ "himmeow/vi-gemma-2b-RAG",
10
+ device_map="auto",
11
+ torch_dtype=torch.bfloat16
12
+ )
13
 
14
+ # Use GPU if available
15
+ if torch.cuda.is_available():
16
+ model.to("cuda")
17
 
18
+ # Streamlit app layout
19
+ st.set_page_config(page_title="πŸ“„ PDF Query App", page_icon=":book:", layout="wide")
20
+ st.title("πŸ“„ PDF Query App")
21
+ st.sidebar.title("Upload File and Query")
 
 
 
 
 
22
 
23
+ # Sidebar: File Upload
24
+ uploaded_file = st.sidebar.file_uploader("Upload your PDF file", type="pdf")
 
25
 
26
+ # Sidebar: Query Input
27
+ query = st.sidebar.text_input("Enter your query:")
 
 
28
 
29
+ # Handle file upload
30
+ if uploaded_file and query:
31
+ # Read the PDF file
32
+ pdf_text = ""
33
+ with open(uploaded_file, "rb") as file:
34
+ reader = PdfReader(file)
35
+ for page_num in range(len(reader.pages)):
36
+ page = reader.pages[page_num]
37
+ text = page.extract_text()
38
+ pdf_text += text + "\n"
39
 
40
+ # Define the prompt format for the model
41
+ prompt = """
42
+ ### Instruction and Input:
43
+ Based on the following context/document:
44
+ {}
45
+ Please answer the question: {}
46
 
47
+ ### Response:
48
+ {}
49
+ """
50
 
51
+ # Format the input text
52
+ input_text = prompt.format(pdf_text, query, " ")
 
 
 
 
 
 
 
53
 
54
+ # Encode the input text into input ids
55
+ input_ids = tokenizer(input_text, return_tensors="pt")
 
 
 
 
 
 
 
 
 
56
 
57
+ # Use GPU for input ids if available
58
+ if torch.cuda.is_available():
59
+ input_ids = input_ids.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # Generate text using the model
62
+ outputs = model.generate(
63
+ **input_ids,
64
+ max_new_tokens=500, # Limit the number of tokens generated
65
+ no_repeat_ngram_size=5, # Prevent repetition of 5-gram phrases
66
+ )
67
 
68
+ # Decode and display the results
69
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
70
+ st.write(response)
71
 
72
+ # Footer with LinkedIn link
73
+ st.sidebar.write("---")
74
+ st.sidebar.write("Created by: [Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")