datascientist22 commited on
Commit
249d7ba
1 Parent(s): 62e64fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -35
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from PyPDF2 import PdfReader
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
 
6
  # Initialize the tokenizer and model
7
  tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
@@ -26,48 +27,51 @@ uploaded_file = st.sidebar.file_uploader("Upload your PDF file", type="pdf")
26
  # Sidebar: Query Input
27
  query = st.sidebar.text_input("Enter your query:")
28
 
29
- # Handle file upload
30
- if uploaded_file and query:
31
- # Read the PDF file
32
- pdf_text = ""
33
- with open(uploaded_file, "rb") as file:
34
- reader = PdfReader(file)
35
- for page_num in range(len(reader.pages)):
36
- page = reader.pages[page_num]
37
- text = page.extract_text()
38
- pdf_text += text + "\n"
 
39
 
40
- # Define the prompt format for the model
41
- prompt = """
42
- ### Instruction and Input:
43
- Based on the following context/document:
44
- {}
45
- Please answer the question: {}
46
 
47
- ### Response:
48
- {}
49
- """
50
 
51
- # Format the input text
52
- input_text = prompt.format(pdf_text, query, " ")
53
 
54
- # Encode the input text into input ids
55
- input_ids = tokenizer(input_text, return_tensors="pt")
56
 
57
- # Use GPU for input ids if available
58
- if torch.cuda.is_available():
59
- input_ids = input_ids.to("cuda")
60
 
61
- # Generate text using the model
62
- outputs = model.generate(
63
- **input_ids,
64
- max_new_tokens=500, # Limit the number of tokens generated
65
- no_repeat_ngram_size=5, # Prevent repetition of 5-gram phrases
66
- )
67
 
68
- # Decode and display the results
69
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
70
- st.write(response)
 
 
71
 
72
  # Footer with LinkedIn link
73
  st.sidebar.write("---")
 
2
  from PyPDF2 import PdfReader
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
+ from io import BytesIO
6
 
7
  # Initialize the tokenizer and model
8
  tokenizer = AutoTokenizer.from_pretrained("himmeow/vi-gemma-2b-RAG")
 
27
  # Sidebar: Query Input
28
  query = st.sidebar.text_input("Enter your query:")
29
 
30
+ # Sidebar: Submit Button
31
+ if st.sidebar.button("Submit"):
32
+ if uploaded_file and query:
33
+ # Read the PDF file
34
+ pdf_text = ""
35
+ with BytesIO(uploaded_file.read()) as file:
36
+ reader = PdfReader(file)
37
+ for page_num in range(len(reader.pages)):
38
+ page = reader.pages[page_num]
39
+ text = page.extract_text()
40
+ pdf_text += text + "\n"
41
 
42
+ # Define the prompt format for the model
43
+ prompt = """
44
+ ### Instruction and Input:
45
+ Based on the following context/document:
46
+ {}
47
+ Please answer the question: {}
48
 
49
+ ### Response:
50
+ {}
51
+ """
52
 
53
+ # Format the input text
54
+ input_text = prompt.format(pdf_text, query, " ")
55
 
56
+ # Encode the input text into input ids
57
+ input_ids = tokenizer(input_text, return_tensors="pt")
58
 
59
+ # Use GPU for input ids if available
60
+ if torch.cuda.is_available():
61
+ input_ids = input_ids.to("cuda")
62
 
63
+ # Generate text using the model
64
+ outputs = model.generate(
65
+ **input_ids,
66
+ max_new_tokens=500, # Limit the number of tokens generated
67
+ no_repeat_ngram_size=5, # Prevent repetition of 5-gram phrases
68
+ )
69
 
70
+ # Decode and display the results
71
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
72
+ st.write(response)
73
+ else:
74
+ st.sidebar.error("Please upload a PDF file and enter a query.")
75
 
76
  # Footer with LinkedIn link
77
  st.sidebar.write("---")