Luca Foppiano commited on
Commit
280ee18
β€’
2 Parent(s): 7b33dad aeef8cf

Merge pull request #16 from lfoppiano/pdf-render

Browse files
.streamlit/config.toml CHANGED
@@ -2,4 +2,7 @@
2
  level = "info"
3
 
4
  [browser]
5
- gatherUsageStats = false
 
 
 
 
2
  level = "info"
3
 
4
  [browser]
5
+ gatherUsageStats = true
6
+
7
+ [ui]
8
+ hideTopBar = true
document_qa/document_qa_engine.py CHANGED
@@ -257,7 +257,7 @@ class DocumentQAEngine:
257
  self.embeddings_dict[hash] = Chroma.from_texts(texts,
258
  embedding=self.embedding_function,
259
  metadatas=metadata,
260
- collection_name=hash)
261
  else:
262
  # if 'documents' in self.embeddings_dict[hash].get() and len(self.embeddings_dict[hash].get()['documents']) == 0:
263
  # self.embeddings_dict[hash].delete(ids=self.embeddings_dict[hash].get()['ids'])
 
257
  self.embeddings_dict[hash] = Chroma.from_texts(texts,
258
  embedding=self.embedding_function,
259
  metadatas=metadata,
260
+ collection_name=hash)
261
  else:
262
  # if 'documents' in self.embeddings_dict[hash].get() and len(self.embeddings_dict[hash].get()['documents']) == 0:
263
  # self.embeddings_dict[hash].delete(ids=self.embeddings_dict[hash].get()['ids'])
streamlit_app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import re
3
  from hashlib import blake2b
@@ -55,10 +56,14 @@ if 'uploaded' not in st.session_state:
55
  if 'memory' not in st.session_state:
56
  st.session_state['memory'] = ConversationBufferWindowMemory(k=4)
57
 
 
 
 
58
  st.set_page_config(
59
  page_title="Scientific Document Insights Q/A",
60
  page_icon="πŸ“",
61
  initial_sidebar_state="expanded",
 
62
  menu_items={
63
  'Get Help': 'https://github.com/lfoppiano/document-qa',
64
  'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
@@ -66,6 +71,36 @@ st.set_page_config(
66
  }
67
  )
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def new_file():
71
  st.session_state['loaded_embeddings'] = None
@@ -222,18 +257,21 @@ with st.sidebar:
222
  on_click=clear_memory,
223
  help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.")
224
 
225
- st.title("πŸ“ Scientific Document Insights Q/A")
226
- st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
227
 
228
- st.markdown(
229
- ":warning: Do not upload sensitive data. We **temporarily** store text from the uploaded PDF documents solely for the purpose of processing your request, and we **do not assume responsibility** for any subsequent use or handling of the data submitted to third parties LLMs.")
 
230
 
231
- uploaded_file = st.file_uploader("Upload an article",
 
 
 
232
  type=("pdf", "txt"),
233
  on_change=new_file,
234
- disabled=st.session_state['model'] is not None and st.session_state['model'] not in
235
- st.session_state['api_keys'],
236
- help="The full-text is extracted using Grobid. ")
237
 
238
  question = st.chat_input(
239
  "Ask something about the article",
@@ -276,65 +314,99 @@ with st.sidebar:
276
  st.markdown(
277
  """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
278
 
 
 
 
 
 
 
 
279
  if uploaded_file and not st.session_state.loaded_embeddings:
280
  if model not in st.session_state['api_keys']:
281
  st.error("Before uploading a document, you must enter the API key. ")
282
  st.stop()
283
- with st.spinner('Reading file, calling Grobid, and creating memory embeddings...'):
284
- binary = uploaded_file.getvalue()
285
- tmp_file = NamedTemporaryFile()
286
- tmp_file.write(bytearray(binary))
287
- # hash = get_file_hash(tmp_file.name)[:10]
288
- st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
289
- chunk_size=chunk_size,
 
 
 
290
  perc_overlap=0.1,
291
  include_biblio=True)
292
- st.session_state['loaded_embeddings'] = True
293
- st.session_state.messages = []
294
 
295
  # timestamp = datetime.utcnow()
296
 
297
- if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
298
- for message in st.session_state.messages:
299
- with st.chat_message(message["role"]):
300
- if message['mode'] == "LLM":
301
- st.markdown(message["content"], unsafe_allow_html=True)
302
- elif message['mode'] == "Embeddings":
303
- st.write(message["content"])
304
- if model not in st.session_state['rqa']:
305
- st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
306
- st.stop()
307
-
308
- with st.chat_message("user"):
309
- st.markdown(question)
310
- st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
311
-
312
- text_response = None
313
- if mode == "Embeddings":
314
- with st.spinner("Generating LLM response..."):
315
- text_response = st.session_state['rqa'][model].query_storage(question, st.session_state.doc_id,
316
- context_size=context_size)
317
- elif mode == "LLM":
318
- with st.spinner("Generating response..."):
319
- _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  context_size=context_size)
321
 
322
- if not text_response:
323
- st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")
324
-
325
- with st.chat_message("assistant"):
326
- if mode == "LLM":
327
- if st.session_state['ner_processing']:
328
- with st.spinner("Processing NER on LLM response..."):
329
- entities = gqa.process_single_text(text_response)
330
- decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
331
- decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
332
- decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
333
- text_response = decorated_text
334
- st.markdown(text_response, unsafe_allow_html=True)
335
- else:
336
- st.write(text_response)
337
- st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
338
 
339
  # if len(st.session_state.messages) > 1:
340
  # last_answer = st.session_state.messages[len(st.session_state.messages)-1]
@@ -342,5 +414,5 @@ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.
342
  # last_question = st.session_state.messages[len(st.session_state.messages)-2]
343
  # st.session_state.memory.save_context({"input": last_question['content']}, {"output": last_answer['content']})
344
 
345
- elif st.session_state.loaded_embeddings and st.session_state.doc_id:
346
- play_old_messages()
 
1
+ import base64
2
  import os
3
  import re
4
  from hashlib import blake2b
 
56
  if 'memory' not in st.session_state:
57
  st.session_state['memory'] = ConversationBufferWindowMemory(k=4)
58
 
59
+ if 'binary' not in st.session_state:
60
+ st.session_state['binary'] = None
61
+
62
  st.set_page_config(
63
  page_title="Scientific Document Insights Q/A",
64
  page_icon="πŸ“",
65
  initial_sidebar_state="expanded",
66
+ layout="wide",
67
  menu_items={
68
  'Get Help': 'https://github.com/lfoppiano/document-qa',
69
  'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
 
71
  }
72
  )
73
 
74
+ css_modify_left_column = '''
75
+ <style>
76
+ [data-testid="stHorizontalBlock"] > div:nth-child(1) {
77
+ overflow: hidden;
78
+ background-color: red;
79
+ height: 70vh;
80
+ }
81
+ </style>
82
+ '''
83
+ css_modify_right_column = '''
84
+ <style>
85
+ [data-testid="stHorizontalBlock"]> div:first-child {
86
+ background-color: red;
87
+ position: fixed
88
+ height: 70vh;
89
+ }
90
+ </style>
91
+ '''
92
+ css_disable_scrolling_container = '''
93
+ <style>
94
+ [data-testid="ScrollToBottomContainer"] {
95
+ overflow: hidden;
96
+ }
97
+ </style>
98
+ '''
99
+
100
+
101
+ # st.markdown(css_lock_column_fixed, unsafe_allow_html=True)
102
+ # st.markdown(css2, unsafe_allow_html=True)
103
+
104
 
105
  def new_file():
106
  st.session_state['loaded_embeddings'] = None
 
257
  on_click=clear_memory,
258
  help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.")
259
 
260
+ left_column, right_column = st.columns([1, 1])
 
261
 
262
+ with right_column:
263
+ st.title("πŸ“ Scientific Document Insights Q/A")
264
+ st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
265
 
266
+ st.markdown(
267
+ ":warning: Do not upload sensitive data. We **temporarily** store text from the uploaded PDF documents solely for the purpose of processing your request, and we **do not assume responsibility** for any subsequent use or handling of the data submitted to third parties LLMs.")
268
+
269
+ uploaded_file = st.file_uploader("Upload an article",
270
  type=("pdf", "txt"),
271
  on_change=new_file,
272
+ disabled=st.session_state['model'] is not None and st.session_state['model'] not in
273
+ st.session_state['api_keys'],
274
+ help="The full-text is extracted using Grobid. ")
275
 
276
  question = st.chat_input(
277
  "Ask something about the article",
 
314
  st.markdown(
315
  """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
316
 
317
+
318
+ @st.cache_resource
319
+ def get_pdf_display(binary):
320
+ base64_pdf = base64.b64encode(binary).decode('utf-8')
321
+ return F'<embed src="data:application/pdf;base64,{base64_pdf}" width="100%" height="700" type="application/pdf"></embed>'
322
+
323
+
324
  if uploaded_file and not st.session_state.loaded_embeddings:
325
  if model not in st.session_state['api_keys']:
326
  st.error("Before uploading a document, you must enter the API key. ")
327
  st.stop()
328
+
329
+ with right_column:
330
+ with st.spinner('Reading file, calling Grobid, and creating memory embeddings...'):
331
+ binary = uploaded_file.getvalue()
332
+ tmp_file = NamedTemporaryFile()
333
+ tmp_file.write(bytearray(binary))
334
+ st.session_state['binary'] = binary
335
+
336
+ st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
337
+ chunk_size=chunk_size,
338
  perc_overlap=0.1,
339
  include_biblio=True)
340
+ st.session_state['loaded_embeddings'] = True
341
+ st.session_state.messages = []
342
 
343
  # timestamp = datetime.utcnow()
344
 
345
+ with left_column:
346
+ if st.session_state['binary']:
347
+ left_column.markdown(get_pdf_display(st.session_state['binary']), unsafe_allow_html=True)
348
+
349
+ with right_column:
350
+ # css = '''
351
+ # <style>
352
+ # [data-testid="column"] {
353
+ # overflow: auto;
354
+ # height: 70vh;
355
+ # }
356
+ # </style>
357
+ # '''
358
+ # st.markdown(css, unsafe_allow_html=True)
359
+
360
+ # st.markdown(
361
+ # """
362
+ # <script>
363
+ # document.querySelectorAll('[data-testid="column"]').scrollIntoView({behavior: "smooth"});
364
+ # </script>
365
+ # """,
366
+ # unsafe_allow_html=True,
367
+ # )
368
+
369
+ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
370
+ for message in st.session_state.messages:
371
+ with st.chat_message(message["role"]):
372
+ if message['mode'] == "LLM":
373
+ st.markdown(message["content"], unsafe_allow_html=True)
374
+ elif message['mode'] == "Embeddings":
375
+ st.write(message["content"])
376
+ if model not in st.session_state['rqa']:
377
+ st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
378
+ st.stop()
379
+
380
+ with st.chat_message("user"):
381
+ st.markdown(question)
382
+ st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
383
+
384
+ text_response = None
385
+ if mode == "Embeddings":
386
+ with st.spinner("Generating LLM response..."):
387
+ text_response = st.session_state['rqa'][model].query_storage(question, st.session_state.doc_id,
388
+ context_size=context_size)
389
+ elif mode == "LLM":
390
+ with st.spinner("Generating response..."):
391
+ _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
392
  context_size=context_size)
393
 
394
+ if not text_response:
395
+ st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")
396
+
397
+ with st.chat_message("assistant"):
398
+ if mode == "LLM":
399
+ if st.session_state['ner_processing']:
400
+ with st.spinner("Processing NER on LLM response..."):
401
+ entities = gqa.process_single_text(text_response)
402
+ decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
403
+ decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
404
+ decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
405
+ text_response = decorated_text
406
+ st.markdown(text_response, unsafe_allow_html=True)
407
+ else:
408
+ st.write(text_response)
409
+ st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
410
 
411
  # if len(st.session_state.messages) > 1:
412
  # last_answer = st.session_state.messages[len(st.session_state.messages)-1]
 
414
  # last_question = st.session_state.messages[len(st.session_state.messages)-2]
415
  # st.session_state.memory.save_context({"input": last_question['content']}, {"output": last_answer['content']})
416
 
417
+ elif st.session_state.loaded_embeddings and st.session_state.doc_id:
418
+ play_old_messages()