Spaces:
Sleeping
Sleeping
import datetime | |
import json | |
from langchain_community.retrievers import BM25Retriever | |
from langchain_core.documents import Document | |
import streamlit as st | |
def load_docs_from_json(json_path): | |
with open(json_path) as f: | |
papers = json.load(f) | |
docs = [] | |
for paper in papers: | |
page_content = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}" | |
doc = Document( | |
page_content=page_content, | |
metadata={ | |
'title': paper['title'], | |
'link': paper['link'], | |
'authors': paper['authors'], | |
'submitter': paper['submitter'], | |
'date': paper['date'], | |
} | |
) | |
docs.append(doc) | |
return docs | |
# init | |
json_path = "hf_daily_papers_2023-05-04_2024-06-27.json" | |
docs = load_docs_from_json(json_path) | |
retriever = BM25Retriever.from_documents(docs) | |
retriever.k = 10 | |
dates = [datetime.datetime.strptime(doc.metadata['date'], '%Y-%m-%d') for doc in docs] | |
oldest_date = min(dates) | |
newest_date = max(dates) | |
# streamlit | |
st.title("HF Daily Papers Search") | |
st.markdown(f"Search papers from [HF daily papers](https://huggingface.co/papers).\n\nNmber of documents: {len(docs)}\n\nfrom {oldest_date.strftime('%Y-%m-%d')} to {newest_date.strftime('%Y-%m-%d')}") | |
user_query = st.text_input("Search anything...") | |
if st.button('β'): | |
results = retriever.invoke(user_query) | |
st.text(f"hit {len(results)} papers") | |
for result in results: | |
with st.expander(label=result.metadata['title'], expanded=False): | |
for k in result.metadata: | |
st.write(f"{k}: {result.metadata[k]}") | |
st.divider() | |
st.markdown(result.page_content) |