import difflib import json import numpy as np import streamlit as st from pyserini.search.lucene import LuceneSearcher def read_json(file_name): with open(file_name, "r") as f: json_data = json.load(f) return json_data class SearchApplication: def __init__(self): self.title = "Awesome ChatGPT repositories search" self.set_page_config() self.searcher = self.set_searcher() st.header(self.title) col1, col2 = st.columns(2) with col1: self.query = st.text_input("Search English words", value="") with col2: st.write("#") self.search_button = st.button("🔎") st.caption( "You can search for open-source software from [1250+ " " repositories](https://github.com/taishi-i/awesome-ChatGPT-repositories)." ) st.write("#") candidate_words_file = "candidate_words.json" candidate_words_json = read_json(candidate_words_file) self.candidate_words = candidate_words_json["candidate_words"] self.show_popular_words() self.show_search_results() def set_page_config(self): st.set_page_config( page_title=self.title, page_icon="😎", layout="centered", ) def set_searcher(self): searcher = LuceneSearcher("indexes/docs") return searcher def show_popular_words(self): st.caption("Popular words") word1, word2, word3, word4, word5, word6 = st.columns(6) with word1: button1 = st.button("Prompt") if button1: self.query = "prompt" with word2: button2 = st.button("Chatbot") if button2: self.query = "chatbot" with word3: button3 = st.button("Langchain") if button3: self.query = "langchain" with word4: button4 = st.button("Extension") if button4: self.query = "extension" with word5: button5 = st.button("LLMs") if button5: self.query = "llms" with word6: button6 = st.button("API") if button6: self.query = "api" def show_search_results(self): if self.query or self.search_button: st.write("#") search_results = self.searcher.search(self.query, k=500) num_search_results = len(search_results) st.write(f"A total of {num_search_results} repositories found.") if num_search_results > 0: json_search_results = [] for result in search_results: # print(result.lucene_document.getValues()) # print(result.lucene_document()) # print(result.lucene_document.toString()) # json_data = json.loads(result.raw) docid = result.docid doc = self.searcher.doc(docid) print(docid) print(doc) print(doc.raw()) # print(doc.get("contents")) json_data = json.loads(doc.raw()) # json_data = doc.get("contents") json_search_results.append(json_data) for json_data in sorted( json_search_results, key=lambda x: x["freq"], reverse=True ): description = json_data["description"] url = json_data["url"] project_name = json_data["project_name"] st.write("---") st.subheader(f"[{project_name}]({url})") st.write(description) info = [] language = json_data["language"] if language is not None and len(language) > 0: info.append(language) else: info.append("Laugage: Unkwown") license = json_data["license"] if license is None: info.append("License: Unkwown") else: info.append(license) st.caption(" / ".join(info)) else: if len(self.query) > 0: scores = [] for candidate_word in self.candidate_words: score = difflib.SequenceMatcher( None, self.query, candidate_word ).ratio() scores.append(score) num_candidate_words = 6 indexes = np.argsort(scores)[::-1][:num_candidate_words] suggestions = [self.candidate_words[i] for i in indexes] suggestions = sorted( set(suggestions), key=suggestions.index ) st.caption("Suggestions") for i, word in enumerate(suggestions, start=1): st.write(f"{i}: {word}") def main(): SearchApplication() if __name__ == "__main__": main()