|
import gradio as gr |
|
from bs4 import BeautifulSoup |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
import pickle |
|
import torch |
|
import io |
|
|
|
class CPU_Unpickler(pickle.Unpickler): |
|
def find_class(self, module, name): |
|
if module == 'torch.storage' and name == '_load_from_bytes': |
|
return lambda b: torch.load(io.BytesIO(b), map_location='cpu') |
|
else: return super().find_class(module, name) |
|
|
|
|
|
def get_hugging_face_model(): |
|
model_name = "mchochlov/codebert-base-cd-ft" |
|
hf = HuggingFaceEmbeddings(model_name=model_name) |
|
return hf |
|
|
|
|
|
def get_db(): |
|
with open("codesearchdb.pickle", "rb") as f: |
|
db = CPU_Unpickler(f).load() |
|
return db |
|
|
|
|
|
def get_similar_links(query, db, embeddings): |
|
embedding_vector = embeddings.embed_query(query) |
|
docs_and_scores = db.similarity_search_by_vector(embedding_vector, k = 10) |
|
hrefs = [] |
|
for docs in docs_and_scores: |
|
html_doc = docs.page_content |
|
soup = BeautifulSoup(html_doc, 'html.parser') |
|
href = [a['href'] for a in soup.find_all('a', href=True)] |
|
hrefs.append(href) |
|
links = [] |
|
for href_list in hrefs: |
|
for link in href_list: |
|
links.append(link) |
|
return links |
|
|
|
|
|
def find_similar_questions(text_input): |
|
embedding_vector = get_hugging_face_model() |
|
db = get_db() |
|
query = text_input |
|
answer = get_similar_links(query, db, embedding_vector) |
|
return "\n".join(set(answer)) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=find_similar_questions, |
|
inputs=gr.inputs.Textbox(lines=20, label="Enter a Code Example"), |
|
outputs=gr.outputs.Textbox(label="Similar Questions on Leetcode"), |
|
title="π DSASearch Engine π€", |
|
description="Find similar questions on Leetcode based on a code example.", |
|
allow_flagging=False, |
|
) |
|
|
|
iface.launch() |
|
|