owner-manual / InnovationHub /llm /vector_store.py
chasetank's picture
Initial Commit
8c41dd4
raw
history blame
2.49 kB
import os
import pprint
import codecs
import chardet
import gradio as gr
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate
from langchain.chains.conversation.memory import ConversationalBufferWindowMemory
from EdgeGPT import Chatbot
def get_content(input_file):
# Read the input file in binary mode
with open(input_file, 'rb') as f:
raw_data = f.read()
# Detect the encoding of the file
result = chardet.detect(raw_data)
encoding = result['encoding']
# Decode the contents using the detected encoding
with codecs.open(input_file, 'r', encoding=encoding) as f:
raw_text = f.read()
# Return the content of the input file
return raw_text
def create_docs(input_file):
# Create a text splitter object with a separator character
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=0,
length_function=len,
)
basename = os.path.basename(input_file)
basename = os.path.splitext(basename)[0]
texts = get_content(input_file=input_file)
metadatas = {'source': basename}
docs = text_splitter.create_documents(texts=[texts], metadatas=[metadatas])
return docs
def get_similar_docs(query, index):
similar_docs = index.similarity_search(query=query)
result = [(d.summary, d.metadata) for d in similar_docs]
return result
def convert_to_html(similar_docs):
result = []
for summary, metadata in similar_docs:
record = '<tr><td>' + summary + '</td><td>' + \
metadata['source'] + '</td></tr>'
result.append(record)
html = '<table><thead><th>Page Content</th><th>Source</th></thead><tbody>' + \
'\n'.join(result) + '</tbody></table>'
return html
def start_ui(index):
def query_index(query):
similar_docs = get_similar_docs(query=query, index=index)
formatted_output = convert_to_html(similar_docs=similar_docs)
return formatted_output
# Define input and output types
input = gr.inputs.Textbox(lines=2)
output = gr.outputs.HTML()
# Create interface object
iface = gr.Interface(fn=query_index,
inputs=input,
outputs=output)
# Launch interface
iface.launch()