import os import pprint import codecs import chardet import gradio as gr from langchain.llms import HuggingFacePipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate from langchain.chains.conversation.memory import ConversationalBufferWindowMemory from EdgeGPT import Chatbot def get_content(input_file): # Read the input file in binary mode with open(input_file, 'rb') as f: raw_data = f.read() # Detect the encoding of the file result = chardet.detect(raw_data) encoding = result['encoding'] # Decode the contents using the detected encoding with codecs.open(input_file, 'r', encoding=encoding) as f: raw_text = f.read() # Return the content of the input file return raw_text def create_docs(input_file): # Create a text splitter object with a separator character text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0, length_function=len, ) basename = os.path.basename(input_file) basename = os.path.splitext(basename)[0] texts = get_content(input_file=input_file) metadatas = {'source': basename} docs = text_splitter.create_documents(texts=[texts], metadatas=[metadatas]) return docs def get_similar_docs(query, index): similar_docs = index.similarity_search(query=query) result = [(d.summary, d.metadata) for d in similar_docs] return result def convert_to_html(similar_docs): result = [] for summary, metadata in similar_docs: record = '
Page Content | Source | ' + \ '\n'.join(result) + '
---|