import plotly.graph_objs as go from sklearn.cluster import KMeans from sklearn.decomposition import PCA import plotly.express as px import numpy as np import os import pprint import codecs import chardet import gradio as gr from langchain.llms import HuggingFacePipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate from langchain.chains.conversation.memory import ConversationalBufferWindowMemory from EdgeGPT import Chatbot def get_content(input_file): # Read the input file in binary mode with open(input_file, 'rb') as f: raw_data = f.read() # Detect the encoding of the file result = chardet.detect(raw_data) encoding = result['encoding'] # Decode the contents using the detected encoding with codecs.open(input_file, 'r', encoding=encoding) as f: raw_text = f.read() # Return the content of the input file return raw_text def split_text(input_file, chunk_size=1000, chunk_overlap=0): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, ) basename = os.path.basename(input_file) basename = os.path.splitext(basename)[0] raw_text = get_content(input_file=input_file) texts = text_splitter.split_text(text=raw_text) metadatas = [{"source": f"{basename}[{i}]"} for i in range(len(texts))] docs = text_splitter.create_documents(texts=texts, metadatas=metadatas) return texts, metadatas, docs def create_docs(input_file): # Create a text splitter object with a separator character text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0, length_function=len, ) basename = os.path.basename(input_file) basename = os.path.splitext(basename)[0] texts = get_content(input_file=input_file) metadatas = {'source': basename} docs = text_splitter.create_documents(texts=[texts], metadatas=[metadatas]) return docs def get_similar_docs(query, index, k=5): similar_docs = index.similarity_search(query=query, k=k) result = [(d.summary, d.metadata) for d in similar_docs] return result def convert_to_html(similar_docs): result = [] for summary, metadata in similar_docs: record = '
Page Content | Source | ' + \ '\n'.join(result) + '
---|