ffreemt commited on
Commit
deeaab0
1 Parent(s): fe6b806

Update ingest

Browse files
Files changed (5) hide show
  1. .gitignore +3 -0
  2. README.md +1 -1
  3. app.py +229 -0
  4. requirements-dev.txt +2 -0
  5. requirements.txt +25 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv
2
+ db
3
+ dummy
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Localgpt
3
  emoji: 🦀
4
  colorFrom: green
5
  colorTo: red
 
1
  ---
2
+ title: localgpt
3
  emoji: 🦀
4
  colorFrom: green
5
  colorTo: red
app.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Refer to
2
+ https://huggingface.co/spaces/mikeee/docs-chat/blob/main/app.py
3
+ and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
4
+
5
+ https://python.langchain.com/en/latest/getting_started/tutorials.html
6
+ """
7
+ # pylint: disable=broad-exception-caught, unused-import
8
+ import os
9
+ import time
10
+ from pathlib import Path
11
+
12
+ # import click
13
+ # from typing import List
14
+
15
+ import gradio as gr
16
+ from charset_normalizer import detect
17
+ from langchain.docstore.document import Document
18
+ from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader
19
+
20
+ # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
21
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
22
+ from langchain.text_splitter import (
23
+ CharacterTextSplitter,
24
+ RecursiveCharacterTextSplitter,
25
+ )
26
+ from langchain.vectorstores import FAISS # FAISS instead of PineCone
27
+ from langchain.vectorstores import Chroma
28
+ from loguru import logger
29
+ from PyPDF2 import PdfReader # localgpt
30
+ from chromadb.config import Settings
31
+
32
+ # from utils import xlxs_to_csv
33
+
34
+ # load possible env such as OPENAI_API_KEY
35
+ # from dotenv import load_dotenv
36
+
37
+ # load_dotenv()load_dotenv()
38
+
39
+ # fix timezone
40
+ os.environ["TZ"] = "Asia/Shanghai"
41
+ try:
42
+ time.tzset() # type: ignore # pylint: disable=no-member
43
+ except Exception:
44
+ # Windows
45
+ logger.warning("Windows, cant run time.tzset()")
46
+
47
+ ROOT_DIRECTORY = Path(__file__).parent
48
+ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
49
+
50
+ # Define the Chroma settings
51
+ CHROMA_SETTINGS = Settings(
52
+ chroma_db_impl='duckdb+parquet',
53
+ persist_directory=PERSIST_DIRECTORY,
54
+ anonymized_telemetry=False
55
+ )
56
+
57
+ def load_single_document(file_path: str|Path) -> Document:
58
+ """ingest.py"""
59
+ # Loads a single document from a file path
60
+ # encoding = detect(open(file_path, "rb").read()).get("encoding", "utf-8")
61
+ encoding = detect(Path(file_path).read_bytes()).get("encoding", "utf-8")
62
+ if file_path.endswith(".txt"):
63
+ if encoding is None:
64
+ logger.warning(
65
+ f" {file_path}'s encoding is None "
66
+ "Something is fishy, return empty str "
67
+ )
68
+ return Document(page_content='', metadata={'source': file_path})
69
+
70
+ try:
71
+ loader = TextLoader(file_path, encoding=encoding)
72
+ except Exception as exc:
73
+ logger.warning(f" {exc}, return dummy ")
74
+ return Document(page_content='', metadata={'source': file_path})
75
+
76
+ elif file_path.endswith(".pdf"):
77
+ loader = PDFMinerLoader(file_path)
78
+ elif file_path.endswith(".csv"):
79
+ loader = CSVLoader(file_path)
80
+ # elif file_path.endswith(".epub"): # for epub? epub2txt unstructured
81
+ else:
82
+ if encoding is None:
83
+ logger.warning(
84
+ f" {file_path}'s encoding is None "
85
+ "Likely binary files, return empty str "
86
+ )
87
+ return ""
88
+
89
+ try:
90
+ loader = TextLoader(file_path)
91
+ except Exception as exc:
92
+ logger.error(f" {exc}, returnning empty string")
93
+ return Document(page_content='', metadata={'source': file_path})
94
+
95
+ return loader.load()[0]
96
+
97
+
98
+ def get_pdf_text(pdf_docs):
99
+ """docs-chat."""
100
+ text = ""
101
+ for pdf in pdf_docs:
102
+ pdf_reader = PdfReader(pdf)
103
+ for page in pdf_reader.pages:
104
+ text += page.extract_text()
105
+ return text
106
+
107
+
108
+ def get_text_chunks(text):
109
+ """docs-chat."""
110
+ text_splitter = CharacterTextSplitter(
111
+ separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
112
+ )
113
+ chunks = text_splitter.split_text(text)
114
+ return chunks
115
+
116
+
117
+ def get_vectorstore(text_chunks):
118
+ """docs-chat."""
119
+ # embeddings = OpenAIEmbeddings()
120
+ model_name = "hkunlp/instructor-xl"
121
+ model_name = "hkunlp/instructor-large"
122
+ model_name = "hkunlp/instructor-base"
123
+ logger.info(f"Loading {model_name}")
124
+ embeddings = HuggingFaceInstructEmbeddings(model_name=model_name)
125
+ logger.info(f"Done loading {model_name}")
126
+
127
+ logger.info(
128
+ "Doing vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
129
+ )
130
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
131
+ logger.info(
132
+ "Done vectorstore FAISS.from_texts(texts=text_chunks, embedding=embeddings)"
133
+ )
134
+
135
+ return vectorstore
136
+
137
+
138
+ def greet(name):
139
+ """Test."""
140
+ logger.debug(f" name: [{name}] ")
141
+ return "Hello " + name + "!!"
142
+
143
+
144
+ def upload_files(files):
145
+ """Upload files."""
146
+ file_paths = [file.name for file in files]
147
+ logger.info(file_paths)
148
+
149
+ res = ingest(file_paths)
150
+
151
+ # return [str(elm) for elm in res]
152
+ return file_paths
153
+
154
+ # return ingest(file_paths)
155
+
156
+
157
+ def ingest(file_paths: list[str | Path], model_name="hkunlp/instructor-base", device_type="cpu"):
158
+ """Gen Chroma db.
159
+ file_paths = ['C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\41b53dd5f203b423f2dced44eaf56e72508b7bbe\\app.py', 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\9390755bb391abc530e71a3946a7b50d463ba0ef\\README.md', 'C:\\Users\\User\\AppData\\Local\\Temp\\gradio\\3341f9a410a60ffa57bf4342f3018a3de689f729\\requirements.txt']
160
+ """
161
+ if device_type in ['cpu', 'CPU']:
162
+ device='cpu'
163
+ elif device_type in ['mps', 'MPS']:
164
+ device='mps'
165
+ else:
166
+ device='cuda'
167
+
168
+ #  Load documents and split in chunks
169
+ # logger.info(f"Loading documents from {SOURCE_DIRECTORY}")
170
+ # documents = load_documents(SOURCE_DIRECTORY)
171
+
172
+ documents = []
173
+ for file_path in file_paths:
174
+ documents.append(load_single_document(f"{file_path}"))
175
+
176
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
177
+ texts = text_splitter.split_documents(documents)
178
+
179
+ logger.info(f"Loaded {len(documents)} documents ")
180
+ logger.info(f"Split into {len(texts)} chunks of text")
181
+
182
+ # Create embeddings
183
+ embeddings = HuggingFaceInstructEmbeddings(
184
+ model_name=model_name,
185
+ model_kwargs={"device": device}
186
+ )
187
+
188
+ db = Chroma.from_documents(
189
+ texts, embeddings,
190
+ persist_directory=PERSIST_DIRECTORY,
191
+ client_settings=CHROMA_SETTINGS
192
+ )
193
+ db.persist()
194
+ db = None
195
+ logger.info("Done ingest")
196
+
197
+ return [[Path(doc.metadata.get("source")).name, len(doc.page_content)] for doc in documents]
198
+
199
+
200
+ def main1():
201
+ """Lump codes"""
202
+ with gr.Blocks() as demo:
203
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
204
+ iface.launch()
205
+
206
+ demo.launch()
207
+
208
+
209
+ def main():
210
+ """Do blocks."""
211
+ with gr.Blocks() as demo:
212
+ name = gr.Textbox(label="Name")
213
+ greet_btn = gr.Button("Submit")
214
+ output = gr.Textbox(label="Output Box")
215
+ greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")
216
+
217
+ file_output = gr.File()
218
+ upload_button = gr.UploadButton(
219
+ "Click to upload files",
220
+ # file_types=["*.pdf", "*.epub", "*.docx"],
221
+ file_count="multiple"
222
+ )
223
+ upload_button.upload(upload_files, upload_button, file_output)
224
+
225
+ demo.launch()
226
+
227
+
228
+ if __name__ == "__main__":
229
+ main()
requirements-dev.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ipython
2
+ pylint
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.166
2
+ chromadb==0.3.22
3
+ llama-cpp-python==0.1.48
4
+ urllib3==1.26.6
5
+ pdfminer.six==20221105
6
+ InstructorEmbedding
7
+
8
+ # required by sentence-transformers
9
+ --extra-index-url https://download.pytorch.org/whl/cpu
10
+ torch
11
+ torchvision
12
+ sentence-transformers
13
+ faiss-cpu
14
+ huggingface_hub
15
+ transformers
16
+ protobuf==3.20.0
17
+ accelerate
18
+ bitsandbytes
19
+ click
20
+ openpyxl
21
+ loguru
22
+ gradio
23
+ charset-normalzier
24
+ PyPDF2
25
+ epub2txt