root commited on
Commit
80be8a0
1 Parent(s): 6b0d528

Add application file

Browse files
Files changed (6) hide show
  1. LICENSE +201 -0
  2. app.py +31 -0
  3. constants.py +18 -0
  4. ingest.py +57 -0
  5. requirements.txt +14 -0
  6. run_localGPT.py +88 -0
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from ingest import parse_document, create_embeddings
3
+ from run_localGPT import generate_answer
4
+
5
+ # Function to run the pipeline
6
+ def run_pipeline(document):
7
+ # Parse the document and create embeddings
8
+ embeddings = create_embeddings(parse_document(document))
9
+
10
+ # Generate an answer using the local LLM and the extracted context
11
+ answer = generate_answer(embeddings)
12
+
13
+ return answer
14
+
15
+ # Streamlit app
16
+ def main():
17
+ st.title("Local GPT Pipeline")
18
+ st.write("How does it work?\n"
19
+ "Selecting the right local models and the power of LangChain, "
20
+ "you can run the entire pipeline locally without any data leaving your environment, and with reasonable performance.")
21
+
22
+ # Input section
23
+ document = st.text_area("Document")
24
+
25
+ # Run the pipeline when the "Run" button is clicked
26
+ if st.button("Run"):
27
+ answer = run_pipeline(document)
28
+ st.write("Answer:", answer)
29
+
30
+ if __name__ == "__main__":
31
+ main()
constants.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # from dotenv import load_dotenv
3
+ from chromadb.config import Settings
4
+
5
+ # load_dotenv()
6
+ ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
7
+
8
+ # Define the folder for storing database
9
+ SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"
10
+
11
+ PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
12
+
13
+ # Define the Chroma settings
14
+ CHROMA_SETTINGS = Settings(
15
+ chroma_db_impl='duckdb+parquet',
16
+ persist_directory=PERSIST_DIRECTORY,
17
+ anonymized_telemetry=False
18
+ )
ingest.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import click
3
+ from typing import List
4
+
5
+ from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.docstore.document import Document
9
+ from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
10
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
11
+
12
+
13
+ def load_single_document(file_path: str) -> Document:
14
+ # Loads a single document from a file path
15
+ if file_path.endswith(".txt"):
16
+ loader = TextLoader(file_path, encoding="utf8")
17
+ elif file_path.endswith(".pdf"):
18
+ loader = PDFMinerLoader(file_path)
19
+ elif file_path.endswith(".csv"):
20
+ loader = CSVLoader(file_path)
21
+ return loader.load()[0]
22
+
23
+
24
+ def load_documents(source_dir: str) -> List[Document]:
25
+ # Loads all documents from source documents directory
26
+ all_files = os.listdir(source_dir)
27
+ return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv'] ]
28
+
29
+
30
+ @click.command()
31
+ @click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
32
+ def main(device_type, ):
33
+ # load the instructorEmbeddings
34
+ if device_type in ['cpu', 'CPU']:
35
+ device='cpu'
36
+ else:
37
+ device='cuda'
38
+
39
+ # Load documents and split in chunks
40
+ print(f"Loading documents from {SOURCE_DIRECTORY}")
41
+ documents = load_documents(SOURCE_DIRECTORY)
42
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
43
+ texts = text_splitter.split_documents(documents)
44
+ print(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
45
+ print(f"Split into {len(texts)} chunks of text")
46
+
47
+ # Create embeddings
48
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
49
+ model_kwargs={"device": device})
50
+
51
+ db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY, client_settings=CHROMA_SETTINGS)
52
+ db.persist()
53
+ db = None
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.166
2
+ chromadb==0.3.22
3
+ llama-cpp-python
4
+ urllib3==1.26.6
5
+ pdfminer.six==20221105
6
+ InstructorEmbedding
7
+ sentence-transformers
8
+ faiss-cpu
9
+ huggingface_hub
10
+ transformers
11
+ protobuf==3.20.0
12
+ accelerate
13
+ bitsandbytes
14
+ click
run_localGPT.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ # from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
5
+ from langchain.llms import HuggingFacePipeline
6
+ from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY
7
+ from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline
8
+ import click
9
+
10
+ from constants import CHROMA_SETTINGS
11
+
12
+ def load_model():
13
+ '''
14
+ Select a model on huggingface.
15
+ If you are running this for the first time, it will download a model for you.
16
+ subsequent runs will use the model from the disk.
17
+ '''
18
+ model_id = "TheBloke/vicuna-7B-1.1-HF"
19
+ tokenizer = LlamaTokenizer.from_pretrained(model_id)
20
+
21
+ model = LlamaForCausalLM.from_pretrained(model_id,
22
+ # load_in_8bit=True, # set these options if your GPU supports them!
23
+ # device_map=1#'auto',
24
+ # torch_dtype=torch.float16,
25
+ # low_cpu_mem_usage=True
26
+ )
27
+
28
+ pipe = pipeline(
29
+ "text-generation",
30
+ model=model,
31
+ tokenizer=tokenizer,
32
+ max_length=2048,
33
+ temperature=0,
34
+ top_p=0.95,
35
+ repetition_penalty=1.15
36
+ )
37
+
38
+ local_llm = HuggingFacePipeline(pipeline=pipe)
39
+
40
+ return local_llm
41
+
42
+ @click.command()
43
+ @click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
44
+ def main(device_type, ):
45
+ # load the instructorEmbeddings
46
+ if device_type in ['cpu', 'CPU']:
47
+ device='cpu'
48
+ else:
49
+ device='cuda'
50
+
51
+ print(f"Running on: {device}")
52
+
53
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
54
+ model_kwargs={"device": device})
55
+ # load the vectorstore
56
+ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
57
+ retriever = db.as_retriever()
58
+ # Prepare the LLM
59
+ # callbacks = [StreamingStdOutCallbackHandler()]
60
+ # load the LLM for generating Natural Language responses.
61
+ llm = load_model()
62
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
63
+ # Interactive questions and answers
64
+ while True:
65
+ query = input("\nEnter a query: ")
66
+ if query == "exit":
67
+ break
68
+
69
+ # Get the answer from the chain
70
+ res = qa(query)
71
+ answer, docs = res['result'], res['source_documents']
72
+
73
+ # Print the result
74
+ print("\n\n> Question:")
75
+ print(query)
76
+ print("\n> Answer:")
77
+ print(answer)
78
+
79
+ # # Print the relevant sources used for the answer
80
+ print("----------------------------------SOURCE DOCUMENTS---------------------------")
81
+ for document in docs:
82
+ print("\n> " + document.metadata["source"] + ":")
83
+ print(document.page_content)
84
+ print("----------------------------------SOURCE DOCUMENTS---------------------------")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()