Spaces:
Running
Running
simplified finetuning
Browse files- app.py +52 -88
- backend.py +1 -1
- finetune_backend.py +23 -7
app.py
CHANGED
@@ -185,85 +185,44 @@ available_models = ['sentence-transformers/all-mpnet-base-v2',
|
|
185 |
#%%
|
186 |
models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
|
187 |
|
188 |
-
def download_model_from_Gdrive(model_name_or_path,
|
189 |
-
print("Downloading model from Google Drive")
|
190 |
st.write("Downloading model from Google Drive")
|
191 |
assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
|
192 |
url = models_urls[model_name_or_path]
|
193 |
-
gdown.download_folder(url, output=
|
194 |
-
print("Model downloaded and saved to
|
195 |
# st.write("Model downloaded")
|
196 |
|
197 |
-
def download_model(model_name_or_path,
|
198 |
|
199 |
if model_name_or_path.startswith("models/"):
|
200 |
-
download_model_from_Gdrive(model_name_or_path,
|
201 |
-
print(f"Model {model_full_path} downloaded")
|
202 |
-
models_urls[model_name_or_path] = model_full_path
|
203 |
-
# st.sidebar.write(f"Model {model_full_path} downloaded")
|
204 |
|
205 |
elif model_name_or_path.startswith("sentence-transformers/"):
|
206 |
-
st.sidebar.write(f"Downloading
|
207 |
-
model = SentenceTransformer(model_name_or_path)
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
# if 'modelspath' not in st.session_state:
|
214 |
-
# st.session_state['modelspath'] = None
|
215 |
-
# if st.session_state.modelspath is None:
|
216 |
-
# # let's create a temp folder on the first run
|
217 |
-
# persistent_dir = pathlib.Path("path/to/persistent_dir")
|
218 |
-
# persistent_dir.mkdir(parents=True, exist_ok=True)
|
219 |
-
# with tempfile.TemporaryDirectory() as temp_dir:
|
220 |
-
# st.session_state.modelspath = temp_dir
|
221 |
-
# print(f"Temporary directory created at {temp_dir}")
|
222 |
-
# # the temp folder disappears with the context, but not the one we've created manually
|
223 |
-
# else:
|
224 |
-
# temp_dir = st.session_state.modelspath
|
225 |
-
# print(f"Temporary directory already exists at {temp_dir}")
|
226 |
-
# # st.write(os.listdir(temp_dir))
|
227 |
|
228 |
#%%
|
229 |
# for streamlit online, we must download the model from google drive
|
230 |
# because github LFS doesn't work on forked repos
|
231 |
def check_model(model_name_or_path):
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
if pathlib.Path(model_full_path).exists():
|
238 |
# let's use the model that's already there
|
239 |
-
print(f"Model {
|
240 |
-
|
241 |
-
|
242 |
-
# but delete everything else in we are online because
|
243 |
-
# streamlit online has limited space (and will shut down the app if it's full)
|
244 |
-
if we_are_online:
|
245 |
-
# st.sidebar.write(f"Model {model_full_path} already exists")
|
246 |
-
# st.sidebar.write(f"Deleting other models")
|
247 |
-
dirs = os.listdir("models/models")
|
248 |
-
# we get only the folder name, not the full path
|
249 |
-
dirs.remove(model_full_path.split('/')[-1])
|
250 |
-
for p in dirs:
|
251 |
-
dirpath = pathlib.Path("models/models") / p
|
252 |
-
if dirpath.is_dir():
|
253 |
-
shutil.rmtree(dirpath)
|
254 |
else:
|
255 |
-
|
256 |
-
|
257 |
-
# space issues on streamlit online, let's not leave anything behind
|
258 |
-
# and redownload the model eveery time
|
259 |
-
print("Deleting models/models folder")
|
260 |
-
if pathlib.Path('models/models').exists():
|
261 |
-
shutil.rmtree("models/models") # make room, if other models are there
|
262 |
-
# st.sidebar.write(f"models/models folder deleted")
|
263 |
-
|
264 |
-
download_model(model_name_or_path, model_full_path)
|
265 |
|
266 |
-
return
|
267 |
|
268 |
#%% instantiate Weaviate client
|
269 |
def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
|
@@ -309,9 +268,9 @@ def main():
|
|
309 |
alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
|
310 |
retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
|
311 |
|
312 |
-
hybrid_filter = st.toggle('Filter Guest', True) # i.e. look only at guests' data
|
313 |
|
314 |
-
rerank = st.toggle('
|
315 |
if rerank:
|
316 |
reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
|
317 |
else:
|
@@ -327,34 +286,39 @@ def main():
|
|
327 |
model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
|
328 |
index=available_models.index(model_default),
|
329 |
placeholder='Select Model')
|
330 |
-
|
331 |
st.write("Experimental and time limited 2'")
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
if
|
337 |
-
|
338 |
-
elif "models/" in model_name_or_path:
|
339 |
-
st.write("sentence-transformers models only!")
|
340 |
-
else:
|
341 |
-
# try:
|
342 |
if 'finetuned' in model_name_or_path:
|
343 |
st.write("Model already finetuned")
|
|
|
|
|
344 |
else:
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
model_name_or_path = check_model(model_name_or_path)
|
360 |
try:
|
@@ -463,7 +427,7 @@ def main():
|
|
463 |
|
464 |
with col1:
|
465 |
|
466 |
-
use_reworded_query = st.toggle('
|
467 |
if use_reworded_query:
|
468 |
|
469 |
# let's use Llama2, and fall back on GPT3.5 if it fails
|
|
|
185 |
#%%
|
186 |
models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
|
187 |
|
188 |
+
def download_model_from_Gdrive(model_name_or_path, model_local_path):
|
|
|
189 |
st.write("Downloading model from Google Drive")
|
190 |
assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
|
191 |
url = models_urls[model_name_or_path]
|
192 |
+
gdown.download_folder(url, output=model_local_path, quiet=False, use_cookies=False)
|
193 |
+
print(f"Model downloaded from Gdrive and saved to {model_local_path} folder")
|
194 |
# st.write("Model downloaded")
|
195 |
|
196 |
+
def download_model(model_name_or_path, model_local_path):
|
197 |
|
198 |
if model_name_or_path.startswith("models/"):
|
199 |
+
download_model_from_Gdrive(model_name_or_path, model_local_path)
|
|
|
|
|
|
|
200 |
|
201 |
elif model_name_or_path.startswith("sentence-transformers/"):
|
202 |
+
st.sidebar.write(f"Downloading {model_name_or_path}")
|
203 |
+
model = SentenceTransformer(model_name_or_path)
|
204 |
+
st.sidebar.write(f"Model {model_name_or_path} downloaded")
|
205 |
+
|
206 |
+
models_urls[model_name_or_path] = model_local_path
|
207 |
+
model.save(model_local_path)
|
208 |
+
# st.sidebar.write(f"Model {model_name_or_path} saved to {model_new_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
#%%
|
211 |
# for streamlit online, we must download the model from google drive
|
212 |
# because github LFS doesn't work on forked repos
|
213 |
def check_model(model_name_or_path):
|
214 |
|
215 |
+
model_name = model_name_or_path.split('/')[-1] # remove 'sentence-transformers'
|
216 |
+
model_local_path = str(pathlib.Path("models") / model_name) # this creates a models folder inside /models
|
217 |
+
|
218 |
+
if pathlib.Path(model_local_path).exists():
|
|
|
219 |
# let's use the model that's already there
|
220 |
+
print(f"Model {model_local_path} already exists")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
else:
|
222 |
+
# let's download the model, HF is not limited in space like Streamlit.io
|
223 |
+
download_model(model_name_or_path, model_local_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
+
return model_local_path
|
226 |
|
227 |
#%% instantiate Weaviate client
|
228 |
def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
|
|
|
268 |
alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
|
269 |
retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
|
270 |
|
271 |
+
hybrid_filter = st.toggle('Filter Search using Guest name', True) # i.e. look only at guests' data
|
272 |
|
273 |
+
rerank = st.toggle('Rerank', True)
|
274 |
if rerank:
|
275 |
reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
|
276 |
else:
|
|
|
286 |
model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
|
287 |
index=available_models.index(model_default),
|
288 |
placeholder='Select Model')
|
289 |
+
|
290 |
st.write("Experimental and time limited 2'")
|
291 |
+
c1,c2 = st.columns([8,1])
|
292 |
+
with c1:
|
293 |
+
finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
|
294 |
+
if we_are_not_online or we_are_online:
|
295 |
+
if finetune_model:
|
296 |
+
from finetune_backend import finetune
|
|
|
|
|
|
|
|
|
297 |
if 'finetuned' in model_name_or_path:
|
298 |
st.write("Model already finetuned")
|
299 |
+
elif "models/" in model_name_or_path:
|
300 |
+
st.write("sentence-transformers models only!")
|
301 |
else:
|
302 |
+
try:
|
303 |
+
if 'finetuned' in model_name_or_path:
|
304 |
+
st.write("Model already finetuned")
|
305 |
+
else:
|
306 |
+
with c2:
|
307 |
+
with st.spinner(''):
|
308 |
+
model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
|
309 |
+
with c1:
|
310 |
+
# st.write(f"model_path returned = {model_path}")
|
311 |
+
if model_path is not None:
|
312 |
+
if model_name_or_path.split('/')[-1] not in model_path:
|
313 |
+
st.write(model_path) # a warning from finetuning in this case
|
314 |
+
elif model_path not in available_models:
|
315 |
+
# finetuning generated a model, let's add it
|
316 |
+
available_models.append(model_path)
|
317 |
+
st.write("Model saved in list!")
|
318 |
+
except Exception:
|
319 |
+
st.write("Model not found on HF or error")
|
320 |
+
else:
|
321 |
+
st.write("Finetuning not available on Streamlit online because of space limitations")
|
322 |
|
323 |
model_name_or_path = check_model(model_name_or_path)
|
324 |
try:
|
|
|
427 |
|
428 |
with col1:
|
429 |
|
430 |
+
use_reworded_query = st.toggle('Rewrite query with LLM', True)
|
431 |
if use_reworded_query:
|
432 |
|
433 |
# let's use Llama2, and fall back on GPT3.5 if it fails
|
backend.py
CHANGED
@@ -92,7 +92,7 @@ def encode_content_splits(content_splits,
|
|
92 |
return emb
|
93 |
|
94 |
|
95 |
-
@stub.function(image=vector_search, gpu="A100", timeout=
|
96 |
mounts=[modal.Mount.from_local_dir("./data",
|
97 |
remote_path="/root/data",
|
98 |
condition=lambda pth: ".json" in pth)],
|
|
|
92 |
return emb
|
93 |
|
94 |
|
95 |
+
@stub.function(image=vector_search, gpu="A100", timeout=240,
|
96 |
mounts=[modal.Mount.from_local_dir("./data",
|
97 |
remote_path="/root/data",
|
98 |
condition=lambda pth: ".json" in pth)],
|
finetune_backend.py
CHANGED
@@ -36,8 +36,8 @@ def finetune(model: str='sentence-transformers/all-mpnet-base-v2',
|
|
36 |
model = model.replace('/','')
|
37 |
model = f"sentence-transformers/{model}"
|
38 |
|
39 |
-
fullpath = os.path.join(outpath, f"finetuned-{model.
|
40 |
-
st.sidebar.write(f"Model
|
41 |
|
42 |
if os.path.exists(fullpath):
|
43 |
msg = "Model already exists!"
|
@@ -45,15 +45,31 @@ def finetune(model: str='sentence-transformers/all-mpnet-base-v2',
|
|
45 |
return msg
|
46 |
|
47 |
start = time.perf_counter()
|
48 |
-
|
|
|
|
|
|
|
49 |
|
50 |
end = time.perf_counter() - start
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
|
53 |
if savemodel:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
print(f"Model saved in {fullpath}")
|
59 |
return fullpath
|
|
|
36 |
model = model.replace('/','')
|
37 |
model = f"sentence-transformers/{model}"
|
38 |
|
39 |
+
fullpath = os.path.join(outpath, f"finetuned-{model.split('/')[-1]}-300")
|
40 |
+
# st.sidebar.write(f"Model saved in {fullpath}")
|
41 |
|
42 |
if os.path.exists(fullpath):
|
43 |
msg = "Model already exists!"
|
|
|
45 |
return msg
|
46 |
|
47 |
start = time.perf_counter()
|
48 |
+
try:
|
49 |
+
finetuned_model = f.remote(training_path, valid_path, model_id=model)
|
50 |
+
except FunctionTimeoutError:
|
51 |
+
return "Timeout!" # will be displayed by app.py
|
52 |
|
53 |
end = time.perf_counter() - start
|
54 |
+
|
55 |
+
# with st.sidebar:
|
56 |
+
# c1,c2 = st.columns([8,1])
|
57 |
+
# with c1:
|
58 |
+
st.sidebar.write(f"Finetuning with GPU lasted {end:.2f} seconds")
|
59 |
|
60 |
if savemodel:
|
61 |
+
|
62 |
+
# save it as zip filess
|
63 |
+
# with open(fullpath+'.zip', 'wb') as file:
|
64 |
+
# # Write the contents of the BytesIO object to a new file
|
65 |
+
# file.write(finetuned_model.getbuffer())
|
66 |
+
# print(f"Model zip file saved in {fullpath}")
|
67 |
+
# zipfile.ZipExtFile(finetuned_model) # to unzip
|
68 |
+
# import sys
|
69 |
+
# sys.getsizeof(zippedio)
|
70 |
|
71 |
+
# or save as folder directly
|
72 |
+
zipfile.ZipFile(finetuned_model).extractall(fullpath)
|
73 |
+
|
74 |
print(f"Model saved in {fullpath}")
|
75 |
return fullpath
|