JPBianchi commited on
Commit
30eb437
1 Parent(s): 88b4a61

simplified finetuning

Browse files
Files changed (3) hide show
  1. app.py +52 -88
  2. backend.py +1 -1
  3. finetune_backend.py +23 -7
app.py CHANGED
@@ -185,85 +185,44 @@ available_models = ['sentence-transformers/all-mpnet-base-v2',
185
  #%%
186
  models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
187
 
188
- def download_model_from_Gdrive(model_name_or_path, model_full_path):
189
- print("Downloading model from Google Drive")
190
  st.write("Downloading model from Google Drive")
191
  assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
192
  url = models_urls[model_name_or_path]
193
- gdown.download_folder(url, output=model_full_path, quiet=False, use_cookies=False)
194
- print("Model downloaded and saved to models folder")
195
  # st.write("Model downloaded")
196
 
197
- def download_model(model_name_or_path, model_full_path):
198
 
199
  if model_name_or_path.startswith("models/"):
200
- download_model_from_Gdrive(model_name_or_path, model_full_path)
201
- print(f"Model {model_full_path} downloaded")
202
- models_urls[model_name_or_path] = model_full_path
203
- # st.sidebar.write(f"Model {model_full_path} downloaded")
204
 
205
  elif model_name_or_path.startswith("sentence-transformers/"):
206
- st.sidebar.write(f"Downloading Sentence Transformer model {model_name_or_path}")
207
- model = SentenceTransformer(model_name_or_path) # HF looks into its own models folder/path
208
- models_urls[model_name_or_path] = model_full_path
209
- # st.sidebar.write(f"Model {model_name_or_path} downloaded")
210
- model.save(model_full_path)
211
- # st.sidebar.write(f"Model {model_name_or_path} saved to {model_full_path}")
212
-
213
- # if 'modelspath' not in st.session_state:
214
- # st.session_state['modelspath'] = None
215
- # if st.session_state.modelspath is None:
216
- # # let's create a temp folder on the first run
217
- # persistent_dir = pathlib.Path("path/to/persistent_dir")
218
- # persistent_dir.mkdir(parents=True, exist_ok=True)
219
- # with tempfile.TemporaryDirectory() as temp_dir:
220
- # st.session_state.modelspath = temp_dir
221
- # print(f"Temporary directory created at {temp_dir}")
222
- # # the temp folder disappears with the context, but not the one we've created manually
223
- # else:
224
- # temp_dir = st.session_state.modelspath
225
- # print(f"Temporary directory already exists at {temp_dir}")
226
- # # st.write(os.listdir(temp_dir))
227
 
228
  #%%
229
  # for streamlit online, we must download the model from google drive
230
  # because github LFS doesn't work on forked repos
231
  def check_model(model_name_or_path):
232
 
233
- model_path = pathlib.Path(model_name_or_path)
234
- model_full_path = str(pathlib.Path("models") / model_path) # this creates a models folder inside /models
235
- model_full_path = model_full_path.replace("sentence-transformers/", "models/") # all are saved in models folder
236
-
237
- if pathlib.Path(model_full_path).exists():
238
  # let's use the model that's already there
239
- print(f"Model {model_full_path} already exists")
240
-
241
-
242
- # but delete everything else in we are online because
243
- # streamlit online has limited space (and will shut down the app if it's full)
244
- if we_are_online:
245
- # st.sidebar.write(f"Model {model_full_path} already exists")
246
- # st.sidebar.write(f"Deleting other models")
247
- dirs = os.listdir("models/models")
248
- # we get only the folder name, not the full path
249
- dirs.remove(model_full_path.split('/')[-1])
250
- for p in dirs:
251
- dirpath = pathlib.Path("models/models") / p
252
- if dirpath.is_dir():
253
- shutil.rmtree(dirpath)
254
  else:
255
-
256
- if we_are_online:
257
- # space issues on streamlit online, let's not leave anything behind
258
- # and redownload the model eveery time
259
- print("Deleting models/models folder")
260
- if pathlib.Path('models/models').exists():
261
- shutil.rmtree("models/models") # make room, if other models are there
262
- # st.sidebar.write(f"models/models folder deleted")
263
-
264
- download_model(model_name_or_path, model_full_path)
265
 
266
- return model_full_path
267
 
268
  #%% instantiate Weaviate client
269
  def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
@@ -309,9 +268,9 @@ def main():
309
  alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
310
  retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
311
 
312
- hybrid_filter = st.toggle('Filter Guest', True) # i.e. look only at guests' data
313
 
314
- rerank = st.toggle('Use Reranker', True)
315
  if rerank:
316
  reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
317
  else:
@@ -327,34 +286,39 @@ def main():
327
  model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
328
  index=available_models.index(model_default),
329
  placeholder='Select Model')
330
-
331
  st.write("Experimental and time limited 2'")
332
- finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
333
- if we_are_not_online or we_are_online:
334
- if finetune_model:
335
- from finetune_backend import finetune
336
- if 'finetuned' in model_name_or_path:
337
- st.write("Model already finetuned")
338
- elif "models/" in model_name_or_path:
339
- st.write("sentence-transformers models only!")
340
- else:
341
- # try:
342
  if 'finetuned' in model_name_or_path:
343
  st.write("Model already finetuned")
 
 
344
  else:
345
- model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
346
- st.write(f"model_path returned = {model_path}")
347
- if model_path is not None:
348
- if model_name_or_path.split('/')[-1] not in model_path:
349
- st.write(model_path) # a warning from finetuning in this case
350
- elif model_path not in available_models:
351
- # finetuning generated a model, let's add it
352
- available_models.append(model_path)
353
- st.write("Model saved!")
354
- # except Exception:
355
- # st.write("Model not found on HF or error")
356
- else:
357
- st.write("Finetuning not available on Streamlit online because of space limitations")
 
 
 
 
 
 
 
358
 
359
  model_name_or_path = check_model(model_name_or_path)
360
  try:
@@ -463,7 +427,7 @@ def main():
463
 
464
  with col1:
465
 
466
- use_reworded_query = st.toggle('Use rewritten query', True)
467
  if use_reworded_query:
468
 
469
  # let's use Llama2, and fall back on GPT3.5 if it fails
 
185
  #%%
186
  models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
187
 
188
+ def download_model_from_Gdrive(model_name_or_path, model_local_path):
 
189
  st.write("Downloading model from Google Drive")
190
  assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
191
  url = models_urls[model_name_or_path]
192
+ gdown.download_folder(url, output=model_local_path, quiet=False, use_cookies=False)
193
+ print(f"Model downloaded from Gdrive and saved to {model_local_path} folder")
194
  # st.write("Model downloaded")
195
 
196
+ def download_model(model_name_or_path, model_local_path):
197
 
198
  if model_name_or_path.startswith("models/"):
199
+ download_model_from_Gdrive(model_name_or_path, model_local_path)
 
 
 
200
 
201
  elif model_name_or_path.startswith("sentence-transformers/"):
202
+ st.sidebar.write(f"Downloading {model_name_or_path}")
203
+ model = SentenceTransformer(model_name_or_path)
204
+ st.sidebar.write(f"Model {model_name_or_path} downloaded")
205
+
206
+ models_urls[model_name_or_path] = model_local_path
207
+ model.save(model_local_path)
208
+ # st.sidebar.write(f"Model {model_name_or_path} saved to {model_new_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  #%%
211
  # for streamlit online, we must download the model from google drive
212
  # because github LFS doesn't work on forked repos
213
  def check_model(model_name_or_path):
214
 
215
+ model_name = model_name_or_path.split('/')[-1] # remove 'sentence-transformers'
216
+ model_local_path = str(pathlib.Path("models") / model_name) # this creates a models folder inside /models
217
+
218
+ if pathlib.Path(model_local_path).exists():
 
219
  # let's use the model that's already there
220
+ print(f"Model {model_local_path} already exists")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  else:
222
+ # let's download the model, HF is not limited in space like Streamlit.io
223
+ download_model(model_name_or_path, model_local_path)
 
 
 
 
 
 
 
 
224
 
225
+ return model_local_path
226
 
227
  #%% instantiate Weaviate client
228
  def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
 
268
  alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
269
  retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
270
 
271
+ hybrid_filter = st.toggle('Filter Search using Guest name', True) # i.e. look only at guests' data
272
 
273
+ rerank = st.toggle('Rerank', True)
274
  if rerank:
275
  reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
276
  else:
 
286
  model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
287
  index=available_models.index(model_default),
288
  placeholder='Select Model')
289
+
290
  st.write("Experimental and time limited 2'")
291
+ c1,c2 = st.columns([8,1])
292
+ with c1:
293
+ finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
294
+ if we_are_not_online or we_are_online:
295
+ if finetune_model:
296
+ from finetune_backend import finetune
 
 
 
 
297
  if 'finetuned' in model_name_or_path:
298
  st.write("Model already finetuned")
299
+ elif "models/" in model_name_or_path:
300
+ st.write("sentence-transformers models only!")
301
  else:
302
+ try:
303
+ if 'finetuned' in model_name_or_path:
304
+ st.write("Model already finetuned")
305
+ else:
306
+ with c2:
307
+ with st.spinner(''):
308
+ model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
309
+ with c1:
310
+ # st.write(f"model_path returned = {model_path}")
311
+ if model_path is not None:
312
+ if model_name_or_path.split('/')[-1] not in model_path:
313
+ st.write(model_path) # a warning from finetuning in this case
314
+ elif model_path not in available_models:
315
+ # finetuning generated a model, let's add it
316
+ available_models.append(model_path)
317
+ st.write("Model saved in list!")
318
+ except Exception:
319
+ st.write("Model not found on HF or error")
320
+ else:
321
+ st.write("Finetuning not available on Streamlit online because of space limitations")
322
 
323
  model_name_or_path = check_model(model_name_or_path)
324
  try:
 
427
 
428
  with col1:
429
 
430
+ use_reworded_query = st.toggle('Rewrite query with LLM', True)
431
  if use_reworded_query:
432
 
433
  # let's use Llama2, and fall back on GPT3.5 if it fails
backend.py CHANGED
@@ -92,7 +92,7 @@ def encode_content_splits(content_splits,
92
  return emb
93
 
94
 
95
- @stub.function(image=vector_search, gpu="A100", timeout=120,
96
  mounts=[modal.Mount.from_local_dir("./data",
97
  remote_path="/root/data",
98
  condition=lambda pth: ".json" in pth)],
 
92
  return emb
93
 
94
 
95
+ @stub.function(image=vector_search, gpu="A100", timeout=240,
96
  mounts=[modal.Mount.from_local_dir("./data",
97
  remote_path="/root/data",
98
  condition=lambda pth: ".json" in pth)],
finetune_backend.py CHANGED
@@ -36,8 +36,8 @@ def finetune(model: str='sentence-transformers/all-mpnet-base-v2',
36
  model = model.replace('/','')
37
  model = f"sentence-transformers/{model}"
38
 
39
- fullpath = os.path.join(outpath, f"finetuned-{model.strip('/')[-1]}-300")
40
- st.sidebar.write(f"Model will be saved in {fullpath}")
41
 
42
  if os.path.exists(fullpath):
43
  msg = "Model already exists!"
@@ -45,15 +45,31 @@ def finetune(model: str='sentence-transformers/all-mpnet-base-v2',
45
  return msg
46
 
47
  start = time.perf_counter()
48
- finetuned_model = f.remote(training_path, valid_path, model_id=model)
 
 
 
49
 
50
  end = time.perf_counter() - start
51
- st.write(f"Finetuning with GPU lasted {end:.2f} seconds")
 
 
 
 
52
 
53
  if savemodel:
 
 
 
 
 
 
 
 
 
54
 
55
- with open(fullpath, 'wb') as file:
56
- # Write the contents of the BytesIO object to a new file
57
- file.write(finetuned_model.getbuffer())
58
  print(f"Model saved in {fullpath}")
59
  return fullpath
 
36
  model = model.replace('/','')
37
  model = f"sentence-transformers/{model}"
38
 
39
+ fullpath = os.path.join(outpath, f"finetuned-{model.split('/')[-1]}-300")
40
+ # st.sidebar.write(f"Model saved in {fullpath}")
41
 
42
  if os.path.exists(fullpath):
43
  msg = "Model already exists!"
 
45
  return msg
46
 
47
  start = time.perf_counter()
48
+ try:
49
+ finetuned_model = f.remote(training_path, valid_path, model_id=model)
50
+ except FunctionTimeoutError:
51
+ return "Timeout!" # will be displayed by app.py
52
 
53
  end = time.perf_counter() - start
54
+
55
+ # with st.sidebar:
56
+ # c1,c2 = st.columns([8,1])
57
+ # with c1:
58
+ st.sidebar.write(f"Finetuning with GPU lasted {end:.2f} seconds")
59
 
60
  if savemodel:
61
+
62
+ # save it as zip filess
63
+ # with open(fullpath+'.zip', 'wb') as file:
64
+ # # Write the contents of the BytesIO object to a new file
65
+ # file.write(finetuned_model.getbuffer())
66
+ # print(f"Model zip file saved in {fullpath}")
67
+ # zipfile.ZipExtFile(finetuned_model) # to unzip
68
+ # import sys
69
+ # sys.getsizeof(zippedio)
70
 
71
+ # or save as folder directly
72
+ zipfile.ZipFile(finetuned_model).extractall(fullpath)
73
+
74
  print(f"Model saved in {fullpath}")
75
  return fullpath