Spaces:

JPBianchi
/

vectorsearch

Running

App Files Files Community

JPBianchi commited on Dec 19, 2023

Commit

30eb437

•

1 Parent(s): 88b4a61

simplified finetuning

Browse files

Files changed (3) hide show

app.py +52 -88
backend.py +1 -1
finetune_backend.py +23 -7

app.py CHANGED Viewed

@@ -185,85 +185,44 @@ available_models = ['sentence-transformers/all-mpnet-base-v2',
 #%%
 models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
-def download_model_from_Gdrive(model_name_or_path, model_full_path):
-    print("Downloading model from Google Drive")
     st.write("Downloading model from Google Drive")
     assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
     url = models_urls[model_name_or_path]
-    gdown.download_folder(url, output=model_full_path, quiet=False, use_cookies=False)
-    print("Model downloaded and saved to models folder")
     # st.write("Model downloaded")
-def download_model(model_name_or_path, model_full_path):
     if model_name_or_path.startswith("models/"):
-        download_model_from_Gdrive(model_name_or_path, model_full_path)
-        print(f"Model {model_full_path} downloaded")
-        models_urls[model_name_or_path] = model_full_path
-        # st.sidebar.write(f"Model {model_full_path} downloaded")
     elif model_name_or_path.startswith("sentence-transformers/"):
-        st.sidebar.write(f"Downloading Sentence Transformer model {model_name_or_path}")
-        model = SentenceTransformer(model_name_or_path)   # HF looks into its own models folder/path
-        models_urls[model_name_or_path] = model_full_path
-        # st.sidebar.write(f"Model {model_name_or_path} downloaded")
-        model.save(model_full_path)
-        # st.sidebar.write(f"Model {model_name_or_path} saved to {model_full_path}")
-# if 'modelspath' not in st.session_state:
-#     st.session_state['modelspath'] = None
-# if st.session_state.modelspath is None:
-#     # let's create a temp folder on the first run
-#     persistent_dir = pathlib.Path("path/to/persistent_dir")
-#     persistent_dir.mkdir(parents=True, exist_ok=True)
-#     with tempfile.TemporaryDirectory() as temp_dir:
-#         st.session_state.modelspath = temp_dir
-#         print(f"Temporary directory created at {temp_dir}")
-#     # the temp folder disappears with the context, but not the one we've created manually
-# else:
-#     temp_dir = st.session_state.modelspath
-#     print(f"Temporary directory already exists at {temp_dir}")
-#     # st.write(os.listdir(temp_dir))
 #%%
 # for streamlit online, we must download the model from google drive
 # because github LFS doesn't work on forked repos
 def check_model(model_name_or_path):
-    model_path = pathlib.Path(model_name_or_path)
-    model_full_path = str(pathlib.Path("models") / model_path) # this creates a models folder inside /models
-    model_full_path = model_full_path.replace("sentence-transformers/", "models/") # all are saved in models folder
-    if pathlib.Path(model_full_path).exists():
         # let's use the model that's already there
-        print(f"Model {model_full_path} already exists")
-        # but delete everything else in we are online because
-        # streamlit online has limited space (and will shut down the app if it's full)
-        if we_are_online:
-            # st.sidebar.write(f"Model {model_full_path} already exists")
-            # st.sidebar.write(f"Deleting other models")
-            dirs = os.listdir("models/models")
-            # we get only the folder name, not the full path
-            dirs.remove(model_full_path.split('/')[-1])
-            for p in dirs:
-                dirpath = pathlib.Path("models/models") / p
-                if dirpath.is_dir():
-                    shutil.rmtree(dirpath)
     else:
-        if we_are_online:
-            # space issues on streamlit online, let's not leave anything behind
-            # and redownload the model eveery time
-            print("Deleting models/models folder")
-            if pathlib.Path('models/models').exists():
-                shutil.rmtree("models/models")  # make room, if other models are there
-            # st.sidebar.write(f"models/models folder deleted")
-        download_model(model_name_or_path, model_full_path)
-    return model_full_path
 #%% instantiate Weaviate client
 def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
@@ -309,9 +268,9 @@ def main():
             alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
             retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
-        hybrid_filter = st.toggle('Filter Guest', True) # i.e. look only at guests' data
-        rerank = st.toggle('Use Reranker', True)
         if rerank:
             reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
         else:
@@ -327,34 +286,39 @@ def main():
         model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
                                           index=available_models.index(model_default),
                                           placeholder='Select Model')
         st.write("Experimental and time limited 2'")
-        finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
-        if we_are_not_online or we_are_online:
-            if finetune_model:
-                from finetune_backend import finetune
-                if 'finetuned' in model_name_or_path:
-                    st.write("Model already finetuned")
-                elif "models/" in model_name_or_path:
-                    st.write("sentence-transformers models only!")
-                else:
-                    # try:
                     if 'finetuned' in model_name_or_path:
                         st.write("Model already finetuned")
                     else:
-                        model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
-                        st.write(f"model_path returned = {model_path}")
-                        if model_path is not None:
-                            if model_name_or_path.split('/')[-1] not in model_path:
-                                st.write(model_path)  # a warning from finetuning in this case
-                            elif model_path not in available_models:
-                                # finetuning generated a model, let's add it
-                                available_models.append(model_path)
-                                st.write("Model saved!")
-                    # except Exception:
-                    #     st.write("Model not found on HF or error")
-        else:
-            st.write("Finetuning not available on Streamlit online because of space limitations")
         model_name_or_path = check_model(model_name_or_path)
         try:
@@ -463,7 +427,7 @@ def main():
                         with col1:
-                            use_reworded_query = st.toggle('Use rewritten query', True)
                             if use_reworded_query:
                                 # let's use Llama2, and fall back on GPT3.5 if it fails

 #%%
 models_urls = {'models/finetuned-all-mpnet-base-v2-300': "https://drive.google.com/drive/folders/1asJ37-AUv5nytLtH6hp6_bVV3_cZOXfj"}
+def download_model_from_Gdrive(model_name_or_path, model_local_path):
     st.write("Downloading model from Google Drive")
     assert model_name_or_path in models_urls, f"Model {model_name_or_path} not found in models_urls"
     url = models_urls[model_name_or_path]
+    gdown.download_folder(url, output=model_local_path, quiet=False, use_cookies=False)
+    print(f"Model downloaded from Gdrive and saved to {model_local_path} folder")
     # st.write("Model downloaded")
+def download_model(model_name_or_path, model_local_path):
     if model_name_or_path.startswith("models/"):
+        download_model_from_Gdrive(model_name_or_path, model_local_path)
     elif model_name_or_path.startswith("sentence-transformers/"):
+        st.sidebar.write(f"Downloading {model_name_or_path}")
+        model = SentenceTransformer(model_name_or_path)
+        st.sidebar.write(f"Model {model_name_or_path} downloaded")
+    models_urls[model_name_or_path] = model_local_path
+    model.save(model_local_path)
+    # st.sidebar.write(f"Model {model_name_or_path} saved to {model_new_path}")
 #%%
 # for streamlit online, we must download the model from google drive
 # because github LFS doesn't work on forked repos
 def check_model(model_name_or_path):
+    model_name = model_name_or_path.split('/')[-1] # remove 'sentence-transformers'
+    model_local_path = str(pathlib.Path("models") / model_name) # this creates a models folder inside /models
+    if pathlib.Path(model_local_path).exists():
         # let's use the model that's already there
+        print(f"Model {model_local_path} already exists")
     else:
+        # let's download the model, HF is not limited in space like Streamlit.io
+        download_model(model_name_or_path, model_local_path)
+    return model_local_path
 #%% instantiate Weaviate client
 def get_weaviate_client(api_key, url, model_name_or_path, openai_api_key):
             alpha_input = st.slider(label='Alpha',min_value=0.00, max_value=1.00, value=0.40, step=0.05)
             retrieval_limit = st.slider(label='Hybrid Search Results', min_value=10, max_value=300, value=10, step=10)
+        hybrid_filter = st.toggle('Filter Search using Guest name', True) # i.e. look only at guests' data
+        rerank = st.toggle('Rerank', True)
         if rerank:
             reranker_topk = st.slider(label='Reranker Top K',min_value=1, max_value=5, value=3, step=1)
         else:
         model_name_or_path = st.selectbox(label='Model Name:', options=available_models,
                                           index=available_models.index(model_default),
                                           placeholder='Select Model')
         st.write("Experimental and time limited 2'")
+        c1,c2 = st.columns([8,1])
+        with c1:
+            finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
+            if we_are_not_online or we_are_online:
+                if finetune_model:
+                    from finetune_backend import finetune
                     if 'finetuned' in model_name_or_path:
                         st.write("Model already finetuned")
+                    elif "models/" in model_name_or_path:
+                        st.write("sentence-transformers models only!")
                     else:
+                        try:
+                            if 'finetuned' in model_name_or_path:
+                                st.write("Model already finetuned")
+                            else:
+                                with c2:
+                                    with st.spinner(''):
+                                        model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
+                                with c1:
+                                    # st.write(f"model_path returned = {model_path}")
+                                    if model_path is not None:
+                                        if model_name_or_path.split('/')[-1] not in model_path:
+                                            st.write(model_path)  # a warning from finetuning in this case
+                                        elif model_path not in available_models:
+                                            # finetuning generated a model, let's add it
+                                            available_models.append(model_path)
+                                            st.write("Model saved in list!")
+                        except Exception:
+                            st.write("Model not found on HF or error")
+            else:
+                st.write("Finetuning not available on Streamlit online because of space limitations")
         model_name_or_path = check_model(model_name_or_path)
         try:
                         with col1:
+                            use_reworded_query = st.toggle('Rewrite query with LLM', True)
                             if use_reworded_query:
                                 # let's use Llama2, and fall back on GPT3.5 if it fails

backend.py CHANGED Viewed

@@ -92,7 +92,7 @@ def encode_content_splits(content_splits,
     return emb
-@stub.function(image=vector_search, gpu="A100", timeout=120,
                mounts=[modal.Mount.from_local_dir("./data",
                                                   remote_path="/root/data",
                                                   condition=lambda pth: ".json" in pth)],

     return emb
+@stub.function(image=vector_search, gpu="A100", timeout=240,
                mounts=[modal.Mount.from_local_dir("./data",
                                                   remote_path="/root/data",
                                                   condition=lambda pth: ".json" in pth)],

finetune_backend.py CHANGED Viewed

@@ -36,8 +36,8 @@ def finetune(model: str='sentence-transformers/all-mpnet-base-v2',
         model = model.replace('/','')
         model = f"sentence-transformers/{model}"
-    fullpath = os.path.join(outpath, f"finetuned-{model.strip('/')[-1]}-300")
-    st.sidebar.write(f"Model will be saved in {fullpath}")
     if os.path.exists(fullpath):
         msg = "Model already exists!"
@@ -45,15 +45,31 @@ def finetune(model: str='sentence-transformers/all-mpnet-base-v2',
         return msg
     start = time.perf_counter()
-    finetuned_model = f.remote(training_path, valid_path, model_id=model)
     end = time.perf_counter() - start
-    st.write(f"Finetuning with GPU lasted {end:.2f} seconds")
     if savemodel:
-        with open(fullpath, 'wb') as file:
-            # Write the contents of the BytesIO object to a new file
-            file.write(finetuned_model.getbuffer())
         print(f"Model saved in {fullpath}")
         return fullpath

         model = model.replace('/','')
         model = f"sentence-transformers/{model}"
+    fullpath = os.path.join(outpath, f"finetuned-{model.split('/')[-1]}-300")
+    # st.sidebar.write(f"Model saved in {fullpath}")
     if os.path.exists(fullpath):
         msg = "Model already exists!"
         return msg
     start = time.perf_counter()
+    try:
+        finetuned_model = f.remote(training_path, valid_path, model_id=model)
+    except FunctionTimeoutError:
+        return "Timeout!"  # will be displayed by app.py
     end = time.perf_counter() - start
+    # with st.sidebar:
+    #     c1,c2 = st.columns([8,1])
+    #     with c1:
+    st.sidebar.write(f"Finetuning with GPU lasted {end:.2f} seconds")
     if savemodel:
+        # save it as zip filess
+        # with open(fullpath+'.zip', 'wb') as file:
+        #     # Write the contents of the BytesIO object to a new file
+        #     file.write(finetuned_model.getbuffer())
+        # print(f"Model zip file saved in {fullpath}")
+        # zipfile.ZipExtFile(finetuned_model) # to unzip
+        # import sys
+        # sys.getsizeof(zippedio)
+        # or save as folder directly
+        zipfile.ZipFile(finetuned_model).extractall(fullpath)
         print(f"Model saved in {fullpath}")
         return fullpath