Spaces:
Running
Running
#%% | |
import os, time, io, zipfile | |
from preprocessing import FileIO | |
import shutil | |
import modal | |
import streamlit as st | |
from llama_index.finetuning import EmbeddingQAFinetuneDataset | |
from dotenv import load_dotenv, find_dotenv | |
env = load_dotenv(find_dotenv('env'), override=True) | |
#%% | |
training_path = 'data/training_data_300.json' | |
valid_path = 'data/validation_data_100.json' | |
training_set = EmbeddingQAFinetuneDataset.from_json(training_path) | |
valid_set = EmbeddingQAFinetuneDataset.from_json(valid_path) | |
def finetune(model='all-mpnet-base-v2', savemodel=False, outpath='.'): | |
""" Finetunes a model on Modal GPU A100. | |
The model is saved in /root/models on a Modal volume | |
and can be stored locally. | |
Args: | |
model (str): the Sentence Transformer model name | |
savemodel (bool, optional): whether to save the model or not. | |
Returns: | |
path of the saved model (when saved) | |
""" | |
f = modal.Function.lookup("vector-search-project", "finetune") | |
if 'sentence-transformers' not in model: | |
model = model.replace('/','') | |
model = f"sentence-transformers/{model}" | |
fullpath = os.path.join(outpath, f"finetuned-{model}-300") | |
st.sidebar.write(f"Model will be saved in {fullpath}") | |
if os.path.exists(fullpath): | |
msg = "Model already exists!" | |
print(msg) | |
return msg | |
start = time.perf_counter() | |
finetuned_model = f.remote(training_path, valid_path, model_id=model) | |
end = time.perf_counter() - start | |
st.write(f"Finetuning with GPU lasted {end:.2f} seconds") | |
if savemodel: | |
with open(fullpath, 'wb') as file: | |
# Write the contents of the BytesIO object to a new file | |
file.write(finetuned_model.getbuffer()) | |
print(f"Model saved in {fullpath}") | |
return fullpath | |