Christina Theodoris
commited on
Commit
•
f0b6641
1
Parent(s):
feeecd0
Update links including unsorted example lengths file
Browse files
examples/pretraining_new_model/pretrain_geneformer_w_deepspeed.py
CHANGED
@@ -99,7 +99,7 @@ subprocess.call(f"mkdir {training_output_dir}", shell=True)
|
|
99 |
subprocess.call(f"mkdir {model_output_dir}", shell=True)
|
100 |
|
101 |
|
102 |
-
# load gene_ensembl_id:token dictionary (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/
|
103 |
with open("token_dictionary.pkl", "rb") as fp:
|
104 |
token_dictionary = pickle.load(fp)
|
105 |
|
@@ -153,8 +153,8 @@ trainer = GeneformerPretrainer(
|
|
153 |
args=training_args,
|
154 |
# pretraining corpus (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048.dataset)
|
155 |
train_dataset=load_from_disk("genecorpus_30M_2048.dataset"),
|
156 |
-
# file of lengths of each example cell (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/
|
157 |
-
example_lengths_file="
|
158 |
token_dictionary=token_dictionary,
|
159 |
)
|
160 |
|
|
|
99 |
subprocess.call(f"mkdir {model_output_dir}", shell=True)
|
100 |
|
101 |
|
102 |
+
# load gene_ensembl_id:token dictionary (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/blob/main/token_dictionary.pkl)
|
103 |
with open("token_dictionary.pkl", "rb") as fp:
|
104 |
token_dictionary = pickle.load(fp)
|
105 |
|
|
|
153 |
args=training_args,
|
154 |
# pretraining corpus (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/genecorpus_30M_2048.dataset)
|
155 |
train_dataset=load_from_disk("genecorpus_30M_2048.dataset"),
|
156 |
+
# file of lengths of each example cell (e.g. https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/blob/main/genecorpus_30M_2048_lengths.pkl)
|
157 |
+
example_lengths_file="genecorpus_30M_2048_lengths.pkl",
|
158 |
token_dictionary=token_dictionary,
|
159 |
)
|
160 |
|