add option for hyperparameter tuning to cc.validate

Browse files

Files changed (5) hide show

examples/cell_classification.ipynb +3 -2
examples/hyperparam_optimiz_for_disease_classifier.py +0 -226
geneformer/classifier.py +235 -23
geneformer/classifier_utils.py +21 -2
requirements.txt +1 -0

examples/cell_classification.ipynb CHANGED Viewed

@@ -13,7 +13,7 @@
    "id": "1792e51c-86c3-406f-be5a-273c4e4aec20",
    "metadata": {},
    "source": [
-    "### Please note that, as usual with deep learning models, we **highly** recommend tuning learning hyperparameters for all fine-tuning applications as this can significantly improve model performance. Example below uses previously optimized hyperparameters, but please see the \"hyperparam_optimiz_for_disease_classifier\" script for an example of how to tune hyperparameters for downstream applications."
    ]
   },
   {
@@ -266,7 +266,8 @@
     "                          id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
     "                          output_directory=output_dir,\n",
     "                          output_prefix=output_prefix,\n",
-    "                          split_id_dict=train_valid_id_split_dict)"
    ]
   },
   {

    "id": "1792e51c-86c3-406f-be5a-273c4e4aec20",
    "metadata": {},
    "source": [
+    "### Please note that, as usual with deep learning models, we **highly** recommend tuning learning hyperparameters for all fine-tuning applications as this can significantly improve model performance. Example below uses previously optimized hyperparameters, but one can optimize hyperparameters with the argument n_hyperopt_trials=n in cc.validate() where n>0 and represents the number of trials for hyperparameter optimization."
    ]
   },
   {
     "                          id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
     "                          output_directory=output_dir,\n",
     "                          output_prefix=output_prefix,\n",
+    "                          split_id_dict=train_valid_id_split_dict)\n",
+    "                          # to optimize hyperparameters, set n_hyperopt_trials=100 (or alternative desired # of trials)"
    ]
   },
   {

examples/hyperparam_optimiz_for_disease_classifier.py DELETED Viewed

@@ -1,226 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# hyperparameter optimization with raytune for disease classification
-# imports
-import os
-import subprocess
-GPU_NUMBER = [0,1,2,3]
-os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
-os.environ["NCCL_DEBUG"] = "INFO"
-os.environ["CONDA_OVERRIDE_GLIBC"] = "2.56"
-os.environ["LD_LIBRARY_PATH"] = "/path/to/miniconda3/lib:/path/to/sw/lib:/path/to/sw/lib"
-# initiate runtime environment for raytune
-import pyarrow # must occur prior to ray import
-import ray
-from ray import tune
-from ray.tune import ExperimentAnalysis
-from ray.tune.suggest.hyperopt import HyperOptSearch
-ray.shutdown() #engage new ray session
-runtime_env = {"conda": "base",
-               "env_vars": {"LD_LIBRARY_PATH": "/path/to/miniconda3/lib:/path/to/sw/lib:/path/to/sw/lib"}}
-ray.init(runtime_env=runtime_env)
-def initialize_ray_with_check(ip_address):
-    """
-    Initialize Ray with a specified IP address and check its status and accessibility.
-    Args:
-    - ip_address (str): The IP address (with port) to initialize Ray.
-    Returns:
-    - bool: True if initialization was successful and dashboard is accessible, False otherwise.
-    """
-    try:
-        ray.init(address=ip_address)
-        print(ray.nodes())
-        services = ray.get_webui_url()
-        if not services:
-            raise RuntimeError("Ray dashboard is not accessible.")
-        else:
-            print(f"Ray dashboard is accessible at: {services}")
-        return True
-    except Exception as e:
-        print(f"Error initializing Ray: {e}")
-        return False
-# Usage:
-ip = 'your_ip:xxxx'  # Replace with your actual IP address and port
-if initialize_ray_with_check(ip):
-    print("Ray initialized successfully.")
-else:
-    print("Error during Ray initialization.")
-import datetime
-import numpy as np
-import pandas as pd
-import random
-import seaborn as sns; sns.set()
-from collections import Counter
-from datasets import load_from_disk
-from scipy.stats import ranksums
-from sklearn.metrics import accuracy_score
-from transformers import BertForSequenceClassification
-from transformers import Trainer
-from transformers.training_args import TrainingArguments
-from geneformer import DataCollatorForCellClassification
-# number of CPU cores
-num_proc=30
-# load train dataset with columns:
-    # cell_type (annotation of each cell's type)
-    # disease (healthy or disease state)
-    # individual (unique ID for each patient)
-    # length (length of that cell's rank value encoding)
-train_dataset=load_from_disk("/path/to/disease_train_data.dataset")
-# filter dataset for given cell_type
-def if_cell_type(example):
-    return example["cell_type"].startswith("Cardiomyocyte")
-trainset_v2 = train_dataset.filter(if_cell_type, num_proc=num_proc)
-# create dictionary of disease states : label ids
-target_names = ["healthy", "disease1", "disease2"]
-target_name_id_dict = dict(zip(target_names,[i for i in range(len(target_names))]))
-trainset_v3 = trainset_v2.rename_column("disease","label")
-# change labels to numerical ids
-def classes_to_ids(example):
-    example["label"] = target_name_id_dict[example["label"]]
-    return example
-trainset_v4 = trainset_v3.map(classes_to_ids, num_proc=num_proc)
-# separate into train, validation, test sets
-indiv_set = set(trainset_v4["individual"])
-random.seed(42)
-train_indiv = random.sample(indiv_set,round(0.7*len(indiv_set)))
-eval_indiv = [indiv for indiv in indiv_set if indiv not in train_indiv]
-valid_indiv = random.sample(eval_indiv,round(0.5*len(eval_indiv)))
-test_indiv = [indiv for indiv in eval_indiv if indiv not in valid_indiv]
-def if_train(example):
-    return example["individual"] in train_indiv
-classifier_trainset = trainset_v4.filter(if_train,num_proc=num_proc).shuffle(seed=42)
-def if_valid(example):
-    return example["individual"] in valid_indiv
-classifier_validset = trainset_v4.filter(if_valid,num_proc=num_proc).shuffle(seed=42)
-# define output directory path
-current_date = datetime.datetime.now()
-datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
-output_dir = f"/path/to/models/{datestamp}_geneformer_DiseaseClassifier/"
-# ensure not overwriting previously saved model
-saved_model_test = os.path.join(output_dir, f"pytorch_model.bin")
-if os.path.isfile(saved_model_test) == True:
-    raise Exception("Model already saved to this directory.")
-# make output directory
-subprocess.call(f'mkdir {output_dir}', shell=True)
-# set training parameters
-# how many pretrained layers to freeze
-freeze_layers = 2
-# batch size for training and eval
-geneformer_batch_size = 12
-# number of epochs
-epochs = 1
-# logging steps
-logging_steps = round(len(classifier_trainset)/geneformer_batch_size/10)
-# define function to initiate model
-def model_init():
-    model = BertForSequenceClassification.from_pretrained("/path/to/pretrained_model/",
-                                                          num_labels=len(target_names),
-                                                          output_attentions = False,
-                                                          output_hidden_states = False)
-    if freeze_layers is not None:
-        modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
-        for module in modules_to_freeze:
-            for param in module.parameters():
-                param.requires_grad = False
-    model = model.to("cuda:0")
-    return model
-# define metrics
-# note: macro f1 score recommended for imbalanced multiclass classifiers
-def compute_metrics(pred):
-    labels = pred.label_ids
-    preds = pred.predictions.argmax(-1)
-    # calculate accuracy using sklearn's function
-    acc = accuracy_score(labels, preds)
-    return {
-      'accuracy': acc,
-    }
-# set training arguments
-training_args = {
-    "do_train": True,
-    "do_eval": True,
-    "evaluation_strategy": "steps",
-    "eval_steps": logging_steps,
-    "logging_steps": logging_steps,
-    "group_by_length": True,
-    "length_column_name": "length",
-    "disable_tqdm": True,
-    "skip_memory_metrics": True, # memory tracker causes errors in raytune
-    "per_device_train_batch_size": geneformer_batch_size,
-    "per_device_eval_batch_size": geneformer_batch_size,
-    "num_train_epochs": epochs,
-    "load_best_model_at_end": True,
-    "output_dir": output_dir,
-}
-training_args_init = TrainingArguments(**training_args)
-# create the trainer
-trainer = Trainer(
-    model_init=model_init,
-    args=training_args_init,
-    data_collator=DataCollatorForCellClassification(),
-    train_dataset=classifier_trainset,
-    eval_dataset=classifier_validset,
-    compute_metrics=compute_metrics,
-)
-# specify raytune hyperparameter search space
-ray_config = {
-    "num_train_epochs": tune.choice([epochs]),
-    "learning_rate": tune.loguniform(1e-6, 1e-3),
-    "weight_decay": tune.uniform(0.0, 0.3),
-    "lr_scheduler_type": tune.choice(["linear","cosine","polynomial"]),
-    "warmup_steps": tune.uniform(100, 2000),
-    "seed": tune.uniform(0,100),
-    "per_device_train_batch_size": tune.choice([geneformer_batch_size])
-}
-hyperopt_search = HyperOptSearch(
-    metric="eval_accuracy", mode="max")
-# optimize hyperparameters
-trainer.hyperparameter_search(
-    direction="maximize",
-    backend="ray",
-    resources_per_trial={"cpu":8,"gpu":1},
-    hp_space=lambda _: ray_config,
-    search_alg=hyperopt_search,
-    n_trials=100, # number of trials
-    progress_reporter=tune.CLIReporter(max_report_frequency=600,
-                                                   sort_by_metric=True,
-                                                   max_progress_rows=100,
-                                                   mode="max",
-                                                   metric="eval_accuracy",
-                                                   metric_columns=["loss", "eval_loss", "eval_accuracy"])
-)

geneformer/classifier.py CHANGED Viewed

@@ -82,11 +82,12 @@ class Classifier:
         "training_args": {None, dict},
         "freeze_layers": {int},
         "num_crossval_splits": {0, 1, 5},
-        "eval_size": {int, float},
         "no_eval": {bool},
         "stratify_splits_col": {None, str},
         "forward_batch_size": {int},
         "nproc": {int},
     }
     def __init__(
@@ -99,13 +100,15 @@ class Classifier:
         max_ncells=None,
         max_ncells_per_class=None,
         training_args=None,
         freeze_layers=0,
         num_crossval_splits=1,
-        eval_size=0.2,
         stratify_splits_col=None,
         no_eval=False,
         forward_batch_size=100,
         nproc=4,
     ):
         """
         Initialize Geneformer classifier.
@@ -152,15 +155,18 @@ class Classifier:
             | Otherwise, will use the Hugging Face defaults:
             | https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
             | Note: Hyperparameter tuning is highly recommended, rather than using defaults.
         freeze_layers : int
             | Number of layers to freeze from fine-tuning.
             | 0: no layers will be frozen; 2: first two layers will be frozen; etc.
         num_crossval_splits : {0, 1, 5}
             | 0: train on all data without splitting
-            | 1: split data into train and eval sets by designated eval_size
-            | 5: split data into 5 folds of train and eval sets by designated eval_size
-        eval_size : None, float
-            | Proportion of data to hold out for evaluation (e.g. 0.2 if intending 80:20 train/eval split)
         stratify_splits_col : None, str
             | Name of column in .dataset to be used for stratified splitting.
             | Proportion of each class in this column will be the same in the splits as in the original dataset.
@@ -171,6 +177,8 @@ class Classifier:
             | Batch size for forward pass (for evaluation, not training).
         nproc : int
             | Number of CPU processes to use.
         """
@@ -182,13 +190,19 @@ class Classifier:
         self.max_ncells = max_ncells
         self.max_ncells_per_class = max_ncells_per_class
         self.training_args = training_args
         self.freeze_layers = freeze_layers
         self.num_crossval_splits = num_crossval_splits
-        self.eval_size = eval_size
         self.stratify_splits_col = stratify_splits_col
         self.no_eval = no_eval
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
         if self.training_args is None:
             logger.warning(
@@ -301,6 +315,9 @@ class Classifier:
                     "Gene_class_dict should contain at least 2 gene classes to classify."
                 )
                 raise
     def prepare_data(
         self,
@@ -337,6 +354,7 @@ class Classifier:
         test_size : None, float
             | Proportion of data to be saved separately and held out for test set
             | (e.g. 0.2 if intending hold out 20%)
             | The training set will be further split to train / validation in self.validate
             | Note: only available for CellClassifiers
         attr_to_split : None, str
@@ -356,6 +374,9 @@ class Classifier:
             | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
         """
         # prepare data and labels for classification
         data = pu.load_and_filter(self.filter_data, self.nproc, input_data_file)
@@ -555,6 +576,7 @@ class Classifier:
         save_eval_output=True,
         predict_eval=True,
         predict_trainer=False,
     ):
         """
         (Cross-)validate cell state or gene classifier.
@@ -604,6 +626,9 @@ class Classifier:
         predict_trainer : bool
             | Whether or not to save eval predictions from trainer
             | Saves as a pickle file of trainer predictions
         """
         if self.num_crossval_splits == 0:
@@ -700,14 +725,30 @@ class Classifier:
                     ]
                     eval_data = data.select(eval_indices)
                     train_data = data.select(train_indices)
-                trainer = self.train_classifier(
-                    model_directory,
-                    num_classes,
-                    train_data,
-                    eval_data,
-                    ksplit_output_dir,
-                    predict_trainer,
-                )
                 result = self.evaluate_model(
                     trainer.model,
                     num_classes,
@@ -752,14 +793,29 @@ class Classifier:
                     self.nproc,
                 )
-                trainer = self.train_classifier(
-                    model_directory,
-                    num_classes,
-                    train_data,
-                    eval_data,
-                    ksplit_output_dir,
-                    predict_trainer,
-                )
                 result = self.evaluate_model(
                     trainer.model,
                     num_classes,
@@ -810,6 +866,162 @@ class Classifier:
         return all_metrics
     def train_classifier(
         self,
         model_directory,

         "training_args": {None, dict},
         "freeze_layers": {int},
         "num_crossval_splits": {0, 1, 5},
+        "split_sizes": {None, dict},
         "no_eval": {bool},
         "stratify_splits_col": {None, str},
         "forward_batch_size": {int},
         "nproc": {int},
+        "ngpu": {int},
     }
     def __init__(
         max_ncells=None,
         max_ncells_per_class=None,
         training_args=None,
+        ray_config=None,
         freeze_layers=0,
         num_crossval_splits=1,
+        split_sizes={"train": 0.8, "valid": 0.1, "test": 0.1},
         stratify_splits_col=None,
         no_eval=False,
         forward_batch_size=100,
         nproc=4,
+        ngpu=1,
     ):
         """
         Initialize Geneformer classifier.
             | Otherwise, will use the Hugging Face defaults:
             | https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
             | Note: Hyperparameter tuning is highly recommended, rather than using defaults.
+        ray_config : None, dict
+            | Training argument ranges for tuning hyperparameters with Ray.
         freeze_layers : int
             | Number of layers to freeze from fine-tuning.
             | 0: no layers will be frozen; 2: first two layers will be frozen; etc.
         num_crossval_splits : {0, 1, 5}
             | 0: train on all data without splitting
+            | 1: split data into train and eval sets by designated split_sizes["valid"]
+            | 5: split data into 5 folds of train and eval sets by designated split_sizes["valid"]
+        split_sizes : None, dict
+            | Dictionary of proportion of data to hold out for train, validation, and test sets
+            | {"train": 0.8, "valid": 0.1, "test": 0.1} if intending 80/10/10 train/valid/test split
         stratify_splits_col : None, str
             | Name of column in .dataset to be used for stratified splitting.
             | Proportion of each class in this column will be the same in the splits as in the original dataset.
             | Batch size for forward pass (for evaluation, not training).
         nproc : int
             | Number of CPU processes to use.
+        ngpu : int
+            | Number of GPUs available.
         """
         self.max_ncells = max_ncells
         self.max_ncells_per_class = max_ncells_per_class
         self.training_args = training_args
+        self.ray_config = ray_config
         self.freeze_layers = freeze_layers
         self.num_crossval_splits = num_crossval_splits
+        self.split_sizes = split_sizes
+        self.train_size = self.split_sizes["train"]
+        self.valid_size = self.split_sizes["valid"]
+        self.oos_test_size = self.split_sizes["test"]
+        self.eval_size = self.valid_size / (self.train_size + self.valid_size)
         self.stratify_splits_col = stratify_splits_col
         self.no_eval = no_eval
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
+        self.ngpu = ngpu
         if self.training_args is None:
             logger.warning(
                     "Gene_class_dict should contain at least 2 gene classes to classify."
                 )
                 raise
+        if sum(self.split_sizes.values()) != 1:
+            logger.error("Train, validation, and test proportions should sum to 1.")
+            raise
     def prepare_data(
         self,
         test_size : None, float
             | Proportion of data to be saved separately and held out for test set
             | (e.g. 0.2 if intending hold out 20%)
+            | If None, will inherit from split_sizes["test"] from Classifier
             | The training set will be further split to train / validation in self.validate
             | Note: only available for CellClassifiers
         attr_to_split : None, str
             | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
         """
+        if test_size is None:
+            test_size = self.oos_test_size
         # prepare data and labels for classification
         data = pu.load_and_filter(self.filter_data, self.nproc, input_data_file)
         save_eval_output=True,
         predict_eval=True,
         predict_trainer=False,
+        n_hyperopt_trials=0,
     ):
         """
         (Cross-)validate cell state or gene classifier.
         predict_trainer : bool
             | Whether or not to save eval predictions from trainer
             | Saves as a pickle file of trainer predictions
+        n_hyperopt_trials : int
+            | Number of trials to run for hyperparameter optimization
+            | If 0, will not optimize hyperparameters
         """
         if self.num_crossval_splits == 0:
                     ]
                     eval_data = data.select(eval_indices)
                     train_data = data.select(train_indices)
+                if n_hyperopt_trials == 0:
+                    trainer = self.train_classifier(
+                        model_directory,
+                        num_classes,
+                        train_data,
+                        eval_data,
+                        ksplit_output_dir,
+                        predict_trainer,
+                    )
+                else:
+                    trainer = self.hyperopt_classifier(
+                        model_directory,
+                        num_classes,
+                        train_data,
+                        eval_data,
+                        ksplit_output_dir,
+                        n_trials=n_hyperopt_trials,
+                    )
+                    if iteration_num == self.num_crossval_splits:
+                        return
+                    else:
+                        iteration_num = iteration_num + 1
+                        continue
                 result = self.evaluate_model(
                     trainer.model,
                     num_classes,
                     self.nproc,
                 )
+                if n_hyperopt_trials == 0:
+                    trainer = self.train_classifier(
+                        model_directory,
+                        num_classes,
+                        train_data,
+                        eval_data,
+                        ksplit_output_dir,
+                        predict_trainer,
+                    )
+                else:
+                    trainer = self.hyperopt_classifier(
+                        model_directory,
+                        num_classes,
+                        train_data,
+                        eval_data,
+                        ksplit_output_dir,
+                        n_trials=n_hyperopt_trials,
+                    )
+                    if iteration_num == self.num_crossval_splits:
+                        return
+                    else:
+                        iteration_num = iteration_num + 1
+                        continue
                 result = self.evaluate_model(
                     trainer.model,
                     num_classes,
         return all_metrics
+    def hyperopt_classifier(
+        self,
+        model_directory,
+        num_classes,
+        train_data,
+        eval_data,
+        output_directory,
+        n_trials=100,
+    ):
+        """
+        Fine-tune model for cell state or gene classification.
+        **Parameters**
+        model_directory : Path
+            | Path to directory containing model
+        num_classes : int
+            | Number of classes for classifier
+        train_data : Dataset
+            | Loaded training .dataset input
+            | For cell classifier, labels in column "label".
+            | For gene classifier, labels in column "labels".
+        eval_data : None, Dataset
+            | (Optional) Loaded evaluation .dataset input
+            | For cell classifier, labels in column "label".
+            | For gene classifier, labels in column "labels".
+        output_directory : Path
+            | Path to directory where fine-tuned model will be saved
+        n_trials : int
+            | Number of trials to run for hyperparameter optimization
+        """
+        # initiate runtime environment for raytune
+        import ray
+        from ray import tune
+        from ray.tune.search.hyperopt import HyperOptSearch
+        ray.shutdown()  # engage new ray session
+        ray.init()
+        ##### Validate and prepare data #####
+        train_data, eval_data = cu.validate_and_clean_cols(
+            train_data, eval_data, self.classifier
+        )
+        if (self.no_eval is True) and (eval_data is not None):
+            logger.warning(
+                "no_eval set to True; hyperparameter optimization requires eval, proceeding with eval"
+            )
+        # ensure not overwriting previously saved model
+        saved_model_test = os.path.join(output_directory, "pytorch_model.bin")
+        if os.path.isfile(saved_model_test) is True:
+            logger.error("Model already saved to this designated output directory.")
+            raise
+        # make output directory
+        subprocess.call(f"mkdir {output_directory}", shell=True)
+        ##### Load model and training args #####
+        if self.classifier == "cell":
+            model_type = "CellClassifier"
+        elif self.classifier == "gene":
+            model_type = "GeneClassifier"
+        model = pu.load_model(model_type, num_classes, model_directory, "train")
+        def_training_args, def_freeze_layers = cu.get_default_train_args(
+            model, self.classifier, train_data, output_directory
+        )
+        del model
+        if self.training_args is not None:
+            def_training_args.update(self.training_args)
+        logging_steps = round(
+            len(train_data) / def_training_args["per_device_train_batch_size"] / 10
+        )
+        def_training_args["logging_steps"] = logging_steps
+        def_training_args["output_dir"] = output_directory
+        if eval_data is None:
+            def_training_args["evaluation_strategy"] = "no"
+            def_training_args["load_best_model_at_end"] = False
+        training_args_init = TrainingArguments(**def_training_args)
+        ##### Fine-tune the model #####
+        # define the data collator
+        if self.classifier == "cell":
+            data_collator = DataCollatorForCellClassification()
+        elif self.classifier == "gene":
+            data_collator = DataCollatorForGeneClassification()
+        # define function to initiate model
+        def model_init():
+            model = pu.load_model(model_type, num_classes, model_directory, "train")
+            if self.freeze_layers is not None:
+                def_freeze_layers = self.freeze_layers
+            if def_freeze_layers > 0:
+                modules_to_freeze = model.bert.encoder.layer[:def_freeze_layers]
+                for module in modules_to_freeze:
+                    for param in module.parameters():
+                        param.requires_grad = False
+            model = model.to("cuda:0")
+            return model
+        # create the trainer
+        trainer = Trainer(
+            model_init=model_init,
+            args=training_args_init,
+            data_collator=data_collator,
+            train_dataset=train_data,
+            eval_dataset=eval_data,
+            compute_metrics=cu.compute_metrics,
+        )
+        # specify raytune hyperparameter search space
+        if self.ray_config is None:
+            logger.warning(
+                "No ray_config provided. Proceeding with default, but ranges may need adjustment depending on model."
+            )
+            def_ray_config = {
+                "num_train_epochs": tune.choice([1]),
+                "learning_rate": tune.loguniform(1e-6, 1e-3),
+                "weight_decay": tune.uniform(0.0, 0.3),
+                "lr_scheduler_type": tune.choice(["linear", "cosine", "polynomial"]),
+                "warmup_steps": tune.uniform(100, 2000),
+                "seed": tune.uniform(0, 100),
+                "per_device_train_batch_size": tune.choice(
+                    [def_training_args["per_device_train_batch_size"]]
+                ),
+            }
+        hyperopt_search = HyperOptSearch(metric="eval_macro_f1", mode="max")
+        # optimize hyperparameters
+        trainer.hyperparameter_search(
+            direction="maximize",
+            backend="ray",
+            resources_per_trial={"cpu": int(self.nproc / self.ngpu), "gpu": 1},
+            hp_space=lambda _: def_ray_config
+            if self.ray_config is None
+            else self.ray_config,
+            search_alg=hyperopt_search,
+            n_trials=n_trials,  # number of trials
+            progress_reporter=tune.CLIReporter(
+                max_report_frequency=600,
+                sort_by_metric=True,
+                max_progress_rows=n_trials,
+                mode="max",
+                metric="eval_macro_f1",
+                metric_columns=["loss", "eval_loss", "eval_accuracy", "eval_macro_f1"],
+            ),
+        )
+        return trainer
     def train_classifier(
         self,
         model_directory,

geneformer/classifier_utils.py CHANGED Viewed

@@ -360,9 +360,23 @@ def get_num_classes(id_class_dict):
 def compute_metrics(pred):
     labels = pred.label_ids
     preds = pred.predictions.argmax(-1)
     # calculate accuracy and macro f1 using sklearn's function
-    acc = accuracy_score(labels, preds)
-    macro_f1 = f1_score(labels, preds, average="macro")
     return {"accuracy": acc, "macro_f1": macro_f1}
@@ -387,6 +401,11 @@ def get_default_train_args(model, classifier, data, output_dir):
             "per_device_train_batch_size": batch_size,
             "per_device_eval_batch_size": batch_size,
         }
     training_args = {
         "num_train_epochs": epochs,

 def compute_metrics(pred):
     labels = pred.label_ids
     preds = pred.predictions.argmax(-1)
     # calculate accuracy and macro f1 using sklearn's function
+    if len(labels.shape) == 1:
+        acc = accuracy_score(labels, preds)
+        macro_f1 = f1_score(labels, preds, average="macro")
+    else:
+        flat_labels = labels.flatten().tolist()
+        flat_preds = preds.flatten().tolist()
+        logit_label_paired = [
+            item for item in list(zip(flat_preds, flat_labels)) if item[1] != -100
+        ]
+        y_pred = [item[0] for item in logit_label_paired]
+        y_true = [item[1] for item in logit_label_paired]
+        acc = accuracy_score(y_true, y_pred)
+        macro_f1 = f1_score(y_true, y_pred, average="macro")
     return {"accuracy": acc, "macro_f1": macro_f1}
             "per_device_train_batch_size": batch_size,
             "per_device_eval_batch_size": batch_size,
         }
+    else:
+        default_training_args = {
+            "per_device_train_batch_size": batch_size,
+            "per_device_eval_batch_size": batch_size,
+        }
     training_args = {
         "num_train_epochs": epochs,

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 anndata>=0.9
 datasets>=2.12
 loompy>=3.0
 matplotlib>=3.7
 numpy>=1.23

 anndata>=0.9
 datasets>=2.12
+hyperopt>=0.2
 loompy>=3.0
 matplotlib>=3.7
 numpy>=1.23