Spaces:

mschuh
/

BarlowDTI

Running on Zero

File size: 6,356 Bytes

84bfd88

import sys
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import threading
from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
import time
import requests
import joblib
# from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, ProtTransT5XLU50Embedder
from Bio import SeqIO
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import torch
from typing import *
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")

from xgboost import XGBClassifier, DMatrix

from model.barlow_twins import BarlowTwins

# sys.path.append("../utils/")
from utils.sequence import uniprot2sequence, encode_sequences



class DTIModel:
    def __init__(self, bt_model_path: str, gbm_model_path: str, encoder: str = "prost_t5"):
        self.bt_model = BarlowTwins()
        self.bt_model.load_model(bt_model_path)

        self.gbm_model = XGBClassifier()
        self.gbm_model.load_model(gbm_model_path)

        self.encoder = encoder

        self.smiles_cache = {}
        self.sequence_cache = {}

    def _encode_smiles(self, smiles: str, radius: int = 2, bits: int = 1024, features: bool = False):
        if smiles is None:
            return None
        # Check if the SMILES is already in the cache
        if smiles in self.smiles_cache:
            return self.smiles_cache[smiles]
        else:
            # Encode the SMILES and store it in the cache
            try:
                mol = Chem.MolFromSmiles(smiles)
                morgan = AllChem.GetMorganFingerprintAsBitVect(
                    mol,
                    radius=radius,
                    nBits=bits,
                    useFeatures=features,
                )
                morgan = np.array(morgan)
                self.smiles_cache[smiles] = morgan
                return morgan
            except Exception as e:
                print(f"Failed to encode SMILES: {smiles}")
                print(e)
                return None

    def _encode_smiles_mult(self, smiles: List[str], radius: int = 2, bits: int = 1024, features: bool = False):
        morgan = [self._encode_smiles(s, radius, bits, features) for s in smiles]
        return np.array(morgan)

    def _encode_sequence(self, sequence: str):
        # Clear torch cache
        torch.cuda.empty_cache()
        if sequence is None:
            return None
        # Check if the sequence is already in the cache
        if sequence in self.sequence_cache:
            return self.sequence_cache[sequence]
        else:
            # Encode the sequence and store it in the cache
            try:
                encoded_sequence = encode_sequences([sequence], encoder=self.encoder)
                self.sequence_cache[sequence] = encoded_sequence
                return encoded_sequence
            except Exception as e:
                print(f"Failed to encode sequence: {sequence}")
                print(e)
                return None

    def _encode_sequence_mult(self, sequences: List[str]):
        seq = [self._encode_sequence(sequence) for sequence in sequences]
        return np.array(seq)

    def __predict_pair(self, drug_emb: np.ndarray, target_emb: np.ndarray, pred_leaf: bool):
        if drug_emb.shape[0] < target_emb.shape[0]:
            drug_emb = np.tile(drug_emb, (len(target_emb), 1))
        elif len(drug_emb) > len(target_emb):
            target_emb = np.tile(target_emb, (len(drug_emb), 1))
        emb = self.bt_model.zero_shot(drug_emb, target_emb)

        if pred_leaf:
            d_emb = DMatrix(emb)
            return self.gbm_model.get_booster().predict(d_emb, pred_leaf=True)
        else:
            return self.gbm_model.predict_proba(emb)[:, 1]

    def predict(self, drug: List[str] or str, target: str, pred_leaf: bool = False):
        if isinstance(drug, str):
            drug_emb = self._encode_smiles(drug)
        else:
            drug_emb = self._encode_smiles_mult(drug)
        target_emb = self._encode_sequence(target)
        return self.__predict_pair(drug_emb, target_emb, pred_leaf)
    
    def get_leaf_weights(self):
        return self.gbm_model.get_booster().get_score(importance_type="weight")

    def _predict_fasta(self, drug: str, fasta_path: str):
        drug_emb = self._encode_smiles(drug)

        results = []
        # Extract targets from fasta
        for target in tqdm(SeqIO.parse(fasta_path, "fasta"), desc="Predicting targets"):
            target_emb = self._encode_sequence(str(target.seq))
            pred = self.__predict_pair(drug_emb, target_emb)
            results.append(
                {
                    "drug": drug,
                    "target": target.id,
                    "name": target.name,
                    "description": target.description,
                    "prediction": pred[0]
                }
            )
        return pd.DataFrame(results)

    def predict_fasta(self, drug: str, fasta_path: str, timeout_seconds: int = 120):
        def process_target(target, results):
            target_emb = self._encode_sequence(str(target.seq))
            pred = self.__predict_pair(drug_emb, target_emb)
            results.append({
                "drug": drug,
                "target": target.id,
                "name": target.name,
                "description": target.description,
                "prediction": pred[0]
            })

        drug_emb = self._encode_smiles(drug)
        results = []

        # First, count the total number of records for the progress bar
        total_records = sum(1 for _ in SeqIO.parse(fasta_path, "fasta"))

        # Extract targets from fasta with a properly initialized tqdm progress bar
        for target in tqdm(SeqIO.parse(fasta_path, "fasta"), total=total_records, desc="Predicting targets"):
            thread_results = []
            thread = threading.Thread(target=process_target, args=(target, thread_results))
            thread.start()
            thread.join(timeout_seconds)
            if thread.is_alive():
                print(f"Skipping target {target.id} due to timeout")
                continue
            results.extend(thread_results)

        return pd.DataFrame(results)

    def predict_uniprot(self, drug: List[str] or str, uniprot_id: str):
        return self.predict(drug, uniprot2sequence(uniprot_id))