Spaces:
Running
on
T4
Running
on
T4
import numpy as np | |
import scipy | |
import scipy.spatial | |
import string | |
import os,re | |
import random | |
import util | |
to1letter = { | |
"ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C', | |
"GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I', | |
"LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P', | |
"SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' } | |
def parse_a3m(filename): | |
'''read A3M and convert letters into integers in the 0..20 range, | |
also keep track of insertions | |
''' | |
# read A3M file line by line | |
lab,seq = [],[] # labels and sequences | |
for line in open(filename, "r"): | |
if line[0] == '>': | |
lab.append(line.split()[0][1:]) | |
seq.append("") | |
else: | |
seq[-1] += line.rstrip() | |
# parse sequences | |
msa,ins = [],[] | |
table = str.maketrans(dict.fromkeys(string.ascii_lowercase)) | |
nrow,ncol = len(seq),len(seq[0]) | |
for seqi in seq: | |
# remove lowercase letters and append to MSA | |
msa.append(seqi.translate(table)) | |
# 0 - match or gap; 1 - insertion | |
a = np.array([0 if c.isupper() or c=='-' else 1 for c in seqi]) | |
i = np.zeros((ncol)) | |
if np.sum(a) > 0: | |
# positions of insertions | |
pos = np.where(a==1)[0] | |
# shift by occurrence | |
a = pos - np.arange(pos.shape[0]) | |
# position of insertions in the cleaned sequence | |
# and their length | |
pos,num = np.unique(a, return_counts=True) | |
[pos[pos<ncol]] = num[pos<ncol] | |
# append to the matrix of insetions | |
ins.append(i) | |
# convert letters into numbers | |
alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8) | |
msa = np.array([list(s) for s in msa], dtype='|S1').view(np.uint8) | |
for i in range(alphabet.shape[0]): | |
msa[msa == alphabet[i]] = i | |
# treat all unknown characters as gaps | |
msa[msa > 20] = 20 | |
ins = np.array(ins, dtype=np.uint8) | |
return {"msa":msa, "labels":lab, "insertions":ins} | |
def parse_pdb(filename, **kwargs): | |
'''extract xyz coords for all heavy atoms''' | |
lines = open(filename,'r').readlines() | |
return parse_pdb_lines(lines, **kwargs) | |
def parse_pdb_lines(lines, parse_hetatom=False, ignore_het_h=True): | |
# indices of residues observed in the structure | |
res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] | |
seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res] | |
pdb_idx = [( l[21:22].strip(), int(l[22:26].strip()) ) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] # chain letter, res num | |
# 4 BB + up to 10 SC atoms | |
xyz = np.full((len(res), 27, 3), np.nan, dtype=np.float32) | |
for l in lines: | |
if l[:4] != "ATOM": | |
continue | |
chain, resNo, atom, aa = l[21:22], int(l[22:26]), ' '+l[12:16].strip().ljust(3), l[17:20] | |
idx = pdb_idx.index((chain,resNo)) | |
for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]): | |
if tgtatm is not None and tgtatm.strip() == atom.strip(): # ignore whitespace | |
xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])] | |
break | |
# save atom mask | |
mask = np.logical_not(np.isnan(xyz[...,0])) | |
xyz[np.isnan(xyz[...,0])] = 0.0 | |
# remove duplicated (chain, resi) | |
new_idx = [] | |
i_unique = [] | |
for i,idx in enumerate(pdb_idx): | |
if idx not in new_idx: | |
new_idx.append(idx) | |
i_unique.append(i) | |
pdb_idx = new_idx | |
xyz = xyz[i_unique] | |
mask = mask[i_unique] | |
seq = np.array(seq)[i_unique] | |
out = {'xyz':xyz, # cartesian coordinates, [Lx14] | |
'mask':mask, # mask showing which atoms are present in the PDB file, [Lx14] | |
'idx':np.array([i[1] for i in pdb_idx]), # residue numbers in the PDB file, [L] | |
'seq':np.array(seq), # amino acid sequence, [L] | |
'pdb_idx': pdb_idx, # list of (chain letter, residue number) in the pdb file, [L] | |
} | |
# heteroatoms (ligands, etc) | |
if parse_hetatom: | |
xyz_het, info_het = [], [] | |
for l in lines: | |
if l[:6]=='HETATM' and not (ignore_het_h and l[77]=='H'): | |
info_het.append(dict( | |
idx=int(l[7:11]), | |
atom_id=l[12:16], | |
atom_type=l[77], | |
name=l[16:20] | |
)) | |
xyz_het.append([float(l[30:38]), float(l[38:46]), float(l[46:54])]) | |
out['xyz_het'] = np.array(xyz_het) | |
out['info_het'] = info_het | |
return out | |
def parse_fasta(filename): | |
''' | |
Return dict of name: seq | |
''' | |
out = {} | |
with open(filename, 'r') as f_in: | |
while True: | |
name = f_in.readline().strip()[1:] | |
seq = f_in.readline().strip() | |
if not name: break | |
out[name] = seq | |
return out | |