Spaces:
Running
on
T4
Running
on
T4
File size: 4,933 Bytes
59a9ccf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import numpy as np
import scipy
import scipy.spatial
import string
import os,re
import random
import util
to1letter = {
"ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C',
"GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I',
"LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P',
"SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' }
def parse_a3m(filename):
'''read A3M and convert letters into integers in the 0..20 range,
also keep track of insertions
'''
# read A3M file line by line
lab,seq = [],[] # labels and sequences
for line in open(filename, "r"):
if line[0] == '>':
lab.append(line.split()[0][1:])
seq.append("")
else:
seq[-1] += line.rstrip()
# parse sequences
msa,ins = [],[]
table = str.maketrans(dict.fromkeys(string.ascii_lowercase))
nrow,ncol = len(seq),len(seq[0])
for seqi in seq:
# remove lowercase letters and append to MSA
msa.append(seqi.translate(table))
# 0 - match or gap; 1 - insertion
a = np.array([0 if c.isupper() or c=='-' else 1 for c in seqi])
i = np.zeros((ncol))
if np.sum(a) > 0:
# positions of insertions
pos = np.where(a==1)[0]
# shift by occurrence
a = pos - np.arange(pos.shape[0])
# position of insertions in the cleaned sequence
# and their length
pos,num = np.unique(a, return_counts=True)
[pos[pos<ncol]] = num[pos<ncol]
# append to the matrix of insetions
ins.append(i)
# convert letters into numbers
alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='|S1').view(np.uint8)
msa = np.array([list(s) for s in msa], dtype='|S1').view(np.uint8)
for i in range(alphabet.shape[0]):
msa[msa == alphabet[i]] = i
# treat all unknown characters as gaps
msa[msa > 20] = 20
ins = np.array(ins, dtype=np.uint8)
return {"msa":msa, "labels":lab, "insertions":ins}
def parse_pdb(filename, **kwargs):
'''extract xyz coords for all heavy atoms'''
lines = open(filename,'r').readlines()
return parse_pdb_lines(lines, **kwargs)
def parse_pdb_lines(lines, parse_hetatom=False, ignore_het_h=True):
# indices of residues observed in the structure
res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"]
seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res]
pdb_idx = [( l[21:22].strip(), int(l[22:26].strip()) ) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] # chain letter, res num
# 4 BB + up to 10 SC atoms
xyz = np.full((len(res), 27, 3), np.nan, dtype=np.float32)
for l in lines:
if l[:4] != "ATOM":
continue
chain, resNo, atom, aa = l[21:22], int(l[22:26]), ' '+l[12:16].strip().ljust(3), l[17:20]
idx = pdb_idx.index((chain,resNo))
for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]):
if tgtatm is not None and tgtatm.strip() == atom.strip(): # ignore whitespace
xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])]
break
# save atom mask
mask = np.logical_not(np.isnan(xyz[...,0]))
xyz[np.isnan(xyz[...,0])] = 0.0
# remove duplicated (chain, resi)
new_idx = []
i_unique = []
for i,idx in enumerate(pdb_idx):
if idx not in new_idx:
new_idx.append(idx)
i_unique.append(i)
pdb_idx = new_idx
xyz = xyz[i_unique]
mask = mask[i_unique]
seq = np.array(seq)[i_unique]
out = {'xyz':xyz, # cartesian coordinates, [Lx14]
'mask':mask, # mask showing which atoms are present in the PDB file, [Lx14]
'idx':np.array([i[1] for i in pdb_idx]), # residue numbers in the PDB file, [L]
'seq':np.array(seq), # amino acid sequence, [L]
'pdb_idx': pdb_idx, # list of (chain letter, residue number) in the pdb file, [L]
}
# heteroatoms (ligands, etc)
if parse_hetatom:
xyz_het, info_het = [], []
for l in lines:
if l[:6]=='HETATM' and not (ignore_het_h and l[77]=='H'):
info_het.append(dict(
idx=int(l[7:11]),
atom_id=l[12:16],
atom_type=l[77],
name=l[16:20]
))
xyz_het.append([float(l[30:38]), float(l[38:46]), float(l[46:54])])
out['xyz_het'] = np.array(xyz_het)
out['info_het'] = info_het
return out
def parse_fasta(filename):
'''
Return dict of name: seq
'''
out = {}
with open(filename, 'r') as f_in:
while True:
name = f_in.readline().strip()[1:]
seq = f_in.readline().strip()
if not name: break
out[name] = seq
return out
|