Spaces:

merle
/

PROTEIN_GENERATOR

Running on T4

PROTEIN_GENERATOR / utils /parsers_inference.py

Jacob Gershon

new b

59a9ccf over 1 year ago

4.93 kB

	import numpy as np
	import scipy
	import scipy.spatial
	import string
	import os,re
	import random
	import util


	to1letter = {
	"ALA":'A', "ARG":'R', "ASN":'N', "ASP":'D', "CYS":'C',
	"GLN":'Q', "GLU":'E', "GLY":'G', "HIS":'H', "ILE":'I',
	"LEU":'L', "LYS":'K', "MET":'M', "PHE":'F', "PRO":'P',
	"SER":'S', "THR":'T', "TRP":'W', "TYR":'Y', "VAL":'V' }


	def parse_a3m(filename):
	'''read A3M and convert letters into integers in the 0..20 range,
	also keep track of insertions
	'''

	# read A3M file line by line
	lab,seq = [],[] # labels and sequences
	for line in open(filename, "r"):
	if line[0] == '>':
	lab.append(line.split()[0][1:])
	seq.append("")
	else:
	seq[-1] += line.rstrip()

	# parse sequences
	msa,ins = [],[]
	table = str.maketrans(dict.fromkeys(string.ascii_lowercase))
	nrow,ncol = len(seq),len(seq[0])

	for seqi in seq:

	# remove lowercase letters and append to MSA
	msa.append(seqi.translate(table))

	# 0 - match or gap; 1 - insertion
	a = np.array([0 if c.isupper() or c=='-' else 1 for c in seqi])
	i = np.zeros((ncol))

	if np.sum(a) > 0:
	# positions of insertions
	pos = np.where(a==1)[0]

	# shift by occurrence
	a = pos - np.arange(pos.shape[0])

	# position of insertions in the cleaned sequence
	# and their length
	pos,num = np.unique(a, return_counts=True)
	[pos[pos<ncol]] = num[pos<ncol]

	# append to the matrix of insetions
	ins.append(i)

	# convert letters into numbers
	alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype='\|S1').view(np.uint8)
	msa = np.array([list(s) for s in msa], dtype='\|S1').view(np.uint8)
	for i in range(alphabet.shape[0]):
	msa[msa == alphabet[i]] = i

	# treat all unknown characters as gaps
	msa[msa > 20] = 20

	ins = np.array(ins, dtype=np.uint8)

	return {"msa":msa, "labels":lab, "insertions":ins}


	def parse_pdb(filename, **kwargs):
	'''extract xyz coords for all heavy atoms'''
	lines = open(filename,'r').readlines()
	return parse_pdb_lines(lines, **kwargs)

	def parse_pdb_lines(lines, parse_hetatom=False, ignore_het_h=True):
	# indices of residues observed in the structure
	res = [(l[22:26],l[17:20]) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"]
	seq = [util.aa2num[r[1]] if r[1] in util.aa2num.keys() else 20 for r in res]
	pdb_idx = [( l[21:22].strip(), int(l[22:26].strip()) ) for l in lines if l[:4]=="ATOM" and l[12:16].strip()=="CA"] # chain letter, res num

	# 4 BB + up to 10 SC atoms
	xyz = np.full((len(res), 27, 3), np.nan, dtype=np.float32)
	for l in lines:
	if l[:4] != "ATOM":
	continue
	chain, resNo, atom, aa = l[21:22], int(l[22:26]), ' '+l[12:16].strip().ljust(3), l[17:20]
	idx = pdb_idx.index((chain,resNo))
	for i_atm, tgtatm in enumerate(util.aa2long[util.aa2num[aa]]):
	if tgtatm is not None and tgtatm.strip() == atom.strip(): # ignore whitespace
	xyz[idx,i_atm,:] = [float(l[30:38]), float(l[38:46]), float(l[46:54])]
	break

	# save atom mask
	mask = np.logical_not(np.isnan(xyz[...,0]))
	xyz[np.isnan(xyz[...,0])] = 0.0

	# remove duplicated (chain, resi)
	new_idx = []
	i_unique = []
	for i,idx in enumerate(pdb_idx):
	if idx not in new_idx:
	new_idx.append(idx)
	i_unique.append(i)

	pdb_idx = new_idx
	xyz = xyz[i_unique]
	mask = mask[i_unique]
	seq = np.array(seq)[i_unique]

	out = {'xyz':xyz, # cartesian coordinates, [Lx14]
	'mask':mask, # mask showing which atoms are present in the PDB file, [Lx14]
	'idx':np.array([i[1] for i in pdb_idx]), # residue numbers in the PDB file, [L]
	'seq':np.array(seq), # amino acid sequence, [L]
	'pdb_idx': pdb_idx, # list of (chain letter, residue number) in the pdb file, [L]
	}

	# heteroatoms (ligands, etc)
	if parse_hetatom:
	xyz_het, info_het = [], []
	for l in lines:
	if l[:6]=='HETATM' and not (ignore_het_h and l[77]=='H'):
	info_het.append(dict(
	idx=int(l[7:11]),
	atom_id=l[12:16],
	atom_type=l[77],
	name=l[16:20]
	))
	xyz_het.append([float(l[30:38]), float(l[38:46]), float(l[46:54])])

	out['xyz_het'] = np.array(xyz_het)
	out['info_het'] = info_het

	return out

	def parse_fasta(filename):
	'''
	Return dict of name: seq
	'''
	out = {}
	with open(filename, 'r') as f_in:
	while True:
	name = f_in.readline().strip()[1:]
	seq = f_in.readline().strip()
	if not name: break

	out[name] = seq

	return out