PascalNotin commited on
Commit
2650437
1 Parent(s): e750e94

Improved app layout

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +148 -49
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Tranception Design
3
- emoji: 🐨
4
  colorFrom: blue
5
  colorTo: gray
6
  sdk: gradio
 
1
  ---
2
  title: Tranception Design
3
+ emoji: 🧬
4
  colorFrom: blue
5
  colorTo: gray
6
  sdk: gradio
app.py CHANGED
@@ -38,45 +38,71 @@ def create_all_single_mutants(sequence,AA_vocab=AA_vocab,mutation_range_start=No
38
  all_single_mutants.columns = ['mutant','mutated_sequence']
39
  return all_single_mutants
40
 
41
- def create_scoring_matrix_visual(scores,sequence,AA_vocab=AA_vocab,mutation_range_start=None,mutation_range_end=None):
42
- piv=scores.pivot(index='position',columns='target_AA',values='avg_score').transpose().round(4)
43
- fig, ax = plt.subplots(figsize=(len(sequence)*1.2,20))
44
  scores_dict = {}
45
  valid_mutant_set=set(scores.mutant)
46
  if mutation_range_start is None: mutation_range_start=1
47
  if mutation_range_end is None: mutation_range_end=len(sequence)
48
- for target_AA in list(AA_vocab):
 
 
49
  for position in range(mutation_range_start,mutation_range_end+1):
50
- mutant = sequence[position-1]+str(position)+target_AA
51
- if mutant in valid_mutant_set:
52
- scores_dict[mutant]= float(scores.loc[scores.mutant==mutant,'avg_score'])
53
- else:
54
- scores_dict[mutant]=0.0
55
- labels = (np.asarray(["{} \n {:.4f}".format(symb,value) for symb, value in scores_dict.items() ])).reshape(len(AA_vocab),mutation_range_end-mutation_range_start+1)
56
- heat = sns.heatmap(piv,annot=labels,fmt="",cmap='RdYlGn',linewidths=0.30,vmin=np.percentile(scores.avg_score,2),vmax=np.percentile(scores.avg_score,98),\
57
- cbar_kws={'label': 'Log likelihood ratio (mutant / starting sequence)'})
58
- heat.figure.axes[-1].yaxis.label.set_size(20)
59
- #heat.set_title("Fitness scores for all single amino acid substitutions",fontsize=30)
60
- heat.set_title("Higher predicted scores (green) imply higher protein fitness",fontsize=30, pad=40)
61
- heat.set_xlabel("Sequence position", fontsize = 20)
62
- heat.set_ylabel("Amino Acid mutation", fontsize = 20)
 
 
 
 
 
 
 
 
 
63
  plt.savefig('fitness_scoring_substitution_matrix.png')
64
- return plt
 
65
 
66
  def suggest_mutations(scores):
67
  intro_message = "The following mutations may be sensible options to improve fitness: \n\n"
68
  #Best mutants
69
  top_mutants=list(scores.sort_values(by=['avg_score'],ascending=False).head(5).mutant)
70
- mutant_recos = "The 5 single mutants with highest predicted fitness are:\n {} \n\n".format(", ".join(top_mutants))
 
 
71
  #Best positions
72
  positive_scores = scores[scores.avg_score > 0]
73
  positive_scores_position_avg = positive_scores.groupby(['position']).mean()
74
  top_positions=list(positive_scores_position_avg.sort_values(by=['avg_score'],ascending=False).head(5).index.astype(str))
75
  print(top_positions)
76
- position_recos = "The 5 positions with the highest average fitness increase are:\n {}".format(", ".join(top_positions))
77
  return intro_message+mutant_recos+position_recos
78
 
 
 
 
 
 
 
 
 
 
 
 
79
  def get_mutated_protein(sequence,mutant):
 
80
  mutated_sequence = list(sequence)
81
  mutated_sequence[int(mutant[1:-1])-1]=mutant[-1]
82
  return ''.join(mutated_sequence)
@@ -101,40 +127,113 @@ def score_and_create_matrix_all_singles(sequence,mutation_range_start=None,mutat
101
  scores["position"]=scores["mutant"].map(lambda x: int(x[1:-1]))
102
  scores["target_AA"] = scores["mutant"].map(lambda x: x[-1])
103
  score_heatmap = create_scoring_matrix_visual(scores,sequence,AA_vocab,mutation_range_start,mutation_range_end)
104
- return score_heatmap,suggest_mutations(scores)
 
 
 
 
 
 
 
 
 
 
105
 
106
  #######################################################################################################################################
107
  ############################################### GRADIO INTERFACE ####################################################################
108
  #######################################################################################################################################
109
 
110
- title = "Interactive in silico directed evolution with Tranception"
111
- description = "Perform in silico directed evolution with Tranception to iteratively improve the fitness of a starting protein sequence, one mutation at a time. At each step, the Tranception model computes the log likelihood ratios of all possible single amino acid substitution Vs the starting sequence, and outputs a fitness heatmap and recommandations to guide the selection of the mutation to apply. Note: The current version does not leverage retrieval of homologs at inference time to increase fitness prediction performance."
112
- article = "<p style='text-align: left'><b>Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval</b></p>"
113
- article += "<p style='text-align: left'> Pascal Notin, Mafalda Dias, Jonathan Frazer, Javier Marchena-Hurtado, Aidan N. Gomez, Debora S. Marks<sup>*</sup>, Yarin Gal<sup>*</sup>"
114
- article += "<p style='text-align: left'> <a href='https://proceedings.mlr.press/v162/notin22a.html' target='_blank'>Paper</a> - <a href='https://github.com/OATML-Markslab/Tranception' target='_blank'>Code</a> </p>"
115
- examples=[
116
- ['ADRB2_HUMAN --> MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL', 1, 10, "Small", True],
117
- ['IF1_ECOLI --> MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPYDLSKGRIVFRSR', 1, None, "Medium", False],
118
- ['P53_HUMAN --> MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD', 5, 10, "Large", False]
119
- ]
 
 
 
 
 
 
 
120
 
121
- model_size_selection = gr.Radio(label="Tranception model size (larger models are more accurate but are slower at inference)", choices=["Small","Medium","Large"], value="Small")
122
- protein_sequence_input = gr.Textbox(lines=1, label="Input protein sequence (default = RL40A_YEAST)",value="MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGIIEPSLKALASKYNCDKSVCRKCYARLPPRATNCRKRKCGHTNQLRPKKKLK")
123
- mutation_range_start = gr.Number(label="Start of mutation range (min value = 1)",value=1,precision=0)
124
- mutation_range_end = gr.Number(label="End of mutation range (leave empty for full lenth)",value=10,precision=0)
125
- scoring_mirror = gr.Checkbox(label="Score protein from both directions (leads to more robust fitness predictions, but doubles inference time)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- #output ==> find a way to make scroallable
128
- output_plot = gr.Plot(label="Fitness scores for all single amino acid substitutions in mutation range")
129
- output_recommendations = gr.Textbox(label="Mutation recommendations")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- gr.Interface(
132
- fn=score_and_create_matrix_all_singles,
133
- inputs=[protein_sequence_input,mutation_range_start,mutation_range_end,model_size_selection,scoring_mirror],
134
- outputs=[output_plot,output_recommendations],
135
- title=title,
136
- description=description,
137
- article=article,
138
- ##examples=examples,
139
- allow_flagging="never"
140
- ).launch(debug=True)
 
38
  all_single_mutants.columns = ['mutant','mutated_sequence']
39
  return all_single_mutants
40
 
41
+ def create_scoring_matrix_visual(scores,sequence,AA_vocab=AA_vocab,mutation_range_start=None,mutation_range_end=None,annotate=True,fontsize=20):
42
+ piv=scores.pivot(index='position',columns='target_AA',values='avg_score').round(4)
43
+ fig, ax = plt.subplots(figsize=(50,len(sequence)*0.6))
44
  scores_dict = {}
45
  valid_mutant_set=set(scores.mutant)
46
  if mutation_range_start is None: mutation_range_start=1
47
  if mutation_range_end is None: mutation_range_end=len(sequence)
48
+ ax.tick_params(bottom=True, top=True, left=True, right=True)
49
+ ax.tick_params(labelbottom=True, labeltop=True, labelleft=True, labelright=True)
50
+ if annotate:
51
  for position in range(mutation_range_start,mutation_range_end+1):
52
+ for target_AA in list(AA_vocab):
53
+ mutant = sequence[position-1]+str(position)+target_AA
54
+ if mutant in valid_mutant_set:
55
+ scores_dict[mutant]= float(scores.loc[scores.mutant==mutant,'avg_score'])
56
+ else:
57
+ scores_dict[mutant]=0.0
58
+ labels = (np.asarray(["{} \n {:.4f}".format(symb,value) for symb, value in scores_dict.items() ])).reshape(mutation_range_end-mutation_range_start+1,len(AA_vocab))
59
+ heat = sns.heatmap(piv,annot=labels,fmt="",cmap='RdYlGn',linewidths=0.30,ax=ax,vmin=np.percentile(scores.avg_score,2),vmax=np.percentile(scores.avg_score,98),\
60
+ cbar_kws={'label': 'Log likelihood ratio (mutant / starting sequence)'},annot_kws={"size": fontsize})
61
+ else:
62
+ heat = sns.heatmap(piv,cmap='RdYlGn',linewidths=0.30,ax=ax,vmin=np.percentile(scores.avg_score,2),vmax=np.percentile(scores.avg_score,98),\
63
+ cbar_kws={'label': 'Log likelihood ratio (mutant / starting sequence)'},annot_kws={"size": fontsize})
64
+ heat.figure.axes[-1].yaxis.label.set_size(fontsize=int(fontsize*1.5))
65
+ heat.figure.axes[-1].yaxis.set_ticklabels(heat.figure.axes[-1].yaxis.get_ticklabels(), fontsize=fontsize)
66
+ heat.set_title("Higher predicted scores (green) imply higher protein fitness",fontsize=fontsize*2, pad=40)
67
+ heat.set_ylabel("Sequence position", fontsize = fontsize*2)
68
+ heat.set_xlabel("Amino Acid mutation", fontsize = fontsize*2)
69
+ yticklabels = [str(pos)+' ('+sequence[pos-1]+')' for pos in range(mutation_range_start,mutation_range_end+1)]
70
+ heat.set_yticklabels(yticklabels)
71
+ heat.set_xticklabels(heat.get_xmajorticklabels(), fontsize = fontsize)
72
+ heat.set_yticklabels(heat.get_ymajorticklabels(), fontsize = fontsize, rotation=0)
73
+ plt.tight_layout()
74
  plt.savefig('fitness_scoring_substitution_matrix.png')
75
+ plt.show()
76
+ return 'fitness_scoring_substitution_matrix.png'
77
 
78
  def suggest_mutations(scores):
79
  intro_message = "The following mutations may be sensible options to improve fitness: \n\n"
80
  #Best mutants
81
  top_mutants=list(scores.sort_values(by=['avg_score'],ascending=False).head(5).mutant)
82
+ top_mutants_fitness=list(scores.sort_values(by=['avg_score'],ascending=False).head(5).avg_score)
83
+ top_mutants_recos = [top_mutant+" ("+str(round(top_mutant_fitness,4))+")" for (top_mutant,top_mutant_fitness) in zip(top_mutants,top_mutants_fitness)]
84
+ mutant_recos = "The single mutants with highest predicted fitness are (positive scores indicate fitness increase Vs starting sequence, negative scores indicate fitness decrease):\n {} \n\n".format(", ".join(top_mutants_recos))
85
  #Best positions
86
  positive_scores = scores[scores.avg_score > 0]
87
  positive_scores_position_avg = positive_scores.groupby(['position']).mean()
88
  top_positions=list(positive_scores_position_avg.sort_values(by=['avg_score'],ascending=False).head(5).index.astype(str))
89
  print(top_positions)
90
+ position_recos = "The positions with the highest average fitness increase are (only positions with at least one fitness increase are considered):\n {}".format(", ".join(top_positions))
91
  return intro_message+mutant_recos+position_recos
92
 
93
+ def check_valid_mutant(sequence,mutant,AA_vocab=AA_vocab):
94
+ valid = True
95
+ try:
96
+ from_AA, position, to_AA = mutant[0], int(mutant[1:-1]), mutant[-1]
97
+ except:
98
+ valid = False
99
+ if sequence[position-1]!=from_AA: valid=False
100
+ if position<1 or position>len(sequence): valid=False
101
+ if to_AA not in AA_vocab: valid=False
102
+ return valid
103
+
104
  def get_mutated_protein(sequence,mutant):
105
+ assert check_valid_mutant(sequence,mutant), "The mutant is not valid"
106
  mutated_sequence = list(sequence)
107
  mutated_sequence[int(mutant[1:-1])-1]=mutant[-1]
108
  return ''.join(mutated_sequence)
 
127
  scores["position"]=scores["mutant"].map(lambda x: int(x[1:-1]))
128
  scores["target_AA"] = scores["mutant"].map(lambda x: x[-1])
129
  score_heatmap = create_scoring_matrix_visual(scores,sequence,AA_vocab,mutation_range_start,mutation_range_end)
130
+ return [score_heatmap],suggest_mutations(scores)
131
+
132
+ def extract_sequence(example):
133
+ label, taxon, sequence = example
134
+ return sequence
135
+
136
+ def clear_inputs(protein_sequence_input,mutation_range_start,mutation_range_end):
137
+ protein_sequence_input = ""
138
+ mutation_range_start = None
139
+ mutation_range_end = None
140
+ return protein_sequence_input,mutation_range_start,mutation_range_end
141
 
142
  #######################################################################################################################################
143
  ############################################### GRADIO INTERFACE ####################################################################
144
  #######################################################################################################################################
145
 
146
+ tranception_design = gr.Blocks()
147
+
148
+ with tranception_design:
149
+ gr.Markdown("# Interactive in silico directed evolution with Tranception")
150
+ gr.Markdown(" Perform in silico directed evolution with Tranception to iteratively improve the fitness of a protein of interest, one mutation at a time. At each step, the Tranception model computes the log likelihood ratios of all possible single amino acid substitution Vs the starting sequence, and outputs a fitness heatmap and recommandations to guide the selection of the mutation to apply.")
151
+
152
+ with gr.Tabs():
153
+ with gr.TabItem("Input"):
154
+ with gr.Row():
155
+ protein_sequence_input = gr.Textbox(lines=1,
156
+ label="Protein sequence",
157
+ placeholder = "Input the sequence of amino acids representing the starting protein of interest or select one from the list of examples below. You may enter the full sequence or just a subdomain (providing full context typically leads to better results, but is slower at inference)"
158
+ )
159
+
160
+ with gr.Row():
161
+ mutation_range_start = gr.Number(label="Start of mutation window (first position indexed at 1)",value=1,precision=0)
162
+ mutation_range_end = gr.Number(label="End of mutation window (leave empty for full lenth)",value=10,precision=0)
163
 
164
+ with gr.TabItem("Parameters"):
165
+ with gr.Row():
166
+ model_size_selection = gr.Radio(label="Tranception model size (larger models are more accurate but are slower at inference)",
167
+ choices=["Small","Medium","Large"],
168
+ value="Small")
169
+ with gr.Row():
170
+ scoring_mirror = gr.Checkbox(label="Score protein from both directions (leads to more robust fitness predictions, but doubles inference time)")
171
+ with gr.Row():
172
+ gr.Markdown("Note: the current version does not leverage retrieval of homologs at inference time to increase fitness prediction performance.")
173
+ with gr.Row():
174
+ clear_button = gr.Button(value="Clear",variant="secondary")
175
+ run_button = gr.Button(value="Predict fitness",variant="primary")
176
+ protein_ID = gr.Textbox(label="Uniprot ID", visible=False)
177
+ taxon = gr.Textbox(label="Taxon", visible=False)
178
+ examples = gr.Examples(
179
+ inputs=[protein_ID, taxon, protein_sequence_input],
180
+ outputs=[protein_sequence_input],
181
+ fn=extract_sequence,
182
+ examples=[
183
+ ['ADRB2_HUMAN' ,'Human', 'MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL'],
184
+ ['IF1_ECOLI' ,'Prokaryote', 'MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPYDLSKGRIVFRSR'],
185
+ ['P53_HUMAN' ,'Human', 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'],
186
+ ['BLAT_ECOLX' ,'Prokaryote', 'MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW'],
187
+ ['BRCA1_HUMAN' ,'Human', 'MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKDEVSIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEISLDSAKKAACEFSETDVTNTEHHQPSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPCSENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVDEYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTENLIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHSRETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVTFECEQKEENQGKNESNIKPVQTVNITAGFPVVGQKDKPVDNAKCSIKGGSRFCLSSQFRGNETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNENIPSTVSTISRNNIRENVFKEASSSNINEVGSSTNEVGSSINEIGSSDENIQAELGRNRGPKLNAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEYEEVVQTVNTDFSPYLISDNLEQPMGSSHASQVCSETPDDLLDDGEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTHTHLAQGYRRGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPSQSTRHSTVATECLSKNTEENLLSLKNSLNDCSNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANTNTQDPFLIGSSKQMRHQSESQGVGLSDKELVSDDEERGTGLEENNQEEQSMDSNLGEAASGCESETSVSEDCSGLSSQSDILTTQQRDTMQHNLIKLQQEMAELEAVLEQHGSQPSNSYPSIISDSSALEDLRNPEQSTSEKAVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSKNKEPGVERSSPSKCPSLDDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEESGPHDLTETSYLPRQDLEGTPYLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTTDTAGYNAMEESVSREKPELTASTERVNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLITEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDVVNGRNHQGPKRARESQDRKIFRGLEICCYGPFTNMPTDQLEWMVQLCGASVVKELSSFTLGTGVHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDTYLIPQIPHSHY'],
188
+ ['CALM1_HUMAN' ,'Human', 'MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK'],
189
+ ['CCDB_ECOLI' ,'Prokaryote', 'MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDKVSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI'],
190
+ ['GFP_AEQVI' ,'Other eukaryote', 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'],
191
+ ['GRB2_HUMAN' ,'Human', 'MEAIAKYDFKATADDELSFKRGDILKVLNEECDQNWYKAELNGKDGFIPKNYIEMKPHPWFFGKIPRAKAEEMLSKQRHDGAFLIRESESAPGDFSLSVKFGNDVQHFKVLRDGAGKYFLWVVKFNSLNELVDYHRSTSVSRNQQIFLRDIEQVPQQPTYVQALFDFDPQEDGELGFRRGDFIHVMDNSDPNWWKGACHGQTGMFPRNYVTPVNRNV'],
192
+ ['HSP82_YEAST' ,'Eukaryote ', 'MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPKQLETEPDLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQFGVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDEVNERIGRGTILRLFLKDDQLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKEVPIPEEEKKDEEKKDEEKKDEDDKKPKLEEVDEEEEKKPKTKKVKEEVQEIEELNKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPLYVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSFVKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSAFSKNIKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLKAVEKSPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAEREKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRDSSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYETALLTSGFSLDEPTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPADTEMEEVD'],
193
+ ['IF1_ECOLI' ,'Prokaryote', 'MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPYDLSKGRIVFRSR'],
194
+ ['KCNH2_HUMAN' ,'Human', 'MPVRRGHVAPQNTFLDTIIRKFEGQSRKFIIANARVENCAVIYCNDGFCELCGYSRAEVMQRPCTCDFLHGPRTQRRAAAQIAQALLGAEERKVEIAFYRKDGSCFLCLVDVVPVKNEDGAVIMFILNFEVVMEKDMVGSPAHDTNHRGPPTSWLAPGRAKTFRLKLPALLALTARESSVRSGGAGGAGAPGAVVVDVDLTPAAPSSESLALDEVTAMDNHVAGLGPAEERRALVGPGSPPRSAPGQLPSPRAHSLNPDASGSSCSLARTRSRESCASVRRASSADDIEAMRAGVLPPPPRHASTGAMHPLRSGLLNSTSDSDLVRYRTISKIPQITLNFVDLKGDPFLASPTSDREIIAPKIKERTHNVTEKVTQVLSLGADVLPEYKLQAPRIHRWTILHYSPFKAVWDWLILLLVIYTAVFTPYSAAFLLKETEEGPPATECGYACQPLAVVDLIVDIMFIVDILINFRTTYVNANEEVVSHPGRIAVHYFKGWFLIDMVAAIPFDLLIFGSGSEELIGLLKTARLLRLVRVARKLDRYSEYGAAVLFLLMCTFALIAHWLACIWYAIGNMEQPHMDSRIGWLHNLGDQIGKPYNSSGLGGPSIKDKYVTALYFTFSSLTSVGFGNVSPNTNSEKIFSICVMLIGSLMYASIFGNVSAIIQRLYSGTARYHTQMLRVREFIRFHQIPNPLRQRLEEYFQHAWSYTNGIDMNAVLKGFPECLQADICLHLNRSLLQHCKPFRGATKGCLRALAMKFKTTHAPPGDTLVHAGDLLTALYFISRGSIEILRGDVVVAILGKNDIFGEPLNLYARPGKSNGDVRALTYCDLHKIHRDDLLEVLDMYPEFSDHFWSSLEITFNLRDTNMIPGSPGSTELEGGFSRQRKRKLSFRRRTDKDTEQPGEVSALGPGRAGAGPSSRGRPGGPWGESPSSGPSSPESSEDEGPGRSSSPLRLVPFSSPRPPGEPPGGEPLMEDCEKSSDTCNPLSGAFSGVSNIFSFWGDSRGRQYQELPRCPAPTPSLLNIPLSSPGRRPRGDVESRLDALQRQLNRLETRLSADMATVLQLLQRQMTLVPPAYSAVTTPGPGPTSTSPLLPVSPLPTLTLDSLSQVSQFMACEELPPGAPELPQEGPTRRLSLPGQLGALTSQPLHRHGSDPGS'],
195
+ ['KKA2_KLEPN' ,'Prokaryote', 'MIEQDGLHAGSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNELQDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAMRRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPDGEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRFLVLYGIAAPDSQRIAFYRLLDEFF'],
196
+ ['MSH2_HUMAN' ,'Human', 'MAVQPKETLQLESAAEVGFVRFFQGMPEKPTTTVRLFDRGDFYTAHGEDALLAAREVFKTQGVIKYMGPAGAKNLQSVVLSKMNFESFVKDLLLVRQYRVEVYKNRAGNKASKENDWYLAYKASPGNLSQFEDILFGNNDMSASIGVVGVKMSAVDGQRQVGVGYVDSIQRKLGLCEFPDNDQFSNLEALLIQIGPKECVLPGGETAGDMGKLRQIIQRGGILITERKKADFSTKDIYQDLNRLLKGKKGEQMNSAVLPEMENQVAVSSLSAVIKFLELLSDDSNFGQFELTTFDFSQYMKLDIAAVRALNLFQGSVEDTTGSQSLAALLNKCKTPQGQRLVNQWIKQPLMDKNRIEERLNLVEAFVEDAELRQTLQEDLLRRFPDLNRLAKKFQRQAANLQDCYRLYQGINQLPNVIQALEKHEGKHQKLLLAVFVTPLTDLRSDFSKFQEMIETTLDMDQVENHEFLVKPSFDPNLSELREIMNDLEKKMQSTLISAARDLGLDPGKQIKLDSSAQFGYYFRVTCKEEKVLRNNKNFSTVDIQKNGVKFTNSKLTSLNEEYTKNKTEYEEAQDAIVKEIVNISSGYVEPMQTLNDVLAQLDAVVSFAHVSNGAPVPYVRPAILEKGQGRIILKASRHACVEVQDEIAFIPNDVYFEKDKQMFHIITGPNMGGKSTYIRQTGVIVLMAQIGCFVPCESAEVSIVDCILARVGAGDSQLKGVSTFMAEMLETASILRSATKDSLIIIDELGRGTSTYDGFGLAWAISEYIATKIGAFCMFATHFHELTALANQIPTVNNLHVTALTTEETLTMLYQVKKGVCDQSFGIHVAELANFPKHVIECAKQKALELEEFQYIGESQGYDIMEPAAKKCYLEREQGEKIIQEFLSKVKQMPFTEMSEENITIKLKQLKAEVIAKNNSFVNEIISRIKVTT'],
197
+ ['PABP_YEAST' ,'Other eukaryote', 'MADITDKTAEQLENLNIQDDQKQAATGSESQSVENSSASLYVGDLEPSVSEAHLYDIFSPIGSVSSIRVCRDAITKTSLGYAYVNFNDHEAGRKAIEQLNYTPIKGRLCRIMWSQRDPSLRKKGSGNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAPHLSRKERDSQLEETKAHYTNLYVKNINSETTDEQFQELFAKFGPIVSASLEKDADGKLKGFGFVNYEKHEDAVKAVEALNDSELNGEKLYVGRAQKKNERMHVLKKQYEAYRLEKMAKYQGVNLFVKNLDDSVDDEKLEEEFAPYGTITSAKVMRTENGKSKGFGFVCFSTPEEATKAITEKNQQIVAGKPLYVAIAQRKDVRRSQLAQQIQARNQMRYQQATAAAAAAAAGMPGQFMPPMFYGVMPPRGVPFNGPNPQQMNPMGGMPKNGMPPQFRNGPVYGVPPQGGFPRNANDNNQFYQQKQRQALGEQLYKKVSAKTSNEEAAGKITGMILDLPPQEVFPLLESDELFEQHYKEASAAYESFKKEQEQQTEQA'],
198
+ ['P53_HUMAN' ,'Human', 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'],
199
+ ['PTEN_HUMAN' ,'Human', 'MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV'],
200
+ ['RL40A_YEAST' ,'Eukaryote ', 'MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGIIEPSLKALASKYNCDKSVCRKCYARLPPRATNCRKRKCGHTNQLRPKKKLK'],
201
+ ['SCN5A_HUMAN' ,'Human', 'MANFLLPRGTSSFRRFTRESLAAIEKRMAEKQARGSTTLQESREGLPEEEAPRPQLDLQASKKLPDLYGNPPQELIGEPLEDLDPFYSTQKTFIVLNKGKTIFRFSATNALYVLSPFHPIRRAAVKILVHSLFNMLIMCTILTNCVFMAQHDPPPWTKYVEYTFTAIYTFESLVKILARGFCLHAFTFLRDPWNWLDFSVIIMAYTTEFVDLGNVSALRTFRVLRALKTISVISGLKTIVGALIQSVKKLADVMVLTVFCLSVFALIGLQLFMGNLRHKCVRNFTALNGTNGSVEADGLVWESLDLYLSDPENYLLKNGTSDVLLCGNSSDAGTCPEGYRCLKAGENPDHGYTSFDSFAWAFLALFRLMTQDCWERLYQQTLRSAGKIYMIFFMLVIFLGSFYLVNLILAVVAMAYEEQNQATIAETEEKEKRFQEAMEMLKKEHEALTIRGVDTVSRSSLEMSPLAPVNSHERRSKRRKRMSSGTEECGEDRLPKSDSEDGPRAMNHLSLTRGLSRTSMKPRSSRGSIFTFRRRDLGSEADFADDENSTAGESESHHTSLLVPWPLRRTSAQGQPSPGTSAPGHALHGKKNSTVDCNGVVSLLGAGDPEATSPGSHLLRPVMLEHPPDTTTPSEEPGGPQMLTSQAPCVDGFEEPGARQRALSAVSVLTSALEELEESRHKCPPCWNRLAQRYLIWECCPLWMSIKQGVKLVVMDPFTDLTITMCIVLNTLFMALEHYNMTSEFEEMLQVGNLVFTGIFTAEMTFKIIALDPYYYFQQGWNIFDSIIVILSLMELGLSRMSNLSVLRSFRLLRVFKLAKSWPTLNTLIKIIGNSVGALGNLTLVLAIIVFIFAVVGMQLFGKNYSELRDSDSGLLPRWHMMDFFHAFLIIFRILCGEWIETMWDCMEVSGQSLCLLVFLLVMVIGNLVVLNLFLALLLSSFSADNLTAPDEDREMNNLQLALARIQRGLRFVKRTTWDFCCGLLRQRPQKPAALAAQGQLPSCIATPYSPPPPETEKVPPTRKETRFEEGEQPGQGTPGDPEPVCVPIAVAESDTDDQEEDEENSLGTEEESSKQQESQPVSGGPEAPPDSRTWSQVSATASSEAEASASQADWRQQWKAEPQAPGCGETPEDSCSEGSTADMTNTAELLEQIPDLGQDVKDPEDCFTEGCVRRCPCCAVDTTQAPGKVWWRLRKTCYHIVEHSWFETFIIFMILLSSGALAFEDIYLEERKTIKVLLEYADKMFTYVFVLEMLLKWVAYGFKKYFTNAWCWLDFLIVDVSLVSLVANTLGFAEMGPIKSLRTLRALRPLRALSRFEGMRVVVNALVGAIPSIMNVLLVCLIFWLIFSIMGVNLFAGKFGRCINQTEGDLPLNYTIVNNKSQCESLNLTGELYWTKVKVNFDNVGAGYLALLQVATFKGWMDIMYAAVDSRGYEEQPQWEYNLYMYIYFVIFIIFGSFFTLNLFIGVIIDNFNQQKKKLGGQDIFMTEEQKKYYNAMKKLGSKKPQKPIPRPLNKYQGFIFDIVTKQAFDVTIMFLICLNMVTMMVETDDQSPEKINILAKINLLFVAIFTGECIVKLAALRHYYFTNSWNIFDFVVVILSIVGTVLSDIIQKYFFSPTLFRVIRLARIGRILRLIRGAKGIRTLLFALMMSLPALFNIGLLLFLVMFIYSIFGMANFAYVKWEAGIDDMFNFQTFANSMLCLFQITTSAGWDGLLSPILNTGPPYCDPTLPNSNGSRGDCGSPAVGILFFTTYIIISFLIVVNMYIAIILENFSVATEESTEPLSEDDFDMFYEIWEKFDPEATQFIEYSVLSDFADALSEPLRIAKPNQISLINMDLPMVSGDRIHCMDILFAFTKRVLGESGEMDALKIQMEEKFMAANPSKISYEPITTTLRRKHEEVSAMVIQRAFRRHLLQRSLKHASFLFRQQAGSGLSEEDAPEREGLIAYVMSENFSRPLGPPSSSSISSTSFPPSYDSVTRATSDNLQVRGSDYSHSEDLADFPPSPDRDRESIV'],
202
+ ['SUMO1_HUMAN' ,'Human', 'MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMNSLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV']
203
+ ],
204
+ )
205
+ gr.Markdown("<br>")
206
+ gr.Markdown("# Fitness predictions for all single amino acid substitutions in mutation range")
207
 
208
+ #output_plot = gr.Plot(label="Fitness predictions for all single amino acid substitutions in mutation range")
209
+ #output_image = gr.Image(label="Fitness predictions for all single amino acid substitutions in mutation range",type="filepath")
210
+ output_image = gr.Gallery(label="Fitness predictions (inference may take a few seconds for short proteins & mutation ranges to several minutes for longer ones)",type="filepath") #Using Gallery to be able to scroll large matrix images
211
+
212
+ output_recommendations = gr.Textbox(label="Mutation recommendations")
213
+
214
+ clear_button.click(
215
+ inputs = [protein_sequence_input,mutation_range_start,mutation_range_end],
216
+ outputs = [protein_sequence_input,mutation_range_start,mutation_range_end],
217
+ fn=clear_inputs
218
+ )
219
+ run_button.click(
220
+ fn=score_and_create_matrix_all_singles,
221
+ inputs=[protein_sequence_input,mutation_range_start,mutation_range_end,model_size_selection,scoring_mirror],
222
+ outputs=[output_image,output_recommendations],
223
+ )
224
+ gr.Markdown("# Mutate the starting protein sequence")
225
+ with gr.Row():
226
+ mutation_triplet = gr.Textbox(lines=1,label="Selected mutation", placeholder = "Input the mutation triplet for the selected mutation (eg., M1A)")
227
+ mutate_button = gr.Button(value="Apply mutation to starting protein", variant="primary")
228
+ mutated_protein_sequence = gr.Textbox(lines=1,label="Mutated protein sequence")
229
+ mutate_button.click(
230
+ fn = get_mutated_protein,
231
+ inputs = [protein_sequence_input,mutation_triplet],
232
+ outputs = mutated_protein_sequence
233
+ )
234
+ gr.Markdown("<p>You may now use the output mutated sequence above as the starting sequence for another round of in silico directed evolution.</p>")
235
+ gr.Markdown("For more information about the Tranception model, please refer to our paper below:")
236
+ gr.Markdown("<p><b>Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval</b><br>Pascal Notin, Mafalda Dias, Jonathan Frazer, Javier Marchena-Hurtado, Aidan N. Gomez, Debora S. Marks<sup>*</sup>, Yarin Gal<sup>*</sup><br><sup>* equal senior authorship</sup></p>")
237
+ gr.Markdown("Links: <a href='https://proceedings.mlr.press/v162/notin22a.html' target='_blank'>Paper</a> <a href='https://github.com/OATML-Markslab/Tranception' target='_blank'>Code</a> <a href='https://sites.google.com/view/proteingym/substitutions' target='_blank'>ProteinGym</a>")
238
 
239
+ tranception_design.launch(debug=True,share=True)