Xi "Alexander" Fu commited on
Commit
40af086
2 Parent(s): 9a25e9f b3e8ff2

Merge pull request #1 from fuxialexander/buendia/read-from-s3

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -2
  2. app/main.py +73 -38
Dockerfile CHANGED
@@ -9,7 +9,7 @@ USER $MAMBA_USER
9
  # Set the working directory in the container to /app
10
  WORKDIR /app
11
  # Create a new environment using mamba with specified packages
12
- RUN micromamba install -n base -c conda-forge -c bioconda -y python=3.10 pip biopython
13
  RUN micromamba install -n base -c conda-forge -c bioconda -y nglview tqdm matplotlib pandas
14
  RUN micromamba install -n base -c conda-forge -c bioconda -y openpyxl pyarrow python-box xmlschema seaborn numpy py3Dmol pyranges scipy pyyaml zarr numcodecs
15
  RUN micromamba install -n base -c conda-forge -c bioconda -y pybigwig networkx plotly pysam requests seqlogo MOODS urllib3 pyliftover gprofiler-official pyfaidx
@@ -57,4 +57,4 @@ EXPOSE 7681
57
  # Set the working directory where your app resides
58
 
59
  # Command to run the Gradio app automatically
60
- CMD ["python", "app/main.py", "-p", "7681", "-s", "-d", "/data"]
 
9
  # Set the working directory in the container to /app
10
  WORKDIR /app
11
  # Create a new environment using mamba with specified packages
12
+ RUN micromamba install -n base -c conda-forge -c bioconda -y python=3.10 pip biopython s3fs
13
  RUN micromamba install -n base -c conda-forge -c bioconda -y nglview tqdm matplotlib pandas
14
  RUN micromamba install -n base -c conda-forge -c bioconda -y openpyxl pyarrow python-box xmlschema seaborn numpy py3Dmol pyranges scipy pyyaml zarr numcodecs
15
  RUN micromamba install -n base -c conda-forge -c bioconda -y pybigwig networkx plotly pysam requests seqlogo MOODS urllib3 pyliftover gprofiler-official pyfaidx
 
57
  # Set the working directory where your app resides
58
 
59
  # Command to run the Gradio app automatically
60
+ CMD ["python", "app/main.py", "-p", "7681", "-s", "-u", "s3://2023-get-xf2217/get_demo_test_data", "-d", "/data"]
app/main.py CHANGED
@@ -6,67 +6,102 @@ import matplotlib.pyplot as plt
6
  import pandas as pd
7
  import pkg_resources
8
  from dash_bio import Clustergram
9
- from proscope.data import get_genename_to_uniprot, get_lddt, get_seq
10
-
11
- seq = get_seq()
12
- genename_to_uniprot = get_genename_to_uniprot()
13
- lddt = get_lddt()
14
  import sys
 
15
  from glob import glob
16
-
17
  import numpy as np
 
18
  from atac_rna_data_processing.config.load_config import load_config
19
  from atac_rna_data_processing.io.celltype import GETCellType
20
  from atac_rna_data_processing.io.nr_motif_v1 import NrMotifV1
21
  from proscope.af2 import AFPairseg
 
22
  from proscope.protein import Protein
23
  from proscope.viewer import view_pdb_html
24
 
 
 
 
 
 
25
  args = argparse.ArgumentParser()
26
  args.add_argument("-p", "--port", type=int, default=7860, help="Port number")
27
  args.add_argument("-s", "--share", action="store_true", help="Share on network")
28
- args.add_argument("-d", "--data", type=str, default="/data", help="Data directory")
 
29
  args = args.parse_args()
30
- # set pseudo args
31
- # args = args.parse_args(['-p', '7869', '-s', '-d', '/manitou/pmg/users/xf2217/demo_data'])
32
- gene_pairs = glob(f"{args.data}/structures/causal/*")
33
- gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
34
  GET_CONFIG = load_config(
35
- "/manitou/pmg/users/xf2217/atac_rna_data_processing/atac_rna_data_processing/config/GET"
36
  )
37
  GET_CONFIG.celltype.jacob = True
38
  GET_CONFIG.celltype.num_cls = 2
39
  GET_CONFIG.celltype.input = True
40
  GET_CONFIG.celltype.embed = True
41
- GET_CONFIG.celltype.data_dir = (
42
- "/manitou/pmg/users/xf2217/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
43
- )
44
- GET_CONFIG.celltype.interpret_dir = (
45
- "/manitou/pmg/users/xf2217/Interpretation_all_hg38_allembed_v4_natac/"
46
- )
47
- GET_CONFIG.motif_dir = "/manitou/pmg/users/xf2217/interpret_natac/motif-clustering"
48
- motif = NrMotifV1.load_from_pickle(
49
- pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
50
- GET_CONFIG.motif_dir,
51
- )
52
- cell_type_annot = pd.read_csv(
53
- GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
54
- + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
55
- )
56
- cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
57
- cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
58
- avaliable_celltypes = sorted(
59
- [
60
- cell_type_id_to_name[f.split("/")[-1]]
61
- for f in glob(GET_CONFIG.celltype.interpret_dir + "*")
62
- ]
63
- )
64
  plt.rcParams["figure.dpi"] = 100
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def visualize_AF2(tf_pair, a):
68
- strcture_dir = f"{args.data}/structures/causal/{tf_pair}"
69
- fasta_dir = f"{args.data}/sequences/causal/{tf_pair}"
 
 
 
 
70
  if not os.path.exists(strcture_dir):
71
  gr.ErrorText("No such gene pair")
72
 
@@ -185,7 +220,7 @@ This section enables you to select different cell types and generates a plot tha
185
  """
186
  )
187
  celltype_name = gr.Dropdown(
188
- label="Cell Type", choices=avaliable_celltypes, value='Fetal Astrocyte 1'
189
  )
190
  celltype_btn = gr.Button(value="Load & plot gene expression")
191
  gene_exp_plot = gr.Plot(label="Gene expression prediction vs observation")
 
6
  import pandas as pd
7
  import pkg_resources
8
  from dash_bio import Clustergram
 
 
 
 
 
9
  import sys
10
+ import s3fs
11
  from glob import glob
 
12
  import numpy as np
13
+
14
  from atac_rna_data_processing.config.load_config import load_config
15
  from atac_rna_data_processing.io.celltype import GETCellType
16
  from atac_rna_data_processing.io.nr_motif_v1 import NrMotifV1
17
  from proscope.af2 import AFPairseg
18
+ from proscope.data import get_genename_to_uniprot, get_lddt, get_seq
19
  from proscope.protein import Protein
20
  from proscope.viewer import view_pdb_html
21
 
22
+
23
+ seq = get_seq()
24
+ genename_to_uniprot = get_genename_to_uniprot()
25
+ lddt = get_lddt()
26
+
27
  args = argparse.ArgumentParser()
28
  args.add_argument("-p", "--port", type=int, default=7860, help="Port number")
29
  args.add_argument("-s", "--share", action="store_true", help="Share on network")
30
+ args.add_argument("-u", "--s3_uri", type=str, default=None, help="Path to demo S3 bucket")
31
+ args.add_argument("-d", "--data", type=str, default=None, help="Data directory")
32
  args = args.parse_args()
33
+
 
 
 
34
  GET_CONFIG = load_config(
35
+ "/app/modules/atac_rna_data_processing/atac_rna_data_processing/config/GET"
36
  )
37
  GET_CONFIG.celltype.jacob = True
38
  GET_CONFIG.celltype.num_cls = 2
39
  GET_CONFIG.celltype.input = True
40
  GET_CONFIG.celltype.embed = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  plt.rcParams["figure.dpi"] = 100
42
 
43
+ if args.s3_uri: # Use S3 path if exists
44
+ GET_CONFIG.s3_uri = args.s3_uri
45
+ s3 = s3fs.S3FileSystem()
46
+ GET_CONFIG.celltype.data_dir = (
47
+ f"{args.s3_uri}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
48
+ )
49
+ GET_CONFIG.celltype.interpret_dir = (
50
+ f"{args.s3_uri}/Interpretation_all_hg38_allembed_v4_natac/"
51
+ )
52
+ GET_CONFIG.motif_dir = f"{args.s3_uri}/interpret_natac/motif-clustering"
53
+ cell_type_annot = pd.read_csv(
54
+ GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
55
+ + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
56
+ )
57
+ cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
58
+ cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
59
+ available_celltypes = sorted(
60
+ [
61
+ cell_type_id_to_name[f.split("/")[-1]]
62
+ for f in s3.glob(GET_CONFIG.celltype.interpret_dir + "*")
63
+ ]
64
+ )
65
+ gene_pairs = s3.glob(f"{args.s3_uri}/structures/causal/*")
66
+ gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
67
+ motif = NrMotifV1.load_from_pickle(
68
+ pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
69
+ GET_CONFIG.motif_dir,
70
+ )
71
+ else: # Run with local data
72
+ GET_CONFIG.celltype.data_dir = (
73
+ f"{args.data}/pretrain_human_bingren_shendure_apr2023/fetal_adult/"
74
+ )
75
+ GET_CONFIG.celltype.interpret_dir = (
76
+ f"{args.data}/Interpretation_all_hg38_allembed_v4_natac/"
77
+ )
78
+ GET_CONFIG.motif_dir = f"{args.data}/interpret_natac/motif-clustering"
79
+ cell_type_annot = pd.read_csv(
80
+ GET_CONFIG.celltype.data_dir.split("fetal_adult")[0]
81
+ + "data/cell_type_pretrain_human_bingren_shendure_apr2023.txt"
82
+ )
83
+ cell_type_id_to_name = dict(zip(cell_type_annot["id"], cell_type_annot["celltype"]))
84
+ cell_type_name_to_id = dict(zip(cell_type_annot["celltype"], cell_type_annot["id"]))
85
+ available_celltypes = sorted(
86
+ [
87
+ cell_type_id_to_name[f.split("/")[-1]]
88
+ for f in glob(GET_CONFIG.celltype.interpret_dir + "*")
89
+ ]
90
+ )
91
+ gene_pairs = glob(f"{args.data}/structures/causal/*")
92
+ gene_pairs = [os.path.basename(pair) for pair in gene_pairs]
93
+ motif = NrMotifV1.load_from_pickle(
94
+ pkg_resources.resource_filename("atac_rna_data_processing", "data/NrMotifV1.pkl"),
95
+ GET_CONFIG.motif_dir,
96
+ )
97
 
98
  def visualize_AF2(tf_pair, a):
99
+ if args.s3_uri:
100
+ strcture_dir = f"{args.s3_uri}/structures/causal/{tf_pair}"
101
+ fasta_dir = f"{args.s3_uri}/sequences/causal/{tf_pair}"
102
+ else:
103
+ strcture_dir = f"{args.data}/structures/causal/{tf_pair}"
104
+ fasta_dir = f"{args.data}/sequences/causal/{tf_pair}"
105
  if not os.path.exists(strcture_dir):
106
  gr.ErrorText("No such gene pair")
107
 
 
220
  """
221
  )
222
  celltype_name = gr.Dropdown(
223
+ label="Cell Type", choices=available_celltypes, value='Fetal Astrocyte 1'
224
  )
225
  celltype_btn = gr.Button(value="Load & plot gene expression")
226
  gene_exp_plot = gr.Plot(label="Gene expression prediction vs observation")