Christina Theodoris
commited on
Commit
•
0960cf6
1
Parent(s):
2f25aea
Add option for modified batch size for loom tokenizer
Browse files- geneformer/tokenizer.py +4 -2
geneformer/tokenizer.py
CHANGED
@@ -157,7 +157,7 @@ class TranscriptomeTokenizer:
|
|
157 |
tokenize_file_fn = (
|
158 |
self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
|
159 |
)
|
160 |
-
for file_path in data_directory.glob("*.{}"
|
161 |
file_found = 1
|
162 |
print(f"Tokenizing {file_path}")
|
163 |
file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path)
|
@@ -278,7 +278,9 @@ class TranscriptomeTokenizer:
|
|
278 |
|
279 |
# scan through .loom files and tokenize cells
|
280 |
tokenized_cells = []
|
281 |
-
for _ix, _selection, view in data.scan(
|
|
|
|
|
282 |
# select subview with protein-coding and miRNA genes
|
283 |
subview = view.view[coding_miRNA_loc, :]
|
284 |
|
|
|
157 |
tokenize_file_fn = (
|
158 |
self.tokenize_loom if file_format == "loom" else self.tokenize_anndata
|
159 |
)
|
160 |
+
for file_path in data_directory.glob(f"*.{file_format}"):
|
161 |
file_found = 1
|
162 |
print(f"Tokenizing {file_path}")
|
163 |
file_tokenized_cells, file_cell_metadata = tokenize_file_fn(file_path)
|
|
|
278 |
|
279 |
# scan through .loom files and tokenize cells
|
280 |
tokenized_cells = []
|
281 |
+
for _ix, _selection, view in data.scan(
|
282 |
+
items=filter_pass_loc, axis=1, batch_size=self.chunk_size
|
283 |
+
):
|
284 |
# select subview with protein-coding and miRNA genes
|
285 |
subview = view.view[coding_miRNA_loc, :]
|
286 |
|