Spaces:
Running
Running
jonatanklosko
commited on
Commit
โข
808e59a
1
Parent(s):
3b6e0d3
Add tokenizer generator
Browse files- Dockerfile +6 -1
- README.md +5 -5
- public-apps/tokenizer-generator.livemd +153 -0
- public-apps/welcome.livemd +0 -46
Dockerfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
FROM ghcr.io/livebook-dev/livebook:latest
|
2 |
|
3 |
ENV LIVEBOOK_APP_SERVICE_NAME "๐ณ Hugging Face - $SPACE_TITLE"
|
4 |
ENV LIVEBOOK_APP_SERVICE_URL "https://huggingface.co/spaces/$SPACE_AUTHOR_NAME/$SPACE_REPO_NAME"
|
@@ -9,7 +9,12 @@ ENV LIVEBOOK_DATA_PATH "/data"
|
|
9 |
ENV LIVEBOOK_PORT 7860
|
10 |
|
11 |
EXPOSE 7860
|
|
|
12 |
USER root
|
|
|
|
|
|
|
|
|
13 |
COPY public-apps/ /public-apps
|
14 |
RUN mkdir -p /data
|
15 |
RUN chmod 777 /data
|
|
|
1 |
+
FROM ghcr.io/livebook-dev/livebook:latest
|
2 |
|
3 |
ENV LIVEBOOK_APP_SERVICE_NAME "๐ณ Hugging Face - $SPACE_TITLE"
|
4 |
ENV LIVEBOOK_APP_SERVICE_URL "https://huggingface.co/spaces/$SPACE_AUTHOR_NAME/$SPACE_REPO_NAME"
|
|
|
9 |
ENV LIVEBOOK_PORT 7860
|
10 |
|
11 |
EXPOSE 7860
|
12 |
+
|
13 |
USER root
|
14 |
+
|
15 |
+
RUN apt-get update && apt-get install -y python3 python3-pip python-is-python3
|
16 |
+
RUN pip --no-cache-dir install transformers sentencepiece protobuf
|
17 |
+
|
18 |
COPY public-apps/ /public-apps
|
19 |
RUN mkdir -p /data
|
20 |
RUN chmod 777 /data
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
fullWidth: true
|
8 |
---
|
9 |
|
10 |
-
|
|
|
1 |
---
|
2 |
+
title: Bumblebee tools
|
3 |
+
emoji: ๐
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: brown
|
6 |
sdk: docker
|
7 |
fullWidth: true
|
8 |
---
|
9 |
|
10 |
+
Tools for [elixir-nx/bumblebee](https://github.com/elixir-nx/bumblebee).
|
public-apps/tokenizer-generator.livemd
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- livebook:{"app_settings":{"access_type":"public","auto_shutdown_ms":5000,"multi_session":true,"output_type":"rich","show_source":true,"slug":"tokenizer-generator"}} -->
|
2 |
+
|
3 |
+
# Tokenizer generator
|
4 |
+
|
5 |
+
```elixir
|
6 |
+
Mix.install([
|
7 |
+
{:kino, "~> 0.10.0"},
|
8 |
+
{:req, "~> 0.4.3"}
|
9 |
+
])
|
10 |
+
```
|
11 |
+
|
12 |
+
## Info
|
13 |
+
|
14 |
+
```elixir
|
15 |
+
Kino.Markdown.new("""
|
16 |
+
## Background
|
17 |
+
|
18 |
+
HuggingFace repositories store tokenizers in two flavours:
|
19 |
+
|
20 |
+
1. "slow tokenizer" - corresponds to a tokenizer implemented in Python
|
21 |
+
and stored as `tokenizer_config.json`
|
22 |
+
|
23 |
+
2. "fast tokenizers" - corresponds to a tokenizer implemented in Rust
|
24 |
+
and stored as `tokenizer.json`
|
25 |
+
|
26 |
+
Many repositories only include files for 1., but the `transformers` library
|
27 |
+
automatically converts "slow tokenizer" to "fast tokenizer" whenever possible.
|
28 |
+
|
29 |
+
Bumblebee relies on the Rust bindings and therefore always requires the
|
30 |
+
`tokenizer.json` file. This app generates that file for any repository with the
|
31 |
+
"slow tokenizer".
|
32 |
+
""")
|
33 |
+
```
|
34 |
+
|
35 |
+
## Generator
|
36 |
+
|
37 |
+
```elixir
|
38 |
+
Kino.Markdown.new("## Converter")
|
39 |
+
```
|
40 |
+
|
41 |
+
```elixir
|
42 |
+
{version, 0} =
|
43 |
+
System.cmd("python", ["-c", "import transformers; print(transformers.__version__, end='')"])
|
44 |
+
|
45 |
+
Kino.Markdown.new("""
|
46 |
+
`tokenizers: #{version}`
|
47 |
+
""")
|
48 |
+
```
|
49 |
+
|
50 |
+
```elixir
|
51 |
+
repo_input = Kino.Input.text("HuggingFace repo")
|
52 |
+
```
|
53 |
+
|
54 |
+
```elixir
|
55 |
+
repo = Kino.Input.read(repo_input)
|
56 |
+
|
57 |
+
if repo == "" do
|
58 |
+
Kino.interrupt!(:normal, "Enter repository.")
|
59 |
+
end
|
60 |
+
```
|
61 |
+
|
62 |
+
```elixir
|
63 |
+
response =
|
64 |
+
Req.post!("https://huggingface.co/api/models/#{repo}/paths-info/main",
|
65 |
+
json: %{paths: ["tokenizer.json"]}
|
66 |
+
)
|
67 |
+
|
68 |
+
case response do
|
69 |
+
%{status: 200, body: []} ->
|
70 |
+
:ok
|
71 |
+
|
72 |
+
%{status: 200, body: [%{"path" => "tokenizer.json"}]} ->
|
73 |
+
Kino.interrupt!(:error, "The tokenizer.json file already exist in the given repository.")
|
74 |
+
|
75 |
+
_ ->
|
76 |
+
Kino.interrupt!(:error, "The repository does not exist or requires authentication.")
|
77 |
+
end
|
78 |
+
```
|
79 |
+
|
80 |
+
```elixir
|
81 |
+
output_dir = Path.join(System.tmp_dir!(), repo)
|
82 |
+
```
|
83 |
+
|
84 |
+
````elixir
|
85 |
+
script = """
|
86 |
+
import sys
|
87 |
+
from transformers import AutoTokenizer
|
88 |
+
|
89 |
+
repo = sys.argv[1]
|
90 |
+
output_dir = sys.argv[2]
|
91 |
+
|
92 |
+
|
93 |
+
try:
|
94 |
+
tokenizer = AutoTokenizer.from_pretrained(repo)
|
95 |
+
assert tokenizer.is_fast
|
96 |
+
tokenizer.save_pretrained(output_dir)
|
97 |
+
except Exception as error:
|
98 |
+
print(error)
|
99 |
+
exit(1)
|
100 |
+
"""
|
101 |
+
|
102 |
+
case System.cmd("python", ["-c", script, repo, output_dir]) do
|
103 |
+
{_, 0} ->
|
104 |
+
:ok
|
105 |
+
|
106 |
+
{output, _} ->
|
107 |
+
Kino.Markdown.new("""
|
108 |
+
```
|
109 |
+
#{output}
|
110 |
+
```
|
111 |
+
""")
|
112 |
+
|> Kino.render()
|
113 |
+
|
114 |
+
Kino.interrupt!(:error, "Tokenizer conversion failed.")
|
115 |
+
end
|
116 |
+
````
|
117 |
+
|
118 |
+
```elixir
|
119 |
+
tokenizer_path = Path.join(output_dir, "tokenizer.json")
|
120 |
+
|
121 |
+
Kino.Download.new(
|
122 |
+
fn -> File.read!(tokenizer_path) end,
|
123 |
+
filename: "tokenizer.json",
|
124 |
+
label: "tokenizer.json"
|
125 |
+
)
|
126 |
+
```
|
127 |
+
|
128 |
+
`````elixir
|
129 |
+
Kino.Markdown.new("""
|
130 |
+
### Next steps
|
131 |
+
|
132 |
+
1. Go to https://huggingface.co/#{repo}/upload/main.
|
133 |
+
|
134 |
+
2. Upload the `tokenizer.json` file.
|
135 |
+
|
136 |
+
3. Add the following description:
|
137 |
+
|
138 |
+
````markdown
|
139 |
+
Generated with:
|
140 |
+
|
141 |
+
```python
|
142 |
+
from transformers import AutoTokenizer
|
143 |
+
|
144 |
+
tokenizer = AutoTokenizer.from_pretrained("#{repo}")
|
145 |
+
assert tokenizer.is_fast
|
146 |
+
tokenizer.save_pretrained("...")
|
147 |
+
```
|
148 |
+
````
|
149 |
+
|
150 |
+
4. Submit the PR.
|
151 |
+
|
152 |
+
""")
|
153 |
+
`````
|
public-apps/welcome.livemd
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
<!-- livebook:{"app_settings":{"access_type":"public","slug":"welcome"}} -->
|
2 |
-
|
3 |
-
# Livebook <3 Hugging Face
|
4 |
-
|
5 |
-
```elixir
|
6 |
-
Mix.install([
|
7 |
-
{:kino, "~> 0.9"}
|
8 |
-
])
|
9 |
-
```
|
10 |
-
|
11 |
-
## Section
|
12 |
-
|
13 |
-
This is the source of a deployed notebook.
|
14 |
-
This notebook is static and simply renders the markdown content below.
|
15 |
-
|
16 |
-
```elixir
|
17 |
-
Kino.Markdown.new("""
|
18 |
-
Welcome to Livebook in Hugging Face!
|
19 |
-
|
20 |
-
This is a deployed notebook, which is also a perfect place to teach you
|
21 |
-
the ropes in using Livebook with Hugging Face.
|
22 |
-
|
23 |
-
## Getting started
|
24 |
-
|
25 |
-
First off, if you want to run your own copy of Livebook,
|
26 |
-
[check our tutorial](https://news.livebook.dev/livebook-inside-hugging-face-spaces-3LQaRi).
|
27 |
-
Once you clone the space, remember to set `LIVEBOOK_PASSWORD` as
|
28 |
-
an environment variable on your Space Settings page (a minimum of
|
29 |
-
12 digits is required).
|
30 |
-
|
31 |
-
If you are new to Elixir and Livebook, [head out to the Learn page](/learn)
|
32 |
-
(it requires a password), there you will find resources to get started
|
33 |
-
with both.
|
34 |
-
|
35 |
-
## Deploying notebooks
|
36 |
-
|
37 |
-
Livebook is fully collaborative and it enables you to deploy interactive
|
38 |
-
and collaborative apps just as well. All of your deployable notebooks will
|
39 |
-
be in the "public-apps" directory of your Spaces repository.
|
40 |
-
|
41 |
-
To deploy your own notebook on Hugging Face, you must click the
|
42 |
-
<i class="ri-livebook-deploy"></i> icon on the notebook sidebar, set a "Slug"
|
43 |
-
for the notebook, mark it as public and then drop its `.livemd` file into
|
44 |
-
the "public-apps" directory of your Spaces repo.
|
45 |
-
""")
|
46 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|