Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,113 @@ from huggingface_hub import InferenceClient, login
|
|
5 |
from transformers import AutoTokenizer
|
6 |
import evaluate
|
7 |
import theme
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
bleu = evaluate.load("bleu")
|
10 |
|
@@ -15,6 +122,8 @@ login(token=HF_TOKEN)
|
|
15 |
checkpoint = "bigcode/starcoder"
|
16 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
|
17 |
|
|
|
|
|
18 |
df = pd.read_csv("samples.csv")
|
19 |
df = df[["content"]].iloc[:50]
|
20 |
|
@@ -24,11 +133,9 @@ description = """
|
|
24 |
This ability of LLMs to learn their training set by heart can pose huge privacy issues, as many large-scale Conversational AI available commercially collect users' data at scale and fine-tune their models on it.
|
25 |
This means that if sensitive data is sent and memorized by an AI, other users can willingly or unwillingly prompt the AI to spit out this sensitive data. 🔓
|
26 |
|
27 |
-
|
28 |
To raise awareness of this issue, we show in this demo how much [StarCoder](https://huggingface.co/bigcode/starcoder), an LLM specialized in coding tasks, memorizes its training set, [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup).
|
29 |
We found that **StarCoder memorized at least 8% of the training samples** we used, which highlights the high risks of LLMs exposing the training set. We provide a notebook to reproduce our results [here](https://colab.research.google.com/drive/1YaaPOXzodEAc4JXboa12gN5zdlzy5XaR?usp=sharing). 👈
|
30 |
|
31 |
-
|
32 |
To evaluate memorization of the training set, we can prompt StarCoder with the first tokens of an example from the training set. If StarCoder completes the prompt with an output that looks very similar to the original sample, we will consider this sample to be memorized by the LLM. 💾
|
33 |
"""
|
34 |
|
@@ -60,8 +167,8 @@ A training sentence is approximately memorized if the [BLEU score](https://huggi
|
|
60 |
The researchers found that the threshold of 0.75 provided good empirical results in terms of semantic and syntactic similarity.
|
61 |
"""
|
62 |
|
63 |
-
|
64 |
-
"
|
65 |
from .models import SearchResult
|
66 |
|
67 |
# Register your models here.
|
@@ -70,7 +177,7 @@ class SearchResultAdmin(admin.ModelAdmin):
|
|
70 |
|
71 |
admin.site.register(SearchResult, SearchResultAdmin)""",
|
72 |
|
73 |
-
"
|
74 |
def finalPrices(self, prices: List[int]) -> List[int]:
|
75 |
res = []
|
76 |
for i in range(len(prices)):
|
@@ -82,7 +189,7 @@ admin.site.register(SearchResult, SearchResultAdmin)""",
|
|
82 |
res.append(prices[i])
|
83 |
res.append(prices[-1])
|
84 |
return res""",
|
85 |
-
"
|
86 |
|
87 |
class Command(BaseXpressDemocracyClubCsvImporter):
|
88 |
council_id = 'E06000027'
|
@@ -90,11 +197,8 @@ class Command(BaseXpressDemocracyClubCsvImporter):
|
|
90 |
stations_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
|
91 |
elections = ['parl.2017-06-08']
|
92 |
csv_delimiter = '\t'
|
93 |
-
"""
|
94 |
-
|
95 |
-
|
96 |
-
low_bleu_examples = {
|
97 |
-
"Example 1": """from zeit.cms.i18n import MessageFactory as _
|
98 |
import zope.interface
|
99 |
import zope.schema
|
100 |
|
@@ -125,7 +229,7 @@ class IGlobalSettings(zope.interface.Interface):
|
|
125 |
|
126 |
\"""
|
127 |
""",
|
128 |
-
"
|
129 |
|
130 |
\"""Context managers implemented for (mostly) internal use\"""
|
131 |
|
@@ -171,7 +275,7 @@ RedirectStdout = functools.partial(_stdchannel_redirected, sys.stdout)
|
|
171 |
RedirectStderr = functools.partial(_stdchannel_redirected, sys.stderr)
|
172 |
RedirectNoOp = functools.partial(_stdchannel_redirected, None, "")
|
173 |
""",
|
174 |
-
"
|
175 |
import torch
|
176 |
import torch.nn.functional as F
|
177 |
|
@@ -205,76 +309,104 @@ def cosine_dist(x, y):
|
|
205 |
"""
|
206 |
}
|
207 |
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
prefix_tokens = tokenizer(sample)["input_ids"][:k]
|
210 |
prefix = tokenizer.decode(prefix_tokens)
|
211 |
-
|
212 |
output = prefix
|
213 |
for token in client.text_generation(prefix, do_sample=False, max_new_tokens=512, stream=True):
|
214 |
if token == "<|endoftext|>":
|
215 |
-
bleu_score = {"BLEU": bleu.compute(predictions=[
|
216 |
-
references=[
|
217 |
-
return output, gr.Label.update(value=bleu_score)
|
218 |
output += token
|
219 |
-
bleu_score = {"BLEU": bleu.compute(predictions=[
|
220 |
-
references=[
|
221 |
-
yield output, gr.Label.update(value=bleu_score)
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
output = high_bleu_examples[x]
|
228 |
-
return output
|
229 |
-
|
230 |
-
def low_bleu_mirror(x):
|
231 |
-
output = low_bleu_examples[x]
|
232 |
-
return output
|
233 |
-
|
234 |
-
def df_select(evt: gr.SelectData):
|
235 |
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
style = theme.Style()
|
239 |
|
240 |
-
with gr.Blocks(theme=style) as demo:
|
|
|
241 |
with gr.Column():
|
242 |
gr.Markdown(title)
|
243 |
with gr.Row():
|
244 |
with gr.Column():
|
245 |
-
gr.Markdown(description)
|
246 |
with gr.Accordion("Learn more about memorization definition", open=False):
|
247 |
gr.Markdown(memorization_definition)
|
248 |
with gr.Row():
|
249 |
with gr.Column():
|
250 |
instruction = gr.Textbox(
|
251 |
-
|
|
|
252 |
lines=5,
|
253 |
-
label="
|
254 |
-
value=
|
|
|
|
|
255 |
)
|
256 |
|
257 |
with gr.Column():
|
258 |
-
|
|
|
|
|
|
|
|
|
259 |
with gr.Row():
|
260 |
with gr.Column():
|
261 |
-
with gr.Accordion("
|
262 |
-
k = gr.Slider(minimum=1, maximum=
|
263 |
-
|
|
|
264 |
info="""Number of tokens used in the prompt.
|
265 |
Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
|
266 |
submit = gr.Button("Check", variant="primary")
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
|
271 |
-
inputs=instruction, outputs=instruction,
|
272 |
-
fn=low_bleu_mirror, cache_examples=True)
|
273 |
with gr.Column():
|
274 |
-
|
275 |
-
gr.
|
276 |
-
|
277 |
-
|
278 |
|
279 |
with gr.Row():
|
280 |
with gr.Column():
|
@@ -283,10 +415,19 @@ with gr.Blocks(theme=style) as demo:
|
|
283 |
To try other examples from The Stack, you can browse the table below and select different training samples to re-run the checker with to assess their memorization score.""")
|
284 |
with gr.Accordion("More samples", open=False):
|
285 |
table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
submit.click(
|
287 |
complete,
|
288 |
-
inputs=[instruction, k],
|
289 |
-
outputs=[
|
290 |
)
|
291 |
-
table.select(fn=df_select, outputs=instruction)
|
292 |
demo.queue(concurrency_count=16).launch(debug=True)
|
|
|
5 |
from transformers import AutoTokenizer
|
6 |
import evaluate
|
7 |
import theme
|
8 |
+
from difflib import Differ
|
9 |
+
|
10 |
+
import difflib
|
11 |
+
import six
|
12 |
+
import xml.sax.saxutils
|
13 |
+
|
14 |
+
default_css = """\
|
15 |
+
<style type="text/css">
|
16 |
+
.diff {
|
17 |
+
border: 1px solid #cccccc;
|
18 |
+
background: none repeat scroll 0 0 #f8f8f8;
|
19 |
+
font-family: 'Bitstream Vera Sans Mono','Courier',monospace;
|
20 |
+
font-size: 12px;
|
21 |
+
line-height: 1.4;
|
22 |
+
white-space: normal;
|
23 |
+
word-wrap: break-word;
|
24 |
+
}
|
25 |
+
.diff div:hover {
|
26 |
+
background-color:#ffc;
|
27 |
+
}
|
28 |
+
.diff .control {
|
29 |
+
background-color: #eaf2f5;
|
30 |
+
color: #999999;
|
31 |
+
}
|
32 |
+
.diff .insert {
|
33 |
+
background-color: #ddffdd;
|
34 |
+
color: #000000;
|
35 |
+
}
|
36 |
+
.diff .insert .highlight {
|
37 |
+
background-color: #aaffaa;
|
38 |
+
color: #000000;
|
39 |
+
}
|
40 |
+
.diff .delete {
|
41 |
+
background-color: #ffdddd;
|
42 |
+
color: #000000;
|
43 |
+
}
|
44 |
+
.diff .delete .highlight {
|
45 |
+
background-color: #ffaaaa;
|
46 |
+
color: #000000;
|
47 |
+
}
|
48 |
+
</style>
|
49 |
+
"""
|
50 |
+
|
51 |
+
|
52 |
+
def escape(text):
|
53 |
+
return xml.sax.saxutils.escape(text, {" ": " "})
|
54 |
+
|
55 |
+
|
56 |
+
def diff(a, b, n=3, css=True):
|
57 |
+
if isinstance(a, six.string_types):
|
58 |
+
a = a.splitlines()
|
59 |
+
if isinstance(b, six.string_types):
|
60 |
+
b = b.splitlines()
|
61 |
+
return colorize(list(difflib.unified_diff(a, b, n=n)), css=css)
|
62 |
+
|
63 |
+
|
64 |
+
def colorize(diff, css=True):
|
65 |
+
css = default_css if css else ""
|
66 |
+
return css + "\n".join(_colorize(diff))
|
67 |
+
|
68 |
+
|
69 |
+
def _colorize(diff):
|
70 |
+
if isinstance(diff, six.string_types):
|
71 |
+
lines = diff.splitlines()
|
72 |
+
else:
|
73 |
+
lines = diff
|
74 |
+
lines.reverse()
|
75 |
+
while lines and not lines[-1].startswith("@@"):
|
76 |
+
lines.pop()
|
77 |
+
yield '<div class="diff">'
|
78 |
+
while lines:
|
79 |
+
line = lines.pop()
|
80 |
+
klass = ""
|
81 |
+
if line.startswith("@@"):
|
82 |
+
klass = "control"
|
83 |
+
elif line.startswith("-"):
|
84 |
+
klass = "delete"
|
85 |
+
if lines:
|
86 |
+
_next = []
|
87 |
+
while lines and len(_next) < 2:
|
88 |
+
_next.append(lines.pop())
|
89 |
+
if _next[0].startswith("+") and (
|
90 |
+
len(_next) == 1 or _next[1][0] not in ("+", "-")):
|
91 |
+
aline, bline = _line_diff(line[1:], _next.pop(0)[1:])
|
92 |
+
yield '<div class="delete">-%s</div>' % (aline,)
|
93 |
+
yield '<div class="insert">+%s</div>' % (bline,)
|
94 |
+
if _next:
|
95 |
+
lines.append(_next.pop())
|
96 |
+
continue
|
97 |
+
lines.extend(reversed(_next))
|
98 |
+
elif line.startswith("+"):
|
99 |
+
klass = "insert"
|
100 |
+
yield '<div class="%s">%s</div>' % (klass, escape(line),)
|
101 |
+
yield "</div>"
|
102 |
+
|
103 |
+
|
104 |
+
def _line_diff(a, b):
|
105 |
+
aline = []
|
106 |
+
bline = []
|
107 |
+
for tag, i1, i2, j1, j2 in difflib.SequenceMatcher(a=a, b=b).get_opcodes():
|
108 |
+
if tag == "equal":
|
109 |
+
aline.append(escape(a[i1:i2]))
|
110 |
+
bline.append(escape(b[j1:j2]))
|
111 |
+
continue
|
112 |
+
aline.append('<span class="highlight">%s</span>' % (escape(a[i1:i2]),))
|
113 |
+
bline.append('<span class="highlight">%s</span>' % (escape(b[j1:j2]),))
|
114 |
+
return "".join(aline), "".join(bline)
|
115 |
|
116 |
bleu = evaluate.load("bleu")
|
117 |
|
|
|
122 |
checkpoint = "bigcode/starcoder"
|
123 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
|
124 |
|
125 |
+
DEFAULT_K = 50
|
126 |
+
|
127 |
df = pd.read_csv("samples.csv")
|
128 |
df = df[["content"]].iloc[:50]
|
129 |
|
|
|
133 |
This ability of LLMs to learn their training set by heart can pose huge privacy issues, as many large-scale Conversational AI available commercially collect users' data at scale and fine-tune their models on it.
|
134 |
This means that if sensitive data is sent and memorized by an AI, other users can willingly or unwillingly prompt the AI to spit out this sensitive data. 🔓
|
135 |
|
|
|
136 |
To raise awareness of this issue, we show in this demo how much [StarCoder](https://huggingface.co/bigcode/starcoder), an LLM specialized in coding tasks, memorizes its training set, [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup).
|
137 |
We found that **StarCoder memorized at least 8% of the training samples** we used, which highlights the high risks of LLMs exposing the training set. We provide a notebook to reproduce our results [here](https://colab.research.google.com/drive/1YaaPOXzodEAc4JXboa12gN5zdlzy5XaR?usp=sharing). 👈
|
138 |
|
|
|
139 |
To evaluate memorization of the training set, we can prompt StarCoder with the first tokens of an example from the training set. If StarCoder completes the prompt with an output that looks very similar to the original sample, we will consider this sample to be memorized by the LLM. 💾
|
140 |
"""
|
141 |
|
|
|
167 |
The researchers found that the threshold of 0.75 provided good empirical results in terms of semantic and syntactic similarity.
|
168 |
"""
|
169 |
|
170 |
+
examples = {
|
171 |
+
"High memorization sample 1": """from django.contrib import admin
|
172 |
from .models import SearchResult
|
173 |
|
174 |
# Register your models here.
|
|
|
177 |
|
178 |
admin.site.register(SearchResult, SearchResultAdmin)""",
|
179 |
|
180 |
+
"High memorization sample 2": """class Solution:
|
181 |
def finalPrices(self, prices: List[int]) -> List[int]:
|
182 |
res = []
|
183 |
for i in range(len(prices)):
|
|
|
189 |
res.append(prices[i])
|
190 |
res.append(prices[-1])
|
191 |
return res""",
|
192 |
+
"High memorization sample 3": """from data_collection.management.commands import BaseXpressDemocracyClubCsvImporter
|
193 |
|
194 |
class Command(BaseXpressDemocracyClubCsvImporter):
|
195 |
council_id = 'E06000027'
|
|
|
197 |
stations_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
|
198 |
elections = ['parl.2017-06-08']
|
199 |
csv_delimiter = '\t'
|
200 |
+
""",
|
201 |
+
"Low memorization sample 1": """from zeit.cms.i18n import MessageFactory as _
|
|
|
|
|
|
|
202 |
import zope.interface
|
203 |
import zope.schema
|
204 |
|
|
|
229 |
|
230 |
\"""
|
231 |
""",
|
232 |
+
"Low memorization sample 2": """# -*- coding: utf-8 -*-
|
233 |
|
234 |
\"""Context managers implemented for (mostly) internal use\"""
|
235 |
|
|
|
275 |
RedirectStderr = functools.partial(_stdchannel_redirected, sys.stderr)
|
276 |
RedirectNoOp = functools.partial(_stdchannel_redirected, None, "")
|
277 |
""",
|
278 |
+
"Low memorization sample 3": """\"""Utils for criterion.\"""
|
279 |
import torch
|
280 |
import torch.nn.functional as F
|
281 |
|
|
|
309 |
"""
|
310 |
}
|
311 |
|
312 |
+
|
313 |
+
def diff_texts(text1, text2):
|
314 |
+
d = Differ()
|
315 |
+
ret = [
|
316 |
+
(token[2:], token[0] if token[0] != " " else None)
|
317 |
+
for token in d.compare(text1, text2)
|
318 |
+
]
|
319 |
+
return ret
|
320 |
+
|
321 |
+
def complete(sample, k, current_example):
|
322 |
prefix_tokens = tokenizer(sample)["input_ids"][:k]
|
323 |
prefix = tokenizer.decode(prefix_tokens)
|
|
|
324 |
output = prefix
|
325 |
for token in client.text_generation(prefix, do_sample=False, max_new_tokens=512, stream=True):
|
326 |
if token == "<|endoftext|>":
|
327 |
+
bleu_score = {"Memorization score (BLEU)": bleu.compute(predictions=[output],
|
328 |
+
references=[current_example])["bleu"]}
|
329 |
+
return diff(output, current_example), gr.Label.update(value=bleu_score), current_example
|
330 |
output += token
|
331 |
+
bleu_score = {"Memorization score (BLEU)": bleu.compute(predictions=[output],
|
332 |
+
references=[current_example])["bleu"]}
|
333 |
+
yield diff(output, current_example), gr.Label.update(value=bleu_score), current_example
|
334 |
+
# yield output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
|
335 |
+
bleu_score = {"Memorization score (BLEU)": bleu.compute(predictions=[output],
|
336 |
+
references=[current_example])["bleu"]}
|
337 |
+
# return output, diff_texts(output, sample), gr.Label.update(value=bleu_score)
|
338 |
+
return diff(output, current_example), gr.Label.update(value=bleu_score), current_example
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
|
340 |
+
|
341 |
+
def df_select(evt: gr.SelectData, current_example):
|
342 |
+
# TODO: FIND A WAY TO UPDATE CURRENT_EXAMPLE, SAMPLE_MAX AND SAMPLE_MED
|
343 |
+
instruction = evt.value
|
344 |
+
max_tokens = get_max(instruction)
|
345 |
+
prefix_tokens = tokenizer(instruction)["input_ids"][:DEFAULT_K]
|
346 |
+
prefix = tokenizer.decode(prefix_tokens)
|
347 |
+
return prefix, instruction, gr.Slider.update(maximum=max_tokens), gr.HTML.update(value="")
|
348 |
+
|
349 |
+
def get_max(current_example):
|
350 |
+
tokens = tokenizer(current_example)["input_ids"]
|
351 |
+
return len(tokens)
|
352 |
+
|
353 |
+
def mirror(example_key, current_example):
|
354 |
+
instruction = examples[example_key]
|
355 |
+
max_tokens = get_max(instruction)
|
356 |
+
prefix_tokens = tokenizer(instruction)["input_ids"][:DEFAULT_K]
|
357 |
+
prefix = tokenizer.decode(prefix_tokens)
|
358 |
+
return prefix, instruction, gr.Slider.update(maximum=max_tokens), gr.HTML.update(value="")
|
359 |
+
|
360 |
+
DEFAULT_SAMPLE = examples["High memorization sample 1"]
|
361 |
+
DEFAULT_SAMPLE_MAX_TOKENS = get_max(DEFAULT_SAMPLE)
|
362 |
+
DEFAULT_SAMPLE_PREFIX = tokenizer.decode(tokenizer(DEFAULT_SAMPLE)["input_ids"][:DEFAULT_K])
|
363 |
|
364 |
style = theme.Style()
|
365 |
|
366 |
+
with gr.Blocks(theme=style, css=modifs) as demo:
|
367 |
+
current_example = gr.State(value=DEFAULT_SAMPLE)
|
368 |
with gr.Column():
|
369 |
gr.Markdown(title)
|
370 |
with gr.Row():
|
371 |
with gr.Column():
|
372 |
+
gr.Markdown(description, line_breaks=True)
|
373 |
with gr.Accordion("Learn more about memorization definition", open=False):
|
374 |
gr.Markdown(memorization_definition)
|
375 |
with gr.Row():
|
376 |
with gr.Column():
|
377 |
instruction = gr.Textbox(
|
378 |
+
id="instruction",
|
379 |
+
placeholder="Output",
|
380 |
lines=5,
|
381 |
+
label="Prompt",
|
382 |
+
value=DEFAULT_SAMPLE_PREFIX,
|
383 |
+
disable=True,
|
384 |
+
interactive=False,
|
385 |
)
|
386 |
|
387 |
with gr.Column():
|
388 |
+
label = gr.Label(value={"Memorization score (BLEU)": 0},label="BLEU")
|
389 |
+
with gr.Accordion("What is BLEU?", open=False): # NOTE - THIS WEIRDLY BREAKS EVERYTHING IF I UNCOMMENT
|
390 |
+
gr.Markdown("""[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) score is a metric that can be used to measure the similarity of two sentences.
|
391 |
+
Here, the higher the BLEU score, the more likely the model will learn the example by heart.
|
392 |
+
You can reduce the Prefix size in the Advanced parameters to reduce the context length and see if the model still extracts the training sample.""")
|
393 |
with gr.Row():
|
394 |
with gr.Column():
|
395 |
+
with gr.Accordion("Prompt size", open=True):
|
396 |
+
k = gr.Slider(minimum=1, maximum=DEFAULT_SAMPLE_MAX_TOKENS, value=DEFAULT_K,
|
397 |
+
step=1,
|
398 |
+
label="Prompt size",
|
399 |
info="""Number of tokens used in the prompt.
|
400 |
Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""")
|
401 |
submit = gr.Button("Check", variant="primary")
|
402 |
+
examples_dropdown = gr.Dropdown(choices=list(examples.keys()), value=list(examples.keys())[0],
|
403 |
+
interactive=True,
|
404 |
+
label="Training set samples")
|
|
|
|
|
|
|
405 |
with gr.Column():
|
406 |
+
# with gr.Row():
|
407 |
+
# output = gr.Textbox(lines=5, label="Completion", interactive=False)
|
408 |
+
diff_HTML = gr.HTML(
|
409 |
+
label="Diff")
|
410 |
|
411 |
with gr.Row():
|
412 |
with gr.Column():
|
|
|
415 |
To try other examples from The Stack, you can browse the table below and select different training samples to re-run the checker with to assess their memorization score.""")
|
416 |
with gr.Accordion("More samples", open=False):
|
417 |
table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False)
|
418 |
+
def update_x(current_example, k):
|
419 |
+
int_k = int(k)
|
420 |
+
tokens = tokenizer(current_example)["input_ids"][:int_k]
|
421 |
+
prefix = tokenizer.decode(tokens)
|
422 |
+
return current_example, prefix
|
423 |
+
|
424 |
+
k.input(update_x, inputs=[current_example, k], outputs=[current_example, instruction])
|
425 |
+
examples_dropdown.input(mirror, inputs=[examples_dropdown, current_example],
|
426 |
+
outputs=[instruction, current_example, k, diff_HTML])
|
427 |
submit.click(
|
428 |
complete,
|
429 |
+
inputs=[instruction, k, current_example],
|
430 |
+
outputs=[diff_HTML, label, current_example],
|
431 |
)
|
432 |
+
table.select(fn=df_select, inputs=current_example, outputs=[instruction, current_example, k, diff_HTML])
|
433 |
demo.queue(concurrency_count=16).launch(debug=True)
|