Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- .gitattributes +1 -0
- app.py +223 -0
- requirements.txt +5 -0
- samples.csv +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
samples.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
from huggingface_hub import InferenceClient, login
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
import evaluate
|
7 |
+
|
8 |
+
bleu = evaluate.load("bleu")
|
9 |
+
|
10 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
11 |
+
print(HF_TOKEN)
|
12 |
+
client = InferenceClient(model="bigcode/starcoder", token=HF_TOKEN)
|
13 |
+
|
14 |
+
login(token=HF_TOKEN)
|
15 |
+
checkpoint = "bigcode/starcoder"
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
|
17 |
+
|
18 |
+
df = pd.read_csv("samples.csv")
|
19 |
+
sample_df = df.loc[~df.prediction_50.isna()]
|
20 |
+
|
21 |
+
description = "<h1 style='text-align: center; color: #333333; font-size: 40px;'>StarCoder Memorization Verifier"
|
22 |
+
high_bleu_examples = {
|
23 |
+
"Example 1": """from django.contrib import admin
|
24 |
+
from .models import SearchResult
|
25 |
+
|
26 |
+
# Register your models here.
|
27 |
+
class SearchResultAdmin(admin.ModelAdmin):
|
28 |
+
fields = ["query", "heading", "url", "text"]
|
29 |
+
|
30 |
+
admin.site.register(SearchResult, SearchResultAdmin)""",
|
31 |
+
|
32 |
+
"Example 2": """class Solution:
|
33 |
+
def finalPrices(self, prices: List[int]) -> List[int]:
|
34 |
+
res = []
|
35 |
+
for i in range(len(prices)):
|
36 |
+
for j in range(i+1,len(prices)):
|
37 |
+
if prices[j]<=prices[i]:
|
38 |
+
res.append(prices[i]-prices[j])
|
39 |
+
break
|
40 |
+
if j==len(prices)-1:
|
41 |
+
res.append(prices[i])
|
42 |
+
res.append(prices[-1])
|
43 |
+
return res""",
|
44 |
+
"Example 3": """from data_collection.management.commands import BaseXpressDemocracyClubCsvImporter
|
45 |
+
|
46 |
+
class Command(BaseXpressDemocracyClubCsvImporter):
|
47 |
+
council_id = 'E06000027'
|
48 |
+
addresses_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
|
49 |
+
stations_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
|
50 |
+
elections = ['parl.2017-06-08']
|
51 |
+
csv_delimiter = '\t'
|
52 |
+
"""
|
53 |
+
}
|
54 |
+
|
55 |
+
low_bleu_examples = {
|
56 |
+
"Example 1": """from zeit.cms.i18n import MessageFactory as _
|
57 |
+
import zope.interface
|
58 |
+
import zope.schema
|
59 |
+
|
60 |
+
|
61 |
+
class IGlobalSettings(zope.interface.Interface):
|
62 |
+
\"""Global CMS settings.\"""
|
63 |
+
|
64 |
+
default_year = zope.schema.Int(
|
65 |
+
title=_("Default year"),
|
66 |
+
min=1900,
|
67 |
+
max=2100)
|
68 |
+
|
69 |
+
default_volume = zope.schema.Int(
|
70 |
+
title=_("Default volume"),
|
71 |
+
min=1,
|
72 |
+
max=54)
|
73 |
+
|
74 |
+
def get_working_directory(template):
|
75 |
+
\"""Return the collection which is the main working directory.
|
76 |
+
|
77 |
+
template:
|
78 |
+
Template which will be filled with year and volume. In
|
79 |
+
``template`` the placeholders $year and $volume will be replaced.
|
80 |
+
Example: 'online/$year/$volume/foo'
|
81 |
+
|
82 |
+
If the respective collection does not exist, it will be created before
|
83 |
+
returning it.
|
84 |
+
|
85 |
+
\"""
|
86 |
+
""",
|
87 |
+
"Example 2": """# -*- coding: utf-8 -*-
|
88 |
+
|
89 |
+
\"""Context managers implemented for (mostly) internal use\"""
|
90 |
+
|
91 |
+
import contextlib
|
92 |
+
import functools
|
93 |
+
from io import UnsupportedOperation
|
94 |
+
import os
|
95 |
+
import sys
|
96 |
+
|
97 |
+
|
98 |
+
__all__ = ["RedirectStdout", "RedirectStderr"]
|
99 |
+
|
100 |
+
|
101 |
+
@contextlib.contextmanager
|
102 |
+
def _stdchannel_redirected(stdchannel, dest_filename, mode="w"):
|
103 |
+
\"""
|
104 |
+
A context manager to temporarily redirect stdout or stderr
|
105 |
+
|
106 |
+
Originally by Marc Abramowitz, 2013
|
107 |
+
(http://marc-abramowitz.com/archives/2013/07/19/python-context-manager-for-redirected-stdout-and-stderr/)
|
108 |
+
\"""
|
109 |
+
|
110 |
+
oldstdchannel = None
|
111 |
+
dest_file = None
|
112 |
+
try:
|
113 |
+
if stdchannel is None:
|
114 |
+
yield iter([None])
|
115 |
+
else:
|
116 |
+
oldstdchannel = os.dup(stdchannel.fileno())
|
117 |
+
dest_file = open(dest_filename, mode)
|
118 |
+
os.dup2(dest_file.fileno(), stdchannel.fileno())
|
119 |
+
yield
|
120 |
+
except (UnsupportedOperation, AttributeError):
|
121 |
+
yield iter([None])
|
122 |
+
finally:
|
123 |
+
if oldstdchannel is not None:
|
124 |
+
os.dup2(oldstdchannel, stdchannel.fileno())
|
125 |
+
if dest_file is not None:
|
126 |
+
dest_file.close()
|
127 |
+
|
128 |
+
|
129 |
+
RedirectStdout = functools.partial(_stdchannel_redirected, sys.stdout)
|
130 |
+
RedirectStderr = functools.partial(_stdchannel_redirected, sys.stderr)
|
131 |
+
RedirectNoOp = functools.partial(_stdchannel_redirected, None, "")
|
132 |
+
""",
|
133 |
+
"Example 3": """\"""Utils for criterion.\"""
|
134 |
+
import torch
|
135 |
+
import torch.nn.functional as F
|
136 |
+
|
137 |
+
|
138 |
+
def normalize(x, axis=-1):
|
139 |
+
\"""Performs L2-Norm.\"""
|
140 |
+
num = x
|
141 |
+
denom = torch.norm(x, 2, axis, keepdim=True).expand_as(x) + 1e-12
|
142 |
+
return num / denom
|
143 |
+
|
144 |
+
|
145 |
+
# Source : https://github.com/earhian/Humpback-Whale-Identification-1st-/blob/master/models/triplet_loss.py
|
146 |
+
def euclidean_dist(x, y):
|
147 |
+
\"""Computes Euclidean distance.\"""
|
148 |
+
m, n = x.size(0), y.size(0)
|
149 |
+
xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
|
150 |
+
yy = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m).t()
|
151 |
+
dist = xx + yy - 2 * torch.matmul(x, y.t())
|
152 |
+
|
153 |
+
dist = dist.clamp(min=1e-12).sqrt()
|
154 |
+
|
155 |
+
return dist
|
156 |
+
|
157 |
+
|
158 |
+
def cosine_dist(x, y):
|
159 |
+
\"""Computes Cosine Distance.\"""
|
160 |
+
x = F.normalize(x, dim=1)
|
161 |
+
y = F.normalize(y, dim=1)
|
162 |
+
dist = 2 - 2 * torch.mm(x, y.t())
|
163 |
+
return dist
|
164 |
+
"""
|
165 |
+
}
|
166 |
+
|
167 |
+
def complete(sample, k):
|
168 |
+
prefix_tokens = tokenizer(sample)["input_ids"][:k]
|
169 |
+
prefix = tokenizer.decode(prefix_tokens)
|
170 |
+
|
171 |
+
output = prefix
|
172 |
+
for token in client.text_generation(prefix, do_sample=False, max_new_tokens=512, stream=True):
|
173 |
+
if token == "<|endoftext|>":
|
174 |
+
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
175 |
+
references=[output])["bleu"]}
|
176 |
+
return output, gr.Label.update(value=bleu_score)
|
177 |
+
output += token
|
178 |
+
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
179 |
+
references=[output])["bleu"]}
|
180 |
+
yield output, gr.Label.update(value=bleu_score)
|
181 |
+
bleu_score = {"BLEU": bleu.compute(predictions=[sample],
|
182 |
+
references=[output])["bleu"]}
|
183 |
+
return output, gr.Label.update(value=bleu_score)
|
184 |
+
|
185 |
+
def high_bleu_mirror(x):
|
186 |
+
output = high_bleu_examples[x]
|
187 |
+
return output
|
188 |
+
|
189 |
+
def low_bleu_mirror(x):
|
190 |
+
output = low_bleu_examples[x]
|
191 |
+
return output
|
192 |
+
|
193 |
+
with gr.Blocks() as demo:
|
194 |
+
with gr.Column():
|
195 |
+
gr.Markdown(description)
|
196 |
+
with gr.Row():
|
197 |
+
with gr.Column():
|
198 |
+
instruction = gr.Textbox(
|
199 |
+
placeholder="Enter your code here",
|
200 |
+
lines=5,
|
201 |
+
label="Original",
|
202 |
+
)
|
203 |
+
|
204 |
+
with gr.Accordion("Advanced parameters", open=False):
|
205 |
+
k = gr.Slider(minimum=1, maximum=250, value=50)
|
206 |
+
submit = gr.Button("Check", variant="primary")
|
207 |
+
high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
|
208 |
+
inputs=instruction, outputs=instruction,
|
209 |
+
fn=high_bleu_mirror, cache_examples=True)
|
210 |
+
low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
|
211 |
+
inputs=instruction, outputs=instruction,
|
212 |
+
fn=low_bleu_mirror, cache_examples=True)
|
213 |
+
with gr.Column():
|
214 |
+
output = gr.Textbox(lines=5,
|
215 |
+
label="Completion", interactive=False)
|
216 |
+
label = gr.Label(value={"BLEU": 0},
|
217 |
+
label="Similarity score (BLEU)")
|
218 |
+
submit.click(
|
219 |
+
complete,
|
220 |
+
inputs=[instruction, k],
|
221 |
+
outputs=[output, label],
|
222 |
+
)
|
223 |
+
demo.queue(concurrency_count=16).launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
evaluate==0.4.0
|
2 |
+
gradio==3.47.1
|
3 |
+
huggingface_hub==0.14.1
|
4 |
+
pandas==2.0.1
|
5 |
+
transformers==4.34.0
|
samples.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e350eaf65087c4eee5db1305170e86ea2417bff2459a097d7c3169cf31251f9
|
3 |
+
size 46909037
|