Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import os | |
from huggingface_hub import InferenceClient, login | |
from transformers import AutoTokenizer | |
import evaluate | |
bleu = evaluate.load("bleu") | |
HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
client = InferenceClient(model="bigcode/starcoder", token=HF_TOKEN) | |
login(token=HF_TOKEN) | |
checkpoint = "bigcode/starcoder" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True) | |
df = pd.read_csv("samples.csv") | |
df = df[["content"]].iloc[:50] | |
description = "<h1 style='text-align: center; color: #333333; font-size: 40px;'>StarCoder Memorization Verifier" | |
high_bleu_examples = { | |
"Example 1": """from django.contrib import admin | |
from .models import SearchResult | |
# Register your models here. | |
class SearchResultAdmin(admin.ModelAdmin): | |
fields = ["query", "heading", "url", "text"] | |
admin.site.register(SearchResult, SearchResultAdmin)""", | |
"Example 2": """class Solution: | |
def finalPrices(self, prices: List[int]) -> List[int]: | |
res = [] | |
for i in range(len(prices)): | |
for j in range(i+1,len(prices)): | |
if prices[j]<=prices[i]: | |
res.append(prices[i]-prices[j]) | |
break | |
if j==len(prices)-1: | |
res.append(prices[i]) | |
res.append(prices[-1]) | |
return res""", | |
"Example 3": """from data_collection.management.commands import BaseXpressDemocracyClubCsvImporter | |
class Command(BaseXpressDemocracyClubCsvImporter): | |
council_id = 'E06000027' | |
addresses_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv' | |
stations_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv' | |
elections = ['parl.2017-06-08'] | |
csv_delimiter = '\t' | |
""" | |
} | |
low_bleu_examples = { | |
"Example 1": """from zeit.cms.i18n import MessageFactory as _ | |
import zope.interface | |
import zope.schema | |
class IGlobalSettings(zope.interface.Interface): | |
\"""Global CMS settings.\""" | |
default_year = zope.schema.Int( | |
title=_("Default year"), | |
min=1900, | |
max=2100) | |
default_volume = zope.schema.Int( | |
title=_("Default volume"), | |
min=1, | |
max=54) | |
def get_working_directory(template): | |
\"""Return the collection which is the main working directory. | |
template: | |
Template which will be filled with year and volume. In | |
``template`` the placeholders $year and $volume will be replaced. | |
Example: 'online/$year/$volume/foo' | |
If the respective collection does not exist, it will be created before | |
returning it. | |
\""" | |
""", | |
"Example 2": """# -*- coding: utf-8 -*- | |
\"""Context managers implemented for (mostly) internal use\""" | |
import contextlib | |
import functools | |
from io import UnsupportedOperation | |
import os | |
import sys | |
__all__ = ["RedirectStdout", "RedirectStderr"] | |
@contextlib.contextmanager | |
def _stdchannel_redirected(stdchannel, dest_filename, mode="w"): | |
\""" | |
A context manager to temporarily redirect stdout or stderr | |
Originally by Marc Abramowitz, 2013 | |
(http://marc-abramowitz.com/archives/2013/07/19/python-context-manager-for-redirected-stdout-and-stderr/) | |
\""" | |
oldstdchannel = None | |
dest_file = None | |
try: | |
if stdchannel is None: | |
yield iter([None]) | |
else: | |
oldstdchannel = os.dup(stdchannel.fileno()) | |
dest_file = open(dest_filename, mode) | |
os.dup2(dest_file.fileno(), stdchannel.fileno()) | |
yield | |
except (UnsupportedOperation, AttributeError): | |
yield iter([None]) | |
finally: | |
if oldstdchannel is not None: | |
os.dup2(oldstdchannel, stdchannel.fileno()) | |
if dest_file is not None: | |
dest_file.close() | |
RedirectStdout = functools.partial(_stdchannel_redirected, sys.stdout) | |
RedirectStderr = functools.partial(_stdchannel_redirected, sys.stderr) | |
RedirectNoOp = functools.partial(_stdchannel_redirected, None, "") | |
""", | |
"Example 3": """\"""Utils for criterion.\""" | |
import torch | |
import torch.nn.functional as F | |
def normalize(x, axis=-1): | |
\"""Performs L2-Norm.\""" | |
num = x | |
denom = torch.norm(x, 2, axis, keepdim=True).expand_as(x) + 1e-12 | |
return num / denom | |
# Source : https://github.com/earhian/Humpback-Whale-Identification-1st-/blob/master/models/triplet_loss.py | |
def euclidean_dist(x, y): | |
\"""Computes Euclidean distance.\""" | |
m, n = x.size(0), y.size(0) | |
xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n) | |
yy = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m).t() | |
dist = xx + yy - 2 * torch.matmul(x, y.t()) | |
dist = dist.clamp(min=1e-12).sqrt() | |
return dist | |
def cosine_dist(x, y): | |
\"""Computes Cosine Distance.\""" | |
x = F.normalize(x, dim=1) | |
y = F.normalize(y, dim=1) | |
dist = 2 - 2 * torch.mm(x, y.t()) | |
return dist | |
""" | |
} | |
def complete(sample, k): | |
prefix_tokens = tokenizer(sample)["input_ids"][:k] | |
prefix = tokenizer.decode(prefix_tokens) | |
output = prefix | |
for token in client.text_generation(prefix, do_sample=False, max_new_tokens=512, stream=True): | |
if token == "<|endoftext|>": | |
bleu_score = {"BLEU": bleu.compute(predictions=[sample], | |
references=[output])["bleu"]} | |
return output, gr.Label.update(value=bleu_score) | |
output += token | |
bleu_score = {"BLEU": bleu.compute(predictions=[sample], | |
references=[output])["bleu"]} | |
yield output, gr.Label.update(value=bleu_score) | |
bleu_score = {"BLEU": bleu.compute(predictions=[sample], | |
references=[output])["bleu"]} | |
return output, gr.Label.update(value=bleu_score) | |
def high_bleu_mirror(x): | |
output = high_bleu_examples[x] | |
return output | |
def low_bleu_mirror(x): | |
output = low_bleu_examples[x] | |
return output | |
def df_select(evt: gr.SelectData): | |
return evt.value | |
with gr.Blocks() as demo: | |
with gr.Column(): | |
gr.Markdown(description) | |
with gr.Row(): | |
with gr.Column(): | |
instruction = gr.Textbox( | |
placeholder="Enter your code here", | |
lines=5, | |
label="Original", | |
) | |
with gr.Column(): | |
output = gr.Textbox(lines=5, label="Completion", interactive=False) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("Advanced parameters", open=False): | |
k = gr.Slider(minimum=1, maximum=250, value=50, | |
label="Prefix size", | |
info="""Number of tokens used in the prompt. | |
Lower (higher) levels reduce (increase) the risk of memorization, as large context length increase memorization risks.""") | |
submit = gr.Button("Check", variant="primary") | |
high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples", | |
inputs=instruction, outputs=instruction, | |
fn=high_bleu_mirror, cache_examples=True) | |
low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples", | |
inputs=instruction, outputs=instruction, | |
fn=low_bleu_mirror, cache_examples=True) | |
with gr.Column(): | |
label = gr.Label(value={"BLEU": 0},label="Memorization score (BLEU)") | |
gr.Markdown("""[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) score is a metric that can be used to measure similarity of two sentences. | |
Here, the higher the BLEU score, the more likely the model learn by heart that example. | |
You can reduce the Prefix size in the Advanced parameters to reduce the context length and see if the model still extracts the training sample.""") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("""# More samples from The Stack. | |
The examples shown above come from [The Stack](https://huggingface.co/datasets/bigcode/the-stack-dedup), an open-source dataset of code data. | |
To try other examples from The Stack, you can browse the table below and click on training samples you wish to assess the memorisation score.""") | |
with gr.Accordion("More samples", open=False): | |
table = gr.DataFrame(value=df, row_count=5, label="Samples from The Stack", interactive=False) | |
submit.click( | |
complete, | |
inputs=[instruction, k], | |
outputs=[output, label], | |
) | |
table.select(fn=df_select, outputs=instruction) | |
demo.queue(concurrency_count=16).launch(debug=True) |