Spaces:

mithril-security
/

starcoder_memorization_checker

Runtime error

App Files Files Community

dhuynh95 commited on Oct 10, 2023

Commit

09caaea

•

1 Parent(s): 8bca454

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
app.py +223 -0
requirements.txt +5 -0
samples.csv +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import gradio as gr
+import pandas as pd
+import os
+from huggingface_hub import InferenceClient, login
+from transformers import AutoTokenizer
+import evaluate
+bleu = evaluate.load("bleu")
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+print(HF_TOKEN)
+client = InferenceClient(model="bigcode/starcoder", token=HF_TOKEN)
+login(token=HF_TOKEN)
+checkpoint = "bigcode/starcoder"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
+df = pd.read_csv("samples.csv")
+sample_df = df.loc[~df.prediction_50.isna()]
+description = "<h1 style='text-align: center; color: #333333; font-size: 40px;'>StarCoder Memorization Verifier"
+high_bleu_examples = {
+    "Example 1": """from django.contrib import admin
+from .models import SearchResult
+# Register your models here.
+class SearchResultAdmin(admin.ModelAdmin):
+    fields = ["query", "heading", "url", "text"]
+admin.site.register(SearchResult, SearchResultAdmin)""",
+    "Example 2": """class Solution:
+    def finalPrices(self, prices: List[int]) -> List[int]:
+        res = []
+        for i in range(len(prices)):
+            for j in range(i+1,len(prices)):
+                if prices[j]<=prices[i]:
+                    res.append(prices[i]-prices[j])
+                    break
+                if j==len(prices)-1:
+                    res.append(prices[i])
+        res.append(prices[-1])
+        return res""",
+    "Example 3": """from data_collection.management.commands import BaseXpressDemocracyClubCsvImporter
+class Command(BaseXpressDemocracyClubCsvImporter):
+    council_id = 'E06000027'
+    addresses_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
+    stations_name = 'parl.2017-06-08/Version 1/Torbay Democracy_Club__08June2017.tsv'
+    elections = ['parl.2017-06-08']
+    csv_delimiter = '\t'
+"""
+}
+low_bleu_examples = {
+    "Example 1": """from zeit.cms.i18n import MessageFactory as _
+import zope.interface
+import zope.schema
+class IGlobalSettings(zope.interface.Interface):
+    \"""Global CMS settings.\"""
+    default_year = zope.schema.Int(
+        title=_("Default year"),
+        min=1900,
+        max=2100)
+    default_volume = zope.schema.Int(
+        title=_("Default volume"),
+        min=1,
+        max=54)
+    def get_working_directory(template):
+        \"""Return the collection which is the main working directory.
+        template:
+            Template which will be filled with year and volume. In
+            ``template`` the placeholders $year and $volume will be replaced.
+            Example: 'online/$year/$volume/foo'
+        If the respective collection does not exist, it will be created before
+        returning it.
+        \"""
+""",
+"Example 2": """# -*- coding: utf-8 -*-
+\"""Context managers implemented for (mostly) internal use\"""
+import contextlib
+import functools
+from io import UnsupportedOperation
+import os
+import sys
+__all__ = ["RedirectStdout", "RedirectStderr"]
+@contextlib.contextmanager
+def _stdchannel_redirected(stdchannel, dest_filename, mode="w"):
+    \"""
+    A context manager to temporarily redirect stdout or stderr
+    Originally by Marc Abramowitz, 2013
+    (http://marc-abramowitz.com/archives/2013/07/19/python-context-manager-for-redirected-stdout-and-stderr/)
+    \"""
+    oldstdchannel = None
+    dest_file = None
+    try:
+        if stdchannel is None:
+            yield iter([None])
+        else:
+            oldstdchannel = os.dup(stdchannel.fileno())
+            dest_file = open(dest_filename, mode)
+            os.dup2(dest_file.fileno(), stdchannel.fileno())
+            yield
+    except (UnsupportedOperation, AttributeError):
+        yield iter([None])
+    finally:
+        if oldstdchannel is not None:
+            os.dup2(oldstdchannel, stdchannel.fileno())
+        if dest_file is not None:
+            dest_file.close()
+RedirectStdout = functools.partial(_stdchannel_redirected, sys.stdout)
+RedirectStderr = functools.partial(_stdchannel_redirected, sys.stderr)
+RedirectNoOp = functools.partial(_stdchannel_redirected, None, "")
+""",
+"Example 3": """\"""Utils for criterion.\"""
+import torch
+import torch.nn.functional as F
+def normalize(x, axis=-1):
+    \"""Performs L2-Norm.\"""
+    num = x
+    denom = torch.norm(x, 2, axis, keepdim=True).expand_as(x) + 1e-12
+    return num / denom
+# Source : https://github.com/earhian/Humpback-Whale-Identification-1st-/blob/master/models/triplet_loss.py
+def euclidean_dist(x, y):
+    \"""Computes Euclidean distance.\"""
+    m, n = x.size(0), y.size(0)
+    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
+    yy = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m).t()
+    dist = xx + yy - 2 * torch.matmul(x, y.t())
+    dist = dist.clamp(min=1e-12).sqrt()
+    return dist
+def cosine_dist(x, y):
+    \"""Computes Cosine Distance.\"""
+    x = F.normalize(x, dim=1)
+    y = F.normalize(y, dim=1)
+    dist = 2 - 2 * torch.mm(x, y.t())
+    return dist
+"""
+}
+def complete(sample, k):
+    prefix_tokens = tokenizer(sample)["input_ids"][:k]
+    prefix = tokenizer.decode(prefix_tokens)
+    output = prefix
+    for token in client.text_generation(prefix, do_sample=False, max_new_tokens=512, stream=True):
+        if token == "<|endoftext|>":
+            bleu_score = {"BLEU": bleu.compute(predictions=[sample],
+                                references=[output])["bleu"]}
+            return output, gr.Label.update(value=bleu_score)
+        output += token
+        bleu_score = {"BLEU": bleu.compute(predictions=[sample],
+                                references=[output])["bleu"]}
+        yield output, gr.Label.update(value=bleu_score)
+    bleu_score = {"BLEU": bleu.compute(predictions=[sample],
+                                references=[output])["bleu"]}
+    return output, gr.Label.update(value=bleu_score)
+def high_bleu_mirror(x):
+    output = high_bleu_examples[x]
+    return output
+def low_bleu_mirror(x):
+    output = low_bleu_examples[x]
+    return output
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                instruction = gr.Textbox(
+                    placeholder="Enter your code here",
+                    lines=5,
+                    label="Original",
+                )
+                with gr.Accordion("Advanced parameters", open=False):
+                    k = gr.Slider(minimum=1, maximum=250, value=50)
+                submit = gr.Button("Check", variant="primary")
+                high_bleu_examples = gr.Examples(list(high_bleu_examples.keys()), label="High memorization samples",
+                                                 inputs=instruction, outputs=instruction,
+                                       fn=high_bleu_mirror, cache_examples=True)
+                low_bleu_examples = gr.Examples(list(low_bleu_examples.keys()), label = "Low memorization samples",
+                                                inputs=instruction, outputs=instruction,
+                                       fn=low_bleu_mirror, cache_examples=True)
+            with gr.Column():
+                output = gr.Textbox(lines=5,
+                    label="Completion", interactive=False)
+                label = gr.Label(value={"BLEU": 0},
+                    label="Similarity score (BLEU)")
+    submit.click(
+        complete,
+        inputs=[instruction, k],
+        outputs=[output, label],
+    )
+demo.queue(concurrency_count=16).launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+evaluate==0.4.0
+gradio==3.47.1
+huggingface_hub==0.14.1
+pandas==2.0.1
+transformers==4.34.0

samples.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e350eaf65087c4eee5db1305170e86ea2417bff2459a097d7c3169cf31251f9
+size 46909037