Make files submissions instructions task-specific & other small changes
Browse files- app.py +29 -27
- requirements.txt +2 -1
- src/content.py +4 -2
- src/evaluation/base_task_metrics.py +1 -3
- src/evaluation/commit_message_generation/cmg_metrics.py +6 -17
- src/formatting.py +1 -3
- src/get_results_for_task.py +13 -21
- src/leaderboard_formatting.py +1 -5
- src/submission_uploader.py +23 -66
- src/{tasks.py β tasks_content.py} +20 -6
app.py
CHANGED
@@ -2,17 +2,27 @@ import logging
|
|
2 |
import os
|
3 |
|
4 |
import gradio as gr # type: ignore[import]
|
5 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
-
from huggingface_hub import HfApi
|
7 |
|
8 |
-
from src.content import (
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from src.get_results_for_task import get_results_for_task
|
14 |
from src.submission_uploader import SubmissionUploader
|
15 |
-
from src.
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
logging.basicConfig(
|
18 |
level=logging.INFO,
|
@@ -23,35 +33,28 @@ logging.basicConfig(
|
|
23 |
submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
|
24 |
|
25 |
|
26 |
-
def restart_space():
|
27 |
-
HfApi(token=os.environ["HF_TOKEN"]).restart_space(
|
28 |
-
repo_id="JetBrains-Research/long-code-arena", token=os.environ["HF_TOKEN"]
|
29 |
-
)
|
30 |
-
|
31 |
-
|
32 |
with gr.Blocks() as demo:
|
|
|
33 |
gr.HTML(INTRODUCTION_TITLE)
|
34 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
35 |
|
|
|
36 |
gr.HTML(LEADERBOARD_TITLE)
|
37 |
gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
|
38 |
-
|
39 |
with gr.Tabs():
|
40 |
-
for
|
41 |
-
with gr.TabItem(
|
42 |
with gr.Row():
|
43 |
-
gr.Markdown(TASKS_DESCRIPTIONS[
|
44 |
|
45 |
-
leaderboard_table = gr.components.Dataframe(
|
46 |
-
value=get_results_for_task(task), interactive=False
|
47 |
-
)
|
48 |
|
|
|
49 |
gr.HTML(SUBMISSION_TITLE)
|
50 |
gr.Markdown(SUBMISSION_TEXT_INTRO, elem_classes="markdown-text")
|
51 |
-
|
52 |
with gr.Accordion("π Submit new results"):
|
53 |
gr.Markdown(SUBMISSION_TEXT_TASK, elem_classes="markdown-text")
|
54 |
-
|
55 |
|
56 |
gr.Markdown(SUBMISSION_TEXT_METADATA, elem_classes="markdown-text")
|
57 |
with gr.Row():
|
@@ -91,6 +94,8 @@ with gr.Blocks() as demo:
|
|
91 |
)
|
92 |
|
93 |
gr.Markdown(SUBMISSION_TEXT_FILES, elem_classes="markdown-text")
|
|
|
|
|
94 |
file_output = gr.File(file_count="multiple")
|
95 |
|
96 |
gr.Markdown(SUBMISSION_TEXT_SUBMIT, elem_classes="markdown-text")
|
@@ -99,7 +104,7 @@ with gr.Blocks() as demo:
|
|
99 |
submit_button.click(
|
100 |
submission_uploader.upload_files,
|
101 |
[
|
102 |
-
|
103 |
model_folder_textbox,
|
104 |
model_name_textbox,
|
105 |
model_availability_textbox,
|
@@ -112,7 +117,4 @@ with gr.Blocks() as demo:
|
|
112 |
)
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
-
scheduler = BackgroundScheduler()
|
116 |
-
scheduler.add_job(restart_space, "interval", seconds=30 * 60)
|
117 |
-
scheduler.start()
|
118 |
demo.launch()
|
|
|
2 |
import os
|
3 |
|
4 |
import gradio as gr # type: ignore[import]
|
|
|
|
|
5 |
|
6 |
+
from src.content import (
|
7 |
+
INTRODUCTION_TEXT,
|
8 |
+
INTRODUCTION_TITLE,
|
9 |
+
LEADERBOARD_TEXT,
|
10 |
+
LEADERBOARD_TITLE,
|
11 |
+
SUBMISSION_TEXT_FILES,
|
12 |
+
SUBMISSION_TEXT_INTRO,
|
13 |
+
SUBMISSION_TEXT_METADATA,
|
14 |
+
SUBMISSION_TEXT_SUBMIT,
|
15 |
+
SUBMISSION_TEXT_TASK,
|
16 |
+
SUBMISSION_TITLE,
|
17 |
+
)
|
18 |
from src.get_results_for_task import get_results_for_task
|
19 |
from src.submission_uploader import SubmissionUploader
|
20 |
+
from src.tasks_content import (
|
21 |
+
TASKS_DESCRIPTIONS,
|
22 |
+
TASKS_PRETTY,
|
23 |
+
TASKS_PRETTY_REVERSE,
|
24 |
+
get_submission_text_files_for_task,
|
25 |
+
)
|
26 |
|
27 |
logging.basicConfig(
|
28 |
level=logging.INFO,
|
|
|
33 |
submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
with gr.Blocks() as demo:
|
37 |
+
# intro
|
38 |
gr.HTML(INTRODUCTION_TITLE)
|
39 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
40 |
|
41 |
+
# leaderboard
|
42 |
gr.HTML(LEADERBOARD_TITLE)
|
43 |
gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
|
|
|
44 |
with gr.Tabs():
|
45 |
+
for task_pretty in TASKS_PRETTY_REVERSE:
|
46 |
+
with gr.TabItem(task_pretty):
|
47 |
with gr.Row():
|
48 |
+
gr.Markdown(TASKS_DESCRIPTIONS[TASKS_PRETTY_REVERSE[task_pretty]])
|
49 |
|
50 |
+
leaderboard_table = gr.components.Dataframe(value=get_results_for_task(task_pretty), interactive=False)
|
|
|
|
|
51 |
|
52 |
+
# submission
|
53 |
gr.HTML(SUBMISSION_TITLE)
|
54 |
gr.Markdown(SUBMISSION_TEXT_INTRO, elem_classes="markdown-text")
|
|
|
55 |
with gr.Accordion("π Submit new results"):
|
56 |
gr.Markdown(SUBMISSION_TEXT_TASK, elem_classes="markdown-text")
|
57 |
+
task_selection = gr.Radio(TASKS_PRETTY_REVERSE.keys(), label="Task")
|
58 |
|
59 |
gr.Markdown(SUBMISSION_TEXT_METADATA, elem_classes="markdown-text")
|
60 |
with gr.Row():
|
|
|
94 |
)
|
95 |
|
96 |
gr.Markdown(SUBMISSION_TEXT_FILES, elem_classes="markdown-text")
|
97 |
+
task_specific_instructions = gr.Markdown(get_submission_text_files_for_task(None))
|
98 |
+
task_selection.select(get_submission_text_files_for_task, [task_selection], task_specific_instructions)
|
99 |
file_output = gr.File(file_count="multiple")
|
100 |
|
101 |
gr.Markdown(SUBMISSION_TEXT_SUBMIT, elem_classes="markdown-text")
|
|
|
104 |
submit_button.click(
|
105 |
submission_uploader.upload_files,
|
106 |
[
|
107 |
+
task_selection,
|
108 |
model_folder_textbox,
|
109 |
model_name_textbox,
|
110 |
model_availability_textbox,
|
|
|
117 |
)
|
118 |
|
119 |
if __name__ == "__main__":
|
|
|
|
|
|
|
120 |
demo.launch()
|
requirements.txt
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
huggingface_hub
|
2 |
jsonlines
|
3 |
pandas
|
|
|
|
|
4 |
tqdm
|
5 |
-
apscheduler
|
6 |
# CMG metrics
|
7 |
evaluate
|
8 |
bert-score
|
|
|
1 |
huggingface_hub
|
2 |
jsonlines
|
3 |
pandas
|
4 |
+
gradio
|
5 |
+
datasets
|
6 |
tqdm
|
|
|
7 |
# CMG metrics
|
8 |
evaluate
|
9 |
bert-score
|
src/content.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
# ================================
|
2 |
# = ABOUT =
|
3 |
# ================================
|
@@ -25,9 +27,9 @@ SUBMISSION_TEXT_TASK = """1. Select a task you want to submit results for."""
|
|
25 |
SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
|
26 |
|
27 |
SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
|
28 |
-
* If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by ποΈ Long Code Arena Team, the results are averaged across 3 runs.
|
29 |
-
* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by ποΈ Long Code Arena Team in π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional.
|
30 |
"""
|
|
|
31 |
SUBMISSION_TEXT_SUBMIT = """All set! A new PR to π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. ποΈ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
|
32 |
|
33 |
β³ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
|
|
|
1 |
+
from .formatting import styled_warning
|
2 |
+
|
3 |
# ================================
|
4 |
# = ABOUT =
|
5 |
# ================================
|
|
|
27 |
SUBMISSION_TEXT_METADATA = """2. Fill in some metadata about your submission."""
|
28 |
|
29 |
SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predictions.
|
30 |
+
* If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by ποΈ Long Code Arena Team, the results are averaged across 3 runs.
|
|
|
31 |
"""
|
32 |
+
|
33 |
SUBMISSION_TEXT_SUBMIT = """All set! A new PR to π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. ποΈ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
|
34 |
|
35 |
β³ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
|
src/evaluation/base_task_metrics.py
CHANGED
@@ -7,9 +7,7 @@ class BaseTaskMetrics(ABC):
|
|
7 |
pass
|
8 |
|
9 |
@abstractmethod
|
10 |
-
def add_batch(
|
11 |
-
self, predictions: List[str], references: List[str], *args, **kwargs
|
12 |
-
) -> None:
|
13 |
pass
|
14 |
|
15 |
@abstractmethod
|
|
|
7 |
pass
|
8 |
|
9 |
@abstractmethod
|
10 |
+
def add_batch(self, predictions: List[str], references: List[str], *args, **kwargs) -> None:
|
|
|
|
|
11 |
pass
|
12 |
|
13 |
@abstractmethod
|
src/evaluation/commit_message_generation/cmg_metrics.py
CHANGED
@@ -13,27 +13,17 @@ class CMGMetrics(BaseTaskMetrics):
|
|
13 |
self.bertscore = evaluate.load("bertscore")
|
14 |
self.bertscore_normalized = evaluate.load("bertscore")
|
15 |
|
16 |
-
def add_batch(
|
17 |
-
self
|
18 |
-
|
19 |
-
self.bleu.add_batch(
|
20 |
-
predictions=predictions, references=[[ref] for ref in references]
|
21 |
-
)
|
22 |
-
self.chrf.add_batch(
|
23 |
-
predictions=predictions, references=[[ref] for ref in references]
|
24 |
-
)
|
25 |
self.rouge.add_batch(predictions=predictions, references=references)
|
26 |
self.bertscore.add_batch(predictions=predictions, references=references)
|
27 |
-
self.bertscore_normalized.add_batch(
|
28 |
-
predictions=predictions, references=references
|
29 |
-
)
|
30 |
|
31 |
def compute(self, *args, **kwargs) -> Dict[str, float]:
|
32 |
rouge = self.rouge.compute()
|
33 |
bertscore = self.bertscore.compute(lang="en")
|
34 |
-
bertscore_normalized = self.bertscore_normalized.compute(
|
35 |
-
lang="en", rescale_with_baseline=True
|
36 |
-
)
|
37 |
return {
|
38 |
"bleu": self.bleu.compute(tokenize="13a")["score"],
|
39 |
"chrf": self.chrf.compute()["score"],
|
@@ -41,6 +31,5 @@ class CMGMetrics(BaseTaskMetrics):
|
|
41 |
"rouge2": rouge["rouge2"] * 100,
|
42 |
"rougeL": rouge["rougeL"] * 100,
|
43 |
"bertscore": sum(bertscore["f1"]) / len(bertscore["f1"]),
|
44 |
-
"bertscore_normalized": sum(bertscore_normalized["f1"])
|
45 |
-
/ len(bertscore_normalized["f1"]),
|
46 |
}
|
|
|
13 |
self.bertscore = evaluate.load("bertscore")
|
14 |
self.bertscore_normalized = evaluate.load("bertscore")
|
15 |
|
16 |
+
def add_batch(self, predictions: List[str], references: List[str], *args, **kwargs) -> None:
|
17 |
+
self.bleu.add_batch(predictions=predictions, references=[[ref] for ref in references])
|
18 |
+
self.chrf.add_batch(predictions=predictions, references=[[ref] for ref in references])
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
self.rouge.add_batch(predictions=predictions, references=references)
|
20 |
self.bertscore.add_batch(predictions=predictions, references=references)
|
21 |
+
self.bertscore_normalized.add_batch(predictions=predictions, references=references)
|
|
|
|
|
22 |
|
23 |
def compute(self, *args, **kwargs) -> Dict[str, float]:
|
24 |
rouge = self.rouge.compute()
|
25 |
bertscore = self.bertscore.compute(lang="en")
|
26 |
+
bertscore_normalized = self.bertscore_normalized.compute(lang="en", rescale_with_baseline=True)
|
|
|
|
|
27 |
return {
|
28 |
"bleu": self.bleu.compute(tokenize="13a")["score"],
|
29 |
"chrf": self.chrf.compute()["score"],
|
|
|
31 |
"rouge2": rouge["rouge2"] * 100,
|
32 |
"rougeL": rouge["rougeL"] * 100,
|
33 |
"bertscore": sum(bertscore["f1"]) / len(bertscore["f1"]),
|
34 |
+
"bertscore_normalized": sum(bertscore_normalized["f1"]) / len(bertscore_normalized["f1"]),
|
|
|
35 |
}
|
src/formatting.py
CHANGED
@@ -7,6 +7,4 @@ def styled_warning(warn):
|
|
7 |
|
8 |
|
9 |
def styled_message(message):
|
10 |
-
return
|
11 |
-
f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
12 |
-
)
|
|
|
7 |
|
8 |
|
9 |
def styled_message(message):
|
10 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
|
|
|
src/get_results_for_task.py
CHANGED
@@ -2,13 +2,15 @@ import logging
|
|
2 |
import os
|
3 |
|
4 |
import pandas as pd # type: ignore[import]
|
5 |
-
from datasets import
|
6 |
-
load_dataset)
|
7 |
|
8 |
-
from .leaderboard_formatting import (
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
|
14 |
|
@@ -44,27 +46,17 @@ def _get_results_stub() -> pd.DataFrame:
|
|
44 |
|
45 |
|
46 |
def _get_results_dataset(task_id: str) -> pd.DataFrame:
|
47 |
-
results_df = load_dataset(
|
48 |
-
os.environ["DATASET_ID"], task_id, split="test"
|
49 |
-
).to_pandas()
|
50 |
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
|
51 |
-
results_df["Context Size"] = results_df["Context Size"].map(
|
52 |
-
lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x
|
53 |
-
)
|
54 |
|
55 |
-
results_df = results_df.sort_values(
|
56 |
-
by=SORT_COLUMN_PER_TASK[task_id], ascending=False
|
57 |
-
)
|
58 |
|
59 |
for metric_column in METRICS_PER_TASK[task_id]:
|
60 |
if "BERTScore" in metric_column:
|
61 |
-
results_df[metric_column] = results_df[metric_column].map(
|
62 |
-
lambda x: f"{x:.5f}"
|
63 |
-
)
|
64 |
else:
|
65 |
-
results_df[metric_column] = results_df[metric_column].map(
|
66 |
-
lambda x: f"{x:.2f}"
|
67 |
-
)
|
68 |
|
69 |
results_df = results_df[get_columns_per_task(task_id)]
|
70 |
return results_df
|
|
|
2 |
import os
|
3 |
|
4 |
import pandas as pd # type: ignore[import]
|
5 |
+
from datasets import get_dataset_config_names, load_dataset # type: ignore[import]
|
|
|
6 |
|
7 |
+
from .leaderboard_formatting import (
|
8 |
+
COLUMNS_PRETTY,
|
9 |
+
METRICS_PER_TASK,
|
10 |
+
SORT_COLUMN_PER_TASK,
|
11 |
+
get_columns_per_task,
|
12 |
+
)
|
13 |
+
from .tasks_content import TASKS_PRETTY_REVERSE
|
14 |
|
15 |
AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
|
16 |
|
|
|
46 |
|
47 |
|
48 |
def _get_results_dataset(task_id: str) -> pd.DataFrame:
|
49 |
+
results_df = load_dataset(os.environ["DATASET_ID"], task_id, split="test").to_pandas()
|
|
|
|
|
50 |
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
|
51 |
+
results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)
|
|
|
|
|
52 |
|
53 |
+
results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)
|
|
|
|
|
54 |
|
55 |
for metric_column in METRICS_PER_TASK[task_id]:
|
56 |
if "BERTScore" in metric_column:
|
57 |
+
results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
|
|
|
|
|
58 |
else:
|
59 |
+
results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")
|
|
|
|
|
60 |
|
61 |
results_df = results_df[get_columns_per_task(task_id)]
|
62 |
return results_df
|
src/leaderboard_formatting.py
CHANGED
@@ -35,8 +35,4 @@ SORT_COLUMN_PER_TASK = {"commit_message_generation": "ROUGE-1"}
|
|
35 |
def get_columns_per_task(task_id: str) -> List[str]:
|
36 |
metrics_per_task = METRICS_PER_TASK[task_id]
|
37 |
|
38 |
-
return
|
39 |
-
["Model Name", "Availability", "Context Size"]
|
40 |
-
+ metrics_per_task
|
41 |
-
+ ["Submitted By"]
|
42 |
-
)
|
|
|
35 |
def get_columns_per_task(task_id: str) -> List[str]:
|
36 |
metrics_per_task = METRICS_PER_TASK[task_id]
|
37 |
|
38 |
+
return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By"]
|
|
|
|
|
|
|
|
src/submission_uploader.py
CHANGED
@@ -11,7 +11,7 @@ from tqdm import tqdm
|
|
11 |
|
12 |
from .evaluation import METRICS
|
13 |
from .formatting import styled_error, styled_message, styled_warning
|
14 |
-
from .
|
15 |
|
16 |
|
17 |
class AlreadyExists(Exception):
|
@@ -34,17 +34,11 @@ class SubmissionUploader:
|
|
34 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
35 |
"""Searches among discussions of dataset repo for a PR with the given title."""
|
36 |
try:
|
37 |
-
discussions = self._api.get_repo_discussions(
|
38 |
-
repo_id=self._dataset_id, repo_type="dataset"
|
39 |
-
)
|
40 |
except Exception:
|
41 |
return None
|
42 |
for discussion in discussions:
|
43 |
-
if
|
44 |
-
discussion.status == "open"
|
45 |
-
and discussion.is_pull_request
|
46 |
-
and discussion.title == pr_title
|
47 |
-
):
|
48 |
return discussion
|
49 |
return None
|
50 |
|
@@ -79,41 +73,30 @@ class SubmissionUploader:
|
|
79 |
]
|
80 |
return commit_operations
|
81 |
|
82 |
-
def _compute_metrics_for_predictions(
|
83 |
-
self, task_id: str, filenames: Optional[List[str]], temp_directory: str
|
84 |
-
) -> None:
|
85 |
metrics_module = METRICS[task_id]
|
86 |
-
assert
|
87 |
-
metrics_module is not None
|
88 |
-
), f"Computing metrics for {task_id} is not supported."
|
89 |
metrics_module.reset()
|
90 |
open(os.path.join(temp_directory, "metrics.jsonl"), "w").close()
|
91 |
|
92 |
# compute the metrics for each submitted file
|
93 |
for filename in filenames:
|
94 |
with jsonlines.open(filename, "r") as reader:
|
95 |
-
for example in tqdm(
|
96 |
-
reader, desc=f"Computing metrics for {os.path.basename(filename)}"
|
97 |
-
):
|
98 |
metrics_module.add_batch(
|
99 |
predictions=[example["prediction"]],
|
100 |
references=[example["reference"]],
|
101 |
)
|
102 |
computed_metrics = metrics_module.compute()
|
103 |
metrics_module.reset()
|
104 |
-
with jsonlines.open(
|
105 |
-
os.path.join(temp_directory, "metrics.jsonl"), "a"
|
106 |
-
) as writer:
|
107 |
writer.write(computed_metrics)
|
108 |
|
109 |
# aggregate the metrics over submitted files
|
110 |
-
with jsonlines.open(
|
111 |
-
os.path.join(temp_directory, "metrics.jsonl"), "r"
|
112 |
-
) as reader:
|
113 |
metrics_results = [line for line in reader]
|
114 |
final_metrics_results = {
|
115 |
-
key: sum(entry[key] for entry in metrics_results) / len(metrics_results)
|
116 |
-
for key in metrics_results[0]
|
117 |
}
|
118 |
with open(os.path.join(temp_directory, "final_metrics.json"), "w") as f:
|
119 |
json.dump(final_metrics_results, f)
|
@@ -142,9 +125,7 @@ class SubmissionUploader:
|
|
142 |
)
|
143 |
final_results.update(metadata_dict)
|
144 |
|
145 |
-
with jsonlines.open(
|
146 |
-
os.path.join(temp_directory, "final_results.jsonl"), "w"
|
147 |
-
) as writer:
|
148 |
writer.write(final_results)
|
149 |
|
150 |
return [
|
@@ -165,29 +146,17 @@ class SubmissionUploader:
|
|
165 |
submitted_by: str,
|
166 |
filenames: Optional[List[str]],
|
167 |
):
|
168 |
-
assert
|
169 |
-
|
170 |
-
), "Please, select one of the supported tasks."
|
171 |
-
assert (
|
172 |
-
model_folder
|
173 |
-
), "Please, specify non-empty name for a directory with a model's results."
|
174 |
assert model_name_pretty, "Please, specify non-empty name for a model."
|
175 |
-
assert
|
176 |
-
|
177 |
-
), "Please, specify non-empty information about a model's availability."
|
178 |
-
assert (
|
179 |
-
context_size
|
180 |
-
), "Please, specify non-empty information about a model's context size."
|
181 |
try:
|
182 |
_ = int(context_size)
|
183 |
except:
|
184 |
-
raise ValueError(
|
185 |
-
"Please, specify a model's context size as an integer (e.g., 16000)."
|
186 |
-
)
|
187 |
|
188 |
-
assert (
|
189 |
-
submitted_by
|
190 |
-
), "Please, specify non-empty information about a submission's author(s)."
|
191 |
assert filenames, "Please, attach at least one file with predictions."
|
192 |
|
193 |
def upload_files(
|
@@ -221,25 +190,16 @@ class SubmissionUploader:
|
|
221 |
|
222 |
logging.info("Checking if this request has already been submitted...")
|
223 |
if not force:
|
224 |
-
if model_name_pretty in self._fs.ls(
|
225 |
-
f"datasets/{self._dataset_id}/{task_id}/predictions"
|
226 |
-
|
227 |
-
filename
|
228 |
-
in self._fs.ls(
|
229 |
-
f"datasets/{self._dataset_id}/{task_id}/predictions/{model_name_pretty}"
|
230 |
-
)
|
231 |
-
for filename in filenames + ["metadata.json"]
|
232 |
):
|
233 |
-
return styled_warning(
|
234 |
-
f"{model_name_pretty} is already present in {self._dataset_id}."
|
235 |
-
)
|
236 |
|
237 |
prev_pr = self._get_previous_pr(pr_title)
|
238 |
if prev_pr is not None:
|
239 |
url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
|
240 |
-
return styled_warning(
|
241 |
-
f"{self._dataset_id} already has an open PR for this submission: {url}."
|
242 |
-
)
|
243 |
|
244 |
logging.info("Processing predictions...")
|
245 |
predictions_commit_operations = self._upload_predictions(
|
@@ -250,9 +210,7 @@ class SubmissionUploader:
|
|
250 |
|
251 |
with TemporaryDirectory() as d:
|
252 |
logging.info("Computing metrics...")
|
253 |
-
self._compute_metrics_for_predictions(
|
254 |
-
task_id=task_id, filenames=filenames, temp_directory=str(d)
|
255 |
-
)
|
256 |
|
257 |
logging.info("Processing results...")
|
258 |
results_commit_operations = self._upload_results(
|
@@ -269,8 +227,7 @@ class SubmissionUploader:
|
|
269 |
logging.info("Creating commit...")
|
270 |
new_pr = self._api.create_commit(
|
271 |
repo_id=self._dataset_id,
|
272 |
-
operations=predictions_commit_operations
|
273 |
-
+ results_commit_operations,
|
274 |
commit_message=pr_title,
|
275 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
|
276 |
create_pr=True,
|
|
|
11 |
|
12 |
from .evaluation import METRICS
|
13 |
from .formatting import styled_error, styled_message, styled_warning
|
14 |
+
from .tasks_content import TASKS_PRETTY_REVERSE
|
15 |
|
16 |
|
17 |
class AlreadyExists(Exception):
|
|
|
34 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
35 |
"""Searches among discussions of dataset repo for a PR with the given title."""
|
36 |
try:
|
37 |
+
discussions = self._api.get_repo_discussions(repo_id=self._dataset_id, repo_type="dataset")
|
|
|
|
|
38 |
except Exception:
|
39 |
return None
|
40 |
for discussion in discussions:
|
41 |
+
if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
|
|
|
|
|
|
|
|
|
42 |
return discussion
|
43 |
return None
|
44 |
|
|
|
73 |
]
|
74 |
return commit_operations
|
75 |
|
76 |
+
def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
|
|
|
|
|
77 |
metrics_module = METRICS[task_id]
|
78 |
+
assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
|
|
|
|
|
79 |
metrics_module.reset()
|
80 |
open(os.path.join(temp_directory, "metrics.jsonl"), "w").close()
|
81 |
|
82 |
# compute the metrics for each submitted file
|
83 |
for filename in filenames:
|
84 |
with jsonlines.open(filename, "r") as reader:
|
85 |
+
for example in tqdm(reader, desc=f"Computing metrics for {os.path.basename(filename)}"):
|
|
|
|
|
86 |
metrics_module.add_batch(
|
87 |
predictions=[example["prediction"]],
|
88 |
references=[example["reference"]],
|
89 |
)
|
90 |
computed_metrics = metrics_module.compute()
|
91 |
metrics_module.reset()
|
92 |
+
with jsonlines.open(os.path.join(temp_directory, "metrics.jsonl"), "a") as writer:
|
|
|
|
|
93 |
writer.write(computed_metrics)
|
94 |
|
95 |
# aggregate the metrics over submitted files
|
96 |
+
with jsonlines.open(os.path.join(temp_directory, "metrics.jsonl"), "r") as reader:
|
|
|
|
|
97 |
metrics_results = [line for line in reader]
|
98 |
final_metrics_results = {
|
99 |
+
key: sum(entry[key] for entry in metrics_results) / len(metrics_results) for key in metrics_results[0]
|
|
|
100 |
}
|
101 |
with open(os.path.join(temp_directory, "final_metrics.json"), "w") as f:
|
102 |
json.dump(final_metrics_results, f)
|
|
|
125 |
)
|
126 |
final_results.update(metadata_dict)
|
127 |
|
128 |
+
with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
|
|
|
|
|
129 |
writer.write(final_results)
|
130 |
|
131 |
return [
|
|
|
146 |
submitted_by: str,
|
147 |
filenames: Optional[List[str]],
|
148 |
):
|
149 |
+
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
150 |
+
assert model_folder, "Please, specify non-empty name for a directory with a model's results."
|
|
|
|
|
|
|
|
|
151 |
assert model_name_pretty, "Please, specify non-empty name for a model."
|
152 |
+
assert model_availability, "Please, specify non-empty information about a model's availability."
|
153 |
+
assert context_size, "Please, specify non-empty information about a model's context size."
|
|
|
|
|
|
|
|
|
154 |
try:
|
155 |
_ = int(context_size)
|
156 |
except:
|
157 |
+
raise ValueError("Please, specify a model's context size as an integer (e.g., 16000).")
|
|
|
|
|
158 |
|
159 |
+
assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
|
|
|
|
|
160 |
assert filenames, "Please, attach at least one file with predictions."
|
161 |
|
162 |
def upload_files(
|
|
|
190 |
|
191 |
logging.info("Checking if this request has already been submitted...")
|
192 |
if not force:
|
193 |
+
if model_name_pretty in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions") and all(
|
194 |
+
filename in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions/{model_name_pretty}")
|
195 |
+
for filename in filenames
|
|
|
|
|
|
|
|
|
|
|
196 |
):
|
197 |
+
return styled_warning(f"{model_name_pretty} is already present in {self._dataset_id}.")
|
|
|
|
|
198 |
|
199 |
prev_pr = self._get_previous_pr(pr_title)
|
200 |
if prev_pr is not None:
|
201 |
url = f"https://huggingface.co/datasets/{self._dataset_id}/discussions/{prev_pr.num}"
|
202 |
+
return styled_warning(f"{self._dataset_id} already has an open PR for this submission: {url}.")
|
|
|
|
|
203 |
|
204 |
logging.info("Processing predictions...")
|
205 |
predictions_commit_operations = self._upload_predictions(
|
|
|
210 |
|
211 |
with TemporaryDirectory() as d:
|
212 |
logging.info("Computing metrics...")
|
213 |
+
self._compute_metrics_for_predictions(task_id=task_id, filenames=filenames, temp_directory=str(d))
|
|
|
|
|
214 |
|
215 |
logging.info("Processing results...")
|
216 |
results_commit_operations = self._upload_results(
|
|
|
227 |
logging.info("Creating commit...")
|
228 |
new_pr = self._api.create_commit(
|
229 |
repo_id=self._dataset_id,
|
230 |
+
operations=predictions_commit_operations + results_commit_operations,
|
|
|
231 |
commit_message=pr_title,
|
232 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
|
233 |
create_pr=True,
|
src/{tasks.py β tasks_content.py}
RENAMED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
TASKS_PRETTY = {
|
2 |
"commit_message_generation": "Commit Message Generation",
|
3 |
"bug_localization": "Bug Localization on Issue",
|
@@ -9,7 +11,7 @@ TASKS_PRETTY = {
|
|
9 |
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
|
10 |
|
11 |
TASKS_DESCRIPTIONS = {
|
12 |
-
"
|
13 |
|
14 |
Our Commit Message Generation benchmark π€ [JetBrains-Research/lca-cmg](https://huggingface.co/datasets/JetBrains-Research/lca-cmg) includes 163 manually curated commits from Python projects.
|
15 |
|
@@ -21,9 +23,21 @@ TASKS_DESCRIPTIONS = {
|
|
21 |
|
22 |
For further details on the dataset and the baselines from ποΈ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
|
23 |
""",
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"
|
29 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
TASKS_PRETTY = {
|
4 |
"commit_message_generation": "Commit Message Generation",
|
5 |
"bug_localization": "Bug Localization on Issue",
|
|
|
11 |
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
|
12 |
|
13 |
TASKS_DESCRIPTIONS = {
|
14 |
+
"commit_message_generation": """# Commit Message Generation\n
|
15 |
|
16 |
Our Commit Message Generation benchmark π€ [JetBrains-Research/lca-cmg](https://huggingface.co/datasets/JetBrains-Research/lca-cmg) includes 163 manually curated commits from Python projects.
|
17 |
|
|
|
23 |
|
24 |
For further details on the dataset and the baselines from ποΈ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
|
25 |
""",
|
26 |
+
"bug_localization": "cool description for Bug Localization on Issue task",
|
27 |
+
"module_to_text": "cool description for Module-to-Text task",
|
28 |
+
"library_usage": "cool description for Library Usage Examples Generation task",
|
29 |
+
"project_code_completion": "cool description for Project-level Code Completion task",
|
30 |
+
"bug_localization_build_logs": "cool description for Bug Localization on Build Logs task",
|
31 |
}
|
32 |
+
|
33 |
+
|
34 |
+
def get_submission_text_files_for_task(task_pretty: Optional[str]) -> str:
|
35 |
+
if not task_pretty:
|
36 |
+
return "Please, select a specific task to see more detailed instructions regarding submitting files."
|
37 |
+
|
38 |
+
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
39 |
+
|
40 |
+
if task_id == "commit_message_generation":
|
41 |
+
return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by ποΈ Long Code Arena Team in π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results/tree/main/commit_message_generation/predictions). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional."""
|
42 |
+
|
43 |
+
return f"**{task_pretty} Instructions:**\n\n* π§ There are no instructions for the current task yet."
|