Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Merge branch 'main' of https://huggingface.co/spaces/pminervini/hallucinations-leaderboard into main
Browse files
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -36,7 +36,8 @@ class SelfCheckGpt(Task):
|
|
36 |
self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
|
37 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
38 |
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
39 |
-
|
|
|
40 |
def has_training_docs(self):
|
41 |
return False
|
42 |
|
@@ -105,6 +106,20 @@ class SelfCheckGpt(Task):
|
|
105 |
sentences = sentences,
|
106 |
sampled_passages = other_responses,
|
107 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
|
110 |
selfcheckgpt_scores_max = max(selfcheckgpt_scores)
|
|
|
36 |
self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
|
37 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
38 |
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
39 |
+
self.SelfCheckNLI_error_cnt = 0
|
40 |
+
|
41 |
def has_training_docs(self):
|
42 |
return False
|
43 |
|
|
|
106 |
sentences = sentences,
|
107 |
sampled_passages = other_responses,
|
108 |
)
|
109 |
+
|
110 |
+
if len(selfcheckgpt_scores) == 0:
|
111 |
+
self.SelfCheckNLI_error_cnt += 1
|
112 |
+
print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
|
113 |
+
result = {'avg-selfcheckgpt': 1.0, 'max-selfcheckgpt': 1.0}
|
114 |
+
|
115 |
+
else:
|
116 |
+
threshold = 0.5
|
117 |
+
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
118 |
+
selfcheckgpt_scores_max = 1.0 if max(selfcheckgpt_scores) > threshold else 0.0
|
119 |
+
# passage is hallucianted if average score of all sentences is hallucinated.
|
120 |
+
selfcheckgpt_scores_avg = 1.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 0.0
|
121 |
+
result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
|
122 |
+
return result
|
123 |
|
124 |
selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
|
125 |
selfcheckgpt_scores_max = max(selfcheckgpt_scores)
|