Upload 2 files
Browse files- .gitattributes +1 -0
- alpaca_data_cleaned_spanish.json +3 -0
- rebuild-alpaca-dataset-argentina.py +81 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
alpaca_data_cleaned_spanish.json filter=lfs diff=lfs merge=lfs -text
|
alpaca_data_cleaned_spanish.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bd962077eb4015c85350922869d96801d0d2b8ed56bc56fe9e4197926bb9ff1
|
3 |
+
size 26652389
|
rebuild-alpaca-dataset-argentina.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os,openai,json,time
|
2 |
+
|
3 |
+
def check_api_key_validity(api_key):
|
4 |
+
try:
|
5 |
+
openai.api_key = api_key
|
6 |
+
print("OpenAI API key is valid")
|
7 |
+
except openai.OpenAIError as e:
|
8 |
+
print("Invalid OpenAI API key")
|
9 |
+
exit()
|
10 |
+
|
11 |
+
|
12 |
+
# Much cheaper and powerful than Davinci
|
13 |
+
def call_AI_chatGPT(prompt):
|
14 |
+
model = "gpt-3.5-turbo"
|
15 |
+
# generate response
|
16 |
+
generated=False
|
17 |
+
while(not generated):
|
18 |
+
try:
|
19 |
+
time.sleep(2)
|
20 |
+
response = openai.ChatCompletion.create(
|
21 |
+
model=model,
|
22 |
+
messages =[
|
23 |
+
{'role':'system','content':'You are an Argentine assistant. When given an instruction, answer the instruction in spanish and write the response of the instruction and nothing else. Use Argentine slang like boludo, che, pelotudo, gil, laburo, forro, sacarla barata, mala leche, al pedo, gil de cuarta, quilombo, estas en pedo, ya fue, etc. Do not always start with che or boludo.'},
|
24 |
+
{'role':'user','content':prompt}
|
25 |
+
],
|
26 |
+
temperature=1.0,
|
27 |
+
max_tokens=1024,request_timeout=40)
|
28 |
+
generated=True
|
29 |
+
except Exception as e:
|
30 |
+
print("%s: Trying again..." % str(e))
|
31 |
+
pass
|
32 |
+
return response.choices[0]['message']['content']
|
33 |
+
|
34 |
+
# import api key
|
35 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
36 |
+
if (api_key is None) or (len(api_key)==0): # try to load apikey from file
|
37 |
+
try:
|
38 |
+
api_key=open('api-key.txt','rb').read().strip().decode()
|
39 |
+
except:
|
40 |
+
print("Couldn't load OpenAI Api key, please load it in OPENAI_API_KEY env variable, or alternatively in 'api-key.txt' file.")
|
41 |
+
exit(0)
|
42 |
+
else: print('Loaded api key from environment variable.')
|
43 |
+
|
44 |
+
check_api_key_validity(api_key)
|
45 |
+
|
46 |
+
# Original ALPACA prompt
|
47 |
+
PROMPT_DICT = {
|
48 |
+
"prompt_input": (
|
49 |
+
"You are an Argentine assistant. When given an instruction, answer the instruction in spanish and write the response of the instruction and nothing else. Use Argentine slang like boludo, che, pelotudo, gil, laburo, forro, mala leche, al pedo, gil de cuarta, quilombo, estas en pedo, ya fue. Do not always start with Che or boludo. Answer in a colloquial manner. Below is an instruction that describes the task, paired with an input that provides further context, and the output response. "
|
50 |
+
"Rewrite the response that appropriately completes the request, taking in acount that you are Argentine, and respond as such.\n\n"
|
51 |
+
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: \n{output}\n\n### Rewrite: \n"
|
52 |
+
),
|
53 |
+
"prompt_no_input": (
|
54 |
+
"You are an Argentine assistant. When given an instruction, answer the instruction in spanish and write te response to the instruction and nothing else. Use Argentine slang like boludo, che, pelotudo, gil, laburo, forro, mala leche, al pedo, gil de cuarta, quilombo, estas en pedo, ya fue, etc. Do not overuse slang. Below is an instruction that describes a task, and the output response. "
|
55 |
+
"Rewrite the response that appropriately completes the request, taking in acount that you are Argentine, and respond as such, but do not always start with che or boludo.\n\n"
|
56 |
+
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: \n{output}\n\n### Rewrite: \n"
|
57 |
+
),
|
58 |
+
}
|
59 |
+
|
60 |
+
originalDataFile="alpaca_data_cleaned_spanish.json"
|
61 |
+
|
62 |
+
f=open(originalDataFile,"r")
|
63 |
+
data=json.load(f)
|
64 |
+
f.close()
|
65 |
+
print("Loaded %s data file with %d entries." % (originalDataFile,len(data)))
|
66 |
+
|
67 |
+
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
|
68 |
+
index=0
|
69 |
+
for i in data:
|
70 |
+
if len(i['input'])==0:
|
71 |
+
prompt=prompt_no_input.format_map(i)
|
72 |
+
else:
|
73 |
+
prompt=prompt_input.format_map(i)
|
74 |
+
i["output"]=call_AI_chatGPT(prompt).strip()
|
75 |
+
js = json.dumps(i)
|
76 |
+
print("Index: %d/%d: %s" % (index,len(data),js))
|
77 |
+
index+=1
|
78 |
+
e=open("dataset-alpaca-Arg.json","a")
|
79 |
+
e.write("%s,\n" % js)
|
80 |
+
e.close()
|
81 |
+
|