File size: 3,038 Bytes
1086ffd
 
 
 
 
 
 
 
 
c6ad667
 
 
1086ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6ad667
 
 
 
 
 
9078d0a
 
 
c6ad667
 
 
 
 
 
 
 
 
 
 
fe413ad
 
c6ad667
 
 
 
 
1086ffd
 
 
c6ad667
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import openai
import json
import sys
sys.path.append('.')
from local_config import openai_key

# Set up your API key
openai.api_key = openai_key

def get_ready_key(name, type, start):
    return f'{name}-{type}-{start}'

def extract_named_entities(src_txt, type_arr):
    system = f"你是一个聪明而且有百年经验的命名实体识别(NER)识别器. 你的任务是从一段文本里面提取出相应的实体并且给出标签。你的回答必须用统一的格式。文本用```符号分割。输出采用Json的格式并且标记实体在文本中的位置。实体类型保存在一个数组里{type_arr}"
    user = f"输入|```皮卡丘神奇宝贝```输出|"
    assistant = """[{"name": "皮卡丘", "type": "Person", "start": 0, "end": 3}, {"name": "神奇宝贝", "type": "物种", "start": 4, "end": 8}]"""
    input = f"输入|```{src_txt}```输出|"
    # Call the OpenAI API
    completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": f"{system}"},
                        {"role": "user", "content": f"{user}"},
                        {"role": "assistant", "content": f"{assistant}"},
                        {"role": "user", "content": f"{input}"}
                    ]
                )

    # Extract the output and parse the JSON array
    content = completion.choices[0].message.content
    print(content)
    j = json.loads(content)
    result = []
    j.sort(key=lambda x: x['start']*1000+x['end'])
    ready_keys = set()
    for item in j:
        s = item['start']
        e = item['end']
        # 过滤非目标实体类型
        if not type_arr.__contains__(item['type']):
            continue
        # 修正标注错误的实体坐标
        if src_txt[s:e] != item['name']:
            for i in range(len(src_txt)):
                if src_txt[i:i+len(item['name'])] != item['name']:
                    continue
                # 跳过匹配过的实体,防止重复匹配
                ready_key = get_ready_key(item['name'], item['type'], i)
                if ready_keys.__contains__(ready_key):
                    continue
                item['start'] = i
                break
        # 确保实体结尾坐标正确
        item['end'] = item['start'] + len(item['name'])
        # 将在实体类型里的放入结果
        result.append(item)
        ready_key = get_ready_key(item['name'], item['type'], item['start'])
        ready_keys.add(ready_key)
    return result

if __name__ == '__main__':
    # extract_named_entities("```汤姆每天都被杰瑞欺负,皮卡丘越来越想帮忙,竟然还总是被拒绝,心想难道我“皮大仙”这点能力都没有?而且,这货不是被虐狂吧```", ["Person", "物种"])
    result = extract_named_entities('老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑', ['代称', '行政区'])
    print(result)