File size: 1,163 Bytes
1086ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def bio_2_json_one(anno_txt):
    ls = anno_txt.split('\n')
    text = ''
    anno = []
    now_label = ''
    for i, l in enumerate(ls):
        char, label = l.split('\t')
        text += char
        if 'B-' in label:
            start = i
            now_label = label.split('-')[1]
        if label == 'O':
            if now_label:
                anno.append([start, i, text[start:i], now_label])
                now_label = ''
                start = 0
    if now_label:
        i += 1
        anno.append([start, i, text[start:i], now_label])
    return {'text': text, 'anno': anno}


def bit_2_json(txt):
    anno_txts = txt.split('\n\n')
    annos = []
    for anno_txt in anno_txts:
        if anno_txt == '':
            continue
        anno_j = bio_2_json_one(anno_txt)
        annos.append(anno_j)
    return annos


if __name__ == '__main__':
    txt = '''你\tB-PER

是\tO

一\tO

个\tO

聪\tB-PER

明\tI-PER

的\tO

软\tB-ORG

件\tI-ORG

工\tI-ORG

程\tI-ORG

师\tI-ORG'''
    # txt = open('data/ner/weibo_ner/dev.txt', 'r', encoding='utf-8').read()
    annos = bit_2_json(txt)
    print(annos)