File size: 3,597 Bytes
f4abbca
 
 
 
8786bb1
 
f4abbca
 
 
 
 
 
8786bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4abbca
 
 
 
 
 
 
 
8786bb1
f4abbca
 
 
8786bb1
 
f4abbca
 
8786bb1
f4abbca
 
 
8786bb1
f4abbca
 
 
8786bb1
f4abbca
 
 
 
 
 
8786bb1
f4abbca
 
8786bb1
f4abbca
 
 
 
8786bb1
f4abbca
 
 
 
 
 
 
 
8786bb1
f4abbca
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from typing import List 
from resources import set_start, audit_elapsedtime, entities_list_to_dict
from transformers import BertTokenizer, BertForTokenClassification
import torch
from gliner import GLiNER


#Named-Entity Recognition model

def init_model_ner():
    print("Initiating NER model...")
    start = set_start()
    model = GLiNER.from_pretrained("urchade/gliner_multi")
    audit_elapsedtime(function="Initiating NER model", start=start)
    return model

def get_entity_results(model: GLiNER, text: str, entities_list: List[str]): #-> Lead_labels:
    print("Initiating entity recognition...")
    start = set_start()
    
    labels = entities_list

    entities_result = model.predict_entities(text, labels)
    
    entities_dict = entities_list_to_dict(entities_list)
    for entity in entities_result:
        print(entity["label"], "=>", entity["text"])
        entities_dict[entity["label"]] = entity["text"]

    audit_elapsedtime(function="Retreiving entity labels from text", start=start)
    return entities_dict

def init_model_ner_v2():
    print("Initiating NER model...")
    start = set_start()
    
    # Load pre-trained tokenizer and model
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

    audit_elapsedtime(function="Initiating NER model", start=start)
    return tokenizer, model

def get_entity_results_v2(tokenizer, model, text: str, entities_list: List[str]): #-> Lead_labels:
    print("Initiating entity recognition...")
    start = set_start()
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
    labels = entities_list#["Apple Inc.", "American", "Cupertino", "California"]#entities_list
    print("tokens line 24:",tokens)
    # Convert tokens to IDs
    input_ids = tokenizer.encode(text, return_tensors="pt")
    print("input_ids line 27:",input_ids)
    # Perform NER prediction
    with torch.no_grad():
        outputs = model(input_ids)
        print("outputs line 31:",outputs)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs.logits, dim=2)[0]
    print("predicted_labels line 35:",predicted_labels)

    # Map predicted labels to actual entities
    entities = []
    current_entity = ""
    for i, label_id in enumerate(predicted_labels):
        label = model.config.id2label[label_id.item()]
        print(f"i[{i}], label[{label}], label_id[{label_id}]")
        token = tokens[i]
        if label.startswith('B-'):  # Beginning of a new entity
            print(token)
            if current_entity:
                entities.append(current_entity.strip())
            current_entity = token
        elif label.startswith('I-'):  # Inside of an entity
            print(token)
            current_entity += " " + token
        else:  # Outside of any entity
            if current_entity:
                entities.append(current_entity.strip())
                current_entity = ""

    # Filter out only the entities you are interested in
    filtered_entities = [entity for entity in entities if entity in labels]
    print("filtered_entities line 56:",filtered_entities)
    # entities_result = model.predict_entities(text, labels)
        
    # entities_dict = entities_list_to_dict(entities_list)
    # for entity in entities_result:
    #     print(entity["text"], "=>", entity["label"])
    #     entities_dict[entity["label"]] = entity["text"]

    audit_elapsedtime(function="Retreiving entity labels from text", start=start)
    return filtered_entities