Spaces:
Runtime error
Runtime error
nkasmanoff
commited on
Commit
•
f8aa4be
1
Parent(s):
b4e3258
Create knowledge_extraction.py
Browse files- knowledge_extraction.py +80 -0
knowledge_extraction.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from spacy.matcher import Matcher
|
3 |
+
|
4 |
+
|
5 |
+
def get_entities(sent):
|
6 |
+
## chunk 1
|
7 |
+
ent1 = ""
|
8 |
+
ent2 = ""
|
9 |
+
|
10 |
+
prv_tok_dep = "" # dependency tag of previous token in the sentence
|
11 |
+
prv_tok_text = "" # previous token in the sentence
|
12 |
+
|
13 |
+
prefix = ""
|
14 |
+
modifier = ""
|
15 |
+
|
16 |
+
#############################################################
|
17 |
+
|
18 |
+
for tok in nlp(sent):
|
19 |
+
## chunk 2
|
20 |
+
# if token is a punctuation mark then move on to the next token
|
21 |
+
if tok.dep_ != "punct":
|
22 |
+
# check: token is a compound word or not
|
23 |
+
if tok.dep_ == "compound":
|
24 |
+
prefix = tok.text
|
25 |
+
# if the previous word was also a 'compound' then add the current word to it
|
26 |
+
if prv_tok_dep == "compound":
|
27 |
+
prefix = prv_tok_text + " " + tok.text
|
28 |
+
|
29 |
+
# check: token is a modifier or not
|
30 |
+
if tok.dep_.endswith("mod") == True:
|
31 |
+
modifier = tok.text
|
32 |
+
# if the previous word was also a 'compound' then add the current word to it
|
33 |
+
if prv_tok_dep == "compound":
|
34 |
+
modifier = prv_tok_text + " " + tok.text
|
35 |
+
|
36 |
+
## chunk 3
|
37 |
+
if tok.dep_.find("subj") == True:
|
38 |
+
ent1 = modifier + " " + prefix + " " + tok.text
|
39 |
+
prefix = ""
|
40 |
+
modifier = ""
|
41 |
+
prv_tok_dep = ""
|
42 |
+
prv_tok_text = ""
|
43 |
+
|
44 |
+
## chunk 4
|
45 |
+
if tok.dep_.find("obj") == True:
|
46 |
+
ent2 = modifier + " " + prefix + " " + tok.text
|
47 |
+
|
48 |
+
## chunk 5
|
49 |
+
# update variables
|
50 |
+
prv_tok_dep = tok.dep_
|
51 |
+
prv_tok_text = tok.text
|
52 |
+
#############################################################
|
53 |
+
|
54 |
+
return [ent1.strip(), ent2.strip()]
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
def get_relation(sent):
|
60 |
+
nlp = spacy.load('en_core_web_sm')
|
61 |
+
|
62 |
+
doc = nlp(sent)
|
63 |
+
|
64 |
+
# Matcher class object
|
65 |
+
matcher = Matcher(nlp.vocab)
|
66 |
+
|
67 |
+
#define the pattern
|
68 |
+
pattern = [{'DEP':'ROOT'},
|
69 |
+
{'DEP':'prep','OP':"?"},
|
70 |
+
{'DEP':'agent','OP':"?"},
|
71 |
+
{'POS':'ADJ','OP':"?"}]
|
72 |
+
|
73 |
+
matcher.add('matching_pattern', patterns=[pattern])
|
74 |
+
matches = matcher(doc)
|
75 |
+
k = len(matches) - 1
|
76 |
+
|
77 |
+
span = doc[matches[k][1]:matches[k][2]]
|
78 |
+
|
79 |
+
return(span.text)
|
80 |
+
|