nkasmanoff commited on
Commit
f8aa4be
1 Parent(s): b4e3258

Create knowledge_extraction.py

Browse files
Files changed (1) hide show
  1. knowledge_extraction.py +80 -0
knowledge_extraction.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.matcher import Matcher
3
+
4
+
5
+ def get_entities(sent):
6
+ ## chunk 1
7
+ ent1 = ""
8
+ ent2 = ""
9
+
10
+ prv_tok_dep = "" # dependency tag of previous token in the sentence
11
+ prv_tok_text = "" # previous token in the sentence
12
+
13
+ prefix = ""
14
+ modifier = ""
15
+
16
+ #############################################################
17
+
18
+ for tok in nlp(sent):
19
+ ## chunk 2
20
+ # if token is a punctuation mark then move on to the next token
21
+ if tok.dep_ != "punct":
22
+ # check: token is a compound word or not
23
+ if tok.dep_ == "compound":
24
+ prefix = tok.text
25
+ # if the previous word was also a 'compound' then add the current word to it
26
+ if prv_tok_dep == "compound":
27
+ prefix = prv_tok_text + " " + tok.text
28
+
29
+ # check: token is a modifier or not
30
+ if tok.dep_.endswith("mod") == True:
31
+ modifier = tok.text
32
+ # if the previous word was also a 'compound' then add the current word to it
33
+ if prv_tok_dep == "compound":
34
+ modifier = prv_tok_text + " " + tok.text
35
+
36
+ ## chunk 3
37
+ if tok.dep_.find("subj") == True:
38
+ ent1 = modifier + " " + prefix + " " + tok.text
39
+ prefix = ""
40
+ modifier = ""
41
+ prv_tok_dep = ""
42
+ prv_tok_text = ""
43
+
44
+ ## chunk 4
45
+ if tok.dep_.find("obj") == True:
46
+ ent2 = modifier + " " + prefix + " " + tok.text
47
+
48
+ ## chunk 5
49
+ # update variables
50
+ prv_tok_dep = tok.dep_
51
+ prv_tok_text = tok.text
52
+ #############################################################
53
+
54
+ return [ent1.strip(), ent2.strip()]
55
+
56
+
57
+
58
+
59
+ def get_relation(sent):
60
+ nlp = spacy.load('en_core_web_sm')
61
+
62
+ doc = nlp(sent)
63
+
64
+ # Matcher class object
65
+ matcher = Matcher(nlp.vocab)
66
+
67
+ #define the pattern
68
+ pattern = [{'DEP':'ROOT'},
69
+ {'DEP':'prep','OP':"?"},
70
+ {'DEP':'agent','OP':"?"},
71
+ {'POS':'ADJ','OP':"?"}]
72
+
73
+ matcher.add('matching_pattern', patterns=[pattern])
74
+ matches = matcher(doc)
75
+ k = len(matches) - 1
76
+
77
+ span = doc[matches[k][1]:matches[k][2]]
78
+
79
+ return(span.text)
80
+