eloukas commited on
Commit
92218bf
1 Parent(s): 1430cec
Files changed (1) hide show
  1. test_run.py +181 -0
test_run.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !pip install gr-nlp-toolkit
2
+
3
+ from gr_nlp_toolkit import Pipeline
4
+
5
+ # Instantiate the Pipeline
6
+ nlp_pos_ner_dp_with_g2g = Pipeline("pos,ner,dp,g2g")
7
+
8
+
9
+ def greeklish_to_greek(text: str) -> str:
10
+ """
11
+ Convert Greeklish (Greek written with Latin characters) to Greek. ("larisa" -> "λαρισα")
12
+
13
+ Args:
14
+ text (str): The Greeklish text to convert.
15
+
16
+ Returns:
17
+ str: The transliterated Greek text.
18
+
19
+ Examples:
20
+ >>> greeklish_to_greek("H thessaloniki einai wraia polh")
21
+ 'η θεσσαλονικη ειναι ωραια πολη'
22
+ """
23
+ doc = nlp_pos_ner_dp_with_g2g(text)
24
+ return " ".join([token.text for token in doc.tokens])
25
+
26
+
27
+ def process_ner(text: str) -> dict:
28
+ """
29
+ Process text to extract Named Entity Recognition (NER) information.
30
+
31
+ Args:
32
+ text (str): The text to process.
33
+
34
+ Returns:
35
+ dict: A dictionary with the text and the NER value.
36
+
37
+ Examples:
38
+ >>> process_ner("Η Αργεντινή κέρδισε το Παγκόσμιο Κύπελλο το 2022")
39
+ {
40
+ 'η': 'O',
41
+ 'αργεντινη': 'S-ORG',
42
+ 'κερδισε': 'O',
43
+ 'το': 'O',
44
+ 'παγκοσμιο': 'B-EVENT',
45
+ 'κυπελλο': 'E-EVENT',
46
+ 'το': 'O',
47
+ '2022': 'S-DATE'
48
+ }
49
+
50
+ NER Possible Labels List:
51
+ ner_labels = [
52
+ 'O', 'S-GPE', 'S-ORG', 'S-CARDINAL', 'B-ORG', 'E-ORG', 'B-DATE', 'E-DATE', 'S-NORP',
53
+ 'B-GPE', 'E-GPE', 'S-EVENT', 'S-DATE', 'S-PRODUCT', 'S-LOC', 'I-ORG', 'S-PERSON',
54
+ 'S-ORDINAL', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'B-LAW', 'I-LAW', 'E-LAW', 'B-MONEY',
55
+ 'I-MONEY', 'E-MONEY', 'B-EVENT', 'I-EVENT', 'E-EVENT', 'B-FAC', 'E-FAC', 'I-DATE',
56
+ 'S-PERCENT', 'B-QUANTITY', 'E-QUANTITY', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART',
57
+ 'I-FAC', 'S-LAW', 'S-TIME', 'B-LOC', 'E-LOC', 'I-LOC', 'S-FAC', 'B-TIME', 'E-TIME',
58
+ 'S-WORK_OF_ART', 'B-PRODUCT', 'E-PRODUCT', 'B-CARDINAL', 'E-CARDINAL', 'S-MONEY',
59
+ 'S-LANGUAGE', 'I-TIME', 'I-PRODUCT', 'I-GPE', 'I-QUANTITY', 'B-NORP', 'E-NORP',
60
+ 'S-QUANTITY', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'I-CARDINAL', 'B-ORDINAL',
61
+ 'I-ORDINAL', 'E-ORDINAL'
62
+ ]
63
+ """
64
+ doc = nlp_pos_ner_dp_with_g2g(text)
65
+ ner_dict = {token.text: token.ner for token in doc.tokens}
66
+ return ner_dict
67
+
68
+
69
+ def process_pos(text: str) -> dict:
70
+ """
71
+ Process text to extract Part-of-Speech information (UPOS tags and morphological features).
72
+
73
+ # Complete list of UPOS (https://universaldependencies.org/u/pos/ & https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py)
74
+ ADJ: adjective
75
+ ADP: adposition
76
+ ADV: adverb
77
+ AUX: auxiliary
78
+ CCONJ: coordinating conjunction
79
+ DET: determiner
80
+ INTJ: interjection
81
+ NOUN: noun
82
+ NUM: numeral
83
+ PART: particle
84
+ PRON: pronoun
85
+ PROPN: proper noun
86
+ PUNCT: punctuation
87
+ SCONJ: subordinating conjunction
88
+ SYM: symbol
89
+ VERB: verb
90
+ X: other
91
+
92
+ # Complete list of the morphological features can be found here: (https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py
93
+ Due to the large number of features, only the most common ones are listed here:
94
+ - Aspect
95
+ - Case
96
+ - Definite
97
+ - Mood
98
+ - Number
99
+ - Person
100
+ - PronType
101
+ - Tense
102
+ - Gender
103
+ - VerbForm
104
+ - Voice
105
+
106
+ Args:
107
+ text (str): The text to process.
108
+
109
+ Returns:
110
+ dict: A dictionary with the text and the POS information, containing UPOS and morphological features as keys.
111
+
112
+ Examples:
113
+ >>> process_pos("Μου αρέσει να διαβάζω τα post του Andrew Ng στο Twitter.")
114
+ {
115
+ 'μου': {'UPOS': 'PRON', 'Morphological_Features': {'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '1', 'Poss': '_', 'PronType': 'Prs'}},
116
+ 'αρεσει': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}},
117
+ 'να': {'UPOS': 'AUX', 'Morphological_Features': {'Aspect': '_', 'Mood': '_', 'Number': '_', 'Person': '_', 'Tense': '_', 'VerbForm': '_', 'Voice': '_'}},
118
+ 'διαβαζω': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}},
119
+ 'τα': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Acc', 'Definite': 'Def', 'Gender': 'Neut', 'Number': 'Plur', 'PronType': 'Art'}},
120
+ 'post': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
121
+ 'του': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Gen', 'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', 'PronType': 'Art'}},
122
+ 'andrew': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
123
+ 'ng': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
124
+ 'στο': {'UPOS': '_', 'Morphological_Features': {}},
125
+ 'twitter': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
126
+ '.': {'UPOS': 'PUNCT', 'Morphological_Features': {}}
127
+ }
128
+ """
129
+ doc = nlp_pos_ner_dp_with_g2g(text)
130
+ pos_dict = {
131
+ token.text: {"UPOS": token.upos, "Morphological_Features": token.feats}
132
+ for token in doc.tokens
133
+ }
134
+ return pos_dict
135
+
136
+
137
+ def process_dp(text: str) -> dict:
138
+ """
139
+ Process text to extract Dependency Parsing information.
140
+
141
+ This method analyzes the given text and returns dependency parsing information for each word,
142
+ including its syntactic head and dependency relation.
143
+
144
+ Args:
145
+ text (str): The text to process.
146
+
147
+ Returns:
148
+ dict: A dictionary where each key is a word from the input text, and the value is another
149
+ dictionary containing:
150
+ - 'Head': The position of the syntactic head of the word (0 indicates the root).
151
+ - 'Deprel': The dependency relation to the head.
152
+
153
+
154
+ Examples:
155
+ >>> process_dp("Προτιμώ την πρωινή πτήση από την Αθήνα στη Θεσσαλονίκη.")
156
+ {
157
+ 'προτιμω': {'Head': 0, 'Deprel': 'root'},
158
+ 'την': {'Head': 4, 'Deprel': 'det'},
159
+ 'πρωινη': {'Head': 4, 'Deprel': 'amod'},
160
+ 'πτηση': {'Head': 1, 'Deprel': 'obj'},
161
+ 'απο': {'Head': 7, 'Deprel': 'case'},
162
+ 'την': {'Head': 7, 'Deprel': 'det'},
163
+ 'αθηνα': {'Head': 4, 'Deprel': 'nmod'},
164
+ 'στη': {'Head': 9, 'Deprel': 'case'},
165
+ 'θεσσαλονικη': {'Head': 4, 'Deprel': 'nmod'},
166
+ '.': {'Head': 1, 'Deprel': 'punct'}
167
+ }
168
+
169
+ Dependency Parsing Possible Labels List:
170
+ dp_labels = [
171
+ 'obl', 'obj', 'dep', 'mark', 'case', 'flat', 'nummod', 'obl:arg', 'punct', 'cop',
172
+ 'acl:relcl', 'expl', 'nsubj', 'csubj:pass', 'root', 'advmod', 'nsubj:pass', 'ccomp',
173
+ 'conj', 'amod', 'xcomp', 'aux', 'appos', 'csubj', 'fixed', 'nmod', 'iobj', 'parataxis',
174
+ 'orphan', 'det', 'advcl', 'vocative', 'compound', 'cc', 'discourse', 'acl', 'obl:agent'
175
+ ]
176
+ """
177
+ doc = nlp_pos_ner_dp_with_g2g(text)
178
+ dp_dict = {
179
+ token.text: {"Head": token.head, "Deprel": token.deprel} for token in doc.tokens
180
+ }
181
+ return dp_dict