nam194 commited on
Commit
bd35a5d
1 Parent(s): 5d0c783

Create parse_info.py

Browse files
Files changed (1) hide show
  1. parse_info.py +93 -0
parse_info.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import chardet, string, gdown, re
3
+ from pathlib import Path
4
+ from nltk import everygrams
5
+ from collections import Counter
6
+ from typing import List, Optional
7
+ from datetime import datetime
8
+ from dateutil import parser, relativedelta
9
+
10
+ punc = list(string.punctuation)
11
+ def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
12
+ try:
13
+ for i in excp:
14
+ punc.remove(i)
15
+ except:
16
+ pass
17
+ inp = inp.lower()
18
+ inp = re.sub(r"won\'t", "will not", inp)
19
+ inp = re.sub(r"can\'t", "can not", inp)
20
+ inp = re.sub(r"\'re", " are", inp)
21
+ inp = re.sub(r"\'s", " of", inp)
22
+ inp = re.sub(r"\'d", " would", inp)
23
+ inp = re.sub(r"\'ll", " will", inp)
24
+ inp = re.sub(r"\'t", " not", inp)
25
+ inp = re.sub(r"\'ve", " have", inp)
26
+ inp = re.sub(r"\'m", " am", inp)
27
+ for i in punc:
28
+ inp = inp.replace(i,rep)
29
+ return " ".join(inp.split())
30
+
31
+ def parse_time(inp: List):
32
+ duration = 0
33
+ for i, _ in enumerate(inp):
34
+ inp[i] = inp[i].lower()
35
+ now = datetime.utcnow().strftime("%d/%m/%Y")
36
+ _ = ["đến", " to "] # list that split 2 time point word
37
+ __ = ["now", "hiện tại", " nay", " đến nay", "present"] # end time point
38
+ for j in _:
39
+ inp[i] = inp[i].replace(j," - ")
40
+ for j in __:
41
+ inp[i] = inp[i].replace(j,now)
42
+ for j in inp[i]:
43
+ if j.isalpha():
44
+ inp[i] = inp[i].replace(j,"").strip()
45
+ inp[i] = parse_string(" ".join(inp[i].split(" ")), rep="", excp=["/","-"])
46
+
47
+ time_point = inp[i].split("-") # split to 2 time point
48
+ if len(time_point) != 2: # must be splitted to 2 time point
49
+ continue
50
+ try:
51
+ d1 = parser.parse(time_point[0]).strftime("%d-%m-%Y")
52
+ d2 = parser.parse(time_point[1]).strftime("%d-%m-%Y")
53
+ duration += (datetime.strptime(d2, "%d-%m-%Y") - datetime.strptime(d1, "%d-%m-%Y")).days
54
+ except:
55
+ continue
56
+ return "{:.1f} năm".format(duration/365)
57
+
58
+ filename = "./skills.csv"
59
+ detected = chardet.detect(Path(filename).read_bytes()) # "ISO-8859-1"
60
+ skill_list = pd.read_csv(filename, encoding=detected["encoding"])
61
+ skill_list = [i.replace("\n","") for i in skill_list["Skill"].to_list()]
62
+ def parse_skill(inp: List) -> list:
63
+ res = []
64
+ for i, _ in enumerate(inp):
65
+ inp[i] = parse_string(inp[i])
66
+ for ngram in Counter(map(' '.join, everygrams(" ".join(inp).split(), 1, 3))).keys():
67
+ if ngram in skill_list:
68
+ res.append(ngram)
69
+ return ". ".join([i.capitalize() for i in list(set(res))])
70
+
71
+ def parse_gender(inp: List) -> str:
72
+ inp = " ".join([parse_string(i) for i in inp])
73
+ gender = ["nam", "nữ", "female", "male", "bisexual", "asexual", "heterosexual", "homosexual", "lgbt"]
74
+ for gen in gender:
75
+ if gen in inp:
76
+ return gen
77
+ return ""
78
+
79
+ def parse_address(inp: List) -> str:
80
+ inp = [parse_string(i, excp=",") for i in inp]
81
+ for i, _ in enumerate(inp):
82
+ inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
83
+ return ". ".join(inp)
84
+
85
+ def parse_designation(inp: List) -> str:
86
+ inp = list(set([parse_string(i) for i in inp]))
87
+ for i, _ in enumerate(inp):
88
+ inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
89
+ return ". ".join(inp)
90
+
91
+ def parse_email(inp: List) -> str:
92
+ inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
93
+ return " ".join(inp)