File size: 3,431 Bytes
bd35a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import string
import chardet, string, gdown, re
from pathlib import Path
from nltk import everygrams
from collections import Counter
from typing import List, Optional
from datetime import datetime
from dateutil import parser, relativedelta

punc = list(string.punctuation)
def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
    try:
        for i in excp:
            punc.remove(i)
    except:
        pass
    inp = inp.lower()
    inp = re.sub(r"won\'t", "will not", inp)
    inp = re.sub(r"can\'t", "can not", inp)
    inp = re.sub(r"\'re", " are", inp)
    inp = re.sub(r"\'s", " of", inp)
    inp = re.sub(r"\'d", " would", inp)
    inp = re.sub(r"\'ll", " will", inp)
    inp = re.sub(r"\'t", " not", inp)
    inp = re.sub(r"\'ve", " have", inp)
    inp = re.sub(r"\'m", " am", inp)  
    for i in punc:
        inp = inp.replace(i,rep)
    return " ".join(inp.split())

def parse_time(inp: List):
    duration = 0
    for i, _ in enumerate(inp):
        inp[i] = inp[i].lower()
        now = datetime.utcnow().strftime("%d/%m/%Y")
        _ = ["đến", " to "] # list that split 2 time point word
        __ = ["now", "hiện tại", " nay", " đến nay", "present"] # end time point
        for j in _:
            inp[i] = inp[i].replace(j," - ")
        for j in __:
            inp[i] = inp[i].replace(j,now)
        for j in inp[i]:
            if j.isalpha():
                inp[i] = inp[i].replace(j,"").strip()
        inp[i] = parse_string(" ".join(inp[i].split(" ")), rep="", excp=["/","-"])

        time_point = inp[i].split("-") # split to 2 time point
        if len(time_point) != 2: # must be splitted to 2 time point
            continue
        try:
            d1 = parser.parse(time_point[0]).strftime("%d-%m-%Y")
            d2 = parser.parse(time_point[1]).strftime("%d-%m-%Y")
            duration += (datetime.strptime(d2, "%d-%m-%Y") - datetime.strptime(d1, "%d-%m-%Y")).days
        except:
            continue
    return "{:.1f} năm".format(duration/365)

filename = "./skills.csv"
detected = chardet.detect(Path(filename).read_bytes()) # "ISO-8859-1"
skill_list = pd.read_csv(filename, encoding=detected["encoding"])
skill_list = [i.replace("\n","") for i in skill_list["Skill"].to_list()]
def parse_skill(inp: List) -> list:
    res = []
    for i, _ in enumerate(inp):
        inp[i] = parse_string(inp[i])
    for ngram in Counter(map(' '.join, everygrams(" ".join(inp).split(), 1, 3))).keys():
        if ngram in skill_list:
            res.append(ngram)
    return ". ".join([i.capitalize() for i in list(set(res))])

def parse_gender(inp: List) -> str:
    inp = " ".join([parse_string(i) for i in inp])
    gender = ["nam", "nữ", "female", "male", "bisexual", "asexual", "heterosexual", "homosexual", "lgbt"]
    for gen in gender:
        if gen in inp:
            return gen
    return ""

def parse_address(inp: List) -> str:
    inp = [parse_string(i, excp=",") for i in inp]
    for i, _ in enumerate(inp):
        inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
    return ". ".join(inp)

def parse_designation(inp: List) -> str:
    inp = list(set([parse_string(i) for i in inp]))
    for i, _ in enumerate(inp):
        inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
    return ". ".join(inp)    

def parse_email(inp: List) -> str:
    inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
    return " ".join(inp)