nam194's picture
Create parse_info.py
bd35a5d
raw
history blame
3.43 kB
import string
import chardet, string, gdown, re
from pathlib import Path
from nltk import everygrams
from collections import Counter
from typing import List, Optional
from datetime import datetime
from dateutil import parser, relativedelta
punc = list(string.punctuation)
def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str:
try:
for i in excp:
punc.remove(i)
except:
pass
inp = inp.lower()
inp = re.sub(r"won\'t", "will not", inp)
inp = re.sub(r"can\'t", "can not", inp)
inp = re.sub(r"\'re", " are", inp)
inp = re.sub(r"\'s", " of", inp)
inp = re.sub(r"\'d", " would", inp)
inp = re.sub(r"\'ll", " will", inp)
inp = re.sub(r"\'t", " not", inp)
inp = re.sub(r"\'ve", " have", inp)
inp = re.sub(r"\'m", " am", inp)
for i in punc:
inp = inp.replace(i,rep)
return " ".join(inp.split())
def parse_time(inp: List):
duration = 0
for i, _ in enumerate(inp):
inp[i] = inp[i].lower()
now = datetime.utcnow().strftime("%d/%m/%Y")
_ = ["đến", " to "] # list that split 2 time point word
__ = ["now", "hiện tại", " nay", " đến nay", "present"] # end time point
for j in _:
inp[i] = inp[i].replace(j," - ")
for j in __:
inp[i] = inp[i].replace(j,now)
for j in inp[i]:
if j.isalpha():
inp[i] = inp[i].replace(j,"").strip()
inp[i] = parse_string(" ".join(inp[i].split(" ")), rep="", excp=["/","-"])
time_point = inp[i].split("-") # split to 2 time point
if len(time_point) != 2: # must be splitted to 2 time point
continue
try:
d1 = parser.parse(time_point[0]).strftime("%d-%m-%Y")
d2 = parser.parse(time_point[1]).strftime("%d-%m-%Y")
duration += (datetime.strptime(d2, "%d-%m-%Y") - datetime.strptime(d1, "%d-%m-%Y")).days
except:
continue
return "{:.1f} năm".format(duration/365)
filename = "./skills.csv"
detected = chardet.detect(Path(filename).read_bytes()) # "ISO-8859-1"
skill_list = pd.read_csv(filename, encoding=detected["encoding"])
skill_list = [i.replace("\n","") for i in skill_list["Skill"].to_list()]
def parse_skill(inp: List) -> list:
res = []
for i, _ in enumerate(inp):
inp[i] = parse_string(inp[i])
for ngram in Counter(map(' '.join, everygrams(" ".join(inp).split(), 1, 3))).keys():
if ngram in skill_list:
res.append(ngram)
return ". ".join([i.capitalize() for i in list(set(res))])
def parse_gender(inp: List) -> str:
inp = " ".join([parse_string(i) for i in inp])
gender = ["nam", "nữ", "female", "male", "bisexual", "asexual", "heterosexual", "homosexual", "lgbt"]
for gen in gender:
if gen in inp:
return gen
return ""
def parse_address(inp: List) -> str:
inp = [parse_string(i, excp=",") for i in inp]
for i, _ in enumerate(inp):
inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
return ". ".join(inp)
def parse_designation(inp: List) -> str:
inp = list(set([parse_string(i) for i in inp]))
for i, _ in enumerate(inp):
inp[i] = " ".join([j.capitalize() for j in inp[i].split()])
return ". ".join(inp)
def parse_email(inp: List) -> str:
inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp]))
return " ".join(inp)