import string import chardet, string, gdown, re from pathlib import Path from nltk import everygrams from collections import Counter from typing import List, Optional from datetime import datetime from dateutil import parser, relativedelta punc = list(string.punctuation) def parse_string(inp: str, rep=" ", punc=punc, excp=[]) -> str: try: for i in excp: punc.remove(i) except: pass inp = inp.lower() inp = re.sub(r"won\'t", "will not", inp) inp = re.sub(r"can\'t", "can not", inp) inp = re.sub(r"\'re", " are", inp) inp = re.sub(r"\'s", " of", inp) inp = re.sub(r"\'d", " would", inp) inp = re.sub(r"\'ll", " will", inp) inp = re.sub(r"\'t", " not", inp) inp = re.sub(r"\'ve", " have", inp) inp = re.sub(r"\'m", " am", inp) for i in punc: inp = inp.replace(i,rep) return " ".join(inp.split()) def parse_time(inp: List): duration = 0 for i, _ in enumerate(inp): inp[i] = inp[i].lower() now = datetime.utcnow().strftime("%d/%m/%Y") _ = ["đến", " to "] # list that split 2 time point word __ = ["now", "hiện tại", " nay", " đến nay", "present"] # end time point for j in _: inp[i] = inp[i].replace(j," - ") for j in __: inp[i] = inp[i].replace(j,now) for j in inp[i]: if j.isalpha(): inp[i] = inp[i].replace(j,"").strip() inp[i] = parse_string(" ".join(inp[i].split(" ")), rep="", excp=["/","-"]) time_point = inp[i].split("-") # split to 2 time point if len(time_point) != 2: # must be splitted to 2 time point continue try: d1 = parser.parse(time_point[0]).strftime("%d-%m-%Y") d2 = parser.parse(time_point[1]).strftime("%d-%m-%Y") duration += (datetime.strptime(d2, "%d-%m-%Y") - datetime.strptime(d1, "%d-%m-%Y")).days except: continue return "{:.1f} năm".format(duration/365) filename = "./skills.csv" detected = chardet.detect(Path(filename).read_bytes()) # "ISO-8859-1" skill_list = pd.read_csv(filename, encoding=detected["encoding"]) skill_list = [i.replace("\n","") for i in skill_list["Skill"].to_list()] def parse_skill(inp: List) -> list: res = [] for i, _ in enumerate(inp): inp[i] = parse_string(inp[i]) for ngram in Counter(map(' '.join, everygrams(" ".join(inp).split(), 1, 3))).keys(): if ngram in skill_list: res.append(ngram) return ". ".join([i.capitalize() for i in list(set(res))]) def parse_gender(inp: List) -> str: inp = " ".join([parse_string(i) for i in inp]) gender = ["nam", "nữ", "female", "male", "bisexual", "asexual", "heterosexual", "homosexual", "lgbt"] for gen in gender: if gen in inp: return gen return "" def parse_address(inp: List) -> str: inp = [parse_string(i, excp=",") for i in inp] for i, _ in enumerate(inp): inp[i] = " ".join([j.capitalize() for j in inp[i].split()]) return ". ".join(inp) def parse_designation(inp: List) -> str: inp = list(set([parse_string(i) for i in inp])) for i, _ in enumerate(inp): inp[i] = " ".join([j.capitalize() for j in inp[i].split()]) return ". ".join(inp) def parse_email(inp: List) -> str: inp = list(set([parse_string(i, rep="", excp=["@","."]) for i in inp])) return " ".join(inp)