File size: 3,826 Bytes
b149299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
027387e
b149299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re

contractions = {
    r"(?<![\w.])no(s)?(?![$\w])": r"em o\g<1>",
    r"(?<![\w.])na(s)?(?![$\w])": r"em a\g<1>",
    r"(?<![\w.])da(s)?(?![$\w])": r"de a\g<1>",
    r"(?<![\w.])do(s)?(?![$\w])": r"de o\g<1>",
    r"(?<![\w.])ao(s)?(?![$\w])": r"a o\g<1>",
    r"(?<![\w.])à(s)?(?![$\w])": r"a a\g<1>",
    r"(?<![\w.])pela(s)?(?![$\w])": r"por a\g<1>",
    r"(?<![\w.])pelo(s)?(?![$\w])": r"por o\g<1>",
    r"(?<![\w.])nesta(s)?(?![$\w])": r"em esta\g<1>",
    r"(?<![\w.])neste(s)?(?![$\w])": r"em este\g<1>",
    r"(?<![\w.])nessa(s)?(?![$\w])": r"em essa\g<1>",
    r"(?<![\w.])nesse(s)?(?![$\w])": r"em esse\g<1>",
    r"(?<![\w.])num(?![$\w])": r"em um",
    r"(?<![\w.])nuns(?![$\w])": r"em uns",
    r"(?<![\w.])numa(s)?(?![$\w])": r"em uma\g<1>",
    r"(?<![\w.])nisso(?![$\w])": r"em isso",
    r"(?<![\w.])naquele(s)?(?![$\w])": r"em aquele\g<1>",
    r"(?<![\w.])naquela(s)?(?![$\w])": r"em aquela\g<1>",
    r"(?<![\w.])naquilo(?![$\w])": r"em aquilo",
    r"(?<![\w.])duma(s)?(?![$\w])": r"de uma\g<1>",
    r"(?<![\w.])daqui(?![$\w])": r"de aqui",
    r"(?<![\w.])dali(?![$\w])": r"de ali",
    r"(?<![\w.])daquele(s)?(?![$\w])": r"de aquele\g<1>",
    r"(?<![\w.])daquela(s)?(?![$\w])": r"de aquela\g<1>",
    r"(?<![\w.])deste(s)?(?![$\w])": r"de este\g<1>",
    r"(?<![\w.])desta(s)?(?![$\w])": r"de esta\g<1>",
    r"(?<![\w.])desse(s)?(?![$\w])": r"de esse\g<1>",
    r"(?<![\w.])dessa(s)?(?![$\w])": r"de essa\g<1>",
    r"(?<![\w.])daí(?![$\w])": r"de aí",
    r"(?<![\w.])dum(?![$\w])": r"de um",
    r"(?<![\w.])donde(?![$\w])": r"de onde",
    r"(?<![\w.])disto(?![$\w])": r"de isto",
    r"(?<![\w.])disso(?![$\w])": r"de isso",
    r"(?<![\w.])daquilo(?![$\w])": r"de aquilo",
    r"(?<![\w.])dela(s)?(?![$\w])": r"de ela\g<1>",
    r"(?<![\w.])dele(s)?(?![$\w])": r"de ele\g<1>",
    r"(?<![\w.])nisto(?![$\w])": r"em isto",
    r"(?<![\w.])nele(s)?(?![$\w])": r"em ele\g<1>",
    r"(?<![\w.])nela(s)?(?![$\w])": r"em ela\g<1>",
    r"(?<![\w.])d'?ele(s)?(?![$\w])": r"de ele\g<1>",
    r"(?<![\w.])d'?ela(s)?(?![$\w])": r"de ela\g<1>",
    r"(?<![\w.])noutro(s)?(?![$\w])": r"em outro\g<1>",
    r"(?<![\w.])aonde(?![$\w])": r"a onde",
    r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
    r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
    r"(?<![\w.])àquilo(?![$\w])": r"a aquelo",
    r"(?<![\w.])contigo(?![$\w])": r"com ti",
    r"(?<![\w.])né(?![$\w])": r"não é",
    r"(?<![\w.])comigo(?![$\w])": r"com mim",
    r"(?<![\w.])contigo(?![$\w])": r"com ti",
    r"(?<![\w.])conosco(?![$\w])": r"com nós",
    r"(?<![\w.])consigo(?![$\w])": r"com si",
    r"(?<![\w.])pra(?![$\w])": r"para a",
    r"(?<![\w.])pro(?![$\w])": r"para o",
}


def replace_keep_case(word, replacement, text):
    """
    Custom function for replace keeping the original case.
    Parameters
    ----------
    word: str
        Text to be replaced.
    replacement: str
        String to replace word.
    text:
        Text to be processed.
    Returns
    -------
    str:
        Processed string
    """

    def func(match):
        g = match.group()
        repl = match.expand(replacement)
        if g.islower():
            return repl.lower()
        if g.istitle():
            return repl.capitalize()
        if g.isupper():
            return repl.upper()
        return repl

    return re.sub(word, func, text, flags=re.I)


def expand_contractions(text: str) -> str:
    """
    Replace contractions to their based form.
    Parameters
    ----------
    text: str
        Text that may contain contractions.
    Returns
    -------
    str:
        Text with expanded contractions.
    """

    for contraction in contractions.keys():
        replace_str = contractions[contraction]
        text = replace_keep_case(contraction, replace_str, text)

    return text