File size: 4,051 Bytes
671cd2f
db031ee
 
 
 
 
94b0caa
1bdf4e2
3ad54a9
 
 
 
 
 
 
 
1bdf4e2
671cd2f
db031ee
671cd2f
db031ee
 
9b6e58a
db031ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json

with open('CHAR_TYPES_MAP.json') as json_file:
    CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
    CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
    CHAR_TYPE_FLATTEN = json.load(json_file)


class TimestepDropout(Dropout):

    def __init__(self, rate, **kwargs):
        super(TimestepDropout, self).__init__(rate, **kwargs)

    def _get_noise_shape(self, inputs):
        input_shape = K.shape(inputs)
        noise_shape = (input_shape[0], input_shape[1], 1)
        return noise_shape


def model_(n_gram = 21):
    
    input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
    input2 = Input(shape=(21,),dtype='float32',name = 'type_input')

    a = Embedding(180, 32,input_length=21)(input1)
    a = SpatialDropout1D(0.1)(a)
    a = TimestepDropout(0.05)(a)
    char_input = BatchNormalization()(a)

    a_concat = []
    filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
    
    for (window_size, filters_size) in filters:
        convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
        convs = Activation('elu')(convs)
        convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
        convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
        a_concat.append(convs)
    token_max = Maximum()(a_concat)
    lstm_char = Bidirectional(LSTM(100 ,return_sequences=True))(char_input)
    
    b = Embedding(12, 12, input_length=21)(input2)
    b = SpatialDropout1D(0.1)(b)
    type_inputs = TimestepDropout(0.05)(b)

    x = Concatenate()([lstm_char, type_inputs, char_input, token_max])
    x = BatchNormalization()(x)

    x = Flatten()(x)
    x = Dense(200, activation='elu')(x)
    x = Dropout(0.2)(x)
    out = Dense(1, activation='sigmoid',dtype = 'float32')(x)
    

    model = Model(inputs=[input1, input2], outputs=out)
   
    return model


def create_feature_array(text, n_pad=21):

    n = len(text)
    n_pad_2 = int((n_pad - 1)/2)
    text_pad = [' '] * n_pad_2  + [t for t in text] + [' '] * n_pad_2
    x_char, x_type = [], []
    for i in range(n_pad_2, n_pad_2 + n):
        char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
                    list(reversed(text_pad[i - n_pad_2: i])) + \
                    [text_pad[i]]
        char_map = [CHARS_MAP.get(c, 179) for c in char_list]
        char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
                     for c in char_list]
        x_char.append(char_map)
        x_type.append(char_type)
    x_char = np.array(x_char).astype(float)
    x_type = np.array(x_type).astype(float)
    return x_char, x_type

def tokenize(text):
        n_pad = 21

        if not text:
            return ['']

        if isinstance(text, str) and sys.version_info.major == 2:
            text = text.decode('utf-8')

        x_char, x_type = create_feature_array(text, n_pad=n_pad)
        word_end = []
 
        y_predict = model.predict([x_char, x_type], batch_size = 512)
        y_predict = (y_predict.ravel() > 0.4).astype(int)
        word_end = y_predict[1:].tolist() + [1]
        
        tokens = []
        word = ''
        for char, w_e in zip(text, word_end):
            word += char
            if w_e:
                tokens.append(word)
                word = ''
        return tokens


model = model_()
model.load_weights("cutto_tf2.h5")

text = st.text_area("Enter original text!")
words = tokenize(text)

st.write('|'.join(words))