chatthai commited on
Commit
db031ee
1 Parent(s): 46931c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -2
app.py CHANGED
@@ -1,6 +1,110 @@
1
  import streamlit as st
 
 
 
 
 
2
 
 
3
 
4
- text = st.text_area("Enter some text!")
 
5
 
6
- st.write(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
3
+ , BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
4
+ import keras.backend as K
5
+ from keras import initializers, regularizers, constraints, activations
6
+ from keras.initializers import Constant
7
 
8
+ class TimestepDropout(Dropout):
9
 
10
+ def __init__(self, rate, **kwargs):
11
+ super(TimestepDropout, self).__init__(rate, **kwargs)
12
 
13
+ def _get_noise_shape(self, inputs):
14
+ input_shape = K.shape(inputs)
15
+ noise_shape = (input_shape[0], input_shape[1], 1)
16
+ return noise_shape
17
+
18
+
19
+ def model_(n_gram = 21):
20
+
21
+ input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
22
+ input2 = Input(shape=(21,),dtype='float32',name = 'type_input')
23
+
24
+ a = Embedding(180, 32,input_length=21)(input1)
25
+ a = SpatialDropout1D(0.1)(a)
26
+ a = TimestepDropout(0.05)(a)
27
+ char_input = BatchNormalization()(a)
28
+
29
+ a_concat = []
30
+ filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
31
+
32
+ for (window_size, filters_size) in filters:
33
+ convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
34
+ convs = Activation('elu')(convs)
35
+ convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
36
+ convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
37
+ a_concat.append(convs)
38
+ token_max = Maximum()(a_concat)
39
+ lstm_char = Bidirectional(LSTM(100 ,return_sequences=True))(char_input)
40
+
41
+ b = Embedding(12, 12, input_length=21)(input2)
42
+ b = SpatialDropout1D(0.1)(b)
43
+ type_inputs = TimestepDropout(0.05)(b)
44
+
45
+ x = Concatenate()([lstm_char, type_inputs, char_input, token_max])
46
+ x = BatchNormalization()(x)
47
+
48
+ x = Flatten()(x)
49
+ x = Dense(200, activation='elu')(x)
50
+ x = Dropout(0.2)(x)
51
+ out = Dense(1, activation='sigmoid',dtype = 'float32')(x)
52
+
53
+
54
+ model = Model(inputs=[input1, input2], outputs=out)
55
+
56
+ return model
57
+
58
+
59
+ def create_feature_array(text, n_pad=21):
60
+
61
+ n = len(text)
62
+ n_pad_2 = int((n_pad - 1)/2)
63
+ text_pad = [' '] * n_pad_2 + [t for t in text] + [' '] * n_pad_2
64
+ x_char, x_type = [], []
65
+ for i in range(n_pad_2, n_pad_2 + n):
66
+ char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
67
+ list(reversed(text_pad[i - n_pad_2: i])) + \
68
+ [text_pad[i]]
69
+ char_map = [CHARS_MAP.get(c, 179) for c in char_list]
70
+ char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
71
+ for c in char_list]
72
+ x_char.append(char_map)
73
+ x_type.append(char_type)
74
+ x_char = np.array(x_char).astype(float)
75
+ x_type = np.array(x_type).astype(float)
76
+ return x_char, x_type
77
+
78
+ def tokenize(text):
79
+ n_pad = 21
80
+
81
+ if not text:
82
+ return ['']
83
+
84
+ if isinstance(text, str) and sys.version_info.major == 2:
85
+ text = text.decode('utf-8')
86
+
87
+ x_char, x_type = create_feature_array(text, n_pad=n_pad)
88
+ word_end = []
89
+
90
+ y_predict = model.predict([x_char, x_type], batch_size = 512)
91
+ y_predict = (y_predict.ravel() > 0.4).astype(int)
92
+ word_end = y_predict[1:].tolist() + [1]
93
+
94
+ tokens = []
95
+ word = ''
96
+ for char, w_e in zip(text, word_end):
97
+ word += char
98
+ if w_e:
99
+ tokens.append(word)
100
+ word = ''
101
+ return tokens
102
+
103
+
104
+ model = model_()
105
+ model.load_weights("cutto_tf2.h5")
106
+
107
+ text = st.text_area("Enter original text!")
108
+ words = tokenize(text)
109
+
110
+ st.write('|'.join(words))