File size: 5,195 Bytes
671cd2f
db031ee
 
 
 
 
94b0caa
1bdf4e2
3ad54a9
7e7da2b
 
3ad54a9
 
 
 
 
 
 
1bdf4e2
671cd2f
db031ee
671cd2f
db031ee
 
9b6e58a
db031ee
 
 
 
 
 
 
 
 
5b9b8c3
e63b020
 
db031ee
 
e63b020
 
db031ee
 
 
 
 
 
 
 
e63b020
 
 
 
db031ee
5b9b8c3
e63b020
 
 
db031ee
 
e63b020
db031ee
e63b020
db031ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78a3063
db031ee
 
 
 
 
 
 
 
e63b020
db031ee
 
 
 
 
 
 
 
 
 
 
 
898364d
 
db031ee
898364d
 
 
 
 
 
cca800b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json
import pandas as pd
import numpy as np

with open('CHAR_TYPES_MAP.json') as json_file:
    CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
    CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
    CHAR_TYPE_FLATTEN = json.load(json_file)


class TimestepDropout(Dropout):

    def __init__(self, rate, **kwargs):
        super(TimestepDropout, self).__init__(rate, **kwargs)

    def _get_noise_shape(self, inputs):
        input_shape = K.shape(inputs)
        noise_shape = (input_shape[0], input_shape[1], 1)
        return noise_shape

def model_(n_gram = 21):
    
    input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
    input2 = Input(shape=(21,),dtype='float32',name = 'type_input')
    a = Embedding(178, 32)(input1)
    a = SpatialDropout1D(0.15)(a)
    #a = TimestepDropout(0.05)(a)
    char_input = BatchNormalization()(a)
    a_concat = []
    filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[8,200],[11,150],[12,100]]
    #filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
    
    for (window_size, filters_size) in filters:
        convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
        convs = Activation('elu')(convs)
        convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
        convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
        a_concat.append(convs)
    token_max = Maximum()(a_concat)
    lstm_char = Bidirectional(LSTM(128 ,return_sequences=True,kernel_regularizer=regularizers.L2(0.0000001),bias_regularizer=regularizers.L2(0.0000001)))(char_input)
    lstm_char = Dense(64, activation='elu')(lstm_char)
    #lstm_char = Bidirectional(LSTM(64 ,return_sequences=True))(lstm_char)
    #lstm_char = Attention(return_sequences=True)(lstm_char)
    
    b = Embedding(12, 12)(input2)
    type_inputs = SpatialDropout1D(0.15)(b)
    #type_inputs = TimestepDropout(0.05)(b)
    x = Concatenate()([type_inputs, char_input, lstm_char, token_max])
    x = BatchNormalization()(x)
    x = Flatten()(x)
    x = Dense(100, activation='elu')(x)
    x = Dropout(0.2)(x)
    out = Dense(1, activation='sigmoid',dtype = 'float32',kernel_regularizer=regularizers.L2(0.01),bias_regularizer=regularizers.L2(0.01))(x)
    model = Model(inputs=[input1, input2], outputs=out)
    return model


def create_feature_array(text, n_pad=21):

    n = len(text)
    n_pad_2 = int((n_pad - 1)/2)
    text_pad = [' '] * n_pad_2  + [t for t in text] + [' '] * n_pad_2
    x_char, x_type = [], []
    for i in range(n_pad_2, n_pad_2 + n):
        char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
                    list(reversed(text_pad[i - n_pad_2: i])) + \
                    [text_pad[i]]
        char_map = [CHARS_MAP.get(c, 179) for c in char_list]
        char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
                     for c in char_list]
        x_char.append(char_map)
        x_type.append(char_type)
    x_char = np.array(x_char).astype(float)
    x_type = np.array(x_type).astype(float)
    return x_char, x_type
def tokenize(text):
    
        n_pad = 21
        if not text:
            return ['']
        if isinstance(text, str) and sys.version_info.major == 2:
            text = text.decode('utf-8')
        x_char, x_type = create_feature_array(text, n_pad=n_pad)
        word_end = []
        y_predict = model.predict([x_char, x_type], batch_size = 512)
        y_predict = (y_predict.ravel() > 0.46542968749999997).astype(int)
        word_end = y_predict[1:].tolist() + [1]
        tokens = []
        word = ''
        for char, w_e in zip(text, word_end):
            word += char
            if w_e:
                tokens.append(word)
                word = ''
        return tokens

model = model_()
model.load_weights("cutto_tf2.h5")
st.title("Cutto Thai word seggmentation.")
st.write("ปรีชานนท์ ชาติไทย และ สัจจวัจน์ ส่งเสริม. (2567), การสรุปข้อความข่าวภาษาไทยด้วยโครงข่ายประสาทเทียม (Thai News Text Summarization Using Neural Network), วิทยาศาสตรบัณฑิต (วทบ.):ขอนแก่น, มหาวิทยาลัยขอนแก่น")
text = st.text_area("Enter original text!")
if st.button("cut it!!"):
    if text:
        words = tokenize(text)
        st.subheader("seggemt:")
        st.write('|'.join(words))
    else:
        st.warning("Please enter some text to seggmentation")