File size: 5,684 Bytes
671cd2f db031ee 94b0caa 1bdf4e2 3ad54a9 7e7da2b 3ad54a9 1bdf4e2 671cd2f db031ee 671cd2f db031ee 9b6e58a db031ee 5b9b8c3 e63b020 db031ee e63b020 db031ee e63b020 db031ee 5b9b8c3 e63b020 db031ee e63b020 db031ee e63b020 db031ee 78a3063 db031ee e63b020 db031ee 898364d 21d0659 722a04d 21d0659 722a04d 21d0659 722a04d 457af50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json
import pandas as pd
import numpy as np
with open('CHAR_TYPES_MAP.json') as json_file:
CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
CHAR_TYPE_FLATTEN = json.load(json_file)
class TimestepDropout(Dropout):
def __init__(self, rate, **kwargs):
super(TimestepDropout, self).__init__(rate, **kwargs)
def _get_noise_shape(self, inputs):
input_shape = K.shape(inputs)
noise_shape = (input_shape[0], input_shape[1], 1)
return noise_shape
def model_(n_gram = 21):
input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
input2 = Input(shape=(21,),dtype='float32',name = 'type_input')
a = Embedding(178, 32)(input1)
a = SpatialDropout1D(0.15)(a)
#a = TimestepDropout(0.05)(a)
char_input = BatchNormalization()(a)
a_concat = []
filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[8,200],[11,150],[12,100]]
#filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
for (window_size, filters_size) in filters:
convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
convs = Activation('elu')(convs)
convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
a_concat.append(convs)
token_max = Maximum()(a_concat)
lstm_char = Bidirectional(LSTM(128 ,return_sequences=True,kernel_regularizer=regularizers.L2(0.0000001),bias_regularizer=regularizers.L2(0.0000001)))(char_input)
lstm_char = Dense(64, activation='elu')(lstm_char)
#lstm_char = Bidirectional(LSTM(64 ,return_sequences=True))(lstm_char)
#lstm_char = Attention(return_sequences=True)(lstm_char)
b = Embedding(12, 12)(input2)
type_inputs = SpatialDropout1D(0.15)(b)
#type_inputs = TimestepDropout(0.05)(b)
x = Concatenate()([type_inputs, char_input, lstm_char, token_max])
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dense(100, activation='elu')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='sigmoid',dtype = 'float32',kernel_regularizer=regularizers.L2(0.01),bias_regularizer=regularizers.L2(0.01))(x)
model = Model(inputs=[input1, input2], outputs=out)
return model
def create_feature_array(text, n_pad=21):
n = len(text)
n_pad_2 = int((n_pad - 1)/2)
text_pad = [' '] * n_pad_2 + [t for t in text] + [' '] * n_pad_2
x_char, x_type = [], []
for i in range(n_pad_2, n_pad_2 + n):
char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
list(reversed(text_pad[i - n_pad_2: i])) + \
[text_pad[i]]
char_map = [CHARS_MAP.get(c, 179) for c in char_list]
char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
for c in char_list]
x_char.append(char_map)
x_type.append(char_type)
x_char = np.array(x_char).astype(float)
x_type = np.array(x_type).astype(float)
return x_char, x_type
def tokenize(text):
n_pad = 21
if not text:
return ['']
if isinstance(text, str) and sys.version_info.major == 2:
text = text.decode('utf-8')
x_char, x_type = create_feature_array(text, n_pad=n_pad)
word_end = []
y_predict = model.predict([x_char, x_type], batch_size = 512)
y_predict = (y_predict.ravel() > 0.46542968749999997).astype(int)
word_end = y_predict[1:].tolist() + [1]
tokens = []
word = ''
for char, w_e in zip(text, word_end):
word += char
if w_e:
tokens.append(word)
word = ''
return tokens
model = model_()
model.load_weights("cutto_tf2.h5")
st.title("Cutto Thai word seggmentation.")
text = st.text_area("Enter original text!")
if st.button("cut it!!"):
if text:
words = tokenize(text)
st.subheader("seggemt:")
st.write('|'.join(words))
else:
st.warning("Please enter some text to seggmentation")
multi = '''### Score
Evaluate the model performance using the test dataset divided from BEST CORPUS 2009, which comprises 10 percent, with the following scores:
- F1-Score: 98.37
- Precision: 98.02
- Recall: 98.67
### Resource Funding
NSTDA Supercomputer center (ThaiSC) and the National e-Science Infrastructure Consortium for their support of computer facilities.
### Citation
If you use cutto in your project or publication, please cite the model as follows:
'''
st.markdown(multi)
st.code(f"""
ปรีชานนท์ ชาติไทย และ สัจจวัจน์ ส่งเสริม. (2567), การสรุปข้อความข่าวภาษาไทยด้วยโครงข่ายประสาทเทียม (Thai News Text Summarization Using Neural Network), วิทยาศาสตรบัณฑิต (วทบ.):ขอนแก่น, มหาวิทยาลัยขอนแก่น)
""")
|