Spaces:
Sleeping
Sleeping
File size: 5,219 Bytes
671cd2f db031ee 94b0caa 1bdf4e2 3ad54a9 7e7da2b 3ad54a9 1bdf4e2 671cd2f db031ee 671cd2f db031ee 9b6e58a db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee e63b020 db031ee 78a3063 db031ee e63b020 db031ee 898364d db031ee 898364d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import streamlit as st
from keras.layers import LSTM, Dropout, Bidirectional, Dense,Embedding,Flatten,Maximum,Activation,Conv2D,LayerNormalization,add\
, BatchNormalization, SpatialDropout1D ,Input,Layer,Multiply,Reshape ,Add, GRU,Concatenate,Conv1D,TimeDistributed,ZeroPadding1D,concatenate,MaxPool1D,GlobalMaxPooling1D
import keras.backend as K
from keras import initializers, regularizers, constraints, activations
from keras.initializers import Constant
from keras import Model
import sys
import json
import pandas as pd
import numpy as np
with open('CHAR_TYPES_MAP.json') as json_file:
CHAR_TYPES_MAP = json.load(json_file)
with open('CHARS_MAP.json') as json_file:
CHARS_MAP = json.load(json_file)
with open('CHAR_TYPE_FLATTEN.json') as json_file:
CHAR_TYPE_FLATTEN = json.load(json_file)
class TimestepDropout(Dropout):
def __init__(self, rate, **kwargs):
super(TimestepDropout, self).__init__(rate, **kwargs)
def _get_noise_shape(self, inputs):
input_shape = K.shape(inputs)
noise_shape = (input_shape[0], input_shape[1], 1)
return noise_shape
def model_(n_gram = 21):
input1 = Input(shape=(21,),dtype='float32',name = 'char_input')
input2 = Input(shape=(21,),dtype='float32',name = 'type_input')
a = Embedding(178, 32,input_length=21)(input1)
a = SpatialDropout1D(0.15)(a)
#a = TimestepDropout(0.05)(a)
char_input = BatchNormalization()(a)
a_concat = []
filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[8,200],[11,150],[12,100]]
#filters = [[1,200],[2,200],[3,200],[4,200],[5,200],[6,200],[7,200],[8,200],[9,150],[10,150],[11,150],[12,100]]
for (window_size, filters_size) in filters:
convs = Conv1D(filters=filters_size, kernel_size=window_size, strides=1)(char_input)
convs = Activation('elu')(convs)
convs = TimeDistributed(Dense(5, input_shape=(21, filters_size)))(convs)
convs = ZeroPadding1D(padding=(0, window_size-1))(convs)
a_concat.append(convs)
token_max = Maximum()(a_concat)
lstm_char = Bidirectional(LSTM(128 ,return_sequences=True,kernel_regularizer=regularizers.L2(0.0000001),bias_regularizer=regularizers.L2(0.0000001)))(char_input)
lstm_char = Dense(64, activation='elu')(lstm_char)
#lstm_char = Bidirectional(LSTM(64 ,return_sequences=True))(lstm_char)
#lstm_char = Attention(return_sequences=True)(lstm_char)
b = Embedding(12, 12, input_length=21)(input2)
type_inputs = SpatialDropout1D(0.15)(b)
#type_inputs = TimestepDropout(0.05)(b)
x = Concatenate()([type_inputs, char_input, lstm_char, token_max])
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dense(100, activation='elu')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='sigmoid',dtype = 'float32',kernel_regularizer=regularizers.L2(0.01),bias_regularizer=regularizers.L2(0.01))(x)
model = Model(inputs=[input1, input2], outputs=out)
return model
def create_feature_array(text, n_pad=21):
n = len(text)
n_pad_2 = int((n_pad - 1)/2)
text_pad = [' '] * n_pad_2 + [t for t in text] + [' '] * n_pad_2
x_char, x_type = [], []
for i in range(n_pad_2, n_pad_2 + n):
char_list = text_pad[i + 1: i + n_pad_2 + 1] + \
list(reversed(text_pad[i - n_pad_2: i])) + \
[text_pad[i]]
char_map = [CHARS_MAP.get(c, 179) for c in char_list]
char_type = [CHAR_TYPES_MAP.get(CHAR_TYPE_FLATTEN.get(c, 'o'), 4)
for c in char_list]
x_char.append(char_map)
x_type.append(char_type)
x_char = np.array(x_char).astype(float)
x_type = np.array(x_type).astype(float)
return x_char, x_type
def tokenize(text):
n_pad = 21
if not text:
return ['']
if isinstance(text, str) and sys.version_info.major == 2:
text = text.decode('utf-8')
x_char, x_type = create_feature_array(text, n_pad=n_pad)
word_end = []
y_predict = model.predict([x_char, x_type], batch_size = 512)
y_predict = (y_predict.ravel() > 0.46542968749999997).astype(int)
word_end = y_predict[1:].tolist() + [1]
tokens = []
word = ''
for char, w_e in zip(text, word_end):
word += char
if w_e:
tokens.append(word)
word = ''
return tokens
model = model_()
model.load_weights("cutto_tf2.h5")
st.title("Cutto Thai word seggmentation.")
st.write("ปรีชานนท์ ชาติไทย และ สัจจวัจน์ ส่งเสริม. (2567), การสรุปข้อความข่าวภาษาไทยด้วยโครงข่ายประสาทเทียม (Thai News Text Summarization Using Neural Network), วิทยาศาสตรบัณฑิต (วทบ.):ขอนแก่น, มหาวิทยาลัยขอนแก่น")
text = st.text_area("Enter original text!")
if st.button("cut it!!"):
if text:
words = tokenize(text)
st.subheader("seggemt:")
st.write('|'.join(words))
else:
st.warning("Please enter some text to seggmentation") |