harshvardhan96 commited on
Commit
421e135
1 Parent(s): 1029368

Delete custom-chatbot.ipynb

Browse files
Files changed (1) hide show
  1. custom-chatbot.ipynb +0 -1
custom-chatbot.ipynb DELETED
@@ -1 +0,0 @@
1
- {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# **Chatbot using Seq2Seq LSTM models**","metadata":{"id":"6xoKYBBO6xaV"}},{"cell_type":"markdown","source":"# Step 1: Import all the packages ","metadata":{"id":"mVuZTAV08qWY"}},{"cell_type":"code","source":"import numpy as np \nimport tensorflow as tf\nimport pickle\nfrom tensorflow.keras import layers, activations, models, preprocessing\nfrom tensorflow.keras import preprocessing, utils\nimport os\nimport yaml","metadata":{"id":"U0mJXRse83hp","execution":{"iopub.status.busy":"2023-07-23T17:55:07.814789Z","iopub.execute_input":"2023-07-23T17:55:07.815567Z","iopub.status.idle":"2023-07-23T17:55:07.820584Z","shell.execute_reply.started":"2023-07-23T17:55:07.815523Z","shell.execute_reply":"2023-07-23T17:55:07.819803Z"},"trusted":true},"execution_count":13,"outputs":[]},{"cell_type":"markdown","source":"# Step 3: Preprocessing the data","metadata":{"id":"l4kJp6uO-fQE"}},{"cell_type":"markdown","source":"### a) Reading the data from the files\nWe parse each of the .yaml files.\n\n1. Concatenate two or more sentences if the answer has two or more of them.\n2. Remove unwanted data types which are produced while parsing the data.\n3. Append <START> and <END> to all the answers.\n4. Create a Tokenizer and load the whole vocabulary ( questions + answers ) into it.","metadata":{"id":"QEV_hSXs-7mF"}},{"cell_type":"markdown","source":"The dataset contains .yml files which have pairs of different questions and their answers on varied subjects like history, bot profile, science etc.\nWe can easily read them as folows:","metadata":{"id":"qyUnopqjDjud"}},{"cell_type":"code","source":"dir_path = '/kaggle/input/chatterbotenglish/'\nfiles_list = os.listdir(dir_path + os.sep)","metadata":{"id":"RxG-s4k0CowI","execution":{"iopub.status.busy":"2023-07-23T17:02:53.045463Z","iopub.execute_input":"2023-07-23T17:02:53.045772Z","iopub.status.idle":"2023-07-23T17:02:53.061189Z","shell.execute_reply.started":"2023-07-23T17:02:53.045737Z","shell.execute_reply":"2023-07-23T17:02:53.060184Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"questions = list()\nanswers = list()\n\nfor filepath in files_list:\n stream = open( dir_path + os.sep + filepath , 'rb')\n docs = yaml.safe_load(stream)\n conversations = docs['conversations']\n for con in conversations:\n if len( con ) > 2 :\n questions.append(con[0])\n replies = con[ 1 : ]\n ans = ''\n for rep in replies:\n ans += ' ' + rep\n answers.append( ans )\n elif len( con )> 1:\n questions.append(con[0])\n answers.append(con[1])\n\nanswers_with_tags = list()\nfor i in range( len( answers ) ):\n if type( answers[i] ) == str:\n answers_with_tags.append( answers[i] )\n else:\n questions.pop( i )\n\nanswers = list()\nfor i in range( len( answers_with_tags ) ) :\n answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )\n\ntokenizer = preprocessing.text.Tokenizer()\ntokenizer.fit_on_texts( questions + answers )\nVOCAB_SIZE = len( tokenizer.word_index )+1\nprint( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))","metadata":{"id":"-bRvbQ00Coy5","outputId":"ef129a1c-3071-4d10-e6b2-7ac5a6ff2bac","execution":{"iopub.status.busy":"2023-07-23T17:02:54.876987Z","iopub.execute_input":"2023-07-23T17:02:54.877282Z","iopub.status.idle":"2023-07-23T17:02:55.244957Z","shell.execute_reply.started":"2023-07-23T17:02:54.877248Z","shell.execute_reply":"2023-07-23T17:02:55.244150Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### b) Preparing data for Seq2Seq model\n\nThis model requires 3 arrays encoder_input_data, decoder_input_data and decoder_output_data.\n\nFor encoder_input_data:\nTokensize the Questions and Pad them to their maximum Length.\n\nFor decoder_input_data:\nTokensize the Answers and Pad them to their maximum Length.\n\nFor decoder_output_data:\nTokensize the Answers and Remove the 1st element from all the tokenized_answers. This is the <START> element which was added earlier.","metadata":{"id":"WMPqb8LxIeGI"}},{"cell_type":"code","source":"from gensim.models import Word2Vec\nimport re","metadata":{"id":"oEfAPL4HCo1t","execution":{"iopub.status.busy":"2023-07-23T17:02:58.444355Z","iopub.execute_input":"2023-07-23T17:02:58.445174Z","iopub.status.idle":"2023-07-23T17:02:59.032740Z","shell.execute_reply.started":"2023-07-23T17:02:58.445124Z","shell.execute_reply":"2023-07-23T17:02:59.031934Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"vocab = []\nfor word in tokenizer.word_index:\n vocab.append(word)\n\ndef tokenize(sentences):\n tokens_list = []\n vocabulary = []\n for sentence in sentences:\n sentence = sentence.lower()\n sentence = re.sub('[^a-zA-Z]', ' ', sentence)\n tokens = sentence.split()\n vocabulary += tokens\n tokens_list.append(tokens)\n return tokens_list, vocabulary","metadata":{"id":"QqYoDsbSCo4f","execution":{"iopub.status.busy":"2023-07-23T17:02:59.318612Z","iopub.execute_input":"2023-07-23T17:02:59.318922Z","iopub.status.idle":"2023-07-23T17:02:59.325560Z","shell.execute_reply.started":"2023-07-23T17:02:59.318889Z","shell.execute_reply":"2023-07-23T17:02:59.324835Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#encoder_input_data\ntokenized_questions = tokenizer.texts_to_sequences( questions )\nmaxlen_questions = max( [len(x) for x in tokenized_questions ] )\nprint(maxlen_questions)\npadded_questions = preprocessing.sequence.pad_sequences( tokenized_questions, maxlen = maxlen_questions, padding = 'post')\nencoder_input_data = np.array(padded_questions)\nprint(encoder_input_data.shape, maxlen_questions)","metadata":{"id":"9vKhieIwCo7J","outputId":"e97b4a74-7384-478c-d4ae-082513257107","execution":{"iopub.status.busy":"2023-07-23T17:03:00.373561Z","iopub.execute_input":"2023-07-23T17:03:00.374167Z","iopub.status.idle":"2023-07-23T17:03:00.392248Z","shell.execute_reply.started":"2023-07-23T17:03:00.374126Z","shell.execute_reply":"2023-07-23T17:03:00.391360Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# decoder_input_data\ntokenized_answers = tokenizer.texts_to_sequences( answers )\nmaxlen_answers = max( [ len(x) for x in tokenized_answers ] )\nprint(maxlen_answers)\npadded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )\ndecoder_input_data = np.array( padded_answers )\nprint( decoder_input_data.shape , maxlen_answers )","metadata":{"id":"AJo7WPjLCo-q","outputId":"28b5e209-5389-4313-f3cd-f5451fa8c519","execution":{"iopub.status.busy":"2023-07-23T17:03:05.261867Z","iopub.execute_input":"2023-07-23T17:03:05.262722Z","iopub.status.idle":"2023-07-23T17:03:05.292070Z","shell.execute_reply.started":"2023-07-23T17:03:05.262657Z","shell.execute_reply":"2023-07-23T17:03:05.291345Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# decoder_output_data\ntokenized_answers = tokenizer.texts_to_sequences( answers )\nfor i in range(len(tokenized_answers)) :\n tokenized_answers[i] = tokenized_answers[i][1:]\npadded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )\nonehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )\ndecoder_output_data = np.array( onehot_answers )\nprint( decoder_output_data.shape )","metadata":{"id":"ccY0wWdRCpCa","outputId":"07877cda-7e07-42b2-bb7e-772d118a3cf0","execution":{"iopub.status.busy":"2023-07-23T16:48:14.572481Z","iopub.execute_input":"2023-07-23T16:48:14.572833Z","iopub.status.idle":"2023-07-23T16:48:14.850481Z","shell.execute_reply.started":"2023-07-23T16:48:14.572792Z","shell.execute_reply":"2023-07-23T16:48:14.849696Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Saving tokenizer params for reloading tokenizer during inference time","metadata":{}},{"cell_type":"code","source":"with open('tokenizer.pkl', 'wb') as f:\n pickle.dump(tokenizer, f)","metadata":{"execution":{"iopub.status.busy":"2023-07-23T16:48:30.551736Z","iopub.execute_input":"2023-07-23T16:48:30.552488Z","iopub.status.idle":"2023-07-23T16:48:30.561895Z","shell.execute_reply.started":"2023-07-23T16:48:30.552449Z","shell.execute_reply":"2023-07-23T16:48:30.561003Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"with open('tokenizer_params.pkl', 'wb') as f:\n tokenizer_params = {\n 'word_index': tokenizer.word_index,\n 'maxlen_questions' : maxlen_questions,\n 'maxlen_answers': maxlen_answers\n # Add other tokenizer attributes you might need for inference\n }\n pickle.dump(tokenizer_params, f)","metadata":{"execution":{"iopub.status.busy":"2023-07-23T17:20:18.427552Z","iopub.execute_input":"2023-07-23T17:20:18.428077Z","iopub.status.idle":"2023-07-23T17:20:18.437099Z","shell.execute_reply.started":"2023-07-23T17:20:18.428036Z","shell.execute_reply":"2023-07-23T17:20:18.435877Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Using Glove Embedding for better embeddings computation and better understanding of the context","metadata":{}},{"cell_type":"code","source":"glove_path = \"/kaggle/input/glove-embeddings/glove.6B.100d.txt\"\nembedding_dim = 100","metadata":{"execution":{"iopub.status.busy":"2023-07-23T16:48:14.851752Z","iopub.execute_input":"2023-07-23T16:48:14.852514Z","iopub.status.idle":"2023-07-23T16:48:14.856676Z","shell.execute_reply.started":"2023-07-23T16:48:14.852463Z","shell.execute_reply":"2023-07-23T16:48:14.855645Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"embeddings_index = {}\nwith open(glove_path, \"r\", encoding=\"utf-8\") as f:\n for line in f:\n values = line.split()\n word = values[0]\n coefs = np.asarray(values[1:], dtype=\"float32\")\n embeddings_index[word] = coefs","metadata":{"execution":{"iopub.status.busy":"2023-07-23T16:48:14.857994Z","iopub.execute_input":"2023-07-23T16:48:14.858817Z","iopub.status.idle":"2023-07-23T16:48:27.279229Z","shell.execute_reply.started":"2023-07-23T16:48:14.858780Z","shell.execute_reply":"2023-07-23T16:48:27.278437Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"embedding_matrix = np.zeros((VOCAB_SIZE, embedding_dim))\nfor word, i in tokenizer.word_index.items():\n embedding_vector = embeddings_index.get(word)\n if embedding_vector is not None:\n embedding_matrix[i] = embedding_vector","metadata":{"execution":{"iopub.status.busy":"2023-07-23T16:48:27.280408Z","iopub.execute_input":"2023-07-23T16:48:27.280694Z","iopub.status.idle":"2023-07-23T16:48:27.292402Z","shell.execute_reply.started":"2023-07-23T16:48:27.280658Z","shell.execute_reply":"2023-07-23T16:48:27.291575Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Step 4: Defining Encoder Decoder Model\n\n\n\n","metadata":{"id":"-D53pyucPCnk"}},{"cell_type":"code","source":"encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))\nencoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim, mask_zero=True, weights=[embedding_matrix], trainable=False)(encoder_inputs)\nencoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )\nencoder_states = [ state_h , state_c ]\n\ndecoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers , ))\ndecoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim, mask_zero=True, weights=[embedding_matrix], trainable=False)(decoder_inputs)\ndecoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )\ndecoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )\ndecoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) \noutput = decoder_dense ( decoder_outputs )\n\nmodel = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )\nmodel.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['accuracy'])\n\nmodel.summary()","metadata":{"id":"W3YjCFDwPRVN","outputId":"7bc112a0-6945-4100-e8d9-3bc5691797e5","execution":{"iopub.status.busy":"2023-07-23T12:45:01.209359Z","iopub.execute_input":"2023-07-23T12:45:01.209640Z","iopub.status.idle":"2023-07-23T12:45:02.970804Z","shell.execute_reply.started":"2023-07-23T12:45:01.209608Z","shell.execute_reply":"2023-07-23T12:45:02.970038Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Step 5: Training the Model\n\nWe train the model for a number of epochs with RMSprop optimizer and categorical_crossentropy loss function.","metadata":{"id":"wVfSormAPb3w"}},{"cell_type":"code","source":"model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=150 ) \nmodel.save( 'model.h5' )","metadata":{"id":"OHlqQq64PYTH","outputId":"c477c08f-55e4-41ac-b8b7-1973aa38f48a","execution":{"iopub.status.busy":"2023-07-23T12:45:13.040687Z","iopub.execute_input":"2023-07-23T12:45:13.041452Z","iopub.status.idle":"2023-07-23T12:46:25.775676Z","shell.execute_reply.started":"2023-07-23T12:45:13.041415Z","shell.execute_reply":"2023-07-23T12:46:25.774900Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Step 6: Defining Inference Models\n\nEncoder Inference Model: Takes questions as input and outputs LSTM states (h and c)\n\nDecoder Inference Model: Takes in 2 inputs one are the LSTM states, second are the answer input sequences. it will o/p the answers for questions which fed to the encoder model and it's state values.","metadata":{"id":"F1MIy1j9aVTo"}},{"cell_type":"code","source":"def make_inference_models():\n \n encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)\n encoder_model.save('encoder_model.h5')\n \n decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))\n decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))\n \n decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n \n decoder_outputs, state_h, state_c = decoder_lstm(\n decoder_embedding , initial_state=decoder_states_inputs)\n \n decoder_states = [state_h, state_c]\n\n decoder_outputs = decoder_dense(decoder_outputs)\n \n decoder_model = tf.keras.models.Model(\n [decoder_inputs] + decoder_states_inputs,\n [decoder_outputs] + decoder_states)\n decoder_model.save('decoder_model.h5')\n \n return encoder_model , decoder_model","metadata":{"id":"MpLowS27cn8X","execution":{"iopub.status.busy":"2023-07-23T16:58:42.954185Z","iopub.execute_input":"2023-07-23T16:58:42.954473Z","iopub.status.idle":"2023-07-23T16:58:42.962094Z","shell.execute_reply.started":"2023-07-23T16:58:42.954440Z","shell.execute_reply":"2023-07-23T16:58:42.960888Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Step 7: Talking with the Chatbot\n\ndefine a method str_to_tokens which converts str questions to Integer tokens with padding.\n\n1. First, we take a question as input and predict the state values using enc_model.\n2. We set the state values in the decoder's LSTM.\n3. Then, we generate a sequence which contains the <start> element.\n4. We input this sequence in the dec_model.\n5. We replace the <start> element with the element which was predicted by the dec_model and update the state values.\n6. We carry out the above steps iteratively till we hit the <end> tag or the maximum answer length.\n\n","metadata":{"id":"EwoYVsBTeYra"}},{"cell_type":"code","source":"def str_to_tokens( sentence : str ):\n\n words = sentence.lower().split()\n tokens_list = list()\n \n for word in words:\n tokens_list.append( tokenizer.word_index[ word ] ) \n return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')\n","metadata":{"id":"oA7Yx45Li3wo","execution":{"iopub.status.busy":"2023-07-23T17:55:42.964621Z","iopub.execute_input":"2023-07-23T17:55:42.964908Z","iopub.status.idle":"2023-07-23T17:55:42.970251Z","shell.execute_reply.started":"2023-07-23T17:55:42.964874Z","shell.execute_reply":"2023-07-23T17:55:42.969438Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"tests = ['You can not move', 'You sound like Data', 'Stupid', 'you are idiot', 'i am going to die']\nfor i in range(5):\n states_values = enc_model.predict(str_to_tokens(tests[i]))\n empty_target_seq = np.zeros((1 , 1))\n empty_target_seq[0, 0] = tokenizer.word_index['start']\n stop_condition = False\n decoded_translation = ''\n\n while not stop_condition :\n dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)\n sampled_word_index = np.argmax(dec_outputs[0, -1, :])\n sampled_word = None\n\n for word , index in tokenizer.word_index.items() :\n if sampled_word_index == index :\n decoded_translation += f' {word}'\n sampled_word = word\n\n if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:\n stop_condition = True\n\n empty_target_seq = np.zeros((1 , 1)) \n empty_target_seq[0 , 0] = sampled_word_index\n states_values = [h , c] \n print(f'Human: {tests[i]}')\n print()\n decoded_translation = decoded_translation.split(' end')[0]\n print(f'Bot: {decoded_translation}')\n print('_'*100)","metadata":{"execution":{"iopub.status.busy":"2023-07-23T17:58:59.482794Z","iopub.execute_input":"2023-07-23T17:58:59.483305Z","iopub.status.idle":"2023-07-23T17:59:02.177998Z","shell.execute_reply.started":"2023-07-23T17:58:59.483265Z","shell.execute_reply":"2023-07-23T17:59:02.177181Z"},"trusted":true},"execution_count":24,"outputs":[{"name":"stdout","text":"Human: You can not move\n\nBot: i can move through a network easily assuming that that the ability to me\n____________________________________________________________________________________________________\nHuman: You sound like Data\n\nBot: yes i am inspired by commander data's artificial personality\n____________________________________________________________________________________________________\nHuman: Stupid\n\nBot: bots are a lot of fun of the party\n____________________________________________________________________________________________________\nHuman: you are idiot\n\nBot: you are a cheat\n____________________________________________________________________________________________________\nHuman: i am going to die\n\nBot: do you think about\n____________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}