hzonuz commited on
Commit
7637ffd
1 Parent(s): 8d2e260

Create recommender.py

Browse files
Files changed (1) hide show
  1. recommender.py +265 -0
recommender.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from ast import literal_eval
5
+ import gc
6
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
7
+ from sklearn.preprocessing import MinMaxScaler
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ # import seaborn as sns
10
+ from collections import Counter
11
+
12
+ import mlflow
13
+
14
+
15
+ def init_mlflow():
16
+ mlflow.set_tracking_uri("http://0.0.0.0:8889")
17
+ mlflow.set_experiment("Default")
18
+ mlflow.start_run()
19
+ mlflow.sklearn.autolog()
20
+
21
+
22
+ def load_data():
23
+ credits_df = pd.read_csv('./datasets/credits.csv')
24
+ keywords_df = pd.read_csv('./datasets/keywords.csv')
25
+ links_df = pd.read_csv('./datasets/links_small.csv')
26
+ movies_df = pd.read_csv('./datasets/movies_metadata.csv')
27
+ ratings_df = pd.read_csv('./datasets/ratings_small.csv')
28
+ return credits_df, keywords_df, links_df, movies_df, ratings_df
29
+
30
+
31
+ def draw_adult_movies_pie_chart(movies_df):
32
+ plt.figure(figsize=(8, 4))
33
+ plt.scatter(x=[0.5, 1.5], y=[1, 1], s=15000, color=['#06837f', '#fdc100'])
34
+ plt.xlim(0, 2)
35
+ plt.ylim(0.9, 1.2)
36
+
37
+ plt.title('Distribution of Adult and Non Adult Movies', fontsize=18, weight=600, color='#333d29')
38
+ plt.text(0.5, 1, '{}\nMovies'.format(str(len(movies_df[movies_df['adult'] == 'True']))), va='center', ha='center',
39
+ fontsize=18, weight=600, color='white')
40
+ plt.text(1.5, 1, '{}\nMovies'.format(str(len(movies_df[movies_df['adult'] == 'False']))), va='center', ha='center',
41
+ fontsize=18, weight=600, color='white')
42
+ plt.text(0.5, 1.11, 'Adult', va='center', ha='center', fontsize=17, weight=500, color='#1c2541')
43
+ plt.text(1.5, 1.11, 'Non Adult', va='center', ha='center', fontsize=17, weight=500, color='#1c2541')
44
+
45
+ plt.axis('off')
46
+
47
+ plt.savefig('adult.png')
48
+ mlflow.log_artifact('adult.png')
49
+
50
+
51
+ def draw_genres_pie_chart(df):
52
+ genres_list = []
53
+ for i in df['genres']:
54
+ i = i[1:]
55
+ i = i[:-1]
56
+ genres_list.extend(i.split(', '))
57
+
58
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
59
+
60
+ df_plot = pd.DataFrame(Counter(genres_list).most_common(5), columns=['genre', 'total'])
61
+ # ax = sns.barplot(data=df_plot, x='genre', y='total', ax=axes[0],
62
+ # palette=['#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811'])
63
+ # ax.set_title('Top 5 Genres in Movies', fontsize=18, weight=600, color='#333d29')
64
+ # sns.despine()
65
+
66
+ df_plot_full = pd.DataFrame([Counter(genres_list)]).transpose().sort_values(by=0, ascending=False)
67
+ df_plot.loc[len(df_plot)] = {'genre': 'Others', 'total': df_plot_full[6:].sum()[0]}
68
+ plt.title('Percentage Ratio of Movie Genres', fontsize=18, weight=600, color='#333d29')
69
+ wedges, texts, autotexts = axes[1].pie(x=df_plot['total'], labels=df_plot['genre'], autopct='%.2f%%',
70
+ textprops=dict(fontsize=14), explode=[0, 0, 0, 0, 0, 0.1],
71
+ colors=['#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811', '#fdc100'])
72
+
73
+ for autotext in autotexts:
74
+ autotext.set_color('#1c2541')
75
+ autotext.set_weight('bold')
76
+
77
+ axes[1].axis('off')
78
+
79
+ plt.savefig('genres.png')
80
+ mlflow.log_artifact('genres.png')
81
+
82
+
83
+ def director(x):
84
+ for i in x:
85
+ if i["job"] == "Director":
86
+ return i["name"]
87
+ return ""
88
+
89
+
90
+ def writer_screenplay(x):
91
+ names = []
92
+ for i in x:
93
+ if (i["job"] == "Writer") | (i["job"] == "Screenplay") | (i["job"] == "Author"):
94
+ name = i["name"]
95
+ names.append(name)
96
+ return names
97
+
98
+
99
+ def calculate_cosine_similarity(train_df):
100
+ cosine_sim = cosine_similarity(train_df)
101
+ return cosine_sim
102
+
103
+
104
+ def clean_data(credits_df, keywords_df, movies_df):
105
+ # draw_adult_movies_pie_chart(movies_df)
106
+ # Cast id column to int
107
+ movies_df["id"] = movies_df["id"].apply(pd.to_numeric, errors="ignore")
108
+ keywords_df["id"] = keywords_df["id"].apply(int)
109
+ credits_df["id"] = credits_df["id"].apply(int)
110
+
111
+ # Merge movies, keywords, credits based on id column
112
+ df = movies_df.merge(keywords_df, on="id").merge(credits_df, on="id")
113
+
114
+ """Cleaning our merged data from from duplicated and null values"""
115
+
116
+ # Find null values in our merged data frame
117
+ df.isnull().sum()
118
+
119
+ # Remove duplicated values with the same titles
120
+ df.drop_duplicates(subset=["title", "id"], inplace=True)
121
+
122
+ # Remove movies with null titles
123
+ df = df[df.title.notnull()]
124
+
125
+ # Find number of movies with vote count < 30
126
+ (df.vote_count < 30).sum()
127
+
128
+ # Remove movies with vote count < 30
129
+ df = df[df.vote_count > 30]
130
+
131
+ # Make release data numeric
132
+ df["release_date"] = pd.to_datetime(df['release_date'])
133
+ df["release_year"] = df["release_date"].dt.year
134
+
135
+ df.drop("release_date", axis=1, inplace=True)
136
+
137
+ # Remove null values
138
+ df = df[df["release_year"].notnull()]
139
+ df = df[df["runtime"].notnull()]
140
+
141
+ # Make vote_average and release_year column categorical and normalize them
142
+ df["vote_average_bins"] = pd.cut(df["vote_average"].astype(float), 10, labels=range(1, 11))
143
+ scaler = MinMaxScaler()
144
+ df["vote_average_bins"] = df["vote_average_bins"].astype(int)
145
+ df["vote_average_bins"] = scaler.fit_transform(df["vote_average_bins"].values.reshape(-1, 1))
146
+
147
+ df["release_year_bins"] = pd.qcut(df["release_year"].astype(float), q=10, labels=range(1, 11))
148
+ scaler = MinMaxScaler()
149
+ df["release_year_bins"] = df["release_year_bins"].astype(int)
150
+ df["release_year_bins"] = scaler.fit_transform(df["release_year_bins"].values.reshape(-1, 1))
151
+
152
+ # Set data frame primary index to title
153
+ df.set_index("title", inplace=True)
154
+
155
+ # Make languages one-hotted
156
+ languages = pd.get_dummies(df["original_language"])
157
+
158
+ # Extract genre name from json
159
+ df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(
160
+ lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
161
+ df["genres"] = df["genres"].astype(str)
162
+ # draw_genres_pie_chart(df)
163
+
164
+ # Make genres one-hotted
165
+ cv = CountVectorizer(lowercase=False)
166
+ genres = cv.fit_transform(df["genres"])
167
+ genres_df = pd.DataFrame(genres.todense(), columns=cv.get_feature_names_out())
168
+ genres_df.set_index(df.index, inplace=True)
169
+
170
+ # Make keywords,tagline,overview one-hotted
171
+ df['keywords'] = df['keywords'].fillna('[]').apply(literal_eval).apply(
172
+ lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
173
+ df["keywords"] = df["keywords"].astype(str)
174
+ df["tagline"].fillna("", inplace=True)
175
+ df["overview"].fillna("", inplace=True)
176
+ df["keywords"].fillna("", inplace=True)
177
+ df["text"] = df["overview"] + df["tagline"] + df["keywords"]
178
+
179
+ tfidf = TfidfVectorizer(max_features=5000)
180
+ tfidf_matrix = tfidf.fit_transform(df["text"])
181
+ tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names_out())
182
+ tfidf_df.set_index(df.index, inplace=True)
183
+
184
+ # Make cast one-hotted
185
+ df['cast'] = df['cast'].fillna('[]').apply(literal_eval).apply(
186
+ lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
187
+ df["cast"] = df["cast"].apply(lambda x: [c.replace(" ", "") for c in x])
188
+ df["cast"] = df["cast"].apply(lambda x: x[:15])
189
+ df["CC"] = df["cast"].astype(str)
190
+ cv = CountVectorizer(lowercase=False, min_df=4)
191
+ cast = cv.fit_transform(df["CC"])
192
+ cast_df = pd.DataFrame(cast.todense(), columns=cv.get_feature_names_out())
193
+ cast_df.set_index(df.index, inplace=True)
194
+
195
+ df["dir"] = df["crew"].apply(literal_eval).apply(director)
196
+ directors = pd.get_dummies(df["dir"])
197
+
198
+ df["writer_screenplay"] = df["crew"].apply(literal_eval).apply(writer_screenplay)
199
+ df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: [c.replace(" ", "") for c in x])
200
+ df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: x[:3])
201
+ df["writer_screenplay"] = df["writer_screenplay"].astype(str)
202
+ cv = CountVectorizer(lowercase=False, min_df=2)
203
+ writing = cv.fit_transform(df["writer_screenplay"])
204
+ writing_df = pd.DataFrame(writing.todense(), columns=cv.get_feature_names_out())
205
+ writing_df.set_index(df.index, inplace=True)
206
+
207
+ gc.collect()
208
+ train_df = pd.concat([languages, genres_df, cast_df, writing_df, directors, tfidf_df], axis=1)
209
+ train_df = train_df.astype(np.int8)
210
+ gc.collect()
211
+
212
+ return train_df, df
213
+
214
+
215
+ class RecommenderSystem(mlflow.pyfunc.PythonModel):
216
+ def load_context(self, context):
217
+ credits_df, keywords_df, links_df, movies_df, ratings_df = load_data()
218
+ self.train_df, self.df = clean_data(credits_df, keywords_df, movies_df)
219
+ self.cosine_sim = calculate_cosine_similarity(self.train_df)
220
+
221
+ def predict(self, context, model_input):
222
+ return self.recommend(model_input[0], self.cosine_sim)
223
+
224
+ def recommend(self, title, cosine_sim):
225
+ indices = pd.Series(range(0, len(self.train_df.index)), index=self.train_df.index).drop_duplicates()
226
+ number = 10
227
+ # Get the index of the movie that matches the title
228
+ idx = indices[title]
229
+ # Get the pairwsie similarity scores of all movies with that movie
230
+ sim_scores = list(enumerate(cosine_sim[idx]))
231
+
232
+ # Sort the movies based on the similarity scores
233
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
234
+
235
+ scores_arr = np.array(sim_scores)
236
+ scores_mean = np.average(scores_arr, axis=0)
237
+ mlflow.log_metric("cosine-total-avg", scores_mean[1])
238
+
239
+ # Get the scores of the 10 most similar movies
240
+ sim_scores = sim_scores[1:number + 1]
241
+ scores_arr = np.array(sim_scores)
242
+ scores_mean = np.average(scores_arr, axis=0)
243
+
244
+ mlflow.log_metric("cosine-result-avg", scores_mean[1])
245
+ mlflow.log_metric("cosine-result-max", sim_scores[0][1])
246
+ mlflow.log_metric("cosine-result-min", sim_scores[number - 1][1])
247
+ mlflow.log_param("number-of-results", number)
248
+
249
+ # Get the movie indices
250
+ movie_indices = [i[0] for i in sim_scores]
251
+
252
+ recommendations = pd.DataFrame({"Movies": self.df.iloc[movie_indices].index.tolist(),
253
+ "Id": self.df.iloc[movie_indices].imdb_id.tolist(),
254
+ "Similarity": [sim[1] for sim in sim_scores]})
255
+ return recommendations
256
+
257
+
258
+ if __name__ == '__main__':
259
+ mlflow.pyfunc.save_model(path="imdb-recommendation-v2", python_model=RecommenderSystem())
260
+ init_mlflow()
261
+ mlflow.pyfunc.log_model("imdb-recommendation-v2", python_model=RecommenderSystem(), registered_model_name="recommendation-model-v2")
262
+ loaded_model = mlflow.pyfunc.load_model("imdb-recommendation-v2")
263
+ print(loaded_model.predict(["The Dark Knight Rises"]))
264
+ mlflow.end_run()
265
+