Create recommender.py
Browse files- recommender.py +265 -0
recommender.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from ast import literal_eval
|
5 |
+
import gc
|
6 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
7 |
+
from sklearn.preprocessing import MinMaxScaler
|
8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
# import seaborn as sns
|
10 |
+
from collections import Counter
|
11 |
+
|
12 |
+
import mlflow
|
13 |
+
|
14 |
+
|
15 |
+
def init_mlflow():
|
16 |
+
mlflow.set_tracking_uri("http://0.0.0.0:8889")
|
17 |
+
mlflow.set_experiment("Default")
|
18 |
+
mlflow.start_run()
|
19 |
+
mlflow.sklearn.autolog()
|
20 |
+
|
21 |
+
|
22 |
+
def load_data():
|
23 |
+
credits_df = pd.read_csv('./datasets/credits.csv')
|
24 |
+
keywords_df = pd.read_csv('./datasets/keywords.csv')
|
25 |
+
links_df = pd.read_csv('./datasets/links_small.csv')
|
26 |
+
movies_df = pd.read_csv('./datasets/movies_metadata.csv')
|
27 |
+
ratings_df = pd.read_csv('./datasets/ratings_small.csv')
|
28 |
+
return credits_df, keywords_df, links_df, movies_df, ratings_df
|
29 |
+
|
30 |
+
|
31 |
+
def draw_adult_movies_pie_chart(movies_df):
|
32 |
+
plt.figure(figsize=(8, 4))
|
33 |
+
plt.scatter(x=[0.5, 1.5], y=[1, 1], s=15000, color=['#06837f', '#fdc100'])
|
34 |
+
plt.xlim(0, 2)
|
35 |
+
plt.ylim(0.9, 1.2)
|
36 |
+
|
37 |
+
plt.title('Distribution of Adult and Non Adult Movies', fontsize=18, weight=600, color='#333d29')
|
38 |
+
plt.text(0.5, 1, '{}\nMovies'.format(str(len(movies_df[movies_df['adult'] == 'True']))), va='center', ha='center',
|
39 |
+
fontsize=18, weight=600, color='white')
|
40 |
+
plt.text(1.5, 1, '{}\nMovies'.format(str(len(movies_df[movies_df['adult'] == 'False']))), va='center', ha='center',
|
41 |
+
fontsize=18, weight=600, color='white')
|
42 |
+
plt.text(0.5, 1.11, 'Adult', va='center', ha='center', fontsize=17, weight=500, color='#1c2541')
|
43 |
+
plt.text(1.5, 1.11, 'Non Adult', va='center', ha='center', fontsize=17, weight=500, color='#1c2541')
|
44 |
+
|
45 |
+
plt.axis('off')
|
46 |
+
|
47 |
+
plt.savefig('adult.png')
|
48 |
+
mlflow.log_artifact('adult.png')
|
49 |
+
|
50 |
+
|
51 |
+
def draw_genres_pie_chart(df):
|
52 |
+
genres_list = []
|
53 |
+
for i in df['genres']:
|
54 |
+
i = i[1:]
|
55 |
+
i = i[:-1]
|
56 |
+
genres_list.extend(i.split(', '))
|
57 |
+
|
58 |
+
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
|
59 |
+
|
60 |
+
df_plot = pd.DataFrame(Counter(genres_list).most_common(5), columns=['genre', 'total'])
|
61 |
+
# ax = sns.barplot(data=df_plot, x='genre', y='total', ax=axes[0],
|
62 |
+
# palette=['#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811'])
|
63 |
+
# ax.set_title('Top 5 Genres in Movies', fontsize=18, weight=600, color='#333d29')
|
64 |
+
# sns.despine()
|
65 |
+
|
66 |
+
df_plot_full = pd.DataFrame([Counter(genres_list)]).transpose().sort_values(by=0, ascending=False)
|
67 |
+
df_plot.loc[len(df_plot)] = {'genre': 'Others', 'total': df_plot_full[6:].sum()[0]}
|
68 |
+
plt.title('Percentage Ratio of Movie Genres', fontsize=18, weight=600, color='#333d29')
|
69 |
+
wedges, texts, autotexts = axes[1].pie(x=df_plot['total'], labels=df_plot['genre'], autopct='%.2f%%',
|
70 |
+
textprops=dict(fontsize=14), explode=[0, 0, 0, 0, 0, 0.1],
|
71 |
+
colors=['#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811', '#fdc100'])
|
72 |
+
|
73 |
+
for autotext in autotexts:
|
74 |
+
autotext.set_color('#1c2541')
|
75 |
+
autotext.set_weight('bold')
|
76 |
+
|
77 |
+
axes[1].axis('off')
|
78 |
+
|
79 |
+
plt.savefig('genres.png')
|
80 |
+
mlflow.log_artifact('genres.png')
|
81 |
+
|
82 |
+
|
83 |
+
def director(x):
|
84 |
+
for i in x:
|
85 |
+
if i["job"] == "Director":
|
86 |
+
return i["name"]
|
87 |
+
return ""
|
88 |
+
|
89 |
+
|
90 |
+
def writer_screenplay(x):
|
91 |
+
names = []
|
92 |
+
for i in x:
|
93 |
+
if (i["job"] == "Writer") | (i["job"] == "Screenplay") | (i["job"] == "Author"):
|
94 |
+
name = i["name"]
|
95 |
+
names.append(name)
|
96 |
+
return names
|
97 |
+
|
98 |
+
|
99 |
+
def calculate_cosine_similarity(train_df):
|
100 |
+
cosine_sim = cosine_similarity(train_df)
|
101 |
+
return cosine_sim
|
102 |
+
|
103 |
+
|
104 |
+
def clean_data(credits_df, keywords_df, movies_df):
|
105 |
+
# draw_adult_movies_pie_chart(movies_df)
|
106 |
+
# Cast id column to int
|
107 |
+
movies_df["id"] = movies_df["id"].apply(pd.to_numeric, errors="ignore")
|
108 |
+
keywords_df["id"] = keywords_df["id"].apply(int)
|
109 |
+
credits_df["id"] = credits_df["id"].apply(int)
|
110 |
+
|
111 |
+
# Merge movies, keywords, credits based on id column
|
112 |
+
df = movies_df.merge(keywords_df, on="id").merge(credits_df, on="id")
|
113 |
+
|
114 |
+
"""Cleaning our merged data from from duplicated and null values"""
|
115 |
+
|
116 |
+
# Find null values in our merged data frame
|
117 |
+
df.isnull().sum()
|
118 |
+
|
119 |
+
# Remove duplicated values with the same titles
|
120 |
+
df.drop_duplicates(subset=["title", "id"], inplace=True)
|
121 |
+
|
122 |
+
# Remove movies with null titles
|
123 |
+
df = df[df.title.notnull()]
|
124 |
+
|
125 |
+
# Find number of movies with vote count < 30
|
126 |
+
(df.vote_count < 30).sum()
|
127 |
+
|
128 |
+
# Remove movies with vote count < 30
|
129 |
+
df = df[df.vote_count > 30]
|
130 |
+
|
131 |
+
# Make release data numeric
|
132 |
+
df["release_date"] = pd.to_datetime(df['release_date'])
|
133 |
+
df["release_year"] = df["release_date"].dt.year
|
134 |
+
|
135 |
+
df.drop("release_date", axis=1, inplace=True)
|
136 |
+
|
137 |
+
# Remove null values
|
138 |
+
df = df[df["release_year"].notnull()]
|
139 |
+
df = df[df["runtime"].notnull()]
|
140 |
+
|
141 |
+
# Make vote_average and release_year column categorical and normalize them
|
142 |
+
df["vote_average_bins"] = pd.cut(df["vote_average"].astype(float), 10, labels=range(1, 11))
|
143 |
+
scaler = MinMaxScaler()
|
144 |
+
df["vote_average_bins"] = df["vote_average_bins"].astype(int)
|
145 |
+
df["vote_average_bins"] = scaler.fit_transform(df["vote_average_bins"].values.reshape(-1, 1))
|
146 |
+
|
147 |
+
df["release_year_bins"] = pd.qcut(df["release_year"].astype(float), q=10, labels=range(1, 11))
|
148 |
+
scaler = MinMaxScaler()
|
149 |
+
df["release_year_bins"] = df["release_year_bins"].astype(int)
|
150 |
+
df["release_year_bins"] = scaler.fit_transform(df["release_year_bins"].values.reshape(-1, 1))
|
151 |
+
|
152 |
+
# Set data frame primary index to title
|
153 |
+
df.set_index("title", inplace=True)
|
154 |
+
|
155 |
+
# Make languages one-hotted
|
156 |
+
languages = pd.get_dummies(df["original_language"])
|
157 |
+
|
158 |
+
# Extract genre name from json
|
159 |
+
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(
|
160 |
+
lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
|
161 |
+
df["genres"] = df["genres"].astype(str)
|
162 |
+
# draw_genres_pie_chart(df)
|
163 |
+
|
164 |
+
# Make genres one-hotted
|
165 |
+
cv = CountVectorizer(lowercase=False)
|
166 |
+
genres = cv.fit_transform(df["genres"])
|
167 |
+
genres_df = pd.DataFrame(genres.todense(), columns=cv.get_feature_names_out())
|
168 |
+
genres_df.set_index(df.index, inplace=True)
|
169 |
+
|
170 |
+
# Make keywords,tagline,overview one-hotted
|
171 |
+
df['keywords'] = df['keywords'].fillna('[]').apply(literal_eval).apply(
|
172 |
+
lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
|
173 |
+
df["keywords"] = df["keywords"].astype(str)
|
174 |
+
df["tagline"].fillna("", inplace=True)
|
175 |
+
df["overview"].fillna("", inplace=True)
|
176 |
+
df["keywords"].fillna("", inplace=True)
|
177 |
+
df["text"] = df["overview"] + df["tagline"] + df["keywords"]
|
178 |
+
|
179 |
+
tfidf = TfidfVectorizer(max_features=5000)
|
180 |
+
tfidf_matrix = tfidf.fit_transform(df["text"])
|
181 |
+
tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names_out())
|
182 |
+
tfidf_df.set_index(df.index, inplace=True)
|
183 |
+
|
184 |
+
# Make cast one-hotted
|
185 |
+
df['cast'] = df['cast'].fillna('[]').apply(literal_eval).apply(
|
186 |
+
lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
|
187 |
+
df["cast"] = df["cast"].apply(lambda x: [c.replace(" ", "") for c in x])
|
188 |
+
df["cast"] = df["cast"].apply(lambda x: x[:15])
|
189 |
+
df["CC"] = df["cast"].astype(str)
|
190 |
+
cv = CountVectorizer(lowercase=False, min_df=4)
|
191 |
+
cast = cv.fit_transform(df["CC"])
|
192 |
+
cast_df = pd.DataFrame(cast.todense(), columns=cv.get_feature_names_out())
|
193 |
+
cast_df.set_index(df.index, inplace=True)
|
194 |
+
|
195 |
+
df["dir"] = df["crew"].apply(literal_eval).apply(director)
|
196 |
+
directors = pd.get_dummies(df["dir"])
|
197 |
+
|
198 |
+
df["writer_screenplay"] = df["crew"].apply(literal_eval).apply(writer_screenplay)
|
199 |
+
df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: [c.replace(" ", "") for c in x])
|
200 |
+
df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: x[:3])
|
201 |
+
df["writer_screenplay"] = df["writer_screenplay"].astype(str)
|
202 |
+
cv = CountVectorizer(lowercase=False, min_df=2)
|
203 |
+
writing = cv.fit_transform(df["writer_screenplay"])
|
204 |
+
writing_df = pd.DataFrame(writing.todense(), columns=cv.get_feature_names_out())
|
205 |
+
writing_df.set_index(df.index, inplace=True)
|
206 |
+
|
207 |
+
gc.collect()
|
208 |
+
train_df = pd.concat([languages, genres_df, cast_df, writing_df, directors, tfidf_df], axis=1)
|
209 |
+
train_df = train_df.astype(np.int8)
|
210 |
+
gc.collect()
|
211 |
+
|
212 |
+
return train_df, df
|
213 |
+
|
214 |
+
|
215 |
+
class RecommenderSystem(mlflow.pyfunc.PythonModel):
|
216 |
+
def load_context(self, context):
|
217 |
+
credits_df, keywords_df, links_df, movies_df, ratings_df = load_data()
|
218 |
+
self.train_df, self.df = clean_data(credits_df, keywords_df, movies_df)
|
219 |
+
self.cosine_sim = calculate_cosine_similarity(self.train_df)
|
220 |
+
|
221 |
+
def predict(self, context, model_input):
|
222 |
+
return self.recommend(model_input[0], self.cosine_sim)
|
223 |
+
|
224 |
+
def recommend(self, title, cosine_sim):
|
225 |
+
indices = pd.Series(range(0, len(self.train_df.index)), index=self.train_df.index).drop_duplicates()
|
226 |
+
number = 10
|
227 |
+
# Get the index of the movie that matches the title
|
228 |
+
idx = indices[title]
|
229 |
+
# Get the pairwsie similarity scores of all movies with that movie
|
230 |
+
sim_scores = list(enumerate(cosine_sim[idx]))
|
231 |
+
|
232 |
+
# Sort the movies based on the similarity scores
|
233 |
+
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
234 |
+
|
235 |
+
scores_arr = np.array(sim_scores)
|
236 |
+
scores_mean = np.average(scores_arr, axis=0)
|
237 |
+
mlflow.log_metric("cosine-total-avg", scores_mean[1])
|
238 |
+
|
239 |
+
# Get the scores of the 10 most similar movies
|
240 |
+
sim_scores = sim_scores[1:number + 1]
|
241 |
+
scores_arr = np.array(sim_scores)
|
242 |
+
scores_mean = np.average(scores_arr, axis=0)
|
243 |
+
|
244 |
+
mlflow.log_metric("cosine-result-avg", scores_mean[1])
|
245 |
+
mlflow.log_metric("cosine-result-max", sim_scores[0][1])
|
246 |
+
mlflow.log_metric("cosine-result-min", sim_scores[number - 1][1])
|
247 |
+
mlflow.log_param("number-of-results", number)
|
248 |
+
|
249 |
+
# Get the movie indices
|
250 |
+
movie_indices = [i[0] for i in sim_scores]
|
251 |
+
|
252 |
+
recommendations = pd.DataFrame({"Movies": self.df.iloc[movie_indices].index.tolist(),
|
253 |
+
"Id": self.df.iloc[movie_indices].imdb_id.tolist(),
|
254 |
+
"Similarity": [sim[1] for sim in sim_scores]})
|
255 |
+
return recommendations
|
256 |
+
|
257 |
+
|
258 |
+
if __name__ == '__main__':
|
259 |
+
mlflow.pyfunc.save_model(path="imdb-recommendation-v2", python_model=RecommenderSystem())
|
260 |
+
init_mlflow()
|
261 |
+
mlflow.pyfunc.log_model("imdb-recommendation-v2", python_model=RecommenderSystem(), registered_model_name="recommendation-model-v2")
|
262 |
+
loaded_model = mlflow.pyfunc.load_model("imdb-recommendation-v2")
|
263 |
+
print(loaded_model.predict(["The Dark Knight Rises"]))
|
264 |
+
mlflow.end_run()
|
265 |
+
|