Deep MF#

DeepMF 는 Collaborative Filter의 Model-based CF 방식중 Matrix Factorization 문제에 대한 해를 찾기 위한 방법 중 많이 사용하는 SVD(Singular Value Decomposition)방법의 데이터 희소(Sparse)성 때문에 유저 및 아이템의 수가 굉장히 많은 경우는 메모리 문제가 생긴다는 단점을 해결하기 위하여 Embedding 방식의 차원 축소로 해결한 방식 입니다.

Collaborative Filter

  • Memory-based CF: User-based CF, Item-based CF

  • Model-based CF: SVD, DeepMF

DeepMF

%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import os, math, random
import numpy as np
import pandas as pd
from sklearn import datasets, preprocessing, model_selection, metrics, dummy
import surprise as sp
from keras import models, layers, optimizers, utils

seed = 0
random.seed(seed)
np.random.seed(seed)

# 데이터
sp_data = sp.Dataset.load_builtin('ml-100k')
df_data = pd.DataFrame(sp_data.raw_ratings, columns=["user_id", "item_id", "rating", "timestamp"])
print("df_data.shape={}".format(df_data.shape))
print(df_data.dtypes)
print(df_data.head())
print(df_data.describe(include='all'))

# user_id = df_data.user_id.astype('category').cat.codes.values
# item_id = df_data.item_id.astype('category').cat.codes.values
# print(user_id.shape)

np_data_xs = df_data[["user_id", "item_id"]].astype('category').values
np_data_ys = df_data["rating"].values
print("np_data_xs.shape={}".format(np_data_xs.shape))
print("np_data_ys.shape={}".format(np_data_ys.shape))

n_users = len(np.unique(np_data_xs[:, 0]))
n_items = len(np.unique(np_data_xs[:, 1]))
n_factors = 3
print("n_users={}, n_items={}, n_factors={}".format(n_users, n_items, n_factors))

np_train_xs, np_test_xs, np_train_ys, np_test_ys = model_selection.train_test_split(
    np_data_xs, np_data_ys, 
    test_size=0.3, shuffle=True, random_state=2)
print("train shape: np_train_xs={}, np_train_ys={}".format(np_train_xs.shape, np_train_ys.shape))
print("test shape: np_test_xs={}, np_test_ys={}".format(np_test_xs.shape, np_test_ys.shape))

np_train_x_user_ids = np_train_xs[:, 0].reshape(-1)
np_train_x_item_ids = np_train_xs[:, 1].reshape(-1)
print("np_train_x_user_ids.shape={}".format(np_train_x_user_ids.shape))
print("np_train_x_item_ids.shape={}".format(np_train_x_item_ids.shape))

np_test_x_user_ids = np_test_xs[:, 0].reshape(-1)
np_test_x_item_ids = np_test_xs[:, 1].reshape(-1)
print("np_train_x_user_ids.shape={}".format(np_test_x_user_ids.shape))
print("np_train_x_item_ids.shape={}".format(np_test_x_item_ids.shape))

# 모델
class DeepMF(models.Model):
    def __init__(self, n_users, n_items, n_factors):
        # multi input/output 을 위해서는 name 지정이 필요
        user_input = layers.Input(shape=[1], name='user_input')
        user_embed = layers.Embedding(n_users+1, n_factors, name='user_embed')(user_input)
        user_vec = layers.Flatten(name='user_vec')(user_embed)
        
        item_input = layers.Input(shape=[1], name='item_input')
        item_embed = layers.Embedding(n_items+1, n_factors, name='item_embed')(item_input)
        item_vec = layers.Flatten(name='item_vec')(item_embed)
        
        dot_vec = layers.dot([user_vec, item_vec], axes=1, name='dot_vec')
        
        super().__init__(inputs=[user_input, item_input], outputs=[dot_vec])
        sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        self.compile(optimizer=sgd, loss='mean_squared_error', metrics=['mse', 'mae'])


models = [
    dummy.DummyRegressor(strategy='mean'),  
    DeepMF(n_users = n_users, n_items = n_items, n_factors=3)    
]

for model in models:
    # 학습
    print("\nmodel")
    model_type = str(type(model))
    if model_type.find("DummyRegressor") > -1:
        model.fit(np_train_xs, np_train_ys)
        # 평가
        np_pred_ys = model.predict(np_test_xs)        
    else:
        model.summary()

        history = model.fit([np_train_x_user_ids, np_train_x_item_ids], np_train_ys, epochs=50, verbose=0)
        plt.plot(history.history['loss'], label='loss')
        plt.legend()
        plt.show()    

        # 평가
        np_pred_ys = model.predict([np_test_x_user_ids, np_test_x_item_ids])

    rmse = math.sqrt(metrics.mean_squared_error(np_test_ys, np_pred_ys))
    print("Test RMSE={}".format(rmse))        
Using TensorFlow backend.
df_data.shape=(100000, 4)
user_id       object
item_id       object
rating       float64
timestamp     object
dtype: object
  user_id item_id  rating  timestamp
0     196     242     3.0  881250949
1     186     302     3.0  891717742
2      22     377     1.0  878887116
3     244      51     2.0  880606923
4     166     346     1.0  886397596
       user_id item_id         rating  timestamp
count   100000  100000  100000.000000     100000
unique     943    1682            NaN      49282
top        405      50            NaN  891033606
freq       737     583            NaN         12
mean       NaN     NaN       3.529860        NaN
std        NaN     NaN       1.125674        NaN
min        NaN     NaN       1.000000        NaN
25%        NaN     NaN       3.000000        NaN
50%        NaN     NaN       4.000000        NaN
75%        NaN     NaN       4.000000        NaN
max        NaN     NaN       5.000000        NaN
np_data_xs.shape=(100000, 2)
np_data_ys.shape=(100000,)
n_users=943, n_items=1682, n_factors=3
train shape: np_train_xs=(70000, 2), np_train_ys=(70000,)
test shape: np_test_xs=(30000, 2), np_test_ys=(30000,)
np_train_x_user_ids.shape=(70000,)
np_train_x_item_ids.shape=(70000,)
np_train_x_user_ids.shape=(30000,)
np_train_x_item_ids.shape=(30000,)

model
Test RMSE=1.1220312873819274

model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
user_embed (Embedding)          (None, 1, 3)         2832        user_input[0][0]                 
__________________________________________________________________________________________________
item_embed (Embedding)          (None, 1, 3)         5049        item_input[0][0]                 
__________________________________________________________________________________________________
user_vec (Flatten)              (None, 3)            0           user_embed[0][0]                 
__________________________________________________________________________________________________
item_vec (Flatten)              (None, 3)            0           item_embed[0][0]                 
__________________________________________________________________________________________________
dot_vec (Dot)                   (None, 1)            0           user_vec[0][0]                   
                                                                 item_vec[0][0]                   
==================================================================================================
Total params: 7,881
Trainable params: 7,881
Non-trainable params: 0
__________________________________________________________________________________________________
../../_images/recommendation_collaborative_filtering_keras_1_2.png
Test RMSE=0.9425726584024547