Model Persistence#

머신러닝 모델을 학습시킨 후, 모델 재사용을 위해 저장하고 로드하는 방법을 알아봅니다.

# 경고 메시지 출력 끄기
import warnings 
warnings.filterwarnings(action='ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import IPython

import sys

rseed = 22
import random
random.seed(rseed)

import numpy as np
np.random.seed(rseed)
np.set_printoptions(precision=3)
np.set_printoptions(formatter={'float_kind': "{:.3f}".format})

import pandas as pd
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:,.5f}'.format

import sklearn

print(f"python ver={sys.version}")
print(f"pandas ver={pd.__version__}")
print(f"numpy ver={np.__version__}")
print(f"sklearn ver={sklearn.__version__}")
python ver=3.8.9 (default, Jun 12 2021, 23:47:44) 
[Clang 12.0.5 (clang-1205.0.22.9)]
pandas ver=1.2.4
numpy ver=1.19.5
sklearn ver=0.24.2
from sklearn import datasets, model_selection, linear_model, metrics
import os, joblib

# 데이터
n_samples = 100000
xs, ys = datasets.make_classification(
    n_samples=n_samples, # 데이터 수
    n_features=10, # X feature 수
    n_informative=3,
    n_classes=3, # Y class 수
    random_state=rseed) # 난수 발생용 Seed 값
print(f"data shape: xs={xs.shape}, ys={ys.shape}")
train_xs, test_xs, train_ys, test_ys = model_selection.train_test_split(
    xs, ys, test_size=0.3, shuffle=True, random_state=2)
print(f"train shape: train_xs={train_xs.shape}, train_ys={train_ys.shape}")
print(f"test shape: test_xs={test_xs.shape}, test_ys={test_ys.shape}")

# 모델
model = linear_model.LogisticRegression(solver='sag', multi_class='multinomial')

# 학습
print(f"model={model}")
model.fit(train_xs, train_ys)

# 평가
pred_ys = model.predict(test_xs)
cr = metrics.classification_report(test_ys, pred_ys)
print(f"classification_report\n{cr}")

# 모델 저장/로드
path_model = "/tmp/model.joblib"
joblib.dump(model, path_model)
model = joblib.load(path_model)

# 재평가
pred_ys = model.predict(test_xs)
cr = metrics.classification_report(test_ys, pred_ys)
print(f"classification_report\n{cr}")
data shape: xs=(100000, 10), ys=(100000,)
train shape: train_xs=(70000, 10), train_ys=(70000,)
test shape: test_xs=(30000, 10), test_ys=(30000,)
model=LogisticRegression(multi_class='multinomial', solver='sag')
classification_report
              precision    recall  f1-score   support

           0       0.69      0.81      0.74      9939
           1       0.66      0.58      0.62     10041
           2       0.81      0.77      0.79     10020

    accuracy                           0.72     30000
   macro avg       0.72      0.72      0.72     30000
weighted avg       0.72      0.72      0.72     30000

classification_report
              precision    recall  f1-score   support

           0       0.69      0.81      0.74      9939
           1       0.66      0.58      0.62     10041
           2       0.81      0.77      0.79     10020

    accuracy                           0.72     30000
   macro avg       0.72      0.72      0.72     30000
weighted avg       0.72      0.72      0.72     30000