SGD Classification
SGD Classification#
SGD(Stochastic Gradient Descent) Classification
Stochastic Gradient Descent (SGD) 은 convex 형태의 loss 함수를 통한 최적화 문제에 매우 효율적인 접근방법입니다. SGD는 큰 규모이면서 희소한 형태의 데이터(예로, 10^5 이상의 Feature를 가진 10^5 개 이상의 학습 데이터)에서도 좋은 성과를 나타냅니다.
장점:
효율적이며, 구현이 쉬움
단점:
효율적 학습을 위해 hyperparameters 에 대한 tunning 이 필요
Feature scaling 에 민감
SGD에서 살펴볼 주요 파라미터는 loss, penalty, l1_ratio 입니다.
loss
loss=”hinge”: (soft-margin) linear Support Vector Machine,
loss=”modified_huber”: smoothed hinge loss,
loss=”log”: logistic regression,
‘squared_hinge’, ‘perceptron’, or a regression loss: ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’
penalty
penalty=”l2”: L2 norm penalty on coef_.
penalty=”l1”: L1 norm penalty on coef_.
penalty=”elasticnet”: Convex combination of L2 and L1; (1 - l1_ratio) * L2 + l1_ratio * L1.
l1_ratio
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Defaults to 0.15.
# 경고 메시지 출력 끄기
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline
import matplotlib.pyplot as plt
import IPython
import platform, sys
rseed = 22
import random
random.seed(rseed)
import numpy as np
np.random.seed(rseed)
np.set_printoptions(precision=3)
np.set_printoptions(formatter={'float_kind': "{:.3f}".format})
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:,.5f}'.format
import sklearn
print(f"python ver={sys.version}")
print(f"python platform={platform.architecture()}")
print(f"pandas ver={pd.__version__}")
print(f"numpy ver={np.__version__}")
print(f"sklearn ver={sklearn.__version__}")
python ver=3.8.9 (default, Jun 12 2021, 23:47:44)
[Clang 12.0.5 (clang-1205.0.22.9)]
python platform=('64bit', '')
pandas ver=1.2.4
numpy ver=1.19.5
sklearn ver=0.24.2
from sklearn import datasets, model_selection, linear_model, metrics
# 데이터
n_samples = 100000
xs, ys = datasets.make_classification(
n_samples=n_samples, # 데이터 수
n_features=10, # X feature 수
n_informative=3,
n_classes=3, # Y class 수
random_state=rseed) # 난수 발생용 Seed 값
print(f"data shape: xs={xs.shape}, ys={ys.shape}")
train_xs, test_xs, train_ys, test_ys = model_selection.train_test_split(
xs, ys, test_size=0.3, shuffle=True, random_state=2)
print(f"train shape: train_xs={train_xs.shape}, train_ys={train_ys.shape}")
print(f"test shape: test_xs={test_xs.shape}, test_ys={test_ys.shape}")
# 모델
models = [
linear_model.SGDClassifier()
]
for model in models:
# 학습
print(f"model={model}")
model.fit(train_xs, train_ys)
# 평가
pred_ys = model.predict(test_xs)
# 선형 회귀 모델링을 통해 얻은 coefficient, intercept 입니다.
print(f"coefficient={model.coef_}")
print(f"intercept={model.intercept_}")
# 평가: 테스트 데이터에 대해서 Accuracy 값을 구합니다.
acc = metrics.accuracy_score(test_ys, pred_ys)
print(f"acc={acc:.5f}")
cr = metrics.classification_report(test_ys, pred_ys)
print(f"classification_report\n{cr}")
data shape: xs=(100000, 10), ys=(100000,)
train shape: train_xs=(70000, 10), train_ys=(70000,)
test shape: test_xs=(30000, 10), test_ys=(30000,)
model=SGDClassifier()
coefficient=[[0.012 0.403 -0.444 0.501 0.024 0.863 0.017 0.452 0.000 0.013]
[0.000 -0.144 0.019 0.034 0.014 -0.054 -0.015 -0.145 -0.090 -0.029]
[0.105 0.041 0.318 -0.604 0.057 -0.914 -0.099 -0.125 0.060 0.087]]
intercept=[-0.406 -0.970 -1.119]
acc=0.69253
classification_report
precision recall f1-score support
0 0.64 0.93 0.75 9939
1 0.75 0.35 0.48 10041
2 0.74 0.80 0.77 10020
accuracy 0.69 30000
macro avg 0.71 0.69 0.67 30000
weighted avg 0.71 0.69 0.67 30000