NeuralNetMLP

import numpy as np
import sys
class NeuralNetMLP(object):
def __init__(self, n_hidden=30, l2=0., epochs=100, eta=0.001, shuffle=True, minibatch_size=1, seed=None):
self.random = np.random.RandomState(seed)
self.n_hidden = n_hidden
self.l2 = l2
self.epochs = epochs
self.eta = eta
self.shuffle = shuffle
self.minibatch_size = minibatch_size
def _onehot(self, y, n_classes):
onehot = np.zeros((n_classes, y.shape[0]))
for idx, val in enumerate(y.astype(int)):
onehot[val, idx] = 1.
return onehot.T
def _sigmoid(self, z):
return 1. / (1. + np.exp(-np.clip(z, -250, 250)))
def _forward(self, X):
z_h = np.dot(X, self.w_h) + self.b_h
a_h = self._sigmoid(z_h)
z_out = np.dot(a_h, self.w_out) + self.b_out
a_out = self._sigmoid(z_out)
return z_h, a_h, z_out, a_out
def _compute_cost(self, y_enc, output):
L2_term = (self.l2 *(np.sum(self.w_h ** 2.)+np.sum(self.w_out ** 2.)))
term1 = -y_enc * (np.log(output))
term2 = (1. - y_enc) * np.log(1. - output)
cost = np.sum(term1 - term2) + L2_term
return cost
def predict(self, X):
z_h, a_h, z_out, a_out = self._forward(X)
y_pred = np.argmax(z_out, axis=1)
return y_pred
def fit(self, X_train, y_train, X_valid, y_valid):
n_output = np.unique(y_train).shape[0]
n_features = X_train.shape[1]
self.b_h = np.zeros(self.n_hidden)
self.w_h = self.random.normal(loc=0.0, scale=0.1, size=(n_features, self.n_hidden))
self.b_out = np.zeros(n_output)
self.w_out = self.random.normal(loc=0.0, scale=0.1, size=(self.n_hidden, n_output))
epoch_strlen = len(str(self.epochs))
self.eval_ = {'cost': [], 'train_acc': [], 'valid_acc': []}
y_train_enc = self._onehot(y_train, n_output)
for i in range(self.epochs):
indices = np.arange(X_train.shape[0])
if self.shuffle:
self.random.shuffle(indices)
for start_idx in range(0, indices.shape[0] - self.minibatch_size +1, self.minibatch_size):
batch_idx = indices[start_idx:start_idx + self.minibatch_size]
z_h, a_h, z_out, a_out = self._forward(X_train[batch_idx])
delta_out = a_out - y_train_enc[batch_idx]
sigmoid_derivative_h = a_h * (1. - a_h)
delta_h = (np.dot(delta_out, self.w_out.T) *sigmoid_derivative_h)
grad_w_h = np.dot(X_train[batch_idx].T, delta_h)
grad_b_h = np.sum(delta_h, axis=0)
grad_w_out = np.dot(a_h.T, delta_out)
grad_b_out = np.sum(delta_out, axis=0)
delta_w_h = (grad_w_h + self.l2*self.w_h)
delta_b_h = grad_b_h
self.w_h -= self.eta * delta_w_h
self.b_h -= self.eta * delta_b_h
delta_w_out = (grad_w_out + self.l2*self.w_out)
delta_b_out = grad_b_out
self.w_out -= self.eta * delta_w_out
self.b_out -= self.eta * delta_b_out
z_h, a_h, z_out, a_out = self._forward(X_train)
cost = self._compute_cost(y_enc=y_train_enc, output=a_out)
y_train_pred = self.predict(X_train)
y_valid_pred = self.predict(X_valid)
train_acc = ((np.sum(y_train == y_train_pred)).astype(np.float) / X_train.shape[0])
valid_acc = ((np.sum(y_valid == y_valid_pred)).astype(np.float) / X_valid.shape[0])
sys.stderr.write('\r%0*d/%d | Cost: %.2f | Train/Valid Acc.: %.2f%%/%.2f%% ' %(epoch_strlen, i+1, self.epochs, cost, train_acc*100, valid_acc*100))
sys.stderr.flush()
self.eval_['cost'].append(cost)
self.eval_['train_acc'].append(train_acc)
self.eval_['valid_acc'].append(valid_acc)
return self
https://wikidocs.net/37406 (back propagation)
신경망 훈련
nn=NeuralNetMLP(n_hidden=100, l2=0.01, epochs=200, eta=0.0005, minibatch_size=100, shuffle=True, seed=1)
nn.fit(X_train=X_train[:55000], y_train=y_train[:55000], X_valid=X_train[55000:], y_valid=y_train[55000:])

200/200 | Cost: 5065.78 | Train/Valid Acc.: 99.28%/97.98% 

Graph
import matplotlib.pyplot as plt
plt.plot(range(nn.epochs), nn.eval_['cost'])
plt.ylabel('Cost')
plt.xlabel('Epochs')
plt.show()

100번의 에포크 동안 비용이 많이 감소한다.
이후의 에포크에서는 천천히 수렴하는 것을 알 수 있다.
plt.plot(range(nn.epochs), nn.eval_['train_acc'], label='Training')
plt.plot(range(nn.epochs), nn.eval_['valid_acc'], label='Validatation', linestyle='--')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()
plt.show()
약 50번의 에포크에서 훈련 정확도와 검증 정확도가 일치하고, 이후에 네트워크는 과대적합됨을 알 수 있다.

과대 적합의 영향을 줄이기 위해서는 규제의 강도를 높여야 한다.
l2를 기존 0에서 0.1로 상승시키면 과대 적합을 줄일 수 있다.
신경망에서 과대적합을 줄이기 위해서 드롭아웃(dropout)을 사용한다.
Test accuracy
y_test_pred=nn.predict(X_test)
acc=(np.sum(y_test==y_test_pred).astype(np.float)/X_test.shape[0])
print(' : %.2f%%' %(acc*100))

테스트 정확도: 97.54%

위의 신경망의 하이퍼 파라미터로 은닉 유닛 개수와 규제 매개변수(파라미터)의 값, 학습률 등을 바꿀 수 있다.
Mis_pred images
miscl_img=X_test[y_test!=y_test_pred][:25]
correct_lab=y_test[y_test!=y_test_pred][:25]
miscl_lab=y_test_pred[y_test!=y_test_pred][:25]
fig, ax=plt.subplots(nrows=5, ncols=5, sharex=True, sharey=True)
ax=ax.flatten()
for i in range(25):
img=miscl_img[i].reshape(28 ,28)
ax[i].imshow(img, cmap='Greys', interpolation='nearest')
ax[i].set_title('%d) t:%d p:%d' %(i+1, correct_lab[i], miscl_lab[i]))
ax[0].set_xticks([])
ax[0].set_yticks([])
plt.tight_layout()
plt.show()