# import required packages
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# load data and split it into training and test set
bc = load_breast_cancer()
train_X, test_X, train_y, test_y = train_test_split(bc.data, bc.target, random_state=42)
class_labels = bc.target_names

# plot shapes
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(426, 30) (426,) (143, 30) (143,)


# activation function
sigmoid_activation = lambda y: 1.0 / (1 + np.exp(-y))
np.seterr(over='ignore') # ignore overflow from exponential function

# learning rate represents the step size as in classical gradient descent
l_rate = 1e-4

# epochs represents the number of maximum iterations for minimization
epochs = 2000

# batch size amounts to the number of samples used in one epoch iteration (up to hardware memory)
b_size = 2**6
assert b_size <= train_X.shape[0]

# insert column of ones as first feature entry to cover bias as a trainable parameter within the weight vector (instead of separate variable)
train_X, test_X = np.c_[np.ones(train_X.shape[0]), train_X], np.c_[np.ones(test_X.shape[0]), test_X]

# initialize weight vector such it has the same number of columns as input features
w = np.random.uniform(size=(train_X.shape[1],))

# initialize a list to track the loss value for each epoch
loss_list = []

# batch composition
def next_batch(X, y, b_size):
    
    # loop over our dataset in mini-batches
    for i in np.arange(0, X.shape[0], b_size):
        
        # yield a tuple for current batch of data and labels
        yield (X[i:i+b_size], y[i:i+b_size])

for epoch in range(epochs+1):
    
    # reset total epoch loss
    epoch_loss = []
    
    # loop over data in batches
    for (batch_X, batch_y) in next_batch(train_X, train_y, b_size):
        
        # take dot product between current feature batch and weights
        y = np.dot(batch_X, w)
        
        # pass product to sigmoid activation function
        preds_y = sigmoid_activation(y)
        
        # loss determination via difference of true values
        loss = preds_y - batch_y
        
        # compute sum of squared loss for current batch
        epoch_loss.append(np.sum(loss))
        
        # compute gradient from transposed batch data and batch error
        gradient = 2*np.dot(batch_X.T, loss)
        
        # scale gradient of current batch to step in the correct direction
        w -= l_rate * gradient
        
    # update loss list by taking average across all batches
    loss_list.append(np.average(epoch_loss))


%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
plt.style.use('seaborn-notebook')

# animated figure that plots the loss over time
fig, ax = (plt.figure(figsize=(8, 5)), plt.axes())
ax.set_xlabel('Epoch #', fontsize=16)
ax.set_ylabel('Loss', fontsize=16)
ax.xaxis.set_tick_params(labelsize=12)
ax.yaxis.set_tick_params(labelsize=12)
line, = ax.plot(range(len(loss_list)), loss_list, lw=2)
point, = ax.plot(0, np.nan, 'r', marker='.', markersize=14)
plt.tight_layout()
plt.close()

# animation
div = 250
def animate(i):
    line.set_data(np.arange(len(loss_list))[:i*div], loss_list[:i*div])
    point.set_data(i*div, loss_list[i*div])
    return line, point

anim = FuncAnimation(fig, animate, interval=200, frames=epochs//div+1)
HTML(anim.to_jshtml())

if False:
    anim.save('./data/sgd_anim.gif', writer='imagemagick', fps=5)


# compute predictions from test set
pred_y = np.round(sigmoid_activation(np.dot(test_X, w))).astype('uint8')

# compose confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true=test_y, y_pred=pred_y)

# print confusion matrix using matplotlib
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_mat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        ax.text(x=j, y=i, s=conf_mat[i, j], va='center', ha='center', size='xx-large')

plt.title('Confusion Matrix', fontsize=18)
plt.xlabel('Predictions $\mathbf{\hat{y}}$', fontsize=18)
plt.ylabel('Actuals $\mathbf{y}$', fontsize=18)
plt.xticks([0, 1], class_labels)
plt.yticks([0, 1], class_labels)
plt.tick_params(top=False, bottom=True, labeltop=False, labelbottom=True)
plt.show()


# TP/(TP+FP)
precision = lambda conf_mat: conf_mat[0, 0]/(conf_mat[0, 0]+conf_mat[1, 0]) * 100
print('%.2f %% Precision' % precision(conf_mat))

# TP/(TP+FN)
recall = lambda conf_mat: conf_mat[0, 0]/(conf_mat[0, 0]+conf_mat[0, 1]) * 100
print('%.2f %% Recall' % recall(conf_mat))

# (TP+TN)/ALL
accuracy = lambda conf_mat: (conf_mat[0, 0]+conf_mat[1, 1])/np.sum(conf_mat) * 100
print('%.2f %% Accuracy' % accuracy(conf_mat))

84.13 % Precision
98.15 % Recall
92.31 % Accuracy


# mean squared error
MSE = lambda y, pred_y: np.round(sum((y-pred_y)**2)/len(y), 4)

# compute predictions of test and training sets
pred_test_y = np.round(sigmoid_activation(np.dot(test_X, w))).astype('uint8')
pred_train_y = np.round(sigmoid_activation(np.dot(train_X, w))).astype('uint8')

# compare MSEs
test_MSE = MSE(test_y, pred_test_y)
train_MSE = MSE(train_y, pred_train_y)
res = np.isclose(test_MSE, train_MSE, rtol=.95)

if res:
    print('MSEs are close enough with %s (test) and %s (train).' % (test_MSE, train_MSE))
else:
    print('Potential over-/underfitting from MSEs with %s (test) and %s (train).' % (test_MSE, train_MSE))

MSEs are close enough with 0.0769 (test) and 0.1009 (train).


import torch

# define single layer model
class SingleLayerNet(torch.nn.Module):
    def __init__(self, n_features):
        super(SingleLayerNet, self).__init__()
        self.linear = torch.nn.Linear(n_features, 1, bias=False)
        torch.nn.init.uniform_(self.linear.weight, 0, 1)

    def forward(self, X):
        z = self.linear(X)
        return torch.squeeze(z, 1)
    
# convert to torch data types
train_Xt = torch.autograd.Variable(torch.FloatTensor(train_X))
train_yt = torch.autograd.Variable(torch.FloatTensor(train_y))
test_Xt = torch.autograd.Variable(torch.FloatTensor(test_X))
test_yt = torch.autograd.Variable(torch.FloatTensor(test_y))

# instantiate model and loss
model = SingleLayerNet(n_features=train_Xt.shape[1])
optimizer = torch.optim.SGD(model.parameters(), lr=l_rate)
criterion = torch.nn.BCEWithLogitsLoss()

# training
loss_list = []
for epoch in range(epochs+1):
    # loop over data in batches
    for i_X, i_y in next_batch(train_Xt, train_yt, b_size):
        optimizer.zero_grad()
        pred_y = model(i_X)
        loss = criterion(pred_y, i_y)
        loss.backward()
        optimizer.step()
    # track loss
    loss_list.append(loss.detach())
    if (epoch) % 500 == 0:
        y_test_pred = model(test_Xt)
        y_train_pred = model(train_Xt)
        test_loss = criterion(y_test_pred, test_yt)
        train_loss = criterion(y_train_pred, train_yt)
        pred_yt = np.round(sigmoid_activation(y_test_pred.detach().numpy()), 0)
        conf_mat = confusion_matrix(y_true=pred_yt, y_pred=test_yt.numpy())
        test_acc = accuracy(conf_mat)
        train_acc = accuracy(conf_mat)
        print(
            f'''epoch {epoch}
            Train set - loss: {np.round(train_loss.detach().numpy().astype('float'), 3)}, accuracy: {np.round(train_acc, 3)}
            Test  set - loss: {np.round(test_loss.detach().numpy().astype('float'), 3)}, accuracy: {np.round(test_acc, 3)}
            ''')

if False:
    torch.save(model, 'torch_sgd_model.pth')

epoch 0
            Train set - loss: 847.939, accuracy: 62.238
            Test  set - loss: 900.102, accuracy: 62.238
            
epoch 500
            Train set - loss: 2.958, accuracy: 89.51
            Test  set - loss: 1.459, accuracy: 89.51
            
epoch 1000
            Train set - loss: 2.65, accuracy: 90.909
            Test  set - loss: 1.263, accuracy: 90.909
            
epoch 1500
            Train set - loss: 2.465, accuracy: 90.909
            Test  set - loss: 1.146, accuracy: 90.909
            
epoch 2000
            Train set - loss: 2.303, accuracy: 90.909
            Test  set - loss: 1.053, accuracy: 90.909


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# plot loss function
ax1.plot(range(len(loss_list)), loss_list)
ax1.set_title('Training Loss', fontsize=14)
ax1.set_xlabel('Epoch #', fontsize=14)
ax1.set_ylabel('Loss', fontsize=14)

# print confusion matrix
pred_yt = np.round(sigmoid_activation(y_test_pred.detach().numpy()), 0)
conf_mat = confusion_matrix(y_true=test_yt.numpy(), y_pred=pred_yt)
ax2.matshow(conf_mat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        ax2.text(x=j, y=i, s=conf_mat[i, j], va='center', ha='center', size='xx-large')

ax2.set_title('Confusion Matrix', fontsize=14)
ax2.set_xlabel('Predictions $\mathbf{\hat{y}}$', fontsize=14)
ax2.set_ylabel('Actuals $\mathbf{y}$', fontsize=14)
ax2.set_yticks([0, 1])
ax2.set_xticks([0, 1])
ax2.set_yticklabels(class_labels)
ax2.set_xticklabels(class_labels)
ax2.tick_params(top=False, bottom=True, labeltop=False, labelbottom=True)
plt.show()

Stochastic Gradient Descent (SGD)¶

Data acquisition¶

SGD theory¶

Optimization¶

Mini-Batching¶

Activation function¶

SGD implementation¶

Overfitting validation¶

PyTorch equivalent¶