A comparison of calibration of neural networks using a single sigmoid output or dual SoftMax or Sigmoid outputs#

When constructing a neural network to distinguish between two classifications (e.g. survived vs died), we have two options with neural networks:

  1. Provide an output for each, with the probability of each being reported, e.g. died=84%, survived=16%. In this case we can use Sigmoid output which treats each class as independent, or SoftMax to ensure that the probabilities add up to 1.0.

  2. Provide an output for just one of the options, e.g. survived=16%. In this case the final output uses Sigmoid activation to scale the output between 0 and 1.

We would hope these three methods will give comparable results. While the final classification will be very similar, in this notebook we show that the reported probabilities are different, and that the single sigmoid output provides a much better calibrated model.

Load modules#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# sklearn for processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.calibration import CalibrationDisplay

# pytorch
import torch
from torch.autograd import Variable
import torch.nn.functional as F

Download data if not previously downloaded#

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'
    
    data = pd.read_csv(address)
    # Make all data 'float' type
    data = data.astype(float)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

Define function to scale data#

In neural networks it is common to to scale input data 0-1 rather than use standardisation (subtracting mean and dividing by standard deviation) of each feature).

def scale_data(X_train, X_test):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_sc = sc.transform(X_train)
    test_sc = sc.transform(X_test)
    
    return train_sc, test_sc

Load data#

data = pd.read_csv('data/processed_data.csv')
data.drop('PassengerId', inplace=True, axis=1)
X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Convert to NumPy arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)

1) SoftMax output with dual output#

Define neural net with Dual SoftMax output#

Dual output for died / survived, with SoftMax output.

class Net(torch.nn.Module):
    
    def __init__(self, number_features, expansion=2):
        # Define layers
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(number_features, number_features * expansion)
        self.bn1 = torch.nn.BatchNorm1d(number_features * expansion)
        self.fc2 = torch.nn.Linear(number_features * expansion, number_features * expansion)
        self.bn2 = torch.nn.BatchNorm1d(number_features * expansion)
        self.fc3 = torch.nn.Linear(number_features * expansion, 2)
        
    def forward(self, x):
        # Define sequence of layers
        x = self.fc1(x) # Fully connected layer
        x = self.bn1(x) # Batch normalisation
        x = F.dropout(x, p=0.5) # Apply dropout
        x = F.relu(x) # ReLU activation
        x = self.fc2(x) # Fully connected layer
        x = self.bn2(x) # Batch normalisation
        x = F.dropout(x, p=0.5) # Apply dropout
        x = F.relu(x) # ReLU activation
        x = self.fc3(x) # Fully connected layer
        x = F.softmax(x, dim=1) # Softmax output (0-1, totals 1)
        
        return x

Train network#

# Define network
number_features = X_train_sc.shape[1]
net = Net(number_features)

# Set batch size (cases per batch - commonly 8-64)
batch_size = 16
# Epochs (number of times to pass over data)
num_epochs = 150
# Learning rate (how much each bacth updates the model)
learning_rate = 0.001
# Calculate number of batches
batch_no = len(X_train_sc) // batch_size

# Set up optimizer for classification
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

# Train model by passing through the data the required number of epochs

training_progress = []
test_progress = []

for epoch in range(num_epochs):
    
    # Shuffle the training data
    X_train_sc, y_train = shuffle(X_train_sc, y_train)

    # Set net to training mode
    net.train()

    for i in range(batch_no):
        
        # Get X and y batch data
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train_sc[start:end]))
        y_var = Variable(torch.LongTensor(y_train[start:end]))

        # These steps train the model: Forward + Backward + Optimize
        optimizer.zero_grad() # reset optimizer
        ypred_var = net(x_var) # predict y
        loss = criterion(ypred_var, y_var) # Calculate loss
        loss.backward() # Back propagate loss through network
        optimizer.step() # Update network to reduce loss

    # Set net to evaluation mode
    net.eval()

    test_var = Variable(torch.FloatTensor(X_train_sc))
    result = net(test_var)
    values, labels = torch.max(result, 1)
    y_pred_train = labels.data.numpy()
    accuracy_train = np.mean(y_pred_train == y_train)
    training_progress.append(accuracy_train)
    
    test_var = Variable(torch.FloatTensor(X_test_sc))
    result = net(test_var)
    values, labels = torch.max(result, 1)
    y_pred_test = labels.data.numpy()
    accuracy_test = np.mean(y_pred_test == y_test)
    test_progress.append(accuracy_test)

Show training

fig = plt.figure()
ax = fig.add_subplot()
x = np.arange(num_epochs)
ax.plot(x, training_progress, label='Training accuracy')
ax.plot(x, test_progress, label='Test accuracy')
ax.grid()
ax.legend()
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
plt.show()
_images/52_pytorch_softmax_sigmoid_15_0.png

Show frequency of probabilities, and show model calibration#

net.eval()
test_var = Variable(torch.FloatTensor(X_test_sc))
result = net(test_var)
softmax_probs = result[:,1].data.numpy()

fig = plt.figure(figsize=(10,5))

ax1 = fig.add_subplot(121)
x = np.arange(num_epochs)
ax1.hist(softmax_probs, bins=20)
ax1.grid()
ax1.set_xlabel('Probability')
ax1.set_ylabel('Frequency')

ax2 = fig.add_subplot(122)
disp1 = CalibrationDisplay.from_predictions(
    y_test, softmax_probs, ax=ax2, n_bins=4)
ax2.grid()

plt.show()
_images/52_pytorch_softmax_sigmoid_17_0.png

2) Sigmoid output with dual output#

Define neural net with Dual Sigmoid output#

Dual output for died / survived, with SoftMax output.

class Net(torch.nn.Module):
    
    def __init__(self, number_features, expansion=2):
        # Define layers
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(number_features, number_features * expansion)
        self.bn1 = torch.nn.BatchNorm1d(number_features * expansion)
        self.fc2 = torch.nn.Linear(number_features * expansion, number_features * expansion)
        self.bn2 = torch.nn.BatchNorm1d(number_features * expansion)
        self.fc3 = torch.nn.Linear(number_features * expansion, 2)
        
    def forward(self, x):
        # Define sequence of layers
        x = self.fc1(x) # Fully connected layer
        x = self.bn1(x) # Batch normalisation
        x = F.dropout(x, p=0.5) # Apply dropout
        x = F.relu(x) # ReLU activation
        x = self.fc2(x) # Fully connected layer
        x = self.bn2(x) # Batch normalisation
        x = F.dropout(x, p=0.5) # Apply dropout
        x = F.relu(x) # ReLU activation
        x = self.fc3(x) # Fully connected layer
        x = torch.sigmoid(x) # Sigmoid output (0-1)
        
        return x

Train network#

# Define network
number_features = X_train_sc.shape[1]
net = Net(number_features)

# Set batch size (cases per batch - commonly 8-64)
batch_size = 16
# Epochs (number of times to pass over data)
num_epochs = 150
# Learning rate (how much each bacth updates the model)
learning_rate = 0.001
# Calculate number of batches
batch_no = len(X_train_sc) // batch_size

# Set up optimizer for classification
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

# Train model by passing through the data the required number of epochs

training_progress = []
test_progress = []

for epoch in range(num_epochs):
    
    # Shuffle the training data
    X_train_sc, y_train = shuffle(X_train_sc, y_train)

    # Set net to training mode
    net.train()

    for i in range(batch_no):
        
        # Get X and y batch data
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train_sc[start:end]))
        y_var = Variable(torch.LongTensor(y_train[start:end]))

        # These steps train the model: Forward + Backward + Optimize
        optimizer.zero_grad() # reset optimizer
        ypred_var = net(x_var) # predict y
        loss = criterion(ypred_var, y_var) # Calculate loss
        loss.backward() # Back propagate loss through network
        optimizer.step() # Update network to reduce loss

    # Set net to evaluation mode
    net.eval()

    test_var = Variable(torch.FloatTensor(X_train_sc))
    result = net(test_var)
    values, labels = torch.max(result, 1)
    y_pred_train = labels.data.numpy()
    accuracy_train = np.mean(y_pred_train == y_train)
    training_progress.append(accuracy_train)
    
    test_var = Variable(torch.FloatTensor(X_test_sc))
    result = net(test_var)
    values, labels = torch.max(result, 1)
    y_pred_test = labels.data.numpy()
    accuracy_test = np.mean(y_pred_test == y_test)
    test_progress.append(accuracy_test)

Show training

fig = plt.figure()
ax = fig.add_subplot()
x = np.arange(num_epochs)
ax.plot(x, training_progress, label='Training accuracy')
ax.plot(x, test_progress, label='Test accuracy')
ax.grid()
ax.legend()
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
plt.show()
_images/52_pytorch_softmax_sigmoid_24_0.png

Show frequency of probabilities, and show model calibration#

net.eval()
test_var = Variable(torch.FloatTensor(X_test_sc))
result = net(test_var)
softmax_probs = result[:,1].data.numpy()

fig = plt.figure(figsize=(10,5))

ax1 = fig.add_subplot(121)
x = np.arange(num_epochs)
ax1.hist(softmax_probs, bins=20)
ax1.grid()
ax1.set_xlabel('Probability')
ax1.set_ylabel('Frequency')

ax2 = fig.add_subplot(122)
disp1 = CalibrationDisplay.from_predictions(
    y_test, softmax_probs, ax=ax2, n_bins=4)
ax2.grid()

plt.show()
_images/52_pytorch_softmax_sigmoid_26_0.png

3) Single sigmoid output#

Define neural net with single Sigmoid output#

One outut for probability of survival

class Net(torch.nn.Module):
    
    def __init__(self, number_features, expansion=2):
        # Define layers
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(number_features, number_features * expansion)
        self.bn1 = torch.nn.BatchNorm1d(number_features * expansion)
        self.fc2 = torch.nn.Linear(number_features * expansion, number_features * expansion)
        self.bn2 = torch.nn.BatchNorm1d(number_features * expansion)
        self.fc3 = torch.nn.Linear(number_features * expansion, 1)
        
    def forward(self, x):
        # Define sequence of layers
        x = self.fc1(x) # Fully connected layer
        x = self.bn1(x) # Batch normalisation
        x = F.dropout(x, p=0.5) # Apply dropout
        x = F.relu(x) # ReLU activation
        x = self.fc2(x) # Fully connected layer
        x = self.bn2(x) # Batch normalisation
        x = F.dropout(x, p=0.5) # Apply dropout
        x = F.relu(x) # ReLU activation
        x = self.fc3(x) # Fully connected layer
        x = torch.sigmoid(x) # Sigmoid output (0-1)
        
        return x

Train network#

# Define network
number_features = X_train_sc.shape[1]
net = Net(number_features)

# Set batch size (cases per batch - commonly 8-64)
batch_size = 16
# Epochs (number of times to pass over data)
num_epochs = 150
# Learning rate (how much each bacth updates the model)
learning_rate = 0.001
# Calculate number of batches
batch_no = len(X_train_sc) // batch_size

# Set up optimizer for classification
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

# Train model by passing through the data the required number of epochs

training_progress = []
test_progress = []

for epoch in range(num_epochs):

    # Shuffle training data
    X_train_sc, y_train = shuffle(X_train_sc, y_train)

    # Set net to training mode
    net.train()

    for i in range(batch_no):
        
        # Get X and y batch data
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train_sc[start:end]))
        y_var = Variable(torch.FloatTensor(y_train[start:end]))

        # These steps train the model: Forward + Backward + Optimize
        optimizer.zero_grad() # reset optimizer
        ypred_var = net(x_var) # predict y
        loss = criterion(ypred_var, y_var.reshape(-1,1)) # Calculate loss
        loss.backward() # Back propagate loss through network
        optimizer.step() # Update network to reduce loss

    # Set net to evaluation mode
    net.eval()

    test_var = Variable(torch.FloatTensor(X_train_sc))
    result = net(test_var)
    values = result.data.numpy().flatten()
    y_pred_train = values >= 0.5
    accuracy_train = np.mean(y_pred_train == y_train)
    training_progress.append(accuracy_train)
    
    test_var = Variable(torch.FloatTensor(X_test_sc))
    result = net(test_var)
    values = result.data.numpy().flatten()
    y_pred_test = values >= 0.5
    accuracy_test = np.mean(y_pred_test == y_test)
    test_progress.append(accuracy_test)

Show training

fig = plt.figure()
ax = fig.add_subplot()
x = np.arange(num_epochs)
ax.plot(x, training_progress, label='Training accuracy')
ax.plot(x, test_progress, label='Test accuracy')
ax.grid()
ax.legend()
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
plt.show()
_images/52_pytorch_softmax_sigmoid_33_0.png

Show frequency of probabilities, and show model calibration#

net.eval()
test_var = Variable(torch.FloatTensor(X_test_sc))
result = net(test_var)
sigmoid_probs = result.data.numpy().flatten()

fig = plt.figure(figsize=(10,5))

ax1 = fig.add_subplot(121)
x = np.arange(num_epochs)
ax1.hist(sigmoid_probs, bins=20)
ax1.grid()
ax1.set_xlabel('Probability')
ax1.set_ylabel('Frequency')

ax2 = fig.add_subplot(122)
disp1 = CalibrationDisplay.from_predictions(
    y_test, sigmoid_probs, ax=ax2, n_bins=4)
ax2.grid()

plt.show()
_images/52_pytorch_softmax_sigmoid_35_0.png

Observations#

  • A single sigmoid output produces the best calibrated model