Measuring uncertainty in probability predictions using bagging#

With bagging(‘boostrap aggregation’) we train multiple models based on multiple bootstrap samples (sampling with replacement) of the training data.

Here we use bagging to examine variation in predicted probability of survival.

For an introduction to bootstrapping see: https://youtu.be/Xz0x-8-cgaQ

Import packages#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Import machine learning methods
from xgboost import XGBClassifier

Download data if required#

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'
    
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

Load data and split into trainign and test sets#

data = pd.read_csv('data/processed_data.csv')
# Make all data 'float' type
data = data.astype(float)

# Shuffle data
data = data.sample(frac=1, random_state=42)

# Split into training and test sets
training_size = int(len(data) * 0.75)
train = data.iloc[0: training_size]
test = data.iloc[training_size:]

Create bootstrap sampled training sets#

splits = 30
train_set = []
for i in range(splits):
    train_set.append(train.sample(frac=1, replace=True))

Train models#

# Set up lists for models and probability predictions
models = []
results  = []
accuracies = []

for i in range(splits):

    # Get X and y
    X_train = train_set[i].drop('Survived', axis=1)
    X_test = test.drop('Survived', axis=1)
    y_train = train_set[i]['Survived']
    y_test = test['Survived']

    # Define and train model; use different random seed for each model
    model = XGBClassifier(verbosity=0, seed=42+i)
    model.fit(X_train, y_train)
    models.append(model)

    # Get predicted probabilities and class
    y_probs = model.predict_proba(X_test)[:,1]
    y_pred = y_probs > 0.5
    results.append([y_probs])

    # Show accuracy
    accuracy = np.mean(y_pred == y_test)
    accuracies.append(accuracy)

results = np.array(results)
results = results.T.reshape(-1, splits)
print (f'Mean accuracy: {np.mean(accuracies):0.3f}')
Mean accuracy: 0.799

Show boxplot of accuracy measurements.

fig = plt.figure()
ax = fig.add_subplot()
ax.boxplot(accuracies, whis=999)
ax.set_ylabel('Model accuracy')
ax.axes.xaxis.set_ticklabels([]) # Remove xtick labels
plt.show()
_images/53_bagging_13_0.png

Measure accuracy of consensus decision#

Consensus decision is the majority vote of the models.

classification = results >= 0.5
consensus = classification.sum(axis=1) >= splits/2
consensus_accuracy = np.mean(consensus == y_test)
print (f'Consensus accuracy: {consensus_accuracy:0.3f}')
Consensus accuracy: 0.821

Plot variation in probability prediction#

# Sort by mean survival
results = results[np.median(results,axis=1).argsort()]

Box plot#

fig = plt.figure()
ax = fig.add_subplot()
ax.boxplot(results.T, whis=999)
ax.axes.xaxis.set_ticklabels([])
ax.set_xlabel('Passenger')
ax.set_ylabel('Probability of survival')
ax.set_xticks([])
ax.grid()
plt.show()
_images/53_bagging_19_0.png

Plot of standard deviation#

# Sort by mean survival
results = results[np.mean(results,axis=1).argsort()]
results = results[np.mean(results,axis=1).argsort()]
mean = np.mean(results,axis=1)
stdev = np.std(results,axis=1)

fig = plt.figure()
ax = fig.add_subplot()
ax.errorbar(range(len(mean)), mean, yerr=stdev, label='Standard Deviation', zorder=1)
ax.plot(mean, 'o', c='r', markersize=2, label = 'Mean probability', zorder=2)
ax.axes.xaxis.set_ticklabels([])
ax.set_xlabel('Passenger')
ax.set_ylabel('Probability of survival')
ax.set_xticks([])
ax.grid()
ax.legend()
plt.show()
_images/53_bagging_22_0.png

Plot of standard error#

# Sort by mean survival
results = results[np.mean(results,axis=1).argsort()]
results = results[np.mean(results,axis=1).argsort()]
mean = np.mean(results,axis=1)
stdev = np.std(results,axis=1)
se = stdev / np.sqrt(splits)

fig = plt.figure()
ax = fig.add_subplot()
ax.errorbar(range(len(mean)), mean, yerr=se, label='Standard Error', zorder=1)
ax.plot(mean, 'o', c='r', markersize=2, label = 'Mean probability', zorder=2)
ax.axes.xaxis.set_ticklabels([])
ax.set_xlabel('Passenger')
ax.set_ylabel('Probability of survival')
ax.set_xticks([])
ax.grid()
ax.legend()
plt.show()
_images/53_bagging_25_0.png

Plot confidence limits#

import scipy.stats as st

# create 95% confidence interval
confidence = st.t.interval(alpha=0.95, df=len(results)-1, loc=mean, scale=se)
confidence_magnitude = confidence[1] - confidence[0]

fig = plt.figure()
ax = fig.add_subplot()
ax.errorbar(range(len(mean)), mean, yerr=(confidence_magnitude/2), 
    label='95% confidence limits' ,zorder=1)
ax.plot(mean, 'o', c='r', markersize=2, label = 'Mean probability', zorder=2)
ax.axes.xaxis.set_ticklabels([])
ax.set_xlabel('Passenger')
ax.set_ylabel('Probability of survival')
ax.set_xticks([])
ax.grid()
ax.legend()
plt.show()
_images/53_bagging_27_0.png
fig = plt.figure()
ax = fig.add_subplot()
ax.scatter(mean, confidence_magnitude)
ax.set_xlabel('Probability of survival')
ax.set_ylabel('95% Confidence interval of probability of survival')
ax.grid()
plt.show()
_images/53_bagging_28_0.png

Conclusions#

  • Bagging may be used to explore uncertainty of prediction

  • Uncerntainty is highest for cases with around 50% probability of survival