XGBoost as a replacement for Random Forest#

An alternative to SciKit Learn’s Random Forest is XGBoost (pip install xgboost if needed). XGBoost is a development of Random Forests, and is often faster and may give added accuracy. We can use XGBoost as a drop in replacement for Random Forests, as shown here.

This notebook is a repeat of the basic Random Forest model, but with Random Forest replaced by XGBoost.

Load modules#

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
/home/michael/miniconda3/envs/samuel/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index

Download data#

Run the following code if data for Titanic survival has not been previously downloaded.

download_required = True

if download_required:
    
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'
    
    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

Load data#

# Load data & drop passenger ID
data = pd.read_csv('data/processed_data.csv')

# Make all data 'float' type
data = data.astype(float)

data.drop('PassengerId', inplace=True, axis=1)

# Split data into two DataFrames
X_df = data.drop('Survived',axis=1)
y_df = data['Survived']

# Convert DataFrames to NumPy arrays
X = X_df.values
y = y_df.values

Define function to calculate accuracy#

import numpy as np

def calculate_accuracy(observed, predicted):
    
    """
    Calculates a range of accuracy scores from observed and predicted classes.
    
    Takes two list or NumPy arrays (observed class values, and predicted class 
    values), and returns a dictionary of results.
    
     1) observed positive rate: proportion of observed cases that are +ve
     2) Predicted positive rate: proportion of predicted cases that are +ve
     3) observed negative rate: proportion of observed cases that are -ve
     4) Predicted negative rate: proportion of predicted cases that are -ve  
     5) accuracy: proportion of predicted results that are correct    
     6) precision: proportion of predicted +ve that are correct
     7) recall: proportion of true +ve correctly identified
     8) f1: harmonic mean of precision and recall
     9) sensitivity: Same as recall
    10) specificity: Proportion of true -ve identified:        
    11) positive likelihood: increased probability of true +ve if test +ve
    12) negative likelihood: reduced probability of true +ve if test -ve
    13) false positive rate: proportion of false +ves in true -ve patients
    14) false negative rate: proportion of false -ves in true +ve patients
    15) true positive rate: Same as recall
    16) true negative rate: Same as specificity
    17) positive predictive value: chance of true +ve if test +ve
    18) negative predictive value: chance of true -ve if test -ve
    
    """
    
    # Converts list to NumPy arrays
    if type(observed) == list:
        observed = np.array(observed)
    if type(predicted) == list:
        predicted = np.array(predicted)
    
    # Calculate accuracy scores
    observed_positives = observed == 1
    observed_negatives = observed == 0
    predicted_positives = predicted == 1
    predicted_negatives = predicted == 0
    
    true_positives = (predicted_positives == 1) & (observed_positives == 1)
    
    false_positives = (predicted_positives == 1) & (observed_positives == 0)
    
    true_negatives = (predicted_negatives == 1) & (observed_negatives == 1)
    
    false_negatives = (predicted_negatives == 1) & (observed_negatives == 0)
    
    accuracy = np.mean(predicted == observed)
    
    precision = (np.sum(true_positives) /
                 (np.sum(true_positives) + np.sum(false_positives)))
        
    recall = np.sum(true_positives) / np.sum(observed_positives)
    
    sensitivity = recall
    
    f1 = 2 * ((precision * recall) / (precision + recall))
    
    specificity = np.sum(true_negatives) / np.sum(observed_negatives)
    
    positive_likelihood = sensitivity / (1 - specificity)
    
    negative_likelihood = (1 - sensitivity) / specificity
    
    false_positive_rate = 1 - specificity
    
    false_negative_rate = 1 - sensitivity
    
    true_positive_rate = sensitivity
    
    true_negative_rate = specificity
    
    positive_predictive_value = (np.sum(true_positives) / 
                            (np.sum(true_positives) + np.sum(false_positives)))
    
    negative_predictive_value = (np.sum(true_negatives) / 
                            (np.sum(true_negatives) + np.sum(false_negatives)))
    
    # Create dictionary for results, and add results
    results = dict()
    
    results['observed_positive_rate'] = np.mean(observed_positives)
    results['observed_negative_rate'] = np.mean(observed_negatives)
    results['predicted_positive_rate'] = np.mean(predicted_positives)
    results['predicted_negative_rate'] = np.mean(predicted_negatives)
    results['accuracy'] = accuracy
    results['precision'] = precision
    results['recall'] = recall
    results['f1'] = f1
    results['sensitivity'] = sensitivity
    results['specificity'] = specificity
    results['positive_likelihood'] = positive_likelihood
    results['negative_likelihood'] = negative_likelihood
    results['false_positive_rate'] = false_positive_rate
    results['false_negative_rate'] = false_negative_rate
    results['true_positive_rate'] = true_positive_rate
    results['true_negative_rate'] = true_negative_rate
    results['positive_predictive_value'] = positive_predictive_value
    results['negative_predictive_value'] = negative_predictive_value
    
    return results

Run the model with k-fold validation#

# Set up lists to hold results for each k-fold run
replicate_accuracy = []
replicate_precision = []
replicate_recall = []
replicate_f1 = []
replicate_predicted_positive_rate = []
replicate_observed_positive_rate = []

# Set up DataFrame for feature importances
importances = pd.DataFrame(index = list(X_df))

# Convert DataFrames to NumPy arrays
X = X_df.values
y = y_df.values

# Set up splits
number_of_splits = 10
skf = StratifiedKFold(n_splits = number_of_splits)
skf.get_n_splits(X, y)

# Loop through the k-fold splits
k_fold_count = 0

for train_index, test_index in skf.split(X, y):
    
    # Get X and Y train/test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Set up and fit model (n_jobs=-1 uses all cores on a computer)
    model = XGBClassifier(random_state=42, verbosity=0)
    X_train, X_test = X[train_index], X[test_index]
    model.fit(X_train, y_train)
    
    # Predict test set labels and get accuracy scores
    y_pred_test = model.predict(X_test)
    accuracy_scores = calculate_accuracy(y_test, y_pred_test)
    replicate_accuracy.append(accuracy_scores['accuracy'])
    replicate_precision.append(accuracy_scores['precision'])
    replicate_recall.append(accuracy_scores['recall'])
    replicate_f1.append(accuracy_scores['f1'])
    replicate_predicted_positive_rate.append(
        accuracy_scores['predicted_positive_rate'])
    replicate_observed_positive_rate.append(
        accuracy_scores['observed_positive_rate'])
    
    # Record feature importances
    col_title = 'split_' + str(k_fold_count)
    importances[col_title] = model.feature_importances_
    k_fold_count +=1
    
# Transfer results to list and add to data frame
results = pd.Series()
results['accuracy'] = np.mean(replicate_accuracy)
results['precision'] = np.mean(replicate_precision)
results['recall'] = np.mean(replicate_recall)
results['f1'] = np.mean(replicate_f1)
results['predicted positive rate'] = np.mean(replicate_predicted_positive_rate)
results['observed positive rate'] = np.mean(replicate_observed_positive_rate)

# Get average of feature importances, and sort
importance_mean = importances.mean(axis=1)
importance_mean.sort_values(ascending=False, inplace=True)

Show results and feature importance#

results
accuracy                   0.820449
precision                  0.776687
recall                     0.745630
f1                         0.758968
predicted positive rate    0.368102
observed positive rate     0.383833
dtype: float64
importance_mean
male                   0.409115
Pclass                 0.151310
CabinLetter_E          0.077558
CabinNumber            0.047122
SibSp                  0.042579
Embarked_S             0.036095
Embarked_C             0.029504
CabinLetter_A          0.028807
Fare                   0.027939
Age                    0.027254
CabinLetter_C          0.024650
Parch                  0.021589
CabinLetterImputed     0.020404
AgeImputed             0.018040
CabinLetter_B          0.013238
CabinLetter_D          0.013003
Embarked_Q             0.011792
Embarked_missing       0.000000
CabinNumberImputed     0.000000
EmbarkedImputed        0.000000
CabinLetter_F          0.000000
CabinLetter_G          0.000000
CabinLetter_T          0.000000
CabinLetter_missing    0.000000
dtype: float32