XGBoost as a replacement for Random Forest
Contents
XGBoost as a replacement for Random Forest#
An alternative to SciKit Learn’s Random Forest is XGBoost (pip install xgboost
if needed). XGBoost is a development of Random Forests, and is often faster and may give added accuracy. We can use XGBoost as a drop in replacement for Random Forests, as shown here.
This notebook is a repeat of the basic Random Forest model, but with Random Forest replaced by XGBoost.
Load modules#
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
/home/michael/miniconda3/envs/samuel/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
Download data#
Run the following code if data for Titanic survival has not been previously downloaded.
download_required = True
if download_required:
# Download processed data:
address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
'1804_python_healthcare/master/titanic/data/processed_data.csv'
data = pd.read_csv(address)
# Create a data subfolder if one does not already exist
import os
data_directory ='./data/'
if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
data.to_csv(data_directory + 'processed_data.csv', index=False)
Load data#
# Load data & drop passenger ID
data = pd.read_csv('data/processed_data.csv')
# Make all data 'float' type
data = data.astype(float)
data.drop('PassengerId', inplace=True, axis=1)
# Split data into two DataFrames
X_df = data.drop('Survived',axis=1)
y_df = data['Survived']
# Convert DataFrames to NumPy arrays
X = X_df.values
y = y_df.values
Define function to calculate accuracy#
import numpy as np
def calculate_accuracy(observed, predicted):
"""
Calculates a range of accuracy scores from observed and predicted classes.
Takes two list or NumPy arrays (observed class values, and predicted class
values), and returns a dictionary of results.
1) observed positive rate: proportion of observed cases that are +ve
2) Predicted positive rate: proportion of predicted cases that are +ve
3) observed negative rate: proportion of observed cases that are -ve
4) Predicted negative rate: proportion of predicted cases that are -ve
5) accuracy: proportion of predicted results that are correct
6) precision: proportion of predicted +ve that are correct
7) recall: proportion of true +ve correctly identified
8) f1: harmonic mean of precision and recall
9) sensitivity: Same as recall
10) specificity: Proportion of true -ve identified:
11) positive likelihood: increased probability of true +ve if test +ve
12) negative likelihood: reduced probability of true +ve if test -ve
13) false positive rate: proportion of false +ves in true -ve patients
14) false negative rate: proportion of false -ves in true +ve patients
15) true positive rate: Same as recall
16) true negative rate: Same as specificity
17) positive predictive value: chance of true +ve if test +ve
18) negative predictive value: chance of true -ve if test -ve
"""
# Converts list to NumPy arrays
if type(observed) == list:
observed = np.array(observed)
if type(predicted) == list:
predicted = np.array(predicted)
# Calculate accuracy scores
observed_positives = observed == 1
observed_negatives = observed == 0
predicted_positives = predicted == 1
predicted_negatives = predicted == 0
true_positives = (predicted_positives == 1) & (observed_positives == 1)
false_positives = (predicted_positives == 1) & (observed_positives == 0)
true_negatives = (predicted_negatives == 1) & (observed_negatives == 1)
false_negatives = (predicted_negatives == 1) & (observed_negatives == 0)
accuracy = np.mean(predicted == observed)
precision = (np.sum(true_positives) /
(np.sum(true_positives) + np.sum(false_positives)))
recall = np.sum(true_positives) / np.sum(observed_positives)
sensitivity = recall
f1 = 2 * ((precision * recall) / (precision + recall))
specificity = np.sum(true_negatives) / np.sum(observed_negatives)
positive_likelihood = sensitivity / (1 - specificity)
negative_likelihood = (1 - sensitivity) / specificity
false_positive_rate = 1 - specificity
false_negative_rate = 1 - sensitivity
true_positive_rate = sensitivity
true_negative_rate = specificity
positive_predictive_value = (np.sum(true_positives) /
(np.sum(true_positives) + np.sum(false_positives)))
negative_predictive_value = (np.sum(true_negatives) /
(np.sum(true_negatives) + np.sum(false_negatives)))
# Create dictionary for results, and add results
results = dict()
results['observed_positive_rate'] = np.mean(observed_positives)
results['observed_negative_rate'] = np.mean(observed_negatives)
results['predicted_positive_rate'] = np.mean(predicted_positives)
results['predicted_negative_rate'] = np.mean(predicted_negatives)
results['accuracy'] = accuracy
results['precision'] = precision
results['recall'] = recall
results['f1'] = f1
results['sensitivity'] = sensitivity
results['specificity'] = specificity
results['positive_likelihood'] = positive_likelihood
results['negative_likelihood'] = negative_likelihood
results['false_positive_rate'] = false_positive_rate
results['false_negative_rate'] = false_negative_rate
results['true_positive_rate'] = true_positive_rate
results['true_negative_rate'] = true_negative_rate
results['positive_predictive_value'] = positive_predictive_value
results['negative_predictive_value'] = negative_predictive_value
return results
Run the model with k-fold validation#
# Set up lists to hold results for each k-fold run
replicate_accuracy = []
replicate_precision = []
replicate_recall = []
replicate_f1 = []
replicate_predicted_positive_rate = []
replicate_observed_positive_rate = []
# Set up DataFrame for feature importances
importances = pd.DataFrame(index = list(X_df))
# Convert DataFrames to NumPy arrays
X = X_df.values
y = y_df.values
# Set up splits
number_of_splits = 10
skf = StratifiedKFold(n_splits = number_of_splits)
skf.get_n_splits(X, y)
# Loop through the k-fold splits
k_fold_count = 0
for train_index, test_index in skf.split(X, y):
# Get X and Y train/test
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# Set up and fit model (n_jobs=-1 uses all cores on a computer)
model = XGBClassifier(random_state=42, verbosity=0)
X_train, X_test = X[train_index], X[test_index]
model.fit(X_train, y_train)
# Predict test set labels and get accuracy scores
y_pred_test = model.predict(X_test)
accuracy_scores = calculate_accuracy(y_test, y_pred_test)
replicate_accuracy.append(accuracy_scores['accuracy'])
replicate_precision.append(accuracy_scores['precision'])
replicate_recall.append(accuracy_scores['recall'])
replicate_f1.append(accuracy_scores['f1'])
replicate_predicted_positive_rate.append(
accuracy_scores['predicted_positive_rate'])
replicate_observed_positive_rate.append(
accuracy_scores['observed_positive_rate'])
# Record feature importances
col_title = 'split_' + str(k_fold_count)
importances[col_title] = model.feature_importances_
k_fold_count +=1
# Transfer results to list and add to data frame
results = pd.Series()
results['accuracy'] = np.mean(replicate_accuracy)
results['precision'] = np.mean(replicate_precision)
results['recall'] = np.mean(replicate_recall)
results['f1'] = np.mean(replicate_f1)
results['predicted positive rate'] = np.mean(replicate_predicted_positive_rate)
results['observed positive rate'] = np.mean(replicate_observed_positive_rate)
# Get average of feature importances, and sort
importance_mean = importances.mean(axis=1)
importance_mean.sort_values(ascending=False, inplace=True)
Show results and feature importance#
results
accuracy 0.820449
precision 0.776687
recall 0.745630
f1 0.758968
predicted positive rate 0.368102
observed positive rate 0.383833
dtype: float64
importance_mean
male 0.409115
Pclass 0.151310
CabinLetter_E 0.077558
CabinNumber 0.047122
SibSp 0.042579
Embarked_S 0.036095
Embarked_C 0.029504
CabinLetter_A 0.028807
Fare 0.027939
Age 0.027254
CabinLetter_C 0.024650
Parch 0.021589
CabinLetterImputed 0.020404
AgeImputed 0.018040
CabinLetter_B 0.013238
CabinLetter_D 0.013003
Embarked_Q 0.011792
Embarked_missing 0.000000
CabinNumberImputed 0.000000
EmbarkedImputed 0.000000
CabinLetter_F 0.000000
CabinLetter_G 0.000000
CabinLetter_T 0.000000
CabinLetter_missing 0.000000
dtype: float32