%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

df = pd.read_csv('model_features.csv', header = None)
cols = ['quit','player_id','experience','winner','performance','performance_session','match','session','match_duration','cum_match_duration']
dummy = ['pl_' + str(i+1) for i in range(1120)]
cols.extend( dummy )
df.columns = cols

# feature engineering
df['log_cum_match_duration'] = np.log(df['cum_match_duration'])
df['winner'] = df['winner'].astype(int)
df['match'] = df['match'].astype(int)
df['session'] = df['session'].astype(int)
df['experience'] = df['experience'].astype(int)
df['match_duration'] = df['match_duration'].astype(float)
df[dummy] = df[dummy].astype(int)

print df.tail()
print len(df.index)
print len(df[df['quit'] == 0].index)
print len(df[df['quit'] == 1].index)

        quit  player_id  experience  winner  performance  performance_session  \
406201     1   52211371          44       1          0.5                  0.5   
406202     0   52211371          45       0          0.0                  0.5   
406203     1   52211371          46       1          0.5                  0.5   
406204     0   52211371          47       0          0.0                  0.0   
406205     1   52211371          48       0          0.0                  0.0   

        match  session  match_duration  cum_match_duration  \
406201      2       18          2424.0              4960.0   
406202      1       19          2378.0              2378.0   
406203      2       19          1983.0              4361.0   
406204      1       20          2004.0              2004.0   
406205      2       20          1889.0              3893.0   

                 ...            pl_1112  pl_1113  pl_1114  pl_1115  pl_1116  \
406201           ...                  0        0        0        0        0   
406202           ...                  0        0        0        0        0   
406203           ...                  0        0        0        0        0   
406204           ...                  0        0        0        0        0   
406205           ...                  0        0        0        0        0   

        pl_1117  pl_1118  pl_1119  pl_1120  log_cum_match_duration  
406201        0        0        0        1                8.509161  
406202        0        0        0        1                7.774015  
406203        0        0        0        1                8.380457  
406204        0        0        0        1                7.602900  
406205        0        0        0        1                8.266935  

[5 rows x 1131 columns]
406206
261037
145169

mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]

df_drop = df[df['quit'] == 1]
df_nodrop = df[df['quit'] == 0]
df_nodrop2 = df_nodrop.sample(n = len(df_drop.index)).copy()
df_balanced = pd.concat([df_drop,df_nodrop2])
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]

def performance(prediction_values, test_values):
    tp = 0 # true positives
    fn = 0 # false negatives
    fp = 0 # false positives
    tn = 0 # true negatives
    
    tpr = []
    fpr = []
    
    for i,j in zip(prediction_values, test_values):
        if (i == True) and (j == True):
            tp += 1.0
        if (i == False) and (j == True):
            fn += 1.0
        if (i == True) and (j == False):
            fp += 1.0  
        if (i == False) and (j == False):
            tn += 1.0
    
    results = dict()
    
    results['precision'] = tp / (tp + fp)
    results['recall'] = tp / (tp + fn)
    results['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
    
    results['F1_score'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'])
    results['nobs'] = len(prediction_values)

    return results

features = ['match_duration','cum_match_duration']
#features.extend(dummy) # fixed effects for each player via dummy

n_estimators = 128

Random Forest Classifier¶

from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score

cv_roc_auc_rf = []
for i in range(1):
    print (i+1)
    mask = np.random.rand(len(df)) < 0.9
    train = df[mask]
    test = df[~mask]
    
    clf = RandomForestClassifier(criterion='entropy', n_estimators=n_estimators, n_jobs=-1)
    clf.fit(train[features], train['quit'])

    # Compute ROC curve and ROC area for each class
    fpr_rf, tpr_rf, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
    roc_auc_rf = auc(fpr_rf, tpr_rf)
    cv_roc_auc_rf.append(roc_auc_rf) 

roc_auc_rf = np.mean(cv_roc_auc_rf)
roc_auc_rf_sd = np.std(cv_roc_auc_rf)

print roc_auc_rf
print roc_auc_rf_sd

plt.figure()
lw = 2
plt.plot(fpr_rf, tpr_rf, color='#30a2da',
         lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

1
0.888601339157
0.0

# feature importance
for feature,value in zip(features, clf.feature_importances_):
    print feature, value

match_duration 0.201652538339
cum_match_duration 0.798347461661

from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []

for i in range(5):
    print (i+1)
    mask = np.random.rand(len(df_balanced)) < 0.9
    train_balanced = df_balanced[mask]
    test_balanced = df_balanced[~mask]

    clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
              max_depth=None, max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07,
              n_estimators=n_estimators, n_jobs=-1, oob_score=False, random_state=None,
              verbose=0, warm_start=False)
    clf.fit(train_balanced[features], train_balanced['quit'])

    res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
    cv_f1.append(res['F1_score'])
    cv_precision.append(res['precision'])
    cv_recall.append(res['recall'])
    cv_accuracy.append(res['accuracy'])
    
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print

rf_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
rf_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

1
2
3
4
5
0.811234170185
0.00174436109279

0.773737828195
0.00158394682195

0.852555782484
0.00295747734196

0.801186108622
0.00104033797009

# feature importance
for feature,value in zip(features, clf.feature_importances_):
    print feature, value

match_duration 0.294575908077
cum_match_duration 0.705424091923

Gradient Boosting Classifier¶

from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score

cv_roc_auc_gb = []
for i in range(1):
    print (i+1)
    mask = np.random.rand(len(df)) < 0.9
    train = df[mask]
    test = df[~mask]

    clf = GradientBoostingClassifier(n_estimators=n_estimators)
    clf.fit(train[features], train['quit'])

    # Compute ROC curve and ROC area for each class

    fpr_gb, tpr_gb, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
    roc_auc_gb = auc(fpr_gb, tpr_gb)
    cv_roc_auc_gb.append(roc_auc_gb) 

    roc_auc_gb = np.mean(cv_roc_auc_gb)
    roc_auc_gb_sd = np.std(cv_roc_auc_gb)

print roc_auc_gb
print roc_auc_gb_sd

plt.figure()
lw = 2
plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
         lw=lw, label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

1
0.804713621589
0.0

# feature importance
for feature,value in zip(features, clf.feature_importances_):
    print feature, value

match_duration 0.283446306584
cum_match_duration 0.716553693416

from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []

for i in range(5):
    print (i+1)
    mask = np.random.rand(len(df_balanced)) < 0.9
    train_balanced = df_balanced[mask]
    test_balanced = df_balanced[~mask]
    
    clf = GradientBoostingClassifier(n_estimators=n_estimators)
    clf.fit(train_balanced[features], train_balanced['quit'])

    res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
    cv_f1.append(res['F1_score'])
    cv_precision.append(res['precision'])
    cv_recall.append(res['recall'])
    cv_accuracy.append(res['accuracy'])
    
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print

gb_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
gb_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

1

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-e2503700d552> in <module>()
      7 for i in range(5):
      8     print (i+1)
----> 9     mask = np.random.rand(len(df_balanced)) < 0.9
     10     train_balanced = df_balanced[mask]
     11     test_balanced = df_balanced[~mask]

NameError: name 'np' is not defined

Adaptive Boosting Classifier¶

from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score

cv_roc_auc_ab = []
for i in range(1):
    print (i+1)
    mask = np.random.rand(len(df)) < 0.9
    train = df[mask]
    test = df[~mask]


    clf = AdaBoostClassifier(n_estimators=n_estimators)
    clf.fit(train[features], train['quit'])

    # Compute ROC curve and ROC area for each class

    fpr_ab, tpr_ab, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
    roc_auc_ab = auc(fpr_ab, tpr_ab)
    cv_roc_auc_ab.append(roc_auc_ab) 

    roc_auc_ab = np.mean(cv_roc_auc_ab)
    roc_auc_ab_sd = np.std(cv_roc_auc_ab)

print roc_auc_ab
print roc_auc_ab_sd

plt.figure()
lw = 2
plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
         lw=lw, label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

1
0.777054468449
0.0

# feature importance
for feature,value in zip(features, clf.feature_importances_):
    print feature, value

match_duration 0.171875
cum_match_duration 0.828125

from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []

for i in range(5):
    print (i+1)
    mask = np.random.rand(len(df_balanced)) < 0.9
    train_balanced = df_balanced[mask]
    test_balanced = df_balanced[~mask]
    
    clf = AdaBoostClassifier(n_estimators=n_estimators)
    clf.fit(train_balanced[features], train_balanced['quit'])

    res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
    cv_f1.append(res['F1_score'])
    cv_precision.append(res['precision'])
    cv_recall.append(res['recall'])
    cv_accuracy.append(res['accuracy'])
    
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print

ab_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
ab_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

Comparison¶

plt.figure(figsize = (5,5))
lw = 2

plt.plot(fpr_rf, tpr_rf, color='#30a2da',
         lw=lw, ls = 'solid', label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
         lw=lw, ls = 'dashed', label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
         lw=lw, ls = 'dotted', label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed', label='Random Guessing (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right', fontsize = 10)
plt.savefig('roc_auc.pdf')
plt.show()

N = 4

ind = np.arange(N)  # the x locations for the groups
width = 0.15       # the width of the bars

plt.figure(figsize = (10,5))
fig, ax = plt.subplots()
ax.set_position([0.1,0.1,.75,0.65])
rects1 = ax.bar(ind + width, ab_scores, width, yerr = ab_scores_std, color='#7A68A6', hatch = '/', edgecolor = 'black', ecolor = 'black')
rects2 = ax.bar(ind - width, rf_scores, width, yerr = rf_scores_std,color='#30a2da', hatch= '\\', edgecolor = 'black',ecolor = 'black')
rects3 = ax.bar(ind, gb_scores, width, yerr = gb_scores_std,color='#fc4f30', edgecolor = 'black',ecolor = 'black')

# add some text for labels, title and axes ticks
ax.set_ylabel('Scores', fontsize = 14)
ax.set_title('')
ax.set_ylim(0,1)
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('F1-measure','Recall','Precision','Accuracy'), fontsize = 14)

ax.legend(( rects2[0], rects3[0], rects1[0]), ('Random\nForest','Gradient\nBoosting', 'Adaptive\nBoosting'), ncol = 3, loc='upper center', 
          bbox_to_anchor=(0.5, 1.25))
plt.savefig('metrics.pdf')