In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
In [3]:
path = #INSERT DATA PATH TO model_features_solo_10.csv HERE
In [4]:
df = pd.read_csv(path+'model_features_solo_10.csv')
cols = ['kills','deaths','assists','KDA','cum_kills','cum_assists','cum_deaths','cum_KDA','mean_kills','mean_assists','mean_deaths','mean_KDA','quit','player_id','experience','winner','performance','performance_session','match','session','match_duration','cum_match_duration','mean_match_duration']
df.columns = cols
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
print 'number of users:',
print len(player_stats.index)

# feature engineering
df['log_cum_match_duration'] = np.log(df['cum_match_duration'])
df['winner'] = df['winner'].astype(int)
df['kills'] = df['kills'].astype(int)
df['assists'] = df['assists'].astype(int)
df['deaths'] = df['deaths'].astype(int)
df['match'] = df['match'].astype(int)
df['session'] = df['session'].astype(int)
df['experience'] = df['experience'].astype(int)
df['match_duration'] = df['match_duration'].astype(float)

mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]

df_drop = df[df['quit'] == 1]
df_nodrop = df[df['quit'] == 0]
df_nodrop2 = df_nodrop.sample(n = len(df_drop.index)).copy()
df_nodrop2 = df[df['quit'] == 0].sample(n = len(df_drop.index)).copy()
df_balanced = pd.concat([df_drop,df_nodrop2])
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]

def performance(prediction_values, test_values):
    tp = 0 # true positives
    fn = 0 # false negatives
    fp = 0 # false positives
    tn = 0 # true negatives
    
    tpr = []
    fpr = []
    
    for i,j in zip(prediction_values, test_values):
        if (i == True) and (j == True):
            tp += 1.0
        if (i == False) and (j == True):
            fn += 1.0
        if (i == True) and (j == False):
            fp += 1.0  
        if (i == False) and (j == False):
            tn += 1.0
    
    results = dict()
    
    results['precision'] = tp / (tp + fp)
    results['recall'] = tp / (tp + fn)
    results['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
    
    results['F1_score'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'])
    results['nobs'] = len(prediction_values)

    return results
number of users: 5046
In [10]:
def feature_add(feature_list,num):
    features = feature_list
    n_estimators = 512

    # Random Forest Classifier
    print 'Random Forest Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_rf = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]

        clf = RandomForestClassifier(criterion='entropy', n_estimators=n_estimators, n_jobs=-1)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class
        fpr_rf, tpr_rf, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_rf = auc(fpr_rf, tpr_rf)
        cv_roc_auc_rf.append(roc_auc_rf) 

    roc_auc_rf = np.mean(cv_roc_auc_rf)
    roc_auc_rf_sd = np.std(cv_roc_auc_rf)

    print 'roc_auc_rf:',roc_auc_rf
    print 'roc_auc_rf_sd:',roc_auc_rf_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_rf, tpr_rf, color='#30a2da',
             lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

    from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                  max_depth=None, max_features=None, max_leaf_nodes=None,
                  min_impurity_decrease=1e-07,
                  n_estimators=n_estimators, n_jobs=-1, oob_score=False, random_state=None,
                  verbose=0, warm_start=False)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print 'cv_f1_mean:',np.mean(cv_f1)
    print 'cv_f1_mean:',np.std(cv_f1)
    print
    print 'cv_precision_mean:',np.mean(cv_precision)
    print 'cv_precision_mean:',np.std(cv_precision)
    print
    print 'cv_recall_mean:',np.mean(cv_recall)
    print 'cv_recall_mean:',np.std(cv_recall)
    print
    print 'cv_accuracy_mean:',np.mean(cv_accuracy)
    print 'cv_accuracy_mean:',np.std(cv_accuracy)
    print

    rf_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    rf_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # feature importance
    print ''
    print 'Random Forest classifier feature importance:'
    for feature,value in zip(features, clf.feature_importances_):
        print feature, value


    # Gradient Boosting Classifier 
    print 'Gradient Boosting Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_gb = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]

        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class

        fpr_gb, tpr_gb, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_gb = auc(fpr_gb, tpr_gb)
        cv_roc_auc_gb.append(roc_auc_gb) 

        roc_auc_gb = np.mean(cv_roc_auc_gb)
        roc_auc_gb_sd = np.std(cv_roc_auc_gb)

    print roc_auc_gb
    print roc_auc_gb_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
             lw=lw, label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()


    # feature importance
    for feature,value in zip(features, clf.feature_importances_):
        print feature, value

        from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print np.mean(cv_f1)
    print np.std(cv_f1)
    print
    print np.mean(cv_precision)
    print np.std(cv_precision)
    print
    print np.mean(cv_recall)
    print np.std(cv_recall)
    print
    print np.mean(cv_accuracy)
    print np.std(cv_accuracy)
    print

    gb_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    gb_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # Adaptive Boosting Classifier 
    print 'Adaptive Boosting Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_ab = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]


        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class

        fpr_ab, tpr_ab, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_ab = auc(fpr_ab, tpr_ab)
        cv_roc_auc_ab.append(roc_auc_ab) 

        roc_auc_ab = np.mean(cv_roc_auc_ab)
        roc_auc_ab_sd = np.std(cv_roc_auc_ab)

    print roc_auc_ab
    print roc_auc_ab_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
             lw=lw, label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

    from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print np.mean(cv_f1)
    print np.std(cv_f1)
    print
    print np.mean(cv_precision)
    print np.std(cv_precision)
    print
    print np.mean(cv_recall)
    print np.std(cv_recall)
    print
    print np.mean(cv_accuracy)
    print np.std(cv_accuracy)
    print

    ab_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    ab_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # feature importance
    for feature,value in zip(features, clf.feature_importances_):
        print 'feature importance',feature, value

    plt.figure(figsize = (5,5))
    lw = 2

    plt.plot(fpr_rf, tpr_rf, color='#30a2da',
             lw=lw, ls = 'solid', label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
    plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
             lw=lw, ls = 'dashed', label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
    plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
             lw=lw, ls = 'dotted', label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed', label='Random Guessing (AUC = 0.50)')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize = 14)
    plt.ylabel('True Positive Rate', fontsize = 14)
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right', fontsize = 10)
    plt.savefig('roc_auc10_model'+num+'.png',dpi=300)
    plt.show()


    N = 4

    ind = np.arange(N)  # the x locations for the groups
    width = 0.15       # the width of the bars

    plt.figure(figsize = (10,5))
    fig, ax = plt.subplots()
    ax.set_position([0.1,0.1,.75,0.65])
    rects1 = ax.bar(ind + width, ab_scores, width, yerr = ab_scores_std, color='#7A68A6', hatch = '/', edgecolor = 'black', ecolor = 'black')
    rects2 = ax.bar(ind - width, rf_scores, width, yerr = rf_scores_std,color='#30a2da', hatch= '\\', edgecolor = 'black',ecolor = 'black')
    rects3 = ax.bar(ind, gb_scores, width, yerr = gb_scores_std,color='#fc4f30', edgecolor = 'black',ecolor = 'black')

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Scores', fontsize = 14)
    ax.set_title('')
    ax.set_ylim(0,1)
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(('F1-measure','Recall','Precision','Accuracy'), fontsize = 14)

    ax.legend(( rects2[0], rects3[0], rects1[0]), ('Random\nForest','Gradient\nBoosting', 'Adaptive\nBoosting'), ncol = 3, loc='upper center', 
              bbox_to_anchor=(0.5, 1.25))
    plt.savefig('metrics10_model'+num+'png',dpi=300)

kills deaths assists session cumulated kills session cumulated deaths session cumulated assists session mean kills session mean deaths session mean assists

In [11]:
feature_list = ['match','match_duration','cum_match_duration','mean_match_duration','session',
                'player_id','experience']
num = '1'
feature_add(feature_list,num)
Random Forest Classifier
1
2
3
4
5
6
7
8
9
10
roc_auc_rf: 0.829909858569
roc_auc_rf_sd: 0.00265840973699
1
2
3
4
5
6
7
8
9
10
cv_f1_mean: 0.803074875532
cv_f1_mean: 0.00236567787672

cv_precision_mean: 0.708941756367
cv_precision_mean: 0.00356547416177

cv_recall_mean: 0.926048136605
cv_recall_mean: 0.00157228443682

cv_accuracy_mean: 0.772811251405
cv_accuracy_mean: 0.0018504828758


Random Forest classifier feature importance:
match 0.368183197262
match_duration 0.131065981877
cum_match_duration 0.100590175435
mean_match_duration 0.104710971406
session 0.0786417167879
player_id 0.113444215549
experience 0.103363741683
Gradient Boosting Classifier
1
2
3
4
5
6
7
8
9
10
0.837213511944
0.00156974005757
match 0.0949511999529
match_duration 0.117250491287
cum_match_duration 0.249123598312
mean_match_duration 0.0957752444685
session 0.147755918051
player_id 0.112301756247
experience 0.182841791681
1
2
3
4
5
6
7
8
9
10
0.81838303444
0.00218710248609

0.702222182206
0.00333852705982

0.980607204247
0.0011872195355

0.782926036875
0.00246964738502

Adaptive Boosting Classifier
1
2
3
4
5
6
7
8
9
10
0.836545757513
0.00260804083649
1
2
3
4
5
6
7
8
9
10
0.818169767558
0.00261765621982

0.701410084796
0.00403275697655

0.98158837481
0.00133436392919

0.782513556478
0.00271241925726

feature importance match 0.013671875
feature importance match_duration 0.080078125
feature importance cum_match_duration 0.04296875
feature importance mean_match_duration 0.0625
feature importance session 0.353515625
feature importance player_id 0.076171875
feature importance experience 0.37109375
<matplotlib.figure.Figure at 0x7fa843823150>