In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
In [2]:
path = #INSERT DATA PATH TO model_features_solo_10.csv HERE
In [3]:
df = pd.read_csv(path+'model_features_solo_10.csv')
cols = ['kills','deaths','assists','KDA','cum_kills','cum_assists','cum_deaths','cum_KDA','mean_kills','mean_assists','mean_deaths','mean_KDA','quit','player_id','experience','winner','performance','performance_session','match','session','match_duration','cum_match_duration','mean_match_duration']
df.columns = cols
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
print 'number of users:',
print len(player_stats.index)

# feature engineering
df['log_cum_match_duration'] = np.log(df['cum_match_duration'])
df['winner'] = df['winner'].astype(int)
df['kills'] = df['kills'].astype(int)
df['assists'] = df['assists'].astype(int)
df['deaths'] = df['deaths'].astype(int)
df['match'] = df['match'].astype(int)
df['session'] = df['session'].astype(int)
df['experience'] = df['experience'].astype(int)
df['match_duration'] = df['match_duration'].astype(float)

mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]

df_drop = df[df['quit'] == 1]
df_nodrop = df[df['quit'] == 0]
df_nodrop2 = df_nodrop.sample(n = len(df_drop.index)).copy()
df_nodrop2 = df[df['quit'] == 0].sample(n = len(df_drop.index)).copy()
df_balanced = pd.concat([df_drop,df_nodrop2])
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]

def performance(prediction_values, test_values):
    tp = 0 # true positives
    fn = 0 # false negatives
    fp = 0 # false positives
    tn = 0 # true negatives
    
    tpr = []
    fpr = []
    
    for i,j in zip(prediction_values, test_values):
        if (i == True) and (j == True):
            tp += 1.0
        if (i == False) and (j == True):
            fn += 1.0
        if (i == True) and (j == False):
            fp += 1.0  
        if (i == False) and (j == False):
            tn += 1.0
    
    results = dict()
    
    results['precision'] = tp / (tp + fp)
    results['recall'] = tp / (tp + fn)
    results['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
    
    results['F1_score'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'])
    results['nobs'] = len(prediction_values)

    return results
number of users: 5046
In [4]:
def feature_add(feature_list,num):
    features = feature_list
    n_estimators = 512

    # Random Forest Classifier
    print 'Random Forest Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_rf = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]

        clf = RandomForestClassifier(criterion='entropy', n_estimators=n_estimators, n_jobs=-1)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class
        fpr_rf, tpr_rf, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_rf = auc(fpr_rf, tpr_rf)
        cv_roc_auc_rf.append(roc_auc_rf) 

    roc_auc_rf = np.mean(cv_roc_auc_rf)
    roc_auc_rf_sd = np.std(cv_roc_auc_rf)

    print 'roc_auc_rf:',roc_auc_rf
    print 'roc_auc_rf_sd:',roc_auc_rf_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_rf, tpr_rf, color='#30a2da',
             lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

    from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                  max_depth=None, max_features=None, max_leaf_nodes=None,
                  min_impurity_decrease=1e-07,
                  n_estimators=n_estimators, n_jobs=-1, oob_score=False, random_state=None,
                  verbose=0, warm_start=False)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print 'cv_f1_mean:',np.mean(cv_f1)
    print 'cv_f1_mean:',np.std(cv_f1)
    print
    print 'cv_precision_mean:',np.mean(cv_precision)
    print 'cv_precision_mean:',np.std(cv_precision)
    print
    print 'cv_recall_mean:',np.mean(cv_recall)
    print 'cv_recall_mean:',np.std(cv_recall)
    print
    print 'cv_accuracy_mean:',np.mean(cv_accuracy)
    print 'cv_accuracy_mean:',np.std(cv_accuracy)
    print

    rf_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    rf_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # feature importance

    print 'Random Forest classifier feature importance:'
    for feature,value in zip(features, clf.feature_importances_):
        print feature, value


    # Gradient Boosting Classifier 
    print 'Gradient Boosting Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_gb = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]

        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class

        fpr_gb, tpr_gb, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_gb = auc(fpr_gb, tpr_gb)
        cv_roc_auc_gb.append(roc_auc_gb) 

        roc_auc_gb = np.mean(cv_roc_auc_gb)
        roc_auc_gb_sd = np.std(cv_roc_auc_gb)

    print roc_auc_gb
    print roc_auc_gb_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
             lw=lw, label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()


    # feature importance
    for feature,value in zip(features, clf.feature_importances_):
        print feature, value

        from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print np.mean(cv_f1)
    print np.std(cv_f1)
    print
    print np.mean(cv_precision)
    print np.std(cv_precision)
    print
    print np.mean(cv_recall)
    print np.std(cv_recall)
    print
    print np.mean(cv_accuracy)
    print np.std(cv_accuracy)
    print

    gb_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    gb_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # Adaptive Boosting Classifier 
    print 'Adaptive Boosting Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_ab = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]


        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class

        fpr_ab, tpr_ab, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_ab = auc(fpr_ab, tpr_ab)
        cv_roc_auc_ab.append(roc_auc_ab) 

        roc_auc_ab = np.mean(cv_roc_auc_ab)
        roc_auc_ab_sd = np.std(cv_roc_auc_ab)

    print roc_auc_ab
    print roc_auc_ab_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
             lw=lw, label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

    from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print np.mean(cv_f1)
    print np.std(cv_f1)
    print
    print np.mean(cv_precision)
    print np.std(cv_precision)
    print
    print np.mean(cv_recall)
    print np.std(cv_recall)
    print
    print np.mean(cv_accuracy)
    print np.std(cv_accuracy)
    print

    ab_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    ab_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # feature importance
    for feature,value in zip(features, clf.feature_importances_):
        print 'feature importance',feature, value

    plt.figure(figsize = (5,5))
    lw = 2

    plt.plot(fpr_rf, tpr_rf, color='#30a2da',
             lw=lw, ls = 'solid', label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
    plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
             lw=lw, ls = 'dashed', label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
    plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
             lw=lw, ls = 'dotted', label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed', label='Random Guessing (AUC = 0.50)')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize = 14)
    plt.ylabel('True Positive Rate', fontsize = 14)
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right', fontsize = 10)
    plt.savefig('roc_auc10_model'+num+'.png',dpi=300)
    plt.show()


    N = 4

    ind = np.arange(N)  # the x locations for the groups
    width = 0.15       # the width of the bars

    plt.figure(figsize = (10,5))
    fig, ax = plt.subplots()
    ax.set_position([0.1,0.1,.75,0.65])
    rects1 = ax.bar(ind + width, ab_scores, width, yerr = ab_scores_std, color='#7A68A6', hatch = '/', edgecolor = 'black', ecolor = 'black')
    rects2 = ax.bar(ind - width, rf_scores, width, yerr = rf_scores_std,color='#30a2da', hatch= '\\', edgecolor = 'black',ecolor = 'black')
    rects3 = ax.bar(ind, gb_scores, width, yerr = gb_scores_std,color='#fc4f30', edgecolor = 'black',ecolor = 'black')

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Scores', fontsize = 14)
    ax.set_title('')
    ax.set_ylim(0,1)
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(('F1-measure','Recall','Precision','Accuracy'), fontsize = 14)

    ax.legend(( rects2[0], rects3[0], rects1[0]), ('Random\nForest','Gradient\nBoosting', 'Adaptive\nBoosting'), ncol = 3, loc='upper center', 
              bbox_to_anchor=(0.5, 1.25))
    plt.savefig('metrics10_model'+num+'png',dpi=300)
In [5]:
feature_list = ['match','match_duration','cum_match_duration','mean_match_duration','session',
                'player_id','experience','kills','deaths','assists','cum_kills','cum_deaths',
                'cum_assists','mean_kills','mean_deaths','mean_assists']
num = '2'
feature_add(feature_list,num)
Random Forest Classifier
1
2
3
4
5
6
7
8
9
10
roc_auc_rf: 0.827111032245
roc_auc_rf_sd: 0.00331446835434
1
2
3
4
5
6
7
8
9
10
cv_f1_mean: 0.813003489728
cv_f1_mean: 0.00237901626301

cv_precision_mean: 0.702639470168
cv_precision_mean: 0.00406854659904

cv_recall_mean: 0.964528553621
cv_recall_mean: 0.00242927373463

cv_accuracy_mean: 0.779073633963
cv_accuracy_mean: 0.00257291263612

Random Forest classifier feature importance:
match 0.364308282822
match_duration 0.0692089313264
cum_match_duration 0.0461620043543
mean_match_duration 0.0544880194381
session 0.0462422155079
player_id 0.0611320155431
experience 0.0599043377386
kills 0.0336994015271
deaths 0.029879850764
assists 0.0370191356337
cum_kills 0.0304117307699
cum_deaths 0.0284417199014
cum_assists 0.0325821267825
mean_kills 0.0350038991275
mean_deaths 0.0339885944659
mean_assists 0.0375277342971
Gradient Boosting Classifier
1
2
3
4
5
6
7
8
9
10
0.839307806542
0.00133576976105
match 0.0803812941366
match_duration 0.0780370294786
cum_match_duration 0.140597308131
mean_match_duration 0.0441140120723
session 0.138937522239
player_id 0.0906111538118
experience 0.140533104694
kills 0.0315520388863
deaths 0.020432148178
assists 0.0326173562074
cum_kills 0.045931212307
cum_deaths 0.0450314990806
cum_assists 0.0425699078541
mean_kills 0.0238024919343
mean_deaths 0.0250136894996
mean_assists 0.0198382314888
1
2
3
4
5
6
7
8
9
10
0.819311328608
0.00209776102967

0.704431400507
0.00297772471899

0.978972257099
0.00101367642341

0.783295221957
0.00206321685459

Adaptive Boosting Classifier
1
2
3
4
5
6
7
8
9
10
0.836312055381
0.00222304676143
1
2
3
4
5
6
7
8
9
10
0.817685824124
0.00226557780253

0.701037888951
0.00342699170771

0.980916361612
0.000766146877472

0.781723245739
0.0025034958691

feature importance match 0.0078125
feature importance match_duration 0.05859375
feature importance cum_match_duration 0.021484375
feature importance mean_match_duration 0.0390625
feature importance session 0.333984375
feature importance player_id 0.0625
feature importance experience 0.341796875
feature importance kills 0.01171875
feature importance deaths 0.0078125
feature importance assists 0.0078125
feature importance cum_kills 0.0234375
feature importance cum_deaths 0.01171875
feature importance cum_assists 0.021484375
feature importance mean_kills 0.015625
feature importance mean_deaths 0.015625
feature importance mean_assists 0.01953125
<matplotlib.figure.Figure at 0x7f02358ace50>