League of Legends

Paper Notebook

In [1]:
%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import networkx as nx
from collections import Counter
import itertools
import math
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
import ncp
from sktensor import dtensor, cp_als, tucker
import sktensor
import tensorflow as tf
from sklearn import linear_model
In [2]:
root = 'alessandro' # change this

path = '/Users/' + root + '/Dropbox/mitigate/lol/data/'

# read data in chunks
df = pd.DataFrame()
reader = pd.read_csv(path + 'structured_data_new.csv', header = None, low_memory = False, chunksize = 100 * 1024)
for chunk in reader:
    df = pd.concat([df, chunk])
In [3]:
cols = ['player_id', 'match_id', 'match_datetime', 'match_creation','match_duration','map_id',
df.columns = cols
df['player_id'] = df['player_id'].astype(str).apply(lambda x: x[:-2])
df['match_id'] = df['match_id'].astype(str).apply(lambda x: x[:-2])
df['map_id'] = df['map_id'].astype(str).apply(lambda x: x[:-2])
df['champion_id'] = df['champion_id'].astype(str).apply(lambda x: x[:-2])
df['team_id'] = df['team_id'].astype(str).apply(lambda x: x[:-2])
In [4]:
players = df['player_id'].value_counts()

threshold = 100 # minimum number of matches per user

print 'number of users:',
print len(players[players >= threshold])
players = pd.DataFrame(players[players >= threshold].index, columns = ['id'])

df = df[df['player_id'].isin(players['id'])]
number of users: 1120
In [5]:
print 'number of matches:',
print len(df.groupby(['match_id'])['match_id'].nunique().index)
number of matches: 435118
In [6]:
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
player_stats = player_stats[player_stats[('winner','count')] >= threshold]
print player_stats.head()
               mean count  sum
60412503   0.641379   290  186
21262790   0.630952   336  212
21491589   0.630252   119   75
48013821   0.628571   105   66
34999026   0.627451   102   64
In [7]:
# player_stats['performance_20'] = np.zeros(len(player_stats.index))
# player_stats['performance_50'] = np.zeros(len(player_stats.index))
# player_stats['performance_100'] = np.zeros(len(player_stats.index))

# for player in player_stats.index:
#     matches = list(df[df['player_id'] == player]['winner'])
#     first_20 = np.mean(matches[:19])
#     first_50 = np.mean(matches[:49])
#     first_100 = np.mean(matches[:99])
#     player_stats.loc[player,'performance_20'] = first_20
#     player_stats.loc[player,'performance_50'] = first_50
#     player_stats.loc[player,'performance_100'] = first_100
In [8]:
# x = player_stats['performance_20']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])

# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 20 matches')
# plt.ylabel('total matches')
# plt.show()
In [9]:
# x = player_stats['performance_50']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])

# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 50 matches')
# plt.ylabel('total matches')
# plt.show()
In [10]:
# x = player_stats['performance_100']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])

# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 100 matches')
# plt.ylabel('total matches')
# plt.show()
In [11]:
# player_stats.columns = player_stats.columns.get_level_values(0)
# player_stats.columns = ['mean','count','sum','performance_20','performance_50','performance_100']
# print player_stats.head()
In [12]:
# histogram
data = np.array(player_stats[('winner','count')].astype(float))
plt.hist(data, bins = 50, histtype='stepfilled', alpha = .75, color = '#30a2da')
print np.median(data)
print np.mean(data)
In [58]:
fig = plt.figure(figsize = (10,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

font_size = 16
# ax1
ax1.scatter(player_stats[('winner','count')], player_stats[('winner','mean')], marker = 'o', alpha = .25, color = '#30a2da')
ax1.set_xlabel('matches', fontsize=font_size)
ax1.set_ylabel('performance', fontsize=font_size)
ax1.set_xscale('log', basex=2)
ax1.axhline(y = .5, xmin=0, xmax=2500, linewidth=2, color = 'k', ls = 'dashed')

data = np.array(player_stats[('winner','mean')].astype(float))
ax2.hist(data, bins = 20, histtype = 'stepfilled', alpha = .75, color = '#30a2da')
ax2.set_xlabel('performance', fontsize=font_size)
ax2.set_ylabel('counts', fontsize=font_size)
ax2.axvline(x = .5, ymin=0, ymax=300, linewidth=2, color = 'k', ls = 'dashed')


print np.mean(data)
print np.std(data)

cor = np.corrcoef(player_stats[('winner','count')], player_stats[('winner','mean')])
print cor
[[ 1.          0.02210729]
 [ 0.02210729  1.        ]]

Gaming Sessions

In [14]:
# computing aggregate interarrival times
inter_arrival_times = []
for player_id in player_stats.index:
    temp = df[df['player_id'] == player_id]
    datetime = pd.to_datetime(temp['match_datetime'])
    inter = datetime.diff().astype('timedelta64[s]')[1:] / 3600
    duration = temp['match_duration'] / 3600
    inter_arrival_times.extend(np.subtract(inter, duration[:-1]))
In [15]:
print np.median(inter_arrival_times)
m = np.median(inter_arrival_times)
In [16]:
class Sequence(object):
    def __init__(self):
        self.player_id = None
        self.sequences = list()
In [17]:
# compute original sessions
sequences = []
for player_id in player_stats.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            old_id = i
            if len(df_copy.index) > 0:

In [18]:
# compute randomized index sessions
randomized_sequences = []
for player_id in player_stats.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            df_copy = df_copy.sample(frac=1).reset_index(drop=True)
            old_id = i
            if len(df_copy.index) > 0:

In [19]:
### code to generate figures
from collections import OrderedDict
def sequence_analysis(sequence_length, sequences, feature, verbose = True):
    data = dict()
    for seq in sequences:
        for df in seq.sequences:
            if len(df.index) == sequence_length:
                for match in xrange(1,sequence_length+1):
                    temp = df[feature].reset_index(0).drop('index',1)
                    data.setdefault(str(match), list()).extend( [temp[feature].ix[match-1]] )
    stats = dict()
    for session in range(1,sequence_length+1):
        stats.setdefault(str(session), dict(mu = None, sigma = None, n = None, ci = None))
        stats[str(session)]['mu'] = np.mean(data[str(session)])
        stats[str(session)]['sigma'] = np.std(data[str(session)])
        stats[str(session)]['n'] = len(data[str(session)])
        stats[str(session)]['ci'] = 1.96 * np.std(data[str(session)]) / np.sqrt(len(data[str(session)]))

        if verbose == True:
            print 'Session =',
            print session,
            print '| Sequence length =',
            print sequence_length
            print 'mu =',
            print stats[str(session)]['mu']
            print 'sigma =',
            print stats[str(session)]['sigma']
            print 'n obs =',
            print stats[str(session)]['n']
            print 'n ci =',
            print stats[str(session)]['ci']
            print '-' * 40
    return OrderedDict(sorted(stats.items(), key=lambda t: t[0]))
In [20]:
In [22]:
All players

In [26]:
feature = 'winner'

ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

fig = plt.figure(figsize = (10,5))
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)

orig.set_xlabel('match position', fontsize=font_size)
orig.set_ylabel('performance', fontsize=font_size)
orig.set_xticks(np.arange(1,6), xrange(1,6))

rand.set_xlabel('match position', fontsize=font_size)
rand.set_ylabel('performance', fontsize=font_size)
rand.set_xticks(np.arange(1,6), xrange(1,6))

rand_bar_y = []
bar_x = []
bar_y = []

for i in range(1,6):
    stats = sequence_analysis(i, sequences, feature, verbose = False)
    rand_stats = sequence_analysis(i, randomized_sequences, feature, verbose = False)
    x = range(1,i+1)
    y = []
    err = []
    rand_y = []
    rand_err = []
    for item in stats:
    for item in stats:
    # alternative visualization
    xx = [x[0], x[-1]]
    yy = [y[0], y[-1]]
    eerr = [err[0], err[-1]]
    rand_yy = [rand_y[0], rand_y[-1]]
    rand_eerr = [rand_err[0], rand_err[-1]]
    # change plot data
    bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
    rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
#     orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
#                 lw = 3, markersize = 8, capthick = 3, capsize = 6)
#     rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
#                 lw = 3, markersize = 8, capthick = 3, capsize = 6)

    orig.errorbar(x, y, yerr = err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
                lw = 3, markersize = 10, capthick = 3, capsize = 6)
    rand.errorbar(x, rand_y, yerr = rand_err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
                lw = 3, markersize = 10, capthick = 3, capsize = 6)

# orig.legend([str(i) + ' matches' for i in range(1,6)], 
#            loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'lower right', shadow = True, fancybox = True)

#orig.set_title('original sessions', fontsize=font_size)
rand.set_title('randomized index sessions', fontsize=font_size)
In [27]:
# visualize change (%)
# visualize change (%)
fig = plt.figure(figsize = (5,5))
# fig.subplots_adjust(wspace=.25)
both = fig.add_subplot(111)

both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', lw = 2, label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', lw = 2, label = 'randomized index sessions', ls = 'dashed')

both.set_ylabel(r'$\Delta \% $' + 'performance', fontsize = font_size)
both.set_xlabel('session length', fontsize = font_size)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
both.legend(loc = 'lower left', fontsize = 12)

for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(bar_y[i],2)

for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(rand_bar_y[i],2)
matches 1.0 -> 0.0
matches 2.0 -> -1.06
matches 3.0 -> -8.98
matches 4.0 -> -9.83
matches 5.0 -> -12.78

matches 1.0 -> 0.0
matches 2.0 -> -0.95
matches 3.0 -> -0.16
matches 4.0 -> -1.57
matches 5.0 -> 1.89

Above 95th percentile

In [28]:
# compute original sessions
player_stats_top = player_stats[player_stats[('winner','count')] >= np.percentile(player_stats[('winner','count')], 95)]

sequences_top = []
for player_id in player_stats_top.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            old_id = i
            if len(df_copy.index) > 0:

In [29]:
# compute randomized index sessions
randomized_sequences_top = []
for player_id in player_stats_top.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            df_copy = df_copy.sample(frac=1).reset_index(drop=True)
            old_id = i
            if len(df_copy.index) > 0:

In [30]:
feature = 'winner'

ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

fig = plt.figure(figsize = (10,5))
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)

orig.set_xlabel('match position')
orig.set_xticks(np.arange(1,6), xrange(1,6))

rand.set_xlabel('match position')
rand.set_xticks(np.arange(1,6), xrange(1,6))

rand_bar_y = []
bar_x = []
bar_y = []

for i in range(1,6):
    stats = sequence_analysis(i, sequences_top, feature, verbose = False)
    rand_stats = sequence_analysis(i, randomized_sequences_top, feature, verbose = False)
    x = range(1,i+1)
    y = []
    err = []
    rand_y = []
    rand_err = []
    for item in stats:
    for item in stats:
    # alternative visualization
    xx = [x[0], x[-1]]
    yy = [y[0], y[-1]]
    eerr = [err[0], err[-1]]
    rand_yy = [rand_y[0], rand_y[-1]]
    rand_eerr = [rand_err[0], rand_err[-1]]
    # change plot data
    bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
    rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
    orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
                lw = 3, markersize = 12, capthick = 3, capsize = 6)
    rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
                lw = 3, markersize = 12, capthick = 3, capsize = 6)

orig.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'lower right', shadow = True, fancybox = True)

orig.set_title('original sessions')
rand.set_title('randomized index sessions')
#plt.savefig('sessions_plos/' + feature + '.png')
In [31]:
above_bar_y = bar_y
above_rand_bar_y = rand_bar_y

# visualize change (%)
fig = plt.figure(figsize = (6,3))
both = fig.add_subplot(111)

both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')

both.set_title('Above 95th percentile Experience')
both.set_ylabel(r'$\Delta \% $' + 'performance')
both.set_xlabel('session length')
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
both.legend(loc = 'lower left')

for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(bar_y[i],2)

for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(rand_bar_y[i],2)
matches 1.0 -> 0.0
matches 2.0 -> -2.47
matches 3.0 -> -9.08
matches 4.0 -> -11.64
matches 5.0 -> -10.77

matches 1.0 -> 0.0
matches 2.0 -> -0.39
matches 3.0 -> 1.28
matches 4.0 -> 1.33
matches 5.0 -> 0.12

Below 5th percentile

In [32]:
# compute original sessions
player_stats_worst = player_stats[player_stats[('winner','count')] <= np.percentile(player_stats[('winner','count')], 5)]

sequences_worst = []
for player_id in player_stats_worst.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            old_id = i
            if len(df_copy.index) > 0:

In [33]:
# compute randomized index sessions
randomized_sequences_worst = []
for player_id in player_stats_worst.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            df_copy = df_copy.sample(frac=1).reset_index(drop=True)
            old_id = i
            if len(df_copy.index) > 0:

In [34]:
feature = 'winner'

ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

fig = plt.figure(figsize = (10,5))
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)

orig.set_xlabel('match position')
orig.set_xticks(np.arange(1,6), xrange(1,6))

rand.set_xlabel('match position')
rand.set_xticks(np.arange(1,6), xrange(1,6))

rand_bar_y = []
bar_x = []
bar_y = []

for i in range(1,6):
    stats = sequence_analysis(i, sequences_worst, feature, verbose = False)
    rand_stats = sequence_analysis(i, randomized_sequences_worst, feature, verbose = False)
    x = range(1,i+1)
    y = []
    err = []
    rand_y = []
    rand_err = []
    for item in stats:
    for item in stats:
    # alternative visualization
    xx = [x[0], x[-1]]
    yy = [y[0], y[-1]]
    eerr = [err[0], err[-1]]
    rand_yy = [rand_y[0], rand_y[-1]]
    rand_eerr = [rand_err[0], rand_err[-1]]
    # change plot data
    bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
    rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
    orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
                lw = 3, markersize = 12, capthick = 3, capsize = 6)
    rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
                lw = 3, markersize = 12, capthick = 3, capsize = 6)

orig.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'lower right', shadow = True, fancybox = True)

orig.set_title('original sessions')
rand.set_title('randomized index sessions')
#plt.savefig('sessions_plos/' + feature + '.png')
In [35]:
below_bar_y = bar_y
below_rand_bar_y = rand_bar_y

# visualize change (%)
fig = plt.figure(figsize = (6,3))
both = fig.add_subplot(111)

both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')

both.set_title('Below 5th percentile Experience')
both.set_ylabel(r'$\Delta \% $' + 'performance')
both.set_xlabel('session length')
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
#both.legend(loc = 'lower left')

for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(bar_y[i],2)

for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(rand_bar_y[i],2)
matches 1.0 -> 0.0
matches 2.0 -> -11.58
matches 3.0 -> -22.71
matches 4.0 -> -25.0
matches 5.0 -> -32.26

matches 1.0 -> 0.0
matches 2.0 -> -1.11
matches 3.0 -> 6.93
matches 4.0 -> -11.54
matches 5.0 -> -3.7
In [59]:
fig = plt.figure(figsize = (10,5))
above = fig.add_subplot(121)
below = fig.add_subplot(122)

# above.plot(bar_x, above_bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
# above.plot(bar_x, above_rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')

above.bar(np.array(bar_x) - .1, above_bar_y, color = '#30a2da', width = .25, label = 'original sessions')
above.bar(np.array(bar_x) + .1, above_rand_bar_y, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')

above.set_title('High Experience Players', fontsize = font_size)
above.set_ylabel(r'$\Delta \% $ performance', fontsize = font_size)
above.set_xlabel('session length', fontsize = font_size)
above.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
above.set_xticks(bar_x, bar_x)
above.legend(loc = 'lower left')

# below.plot(bar_x, below_bar_y, color = '#30a2da', marker = 'o', lw = 2, label = 'original sessions')
# below.plot(bar_x, below_rand_bar_y, color = '#fc4f30', marker = 's', lw = 2, label = 'randomized index sessions', ls = 'dashed')
below.bar(np.array(bar_x) - .1, below_bar_y, color = '#30a2da', width = .25, label = 'original sessions')
below.bar(np.array(bar_x) + .1, below_rand_bar_y, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')

below.set_title('Low Experience Players', fontsize = font_size)
below.set_ylabel(r'$\Delta \% $ performance', fontsize = font_size)
below.set_xlabel('session length', fontsize = font_size)
below.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
below.set_xticks(bar_x, bar_x)


Session length analysis

In [60]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences:
        if len(seq.index) > 0:
    top_mean_sessions.append( np.mean(session_length) )

worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences:
        if len(seq.index) > 0:
    worst_mean_sessions.append( np.mean(session_length) )

# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='black', linewidth=2)
    # change fill color
    box.set( facecolor = 'white', alpha = .75 )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#fc4f30', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30')
WilcoxonResult(statistic=410.0, pvalue=0.0015511026381292636)
Significant difference detected :)

Considering only the first 100 matches

In [122]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences[:100]:
        if len(seq.index) > 0:
    top_mean_sessions.append( np.mean(session_length) )

worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences[:100]:
        if len(seq.index) > 0:
    worst_mean_sessions.append( np.mean(session_length) )

# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Below 5th percentile\nExperience', 'Above 95th percentile\nExperience'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='black', linewidth=2)
    # change fill color
    box.set( facecolor = 'white', alpha = .75 )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#fc4f30', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30')
WilcoxonResult(statistic=457.0, pvalue=0.0054097364279446069)
Significant difference detected :)

Considering only the first 50 matches

In [48]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences[:50]:
        if len(seq.index) > 0:
    top_mean_sessions.append( np.mean(session_length) )

worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences[:50]:
        if len(seq.index) > 0:
    worst_mean_sessions.append( np.mean(session_length) )

# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Below 5th percentile\nExperience', 'Above 95th percentile\nExperience'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='#333333', linewidth=2)
    # change fill color
    box.set( facecolor = '#30a2da', alpha = .75 )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#333333', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30', alpha=0.5)
WilcoxonResult(statistic=491.0, pvalue=0.012269831767843761)
Significant difference detected :)

Considering only the first 20 matches

In [61]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences[:20]:
        if len(seq.index) > 0:
    top_mean_sessions.append( np.mean(session_length) )
worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences[:20]:
        if len(seq.index) > 0:
    worst_mean_sessions.append( np.mean(session_length) )
# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='black', linewidth=2)
    # change fill color
    box.set( facecolor = 'white' )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#fc4f30', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30')
WilcoxonResult(statistic=491.5, pvalue=0.030661742201220407)
Significant difference detected :)
