League of Legends

Paper Notebook

In [1]:
%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#plt.style.use('bessi')
import statsmodels.api as sm
import networkx as nx
from collections import Counter
import itertools
import math
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
import ncp
from sktensor import dtensor, cp_als, tucker
import sktensor
import tensorflow as tf
from sklearn import linear_model
In [2]:
root = 'alessandro' # change this

path = '/Users/' + root + '/Dropbox/mitigate/lol/data/'

# read data in chunks
df = pd.DataFrame()
reader = pd.read_csv(path + 'structured_data_new.csv', header = None, low_memory = False, chunksize = 100 * 1024)
for chunk in reader:
    df = pd.concat([df, chunk])
In [3]:
cols = ['player_id', 'match_id', 'match_datetime', 'match_creation','match_duration','map_id',
'champion_id','team_id','assists','deaths','champ_level','winner','combat_player_score','double_kills',
'gold_earned','gold_spent','inhibitor_kills','killing_sprees','kills','largest_critical_strike','largest_killing_spree',
'largest_multi_kill','magic_damage_dealt','magic_damage_dealt_to_champions','magic_damage_taken','minions_killed',
'neutral_minions_killed','neutral_minions_killed_enemy_jungle','neutral_minions_killed_team_jungle',
'objective_player_score','penta_kills','physical_damage_dealt','physical_damage_dealt_to_champions','physical_damage_taken',
'quadra_kills','total_damage_dealt','total_damage_dealt_to_champions','total_damage_taken','total_heal',
'total_player_score','total_score_rank','total_time_crowd_control_dealt','total_units_healed','triple_kills',
'true_damage_dealt','true_damage_dealt_to_champions','true_damage_taken','unreal_kills','vision_wards_bought_in_game',
'wards_killed','wards_placed']
df.columns = cols
df['player_id'] = df['player_id'].astype(str).apply(lambda x: x[:-2])
df['match_id'] = df['match_id'].astype(str).apply(lambda x: x[:-2])
df['map_id'] = df['map_id'].astype(str).apply(lambda x: x[:-2])
df['champion_id'] = df['champion_id'].astype(str).apply(lambda x: x[:-2])
df['team_id'] = df['team_id'].astype(str).apply(lambda x: x[:-2])
In [4]:
players = df['player_id'].value_counts()

threshold = 100 # minimum number of matches per user

print 'number of users:',
print len(players[players >= threshold])
players = pd.DataFrame(players[players >= threshold].index, columns = ['id'])

df = df[df['player_id'].isin(players['id'])]
number of users: 1120
In [5]:
print 'number of matches:',
print len(df.groupby(['match_id'])['match_id'].nunique().index)
number of matches: 435118
In [6]:
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
player_stats = player_stats[player_stats[('winner','count')] >= threshold]
print player_stats.head()
             winner           
               mean count  sum
player_id                     
60412503   0.641379   290  186
21262790   0.630952   336  212
21491589   0.630252   119   75
48013821   0.628571   105   66
34999026   0.627451   102   64
In [7]:
# player_stats['performance_20'] = np.zeros(len(player_stats.index))
# player_stats['performance_50'] = np.zeros(len(player_stats.index))
# player_stats['performance_100'] = np.zeros(len(player_stats.index))

# for player in player_stats.index:
#     matches = list(df[df['player_id'] == player]['winner'])
#     first_20 = np.mean(matches[:19])
#     first_50 = np.mean(matches[:49])
#     first_100 = np.mean(matches[:99])
#     player_stats.loc[player,'performance_20'] = first_20
#     player_stats.loc[player,'performance_50'] = first_50
#     player_stats.loc[player,'performance_100'] = first_100
In [ ]:
 
In [8]:
# x = player_stats['performance_20']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])

# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 20 matches')
# plt.ylabel('total matches')
# plt.show()
In [9]:
# x = player_stats['performance_50']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])

# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 50 matches')
# plt.ylabel('total matches')
# plt.show()
In [10]:
# x = player_stats['performance_100']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])

# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 100 matches')
# plt.ylabel('total matches')
# plt.show()
In [11]:
# player_stats.columns = player_stats.columns.get_level_values(0)
# player_stats.columns = ['mean','count','sum','performance_20','performance_50','performance_100']
# print player_stats.head()
# player_stats.to_csv('player_stats_performance.csv')
In [12]:
# histogram
data = np.array(player_stats[('winner','count')].astype(float))
plt.hist(data, bins = 50, histtype='stepfilled', alpha = .75, color = '#30a2da')
plt.xlabel('matches')
plt.ylabel('counts')
plt.xlim(0,2500)
plt.savefig('distribution_matches.pdf')
print np.median(data)
print np.mean(data)
270.0
528.536607143
In [58]:
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)


font_size = 16
# ax1
ax1.scatter(player_stats[('winner','count')], player_stats[('winner','mean')], marker = 'o', alpha = .25, color = '#30a2da')
ax1.set_xlabel('matches', fontsize=font_size)
ax1.set_ylabel('performance', fontsize=font_size)
ax1.set_ylim(0,1)
#ax1.set_xlim(0,2500)
ax1.set_xscale('log', basex=2)
ax1.axhline(y = .5, xmin=0, xmax=2500, linewidth=2, color = 'k', ls = 'dashed')

#ax2
data = np.array(player_stats[('winner','mean')].astype(float))
ax2.hist(data, bins = 20, histtype = 'stepfilled', alpha = .75, color = '#30a2da')
ax2.set_xlabel('performance', fontsize=font_size)
ax2.set_ylabel('counts', fontsize=font_size)
ax2.set_xlim(0,1)
ax2.axvline(x = .5, ymin=0, ymax=300, linewidth=2, color = 'k', ls = 'dashed')

plt.savefig('performance_scatterplot.pdf')

print np.mean(data)
print np.std(data)

cor = np.corrcoef(player_stats[('winner','count')], player_stats[('winner','mean')])
print cor
0.505362330253
0.0352840610324
[[ 1.          0.02210729]
 [ 0.02210729  1.        ]]

Gaming Sessions

In [14]:
# computing aggregate interarrival times
inter_arrival_times = []
for player_id in player_stats.index:
    temp = df[df['player_id'] == player_id]
    datetime = pd.to_datetime(temp['match_datetime'])
    inter = datetime.diff().astype('timedelta64[s]')[1:] / 3600
    duration = temp['match_duration'] / 3600
    inter_arrival_times.extend(np.subtract(inter, duration[:-1]))
    break
In [15]:
print np.median(inter_arrival_times)
m = np.median(inter_arrival_times)
0.255833333333
In [16]:
class Sequence(object):
    def __init__(self):
        self.player_id = None
        self.sequences = list()
In [17]:
# compute original sessions
sequences = []
for player_id in player_stats.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            old_id = i
            if len(df_copy.index) > 0:
                obj.sequences.append(df_copy)

    sequences.append(obj)
In [18]:
# compute randomized index sessions
randomized_sequences = []
for player_id in player_stats.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            df_copy = df_copy.sample(frac=1).reset_index(drop=True)
            old_id = i
            if len(df_copy.index) > 0:
                obj.sequences.append(df_copy)

    randomized_sequences.append(obj)
In [19]:
### code to generate figures
from collections import OrderedDict
def sequence_analysis(sequence_length, sequences, feature, verbose = True):
    data = dict()
    for seq in sequences:
        for df in seq.sequences:
            if len(df.index) == sequence_length:
                for match in xrange(1,sequence_length+1):
                    temp = df[feature].reset_index(0).drop('index',1)
                    data.setdefault(str(match), list()).extend( [temp[feature].ix[match-1]] )
    
    stats = dict()
    for session in range(1,sequence_length+1):
        stats.setdefault(str(session), dict(mu = None, sigma = None, n = None, ci = None))
        stats[str(session)]['mu'] = np.mean(data[str(session)])
        stats[str(session)]['sigma'] = np.std(data[str(session)])
        stats[str(session)]['n'] = len(data[str(session)])
        stats[str(session)]['ci'] = 1.96 * np.std(data[str(session)]) / np.sqrt(len(data[str(session)]))

        if verbose == True:
            print 'Session =',
            print session,
            print '| Sequence length =',
            print sequence_length
            print 'mu =',
            print stats[str(session)]['mu']
            print 'sigma =',
            print stats[str(session)]['sigma']
            print 'n obs =',
            print stats[str(session)]['n']
            print 'n ci =',
            print stats[str(session)]['ci']
            print '-' * 40
    
    return OrderedDict(sorted(stats.items(), key=lambda t: t[0]))
In [20]:
# feature = 'winner'

# ls = ['solid','solid','dashed','dashdot','dotted']
# sym = ['o','*','s','v','^']
# plt.figure()
# plt.xlabel('match position')
# plt.ylabel(feature)
# plt.xlim(0.9,6.1)
# plt.xticks(np.arange(1,7), xrange(1,7))

# bar_x = []
# bar_y = []

# for i in range(1,6):
#     stats = sequence_analysis(i, sequences, feature, verbose = False)
#     x = range(1,i+1)
#     y = []
#     err = []
    
#     for item in stats:
#         y.append(stats[item]['mu'])
#         err.append(stats[item]['ci'])
    
#     # alternative visualization
#     xx = [x[0], x[-1]]
#     yy = [y[0], y[-1]]
#     eerr = [err[0], err[-1]]
    
#     # bar plot data
#     bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
#     bar_x.append(x[-1])
    
#     plt.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1],
#                 lw = 3, markersize = 12, capthick = 3, capsize = 6)

# plt.legend([str(i) + ' matches' for i in range(1,6)], 
#            loc = 'upper right', shadow = True, fancybox = True)
# plt.title('original sessions')
# #plt.savefig('sessions_plos/' + feature + '.png')
# plt.show()
In [21]:
# plt.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# plt.title('original sessions')
# plt.ylabel(r'$\Delta \%$')
# plt.xlabel('session length')
# plt.xlim(0.5,5.5)
# plt.ylim(-15,2)
# ax1.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# plt.xticks(bar_x, bar_x)
# plt.show()
In [22]:
# feature = 'winner'

# ls = ['solid','solid','dashed','dashdot','dotted']
# sym = ['o','*','s','v','^']
# plt.figure()
# plt.xlabel('match position')
# plt.ylabel(feature)
# plt.xlim(0.9,6.1)
# plt.xticks(np.arange(1,7), xrange(1,7))

# rand_bar_x = []
# rand_bar_y = []

# for i in range(1,6):
#     stats = sequence_analysis(i, randomized_sequences, feature, verbose = False)
#     x = range(1,i+1)
#     y = []
#     err = []
    
#     for item in stats:
#         y.append(stats[item]['mu'])
#         err.append(stats[item]['ci'])
    
#     # alternative visualization
#     xx = [x[0], x[-1]]
#     yy = [y[0], y[-1]]
#     eerr = [err[0], err[-1]]
    
#     # bar plot data
#     rand_bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
#     rand_bar_x.append(x[-1])
    
#     plt.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1],
#                 lw = 3, markersize = 12, capthick = 3, capsize = 6)

# plt.legend([str(i) + ' matches' for i in range(1,6)], 
#            loc = 'lower right', shadow = True, fancybox = True)
# #plt.savefig('sessions_plos/' + feature + '.png')
# plt.title('randomized index sessions')
# plt.show()
In [23]:
# plt.plot(rand_bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# plt.title('randomized index sessions')
# plt.ylabel(r'$\Delta \%$')
# plt.xlabel('session length')
# plt.xlim(0.5,5.5)
# plt.ylim(-15,2)
# ax1.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# plt.xticks(rand_bar_x, rand_bar_x)
# plt.show()

All players

In [26]:
feature = 'winner'

ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)

orig.set_xlabel('match position', fontsize=font_size)
orig.set_ylabel('performance', fontsize=font_size)
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))

rand.set_xlabel('match position', fontsize=font_size)
rand.set_ylabel('performance', fontsize=font_size)
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))

rand_bar_y = []
bar_x = []
bar_y = []

for i in range(1,6):
    stats = sequence_analysis(i, sequences, feature, verbose = False)
    rand_stats = sequence_analysis(i, randomized_sequences, feature, verbose = False)
    
    x = range(1,i+1)
    y = []
    err = []
    
    rand_y = []
    rand_err = []
    
    for item in stats:
        y.append(stats[item]['mu'])
        err.append(stats[item]['ci'])
    
    for item in stats:
        rand_y.append(rand_stats[item]['mu'])
        rand_err.append(rand_stats[item]['ci'])
    
    # alternative visualization
    xx = [x[0], x[-1]]
    yy = [y[0], y[-1]]
    eerr = [err[0], err[-1]]
    
    rand_yy = [rand_y[0], rand_y[-1]]
    rand_eerr = [rand_err[0], rand_err[-1]]
    
    # change plot data
    bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
    bar_x.append(x[-1])
    rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
    
#     orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
#                 lw = 3, markersize = 8, capthick = 3, capsize = 6)
#     rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
#                 lw = 3, markersize = 8, capthick = 3, capsize = 6)

    orig.errorbar(x, y, yerr = err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
                lw = 3, markersize = 10, capthick = 3, capsize = 6)
    rand.errorbar(x, rand_y, yerr = rand_err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
                lw = 3, markersize = 10, capthick = 3, capsize = 6)

# orig.legend([str(i) + ' matches' for i in range(1,6)], 
#            loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'lower right', shadow = True, fancybox = True)

#orig.set_title('original sessions', fontsize=font_size)
rand.set_title('randomized index sessions', fontsize=font_size)
plt.savefig('deterioration.pdf')
plt.show()
In [27]:
# visualize change (%)
# fig = plt.figure(figsize = (10,3))
# fig.subplots_adjust(wspace=.5)
# orig = fig.add_subplot(121)
# rand = fig.add_subplot(122)

# orig.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# orig.set_title('original sessions')
# orig.set_ylabel(r'$\Delta \% $' + feature)
# orig.set_xlabel('session length')
# orig.set_xlim(0.5,5.5)
# orig.set_ylim(-15,2)
# orig.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# orig.set_xticks(bar_x, bar_x)

# rand.plot(bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# rand.set_title('randomized index sessions')
# rand.set_ylabel(r'$\Delta \% $' + feature)
# rand.set_xlabel('session length')
# rand.set_xlim(0.5,5.5)
# rand.set_ylim(-15,2)
# rand.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# rand.set_xticks(bar_x, bar_x)

# visualize change (%)
fig = plt.figure(figsize = (5,5))
# fig.subplots_adjust(wspace=.25)
both = fig.add_subplot(111)

both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', lw = 2, label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', lw = 2, label = 'randomized index sessions', ls = 'dashed')

both.set_title('')
both.set_ylabel(r'$\Delta \% $' + 'performance', fontsize = font_size)
both.set_xlabel('session length', fontsize = font_size)
both.set_xlim(0.5,5.5)
both.set_ylim(-35,15)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
both.legend(loc = 'lower left', fontsize = 12)
plt.savefig('randomized_deterioration_all.pdf')

print 'ORIGINAL SESSIONS'
for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(bar_y[i],2)

print
print 'RANDOMIZED INDEX SESSIONS'
for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(rand_bar_y[i],2)
ORIGINAL SESSIONS
matches 1.0 -> 0.0
matches 2.0 -> -1.06
matches 3.0 -> -8.98
matches 4.0 -> -9.83
matches 5.0 -> -12.78

RANDOMIZED INDEX SESSIONS
matches 1.0 -> 0.0
matches 2.0 -> -0.95
matches 3.0 -> -0.16
matches 4.0 -> -1.57
matches 5.0 -> 1.89

Above 95th percentile

In [28]:
# compute original sessions
player_stats_top = player_stats[player_stats[('winner','count')] >= np.percentile(player_stats[('winner','count')], 95)]

sequences_top = []
for player_id in player_stats_top.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            old_id = i
            if len(df_copy.index) > 0:
                obj.sequences.append(df_copy)

    sequences_top.append(obj)
In [29]:
# compute randomized index sessions
randomized_sequences_top = []
for player_id in player_stats_top.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            df_copy = df_copy.sample(frac=1).reset_index(drop=True)
            old_id = i
            if len(df_copy.index) > 0:
                obj.sequences.append(df_copy)

    randomized_sequences_top.append(obj)
In [30]:
feature = 'winner'

ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.5)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)

orig.set_xlabel('match position')
orig.set_ylabel('performance')
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))

rand.set_xlabel('match position')
rand.set_ylabel('performance')
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))

rand_bar_y = []
bar_x = []
bar_y = []

for i in range(1,6):
    stats = sequence_analysis(i, sequences_top, feature, verbose = False)
    rand_stats = sequence_analysis(i, randomized_sequences_top, feature, verbose = False)
    
    x = range(1,i+1)
    y = []
    err = []
    
    rand_y = []
    rand_err = []
    
    for item in stats:
        y.append(stats[item]['mu'])
        err.append(stats[item]['ci'])
    
    for item in stats:
        rand_y.append(rand_stats[item]['mu'])
        rand_err.append(rand_stats[item]['ci'])
    
    # alternative visualization
    xx = [x[0], x[-1]]
    yy = [y[0], y[-1]]
    eerr = [err[0], err[-1]]
    
    rand_yy = [rand_y[0], rand_y[-1]]
    rand_eerr = [rand_err[0], rand_err[-1]]
    
    # change plot data
    bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
    bar_x.append(x[-1])
    rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
    
    orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
                lw = 3, markersize = 12, capthick = 3, capsize = 6)
    rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
                lw = 3, markersize = 12, capthick = 3, capsize = 6)

orig.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'lower right', shadow = True, fancybox = True)

orig.set_title('original sessions')
rand.set_title('randomized index sessions')
#plt.savefig('sessions_plos/' + feature + '.png')
plt.show()
In [31]:
# visualize change (%)
# fig = plt.figure(figsize = (6,3))
# fig.subplots_adjust(wspace=.5)
# orig = fig.add_subplot(121)
# rand = fig.add_subplot(122)

# orig.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# orig.set_title('original sessions')
# orig.set_ylabel(r'$\Delta \% $' + feature)
# orig.set_xlabel('session length')
# orig.set_xlim(0.5,5.5)
# orig.set_ylim(-15,2)
# orig.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# orig.set_xticks(bar_x, bar_x)

# rand.plot(bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# rand.set_title('randomized index sessions')
# rand.set_ylabel(r'$\Delta \% $' + feature)
# rand.set_xlabel('session length')
# rand.set_xlim(0.5,5.5)
# rand.set_ylim(-15,2)
# rand.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# rand.set_xticks(bar_x, bar_x)

above_bar_y = bar_y
above_rand_bar_y = rand_bar_y

# visualize change (%)
fig = plt.figure(figsize = (6,3))
fig.subplots_adjust(wspace=.5)
both = fig.add_subplot(111)

both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')

both.set_title('Above 95th percentile Experience')
both.set_ylabel(r'$\Delta \% $' + 'performance')
both.set_xlabel('session length')
both.set_xlim(0.5,5.5)
both.set_ylim(-35,15)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
both.legend(loc = 'lower left')


print 'ORIGINAL SESSIONS'
for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(bar_y[i],2)

print
print 'RANDOMIZED INDEX SESSIONS'
for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(rand_bar_y[i],2)
ORIGINAL SESSIONS
matches 1.0 -> 0.0
matches 2.0 -> -2.47
matches 3.0 -> -9.08
matches 4.0 -> -11.64
matches 5.0 -> -10.77

RANDOMIZED INDEX SESSIONS
matches 1.0 -> 0.0
matches 2.0 -> -0.39
matches 3.0 -> 1.28
matches 4.0 -> 1.33
matches 5.0 -> 0.12

Below 5th percentile

In [32]:
# compute original sessions
player_stats_worst = player_stats[player_stats[('winner','count')] <= np.percentile(player_stats[('winner','count')], 5)]

sequences_worst = []
for player_id in player_stats_worst.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            old_id = i
            if len(df_copy.index) > 0:
                obj.sequences.append(df_copy)

    sequences_worst.append(obj)
In [33]:
# compute randomized index sessions
randomized_sequences_worst = []
for player_id in player_stats_worst.index:
    old_id = 0
    obj = Sequence()
    obj.player_id = player_id
    temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
    for i in range(1,len(temp.index)):
        dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
        diff = dates.diff().astype('timedelta64[s]') / 3600
        duration = temp['match_duration'].ix[i - 1 : i] / 3600
        delta = float(np.subtract(diff[1:], duration[:-1]))
        
        if delta >= m:
            df_copy = temp.ix[old_id:i-1].copy()
            df_copy = df_copy.sample(frac=1).reset_index(drop=True)
            old_id = i
            if len(df_copy.index) > 0:
                obj.sequences.append(df_copy)

    randomized_sequences_worst.append(obj)
In [34]:
feature = 'winner'

ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']

fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.5)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)

orig.set_xlabel('match position')
orig.set_ylabel('performance')
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))

rand.set_xlabel('match position')
rand.set_ylabel('performance')
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))

rand_bar_y = []
bar_x = []
bar_y = []

for i in range(1,6):
    stats = sequence_analysis(i, sequences_worst, feature, verbose = False)
    rand_stats = sequence_analysis(i, randomized_sequences_worst, feature, verbose = False)
    
    x = range(1,i+1)
    y = []
    err = []
    
    rand_y = []
    rand_err = []
    
    for item in stats:
        y.append(stats[item]['mu'])
        err.append(stats[item]['ci'])
    
    for item in stats:
        rand_y.append(rand_stats[item]['mu'])
        rand_err.append(rand_stats[item]['ci'])
    
    # alternative visualization
    xx = [x[0], x[-1]]
    yy = [y[0], y[-1]]
    eerr = [err[0], err[-1]]
    
    rand_yy = [rand_y[0], rand_y[-1]]
    rand_eerr = [rand_err[0], rand_err[-1]]
    
    # change plot data
    bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
    bar_x.append(x[-1])
    rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
    
    orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1], 
                lw = 3, markersize = 12, capthick = 3, capsize = 6)
    rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
                lw = 3, markersize = 12, capthick = 3, capsize = 6)

orig.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)], 
           loc = 'lower right', shadow = True, fancybox = True)

orig.set_title('original sessions')
rand.set_title('randomized index sessions')
#plt.savefig('sessions_plos/' + feature + '.png')
plt.show()
In [35]:
# visualize change (%)
# fig = plt.figure(figsize = (10,3))
# fig.subplots_adjust(wspace=.5)
# orig = fig.add_subplot(121)
# rand = fig.add_subplot(122)

# orig.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# orig.set_title('original sessions')
# orig.set_ylabel(r'$\Delta \% $' + feature)
# orig.set_xlabel('session length')
# orig.set_xlim(0.5,5.5)
# orig.set_ylim(-35,2)
# orig.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# orig.set_xticks(bar_x, bar_x)

# rand.plot(bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# rand.set_title('randomized index sessions')
# rand.set_ylabel(r'$\Delta \% $' + feature)
# rand.set_xlabel('session length')
# rand.set_xlim(0.5,5.5)
# rand.set_ylim(-35,2)
# rand.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# rand.set_xticks(bar_x, bar_x)


below_bar_y = bar_y
below_rand_bar_y = rand_bar_y

# visualize change (%)
fig = plt.figure(figsize = (6,3))
fig.subplots_adjust(wspace=.5)
both = fig.add_subplot(111)

both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')

both.set_title('Below 5th percentile Experience')
both.set_ylabel(r'$\Delta \% $' + 'performance')
both.set_xlabel('session length')
both.set_xlim(0.5,5.5)
both.set_ylim(-35,15)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
#both.legend(loc = 'lower left')

print 'ORIGINAL SESSIONS'
for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(bar_y[i],2)

print
print 'RANDOMIZED INDEX SESSIONS'
for i in range(len(bar_x)):
    print 'matches',
    print round(bar_x[i],2),
    print '->',
    print round(rand_bar_y[i],2)
ORIGINAL SESSIONS
matches 1.0 -> 0.0
matches 2.0 -> -11.58
matches 3.0 -> -22.71
matches 4.0 -> -25.0
matches 5.0 -> -32.26

RANDOMIZED INDEX SESSIONS
matches 1.0 -> 0.0
matches 2.0 -> -1.11
matches 3.0 -> 6.93
matches 4.0 -> -11.54
matches 5.0 -> -3.7
In [59]:
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
above = fig.add_subplot(121)
below = fig.add_subplot(122)

# above.plot(bar_x, above_bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
# above.plot(bar_x, above_rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')

above.bar(np.array(bar_x) - .1, above_bar_y, color = '#30a2da', width = .25, label = 'original sessions')
above.bar(np.array(bar_x) + .1, above_rand_bar_y, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')


above.set_title('High Experience Players', fontsize = font_size)
above.set_ylabel(r'$\Delta \% $ performance', fontsize = font_size)
above.set_xlabel('session length', fontsize = font_size)
above.set_xlim(0.5,5.5)
above.set_ylim(-35,15)
above.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
above.set_xticks(bar_x, bar_x)
above.legend(loc = 'lower left')

# below.plot(bar_x, below_bar_y, color = '#30a2da', marker = 'o', lw = 2, label = 'original sessions')
# below.plot(bar_x, below_rand_bar_y, color = '#fc4f30', marker = 's', lw = 2, label = 'randomized index sessions', ls = 'dashed')
below.bar(np.array(bar_x) - .1, below_bar_y, color = '#30a2da', width = .25, label = 'original sessions')
below.bar(np.array(bar_x) + .1, below_rand_bar_y, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')


below.set_title('Low Experience Players', fontsize = font_size)
below.set_ylabel(r'$\Delta \% $ performance', fontsize = font_size)
below.set_xlabel('session length', fontsize = font_size)
below.set_xlim(0.5,5.5)
below.set_ylim(-35,15)
below.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
below.set_xticks(bar_x, bar_x)

plt.savefig('deterioration_above95below5.pdf')

Session length analysis

In [60]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    top_mean_sessions.append( np.mean(session_length) )

worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    worst_mean_sessions.append( np.mean(session_length) )

# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
else:
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='black', linewidth=2)
    # change fill color
    box.set( facecolor = 'white', alpha = .75 )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#fc4f30', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30')
plt.savefig('average_session_all.pdf')
plt.show()
WilcoxonResult(statistic=410.0, pvalue=0.0015511026381292636)
Significant difference detected :)

Considering only the first 100 matches

In [122]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences[:100]:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    top_mean_sessions.append( np.mean(session_length) )

worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences[:100]:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    worst_mean_sessions.append( np.mean(session_length) )

# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
else:
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Below 5th percentile\nExperience', 'Above 95th percentile\nExperience'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='black', linewidth=2)
    # change fill color
    box.set( facecolor = 'white', alpha = .75 )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#fc4f30', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30')
plt.show()
WilcoxonResult(statistic=457.0, pvalue=0.0054097364279446069)
Significant difference detected :)

Considering only the first 50 matches

In [48]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences[:50]:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    top_mean_sessions.append( np.mean(session_length) )

worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences[:50]:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    worst_mean_sessions.append( np.mean(session_length) )

# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
else:
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Below 5th percentile\nExperience', 'Above 95th percentile\nExperience'])
ax.set_ylabel('average session length')

for box in bp['boxes']:
    # change outline color
    box.set( color='#333333', linewidth=2)
    # change fill color
    box.set( facecolor = '#30a2da', alpha = .75 )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#333333', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30', alpha=0.5)
plt.show()
WilcoxonResult(statistic=491.0, pvalue=0.012269831767843761)
Significant difference detected :)

Considering only the first 20 matches

In [61]:
top_mean_sessions = []
for player in range(len(sequences_top)):
    session_length = []
    for seq in sequences_top[player].sequences[:20]:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    top_mean_sessions.append( np.mean(session_length) )
    
worst_mean_sessions = []
for player in range(len(sequences_worst)):
    session_length = []
    for seq in sequences_worst[player].sequences[:20]:
        if len(seq.index) > 0:
            session_length.append(len(seq.index)) 
    worst_mean_sessions.append( np.mean(session_length) )
    
# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
    print 'No significant difference :('
else:
    print 'Significant difference detected :)'

fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'])
ax.set_ylabel('average session length')
ax.set_title('')

for box in bp['boxes']:
    # change outline color
    box.set( color='black', linewidth=2)
    # change fill color
    box.set( facecolor = 'white' )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#333333', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#333333', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#fc4f30', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='s', markerfacecolor='#fc4f30')
plt.savefig('average_session_20.pdf')
plt.show()
WilcoxonResult(statistic=491.5, pvalue=0.030661742201220407)
Significant difference detected :)
In [ ]: