%matplotlib inline
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#plt.style.use('bessi')
import statsmodels.api as sm
import networkx as nx
from collections import Counter
import itertools
import math
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
import ncp
from sktensor import dtensor, cp_als, tucker
import sktensor
import tensorflow as tf
from sklearn import linear_model
root = 'alessandro' # change this
path = '/Users/' + root + '/Dropbox/mitigate/lol/data/'
# read data in chunks
df = pd.DataFrame()
reader = pd.read_csv(path + 'structured_data_new.csv', header = None, low_memory = False, chunksize = 100 * 1024)
for chunk in reader:
df = pd.concat([df, chunk])
cols = ['player_id', 'match_id', 'match_datetime', 'match_creation','match_duration','map_id',
'champion_id','team_id','assists','deaths','champ_level','winner','combat_player_score','double_kills',
'gold_earned','gold_spent','inhibitor_kills','killing_sprees','kills','largest_critical_strike','largest_killing_spree',
'largest_multi_kill','magic_damage_dealt','magic_damage_dealt_to_champions','magic_damage_taken','minions_killed',
'neutral_minions_killed','neutral_minions_killed_enemy_jungle','neutral_minions_killed_team_jungle',
'objective_player_score','penta_kills','physical_damage_dealt','physical_damage_dealt_to_champions','physical_damage_taken',
'quadra_kills','total_damage_dealt','total_damage_dealt_to_champions','total_damage_taken','total_heal',
'total_player_score','total_score_rank','total_time_crowd_control_dealt','total_units_healed','triple_kills',
'true_damage_dealt','true_damage_dealt_to_champions','true_damage_taken','unreal_kills','vision_wards_bought_in_game',
'wards_killed','wards_placed']
df.columns = cols
df['player_id'] = df['player_id'].astype(str).apply(lambda x: x[:-2])
df['match_id'] = df['match_id'].astype(str).apply(lambda x: x[:-2])
df['map_id'] = df['map_id'].astype(str).apply(lambda x: x[:-2])
df['champion_id'] = df['champion_id'].astype(str).apply(lambda x: x[:-2])
df['team_id'] = df['team_id'].astype(str).apply(lambda x: x[:-2])
players = df['player_id'].value_counts()
threshold = 100 # minimum number of matches per user
print 'number of users:',
print len(players[players >= threshold])
players = pd.DataFrame(players[players >= threshold].index, columns = ['id'])
df = df[df['player_id'].isin(players['id'])]
print 'number of matches:',
print len(df.groupby(['match_id'])['match_id'].nunique().index)
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
player_stats = player_stats[player_stats[('winner','count')] >= threshold]
print player_stats.head()
# player_stats['performance_20'] = np.zeros(len(player_stats.index))
# player_stats['performance_50'] = np.zeros(len(player_stats.index))
# player_stats['performance_100'] = np.zeros(len(player_stats.index))
# for player in player_stats.index:
# matches = list(df[df['player_id'] == player]['winner'])
# first_20 = np.mean(matches[:19])
# first_50 = np.mean(matches[:49])
# first_100 = np.mean(matches[:99])
# player_stats.loc[player,'performance_20'] = first_20
# player_stats.loc[player,'performance_50'] = first_50
# player_stats.loc[player,'performance_100'] = first_100
# x = player_stats['performance_20']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])
# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 20 matches')
# plt.ylabel('total matches')
# plt.show()
# x = player_stats['performance_50']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])
# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 50 matches')
# plt.ylabel('total matches')
# plt.show()
# x = player_stats['performance_100']
# y = player_stats[('winner','count')]
# print np.corrcoef(x[:-1],y[:-1])
# plt.scatter(x, y, color = '#30a2da', alpha = .75)
# plt.title('')
# plt.ylim(0,2500)
# plt.xlim(0,1)
# plt.xlabel('performance in the first 100 matches')
# plt.ylabel('total matches')
# plt.show()
# player_stats.columns = player_stats.columns.get_level_values(0)
# player_stats.columns = ['mean','count','sum','performance_20','performance_50','performance_100']
# print player_stats.head()
# player_stats.to_csv('player_stats_performance.csv')
# histogram
data = np.array(player_stats[('winner','count')].astype(float))
plt.hist(data, bins = 50, histtype='stepfilled', alpha = .75, color = '#30a2da')
plt.xlabel('matches')
plt.ylabel('counts')
plt.xlim(0,2500)
plt.savefig('distribution_matches.pdf')
print np.median(data)
print np.mean(data)
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
font_size = 16
# ax1
ax1.scatter(player_stats[('winner','count')], player_stats[('winner','mean')], marker = 'o', alpha = .25, color = '#30a2da')
ax1.set_xlabel('matches', fontsize=font_size)
ax1.set_ylabel('performance', fontsize=font_size)
ax1.set_ylim(0,1)
#ax1.set_xlim(0,2500)
ax1.set_xscale('log', basex=2)
ax1.axhline(y = .5, xmin=0, xmax=2500, linewidth=2, color = 'k', ls = 'dashed')
#ax2
data = np.array(player_stats[('winner','mean')].astype(float))
ax2.hist(data, bins = 20, histtype = 'stepfilled', alpha = .75, color = '#30a2da')
ax2.set_xlabel('performance', fontsize=font_size)
ax2.set_ylabel('counts', fontsize=font_size)
ax2.set_xlim(0,1)
ax2.axvline(x = .5, ymin=0, ymax=300, linewidth=2, color = 'k', ls = 'dashed')
plt.savefig('performance_scatterplot.pdf')
print np.mean(data)
print np.std(data)
cor = np.corrcoef(player_stats[('winner','count')], player_stats[('winner','mean')])
print cor
# computing aggregate interarrival times
inter_arrival_times = []
for player_id in player_stats.index:
temp = df[df['player_id'] == player_id]
datetime = pd.to_datetime(temp['match_datetime'])
inter = datetime.diff().astype('timedelta64[s]')[1:] / 3600
duration = temp['match_duration'] / 3600
inter_arrival_times.extend(np.subtract(inter, duration[:-1]))
break
print np.median(inter_arrival_times)
m = np.median(inter_arrival_times)
class Sequence(object):
def __init__(self):
self.player_id = None
self.sequences = list()
# compute original sessions
sequences = []
for player_id in player_stats.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
sequences.append(obj)
# compute randomized index sessions
randomized_sequences = []
for player_id in player_stats.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
df_copy = df_copy.sample(frac=1).reset_index(drop=True)
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
randomized_sequences.append(obj)
### code to generate figures
from collections import OrderedDict
def sequence_analysis(sequence_length, sequences, feature, verbose = True):
data = dict()
for seq in sequences:
for df in seq.sequences:
if len(df.index) == sequence_length:
for match in xrange(1,sequence_length+1):
temp = df[feature].reset_index(0).drop('index',1)
data.setdefault(str(match), list()).extend( [temp[feature].ix[match-1]] )
stats = dict()
for session in range(1,sequence_length+1):
stats.setdefault(str(session), dict(mu = None, sigma = None, n = None, ci = None))
stats[str(session)]['mu'] = np.mean(data[str(session)])
stats[str(session)]['sigma'] = np.std(data[str(session)])
stats[str(session)]['n'] = len(data[str(session)])
stats[str(session)]['ci'] = 1.96 * np.std(data[str(session)]) / np.sqrt(len(data[str(session)]))
if verbose == True:
print 'Session =',
print session,
print '| Sequence length =',
print sequence_length
print 'mu =',
print stats[str(session)]['mu']
print 'sigma =',
print stats[str(session)]['sigma']
print 'n obs =',
print stats[str(session)]['n']
print 'n ci =',
print stats[str(session)]['ci']
print '-' * 40
return OrderedDict(sorted(stats.items(), key=lambda t: t[0]))
# feature = 'winner'
# ls = ['solid','solid','dashed','dashdot','dotted']
# sym = ['o','*','s','v','^']
# plt.figure()
# plt.xlabel('match position')
# plt.ylabel(feature)
# plt.xlim(0.9,6.1)
# plt.xticks(np.arange(1,7), xrange(1,7))
# bar_x = []
# bar_y = []
# for i in range(1,6):
# stats = sequence_analysis(i, sequences, feature, verbose = False)
# x = range(1,i+1)
# y = []
# err = []
# for item in stats:
# y.append(stats[item]['mu'])
# err.append(stats[item]['ci'])
# # alternative visualization
# xx = [x[0], x[-1]]
# yy = [y[0], y[-1]]
# eerr = [err[0], err[-1]]
# # bar plot data
# bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
# bar_x.append(x[-1])
# plt.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1],
# lw = 3, markersize = 12, capthick = 3, capsize = 6)
# plt.legend([str(i) + ' matches' for i in range(1,6)],
# loc = 'upper right', shadow = True, fancybox = True)
# plt.title('original sessions')
# #plt.savefig('sessions_plos/' + feature + '.png')
# plt.show()
# plt.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# plt.title('original sessions')
# plt.ylabel(r'$\Delta \%$')
# plt.xlabel('session length')
# plt.xlim(0.5,5.5)
# plt.ylim(-15,2)
# ax1.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# plt.xticks(bar_x, bar_x)
# plt.show()
# feature = 'winner'
# ls = ['solid','solid','dashed','dashdot','dotted']
# sym = ['o','*','s','v','^']
# plt.figure()
# plt.xlabel('match position')
# plt.ylabel(feature)
# plt.xlim(0.9,6.1)
# plt.xticks(np.arange(1,7), xrange(1,7))
# rand_bar_x = []
# rand_bar_y = []
# for i in range(1,6):
# stats = sequence_analysis(i, randomized_sequences, feature, verbose = False)
# x = range(1,i+1)
# y = []
# err = []
# for item in stats:
# y.append(stats[item]['mu'])
# err.append(stats[item]['ci'])
# # alternative visualization
# xx = [x[0], x[-1]]
# yy = [y[0], y[-1]]
# eerr = [err[0], err[-1]]
# # bar plot data
# rand_bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
# rand_bar_x.append(x[-1])
# plt.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1],
# lw = 3, markersize = 12, capthick = 3, capsize = 6)
# plt.legend([str(i) + ' matches' for i in range(1,6)],
# loc = 'lower right', shadow = True, fancybox = True)
# #plt.savefig('sessions_plos/' + feature + '.png')
# plt.title('randomized index sessions')
# plt.show()
# plt.plot(rand_bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# plt.title('randomized index sessions')
# plt.ylabel(r'$\Delta \%$')
# plt.xlabel('session length')
# plt.xlim(0.5,5.5)
# plt.ylim(-15,2)
# ax1.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# plt.xticks(rand_bar_x, rand_bar_x)
# plt.show()
feature = 'winner'
ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)
orig.set_xlabel('match position', fontsize=font_size)
orig.set_ylabel('performance', fontsize=font_size)
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))
rand.set_xlabel('match position', fontsize=font_size)
rand.set_ylabel('performance', fontsize=font_size)
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))
rand_bar_y = []
bar_x = []
bar_y = []
for i in range(1,6):
stats = sequence_analysis(i, sequences, feature, verbose = False)
rand_stats = sequence_analysis(i, randomized_sequences, feature, verbose = False)
x = range(1,i+1)
y = []
err = []
rand_y = []
rand_err = []
for item in stats:
y.append(stats[item]['mu'])
err.append(stats[item]['ci'])
for item in stats:
rand_y.append(rand_stats[item]['mu'])
rand_err.append(rand_stats[item]['ci'])
# alternative visualization
xx = [x[0], x[-1]]
yy = [y[0], y[-1]]
eerr = [err[0], err[-1]]
rand_yy = [rand_y[0], rand_y[-1]]
rand_eerr = [rand_err[0], rand_err[-1]]
# change plot data
bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
bar_x.append(x[-1])
rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
# orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
# lw = 3, markersize = 8, capthick = 3, capsize = 6)
# rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
# lw = 3, markersize = 8, capthick = 3, capsize = 6)
orig.errorbar(x, y, yerr = err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 10, capthick = 3, capsize = 6)
rand.errorbar(x, rand_y, yerr = rand_err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 10, capthick = 3, capsize = 6)
# orig.legend([str(i) + ' matches' for i in range(1,6)],
# loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'lower right', shadow = True, fancybox = True)
#orig.set_title('original sessions', fontsize=font_size)
rand.set_title('randomized index sessions', fontsize=font_size)
plt.savefig('deterioration.pdf')
plt.show()
# visualize change (%)
# fig = plt.figure(figsize = (10,3))
# fig.subplots_adjust(wspace=.5)
# orig = fig.add_subplot(121)
# rand = fig.add_subplot(122)
# orig.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# orig.set_title('original sessions')
# orig.set_ylabel(r'$\Delta \% $' + feature)
# orig.set_xlabel('session length')
# orig.set_xlim(0.5,5.5)
# orig.set_ylim(-15,2)
# orig.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# orig.set_xticks(bar_x, bar_x)
# rand.plot(bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# rand.set_title('randomized index sessions')
# rand.set_ylabel(r'$\Delta \% $' + feature)
# rand.set_xlabel('session length')
# rand.set_xlim(0.5,5.5)
# rand.set_ylim(-15,2)
# rand.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# rand.set_xticks(bar_x, bar_x)
# visualize change (%)
fig = plt.figure(figsize = (5,5))
# fig.subplots_adjust(wspace=.25)
both = fig.add_subplot(111)
both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', lw = 2, label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', lw = 2, label = 'randomized index sessions', ls = 'dashed')
both.set_title('')
both.set_ylabel(r'$\Delta \% $' + 'performance', fontsize = font_size)
both.set_xlabel('session length', fontsize = font_size)
both.set_xlim(0.5,5.5)
both.set_ylim(-35,15)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
both.legend(loc = 'lower left', fontsize = 12)
plt.savefig('randomized_deterioration_all.pdf')
print 'ORIGINAL SESSIONS'
for i in range(len(bar_x)):
print 'matches',
print round(bar_x[i],2),
print '->',
print round(bar_y[i],2)
print
print 'RANDOMIZED INDEX SESSIONS'
for i in range(len(bar_x)):
print 'matches',
print round(bar_x[i],2),
print '->',
print round(rand_bar_y[i],2)
# compute original sessions
player_stats_top = player_stats[player_stats[('winner','count')] >= np.percentile(player_stats[('winner','count')], 95)]
sequences_top = []
for player_id in player_stats_top.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
sequences_top.append(obj)
# compute randomized index sessions
randomized_sequences_top = []
for player_id in player_stats_top.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
df_copy = df_copy.sample(frac=1).reset_index(drop=True)
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
randomized_sequences_top.append(obj)
feature = 'winner'
ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.5)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)
orig.set_xlabel('match position')
orig.set_ylabel('performance')
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))
rand.set_xlabel('match position')
rand.set_ylabel('performance')
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))
rand_bar_y = []
bar_x = []
bar_y = []
for i in range(1,6):
stats = sequence_analysis(i, sequences_top, feature, verbose = False)
rand_stats = sequence_analysis(i, randomized_sequences_top, feature, verbose = False)
x = range(1,i+1)
y = []
err = []
rand_y = []
rand_err = []
for item in stats:
y.append(stats[item]['mu'])
err.append(stats[item]['ci'])
for item in stats:
rand_y.append(rand_stats[item]['mu'])
rand_err.append(rand_stats[item]['ci'])
# alternative visualization
xx = [x[0], x[-1]]
yy = [y[0], y[-1]]
eerr = [err[0], err[-1]]
rand_yy = [rand_y[0], rand_y[-1]]
rand_eerr = [rand_err[0], rand_err[-1]]
# change plot data
bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
bar_x.append(x[-1])
rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 12, capthick = 3, capsize = 6)
rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 12, capthick = 3, capsize = 6)
orig.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'lower right', shadow = True, fancybox = True)
orig.set_title('original sessions')
rand.set_title('randomized index sessions')
#plt.savefig('sessions_plos/' + feature + '.png')
plt.show()
# visualize change (%)
# fig = plt.figure(figsize = (6,3))
# fig.subplots_adjust(wspace=.5)
# orig = fig.add_subplot(121)
# rand = fig.add_subplot(122)
# orig.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# orig.set_title('original sessions')
# orig.set_ylabel(r'$\Delta \% $' + feature)
# orig.set_xlabel('session length')
# orig.set_xlim(0.5,5.5)
# orig.set_ylim(-15,2)
# orig.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# orig.set_xticks(bar_x, bar_x)
# rand.plot(bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# rand.set_title('randomized index sessions')
# rand.set_ylabel(r'$\Delta \% $' + feature)
# rand.set_xlabel('session length')
# rand.set_xlim(0.5,5.5)
# rand.set_ylim(-15,2)
# rand.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# rand.set_xticks(bar_x, bar_x)
above_bar_y = bar_y
above_rand_bar_y = rand_bar_y
# visualize change (%)
fig = plt.figure(figsize = (6,3))
fig.subplots_adjust(wspace=.5)
both = fig.add_subplot(111)
both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')
both.set_title('Above 95th percentile Experience')
both.set_ylabel(r'$\Delta \% $' + 'performance')
both.set_xlabel('session length')
both.set_xlim(0.5,5.5)
both.set_ylim(-35,15)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
both.legend(loc = 'lower left')
print 'ORIGINAL SESSIONS'
for i in range(len(bar_x)):
print 'matches',
print round(bar_x[i],2),
print '->',
print round(bar_y[i],2)
print
print 'RANDOMIZED INDEX SESSIONS'
for i in range(len(bar_x)):
print 'matches',
print round(bar_x[i],2),
print '->',
print round(rand_bar_y[i],2)
# compute original sessions
player_stats_worst = player_stats[player_stats[('winner','count')] <= np.percentile(player_stats[('winner','count')], 5)]
sequences_worst = []
for player_id in player_stats_worst.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
sequences_worst.append(obj)
# compute randomized index sessions
randomized_sequences_worst = []
for player_id in player_stats_worst.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df[df['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
df_copy = df_copy.sample(frac=1).reset_index(drop=True)
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
randomized_sequences_worst.append(obj)
feature = 'winner'
ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.5)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)
orig.set_xlabel('match position')
orig.set_ylabel('performance')
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))
rand.set_xlabel('match position')
rand.set_ylabel('performance')
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))
rand_bar_y = []
bar_x = []
bar_y = []
for i in range(1,6):
stats = sequence_analysis(i, sequences_worst, feature, verbose = False)
rand_stats = sequence_analysis(i, randomized_sequences_worst, feature, verbose = False)
x = range(1,i+1)
y = []
err = []
rand_y = []
rand_err = []
for item in stats:
y.append(stats[item]['mu'])
err.append(stats[item]['ci'])
for item in stats:
rand_y.append(rand_stats[item]['mu'])
rand_err.append(rand_stats[item]['ci'])
# alternative visualization
xx = [x[0], x[-1]]
yy = [y[0], y[-1]]
eerr = [err[0], err[-1]]
rand_yy = [rand_y[0], rand_y[-1]]
rand_eerr = [rand_err[0], rand_err[-1]]
# change plot data
bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
bar_x.append(x[-1])
rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
orig.errorbar(xx, yy, yerr = eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 12, capthick = 3, capsize = 6)
rand.errorbar(xx, rand_yy, yerr = rand_eerr, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 12, capthick = 3, capsize = 6)
orig.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'upper right', shadow = True, fancybox = True)
rand.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'lower right', shadow = True, fancybox = True)
orig.set_title('original sessions')
rand.set_title('randomized index sessions')
#plt.savefig('sessions_plos/' + feature + '.png')
plt.show()
# visualize change (%)
# fig = plt.figure(figsize = (10,3))
# fig.subplots_adjust(wspace=.5)
# orig = fig.add_subplot(121)
# rand = fig.add_subplot(122)
# orig.plot(bar_x, bar_y, color = '#30a2da', marker = 's')
# orig.set_title('original sessions')
# orig.set_ylabel(r'$\Delta \% $' + feature)
# orig.set_xlabel('session length')
# orig.set_xlim(0.5,5.5)
# orig.set_ylim(-35,2)
# orig.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# orig.set_xticks(bar_x, bar_x)
# rand.plot(bar_x, rand_bar_y, color = '#30a2da', marker = 's')
# rand.set_title('randomized index sessions')
# rand.set_ylabel(r'$\Delta \% $' + feature)
# rand.set_xlabel('session length')
# rand.set_xlim(0.5,5.5)
# rand.set_ylim(-35,2)
# rand.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
# rand.set_xticks(bar_x, bar_x)
below_bar_y = bar_y
below_rand_bar_y = rand_bar_y
# visualize change (%)
fig = plt.figure(figsize = (6,3))
fig.subplots_adjust(wspace=.5)
both = fig.add_subplot(111)
both.plot(bar_x, bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
both.plot(bar_x, rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')
both.set_title('Below 5th percentile Experience')
both.set_ylabel(r'$\Delta \% $' + 'performance')
both.set_xlabel('session length')
both.set_xlim(0.5,5.5)
both.set_ylim(-35,15)
both.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dotted')
both.set_xticks(bar_x, bar_x)
#both.legend(loc = 'lower left')
print 'ORIGINAL SESSIONS'
for i in range(len(bar_x)):
print 'matches',
print round(bar_x[i],2),
print '->',
print round(bar_y[i],2)
print
print 'RANDOMIZED INDEX SESSIONS'
for i in range(len(bar_x)):
print 'matches',
print round(bar_x[i],2),
print '->',
print round(rand_bar_y[i],2)
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
above = fig.add_subplot(121)
below = fig.add_subplot(122)
# above.plot(bar_x, above_bar_y, color = '#30a2da', marker = 'o', label = 'original sessions')
# above.plot(bar_x, above_rand_bar_y, color = '#fc4f30', marker = 's', label = 'randomized index sessions', ls = 'dashed')
above.bar(np.array(bar_x) - .1, above_bar_y, color = '#30a2da', width = .25, label = 'original sessions')
above.bar(np.array(bar_x) + .1, above_rand_bar_y, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')
above.set_title('High Experience Players', fontsize = font_size)
above.set_ylabel(r'$\Delta \% $ performance', fontsize = font_size)
above.set_xlabel('session length', fontsize = font_size)
above.set_xlim(0.5,5.5)
above.set_ylim(-35,15)
above.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
above.set_xticks(bar_x, bar_x)
above.legend(loc = 'lower left')
# below.plot(bar_x, below_bar_y, color = '#30a2da', marker = 'o', lw = 2, label = 'original sessions')
# below.plot(bar_x, below_rand_bar_y, color = '#fc4f30', marker = 's', lw = 2, label = 'randomized index sessions', ls = 'dashed')
below.bar(np.array(bar_x) - .1, below_bar_y, color = '#30a2da', width = .25, label = 'original sessions')
below.bar(np.array(bar_x) + .1, below_rand_bar_y, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')
below.set_title('Low Experience Players', fontsize = font_size)
below.set_ylabel(r'$\Delta \% $ performance', fontsize = font_size)
below.set_xlabel('session length', fontsize = font_size)
below.set_xlim(0.5,5.5)
below.set_ylim(-35,15)
below.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
below.set_xticks(bar_x, bar_x)
plt.savefig('deterioration_above95below5.pdf')
top_mean_sessions = []
for player in range(len(sequences_top)):
session_length = []
for seq in sequences_top[player].sequences:
if len(seq.index) > 0:
session_length.append(len(seq.index))
top_mean_sessions.append( np.mean(session_length) )
worst_mean_sessions = []
for player in range(len(sequences_worst)):
session_length = []
for seq in sequences_worst[player].sequences:
if len(seq.index) > 0:
session_length.append(len(seq.index))
worst_mean_sessions.append( np.mean(session_length) )
# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
print 'No significant difference :('
else:
print 'Significant difference detected :)'
fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'])
ax.set_ylabel('average session length')
for box in bp['boxes']:
# change outline color
box.set( color='black', linewidth=2)
# change fill color
box.set( facecolor = 'white', alpha = .75 )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#333333', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#333333', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#fc4f30', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='s', markerfacecolor='#fc4f30')
plt.savefig('average_session_all.pdf')
plt.show()
top_mean_sessions = []
for player in range(len(sequences_top)):
session_length = []
for seq in sequences_top[player].sequences[:100]:
if len(seq.index) > 0:
session_length.append(len(seq.index))
top_mean_sessions.append( np.mean(session_length) )
worst_mean_sessions = []
for player in range(len(sequences_worst)):
session_length = []
for seq in sequences_worst[player].sequences[:100]:
if len(seq.index) > 0:
session_length.append(len(seq.index))
worst_mean_sessions.append( np.mean(session_length) )
# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
print 'No significant difference :('
else:
print 'Significant difference detected :)'
fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Below 5th percentile\nExperience', 'Above 95th percentile\nExperience'])
ax.set_ylabel('average session length')
for box in bp['boxes']:
# change outline color
box.set( color='black', linewidth=2)
# change fill color
box.set( facecolor = 'white', alpha = .75 )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#333333', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#333333', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#fc4f30', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='s', markerfacecolor='#fc4f30')
plt.show()
top_mean_sessions = []
for player in range(len(sequences_top)):
session_length = []
for seq in sequences_top[player].sequences[:50]:
if len(seq.index) > 0:
session_length.append(len(seq.index))
top_mean_sessions.append( np.mean(session_length) )
worst_mean_sessions = []
for player in range(len(sequences_worst)):
session_length = []
for seq in sequences_worst[player].sequences[:50]:
if len(seq.index) > 0:
session_length.append(len(seq.index))
worst_mean_sessions.append( np.mean(session_length) )
# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
print 'No significant difference :('
else:
print 'Significant difference detected :)'
fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Below 5th percentile\nExperience', 'Above 95th percentile\nExperience'])
ax.set_ylabel('average session length')
for box in bp['boxes']:
# change outline color
box.set( color='#333333', linewidth=2)
# change fill color
box.set( facecolor = '#30a2da', alpha = .75 )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#333333', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#333333', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#333333', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='s', markerfacecolor='#fc4f30', alpha=0.5)
plt.show()
top_mean_sessions = []
for player in range(len(sequences_top)):
session_length = []
for seq in sequences_top[player].sequences[:20]:
if len(seq.index) > 0:
session_length.append(len(seq.index))
top_mean_sessions.append( np.mean(session_length) )
worst_mean_sessions = []
for player in range(len(sequences_worst)):
session_length = []
for seq in sequences_worst[player].sequences[:20]:
if len(seq.index) > 0:
session_length.append(len(seq.index))
worst_mean_sessions.append( np.mean(session_length) )
# boxplot
from scipy.stats import wilcoxon
test = wilcoxon(worst_mean_sessions[:-1], top_mean_sessions)
print test
if test.pvalue > 0.05:
print 'No significant difference :('
else:
print 'Significant difference detected :)'
fig = plt.figure()
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'])
ax.set_ylabel('average session length')
ax.set_title('')
for box in bp['boxes']:
# change outline color
box.set( color='black', linewidth=2)
# change fill color
box.set( facecolor = 'white' )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#333333', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#333333', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#fc4f30', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='s', markerfacecolor='#fc4f30')
plt.savefig('average_session_20.pdf')
plt.show()