import numpy as np
import matplotlib.pyplot as pl
import pymc
import ipdb
import seaborn as sns
#import daft
%matplotlib inline

After a football season finishes, we can use the results of all the matches to deduce the quality of each team. This has been done with the italian league in the paper 'Bayesian hierarchical model for the prediction of football results' by Gianluca Baio and Marta A. Blangiardo. This work has been applied to the Premier A league in http://danielweitzenfeld.github.io/passtheroc/blog/2014/10/28/bayes-premier-league/

The study is done using a quite simple Bayesian hierarchical model that hopefully captures all the dependencies. The model starts by assuming that the number of goals $y_{ij}$ in the $i$-match for each team acting as $j=h$ (home) or $j=a$ (away) team are given by:

$$y_{ij} = \mathrm{Poisson}(\theta_{ij})$$

In other words, the number of goals of a given team in a given match is extracted from a Poisson distribution with a mean that depends on the match. These means are modeled as a combination of attack and defense quality for each team:

$$\log \theta_{ih} = \mathrm{intercept} + \mathrm{home} + \mathrm{att}_{ih} + \mathrm{def}_{ia}$$$$\log \theta_{ia} = \mathrm{intercept} + \mathrm{att}_{ia} + \mathrm{def}_{ih}$$

This model assumes that the home team has an advantage (that is constant for all teams). This model assumes that the number of goals scored by the home team in game $i$ depends on the sum of the attack quality of the home team and the defense quality of the away team (obviously defense terms will usually be negative unless the away team helps the home team win) by scoring goals by themselves. The same applies for the away team in game $i$.

In order to constraint all the attack and defense parameters, we apply a Bayesian hierarchical model by assuming that all of them are extracted from a common distribution. We use simple normal distributions with zero mean:

$$\mathrm{att}_t \sim \mathrm{Normal}(0,\tau_\mathrm{att})$$$$\mathrm{def}_t \sim \mathrm{Normal}(0,\tau_\mathrm{def})$$

Finally, we set quite uninformative priors for the parameters and hyperparameters

$$\mathrm{intercept} \sim \mathrm{Normal}(0,10^{-4})$$$$\mathrm{home} \sim \mathrm{Normal}(0,10^{-4})$$$$\tau_\mathrm{att} \sim \mathrm{Gamma}(0.1,0.1)$$$$\tau_\mathrm{def} \sim \mathrm{Gamma}(0.1,0.1)$$

In principle, this model might produce posterior for the attack and defense parameters that can be improper. For this reason, we use the conditions that:

$$\sum_t \mathrm{att}_t = 0$$$$\sum_t \mathrm{def}_t = 0$$

The graphical model for this model is the following:

pgm = daft.PGM([4.7, 6.35], origin=[0, 0.2])
pgm.add_node(daft.Node("tau_att", r"$\tau_\mathrm{att}$", 1, 3))
pgm.add_node(daft.Node("tau_def", r"$\tau_\mathrm{def}$", 1, 4))
pgm.add_node(daft.Node("home", r"$\mathrm{home}$", 1, 5))
pgm.add_node(daft.Node("intercept", r"$\mathrm{intercept}$", 1, 6))
pgm.add_node(daft.Node("atth", r"$att_{ih}$", 2, 4.5))
pgm.add_node(daft.Node("defa", r"$def_{ia}$", 2, 3.5))
pgm.add_node(daft.Node("atta", r"$att_{ia}$", 2, 2.5))
pgm.add_node(daft.Node("defh", r"$def_{ih}$", 2, 1.5))
pgm.add_node(daft.Node("theta_a", r"$\theta_{ia}$", 3, 4.5))
pgm.add_node(daft.Node("theta_h", r"$\theta_{ih}$", 3, 2.5))
pgm.add_node(daft.Node("y_a", r"$y_{ia}$", 4, 4.5, observed=True))
pgm.add_node(daft.Node("y_h", r"$y_{ih}$", 4, 2.5, observed=True))
pgm.add_plate(daft.Plate([1.5, 1, 3, 4], label=r"game $i$"))
pgm.add_edge("intercept", "theta_a")
pgm.add_edge("intercept", "theta_h")
pgm.add_edge("home", "theta_a")
pgm.add_edge("tau_att", "atth")
pgm.add_edge("tau_att", "atta")
pgm.add_edge("tau_def", "defh")
pgm.add_edge("tau_def", "defa")
pgm.add_edge("atth", "theta_h")
pgm.add_edge("defa", "theta_h")
pgm.add_edge("atta", "theta_a")
pgm.add_edge("defh", "theta_a")
pgm.add_edge("theta_a", "y_a")
pgm.add_edge("theta_h", "y_h")
pgm.render()

<matplotlib.axes.Axes at 0xa909224c>

We read the results of all the games of the 2013-2014 season of the Liga BBVA in Spain from Wikipedia and build the model using PyMC.

# Read team names
season = []
home = []
away = []
homeGoals = []
awayGoals = []
for i in range(15):
	fileName = '/scratch/Dropbox/THEORY/footballData/results_{0:02d}{1:02d}'.format(i,i+1)
	f = open(fileName)
	lines = f.readlines()
	for l in lines[1:]:
		vals = l.split(',')
		season.append(i)
		home.append(vals[2])
		away.append(vals[3])
		homeGoals.append(vals[4])
		awayGoals.append(vals[5])
seasonAll = np.asarray(season)
homeAll = np.asarray(home)
awayAll = np.asarray(away)
homeAllGoals = np.asarray(homeGoals)
awayAllGoals = np.asarray(awayGoals)

# Choose season 2013-2014
ind = np.where(seasonAll == 14)
nGames = len(ind[0])
home = homeAll[ind]
away = awayAll[ind]
homeGoals = homeAllGoals[ind]
awayGoals = awayAllGoals[ind]
uniq, uniqIndex = np.unique(home, return_index=True)
nTeams = len(np.unique(home))
for i in range(nTeams):
    ind = np.where(home == uniq[i])
    home[ind] = i
    ind = np.where(away == uniq[i])
    away[ind] = i

# Read league results and put it in two arrays
results = np.zeros((2,nTeams,nTeams))-1
info = {'home': np.zeros(nGames, dtype=np.int8), 'away': np.zeros(nGames, dtype=np.int8), 
		  'homeScore': np.zeros(nGames, dtype=np.int8), 'awayScore': np.zeros(nGames, dtype=np.int8)}

for loop in range(nGames):
    info['homeScore'][loop] = homeGoals[loop]
    info['awayScore'][loop] = awayGoals[loop]
    info['home'][loop] = home[loop]
    info['away'][loop] = away[loop]

attStartingPoints = np.log(info['awayScore'].mean())
defStartingPoints = -np.log(info['homeScore'].mean())
			
# Hierarchical model
# Hyperpriors
home = pymc.Normal('home', 0, .0001, value=0)
tau_att = pymc.Gamma('tau_att', .1, .1, value=10)
tau_def = pymc.Gamma('tau_def', .1, .1, value=10)
intercept = pymc.Normal('intercept', 0, .0001, value=0)

#team-specific parameters
atts_star = pymc.Normal("atts_star", 
                        mu=0, 
                        tau=tau_att, 
                        size=nTeams, 
                        value=np.ones(nTeams)*attStartingPoints)
defs_star = pymc.Normal("defs_star", 
                        mu=0, 
                        tau=tau_def, 
                        size=nTeams, 
                        value=np.ones(nTeams)*defStartingPoints) 


# trick to code the sum to zero contraint
@pymc.deterministic
def atts(atts_star=atts_star):
    atts = atts_star.copy()
    atts = atts - np.mean(atts_star)
    return atts

@pymc.deterministic
def defs(defs_star=defs_star):
    defs = defs_star.copy()
    defs = defs - np.mean(defs_star)
    return defs

@pymc.deterministic
def home_theta(home_team=info['home'], 
               away_team=info['away'], 
               home=home, 
               atts=atts, 
               defs=defs, 
               intercept=intercept): 
    return np.exp(intercept + 
                  home + 
                  atts[home_team] + 
                  defs[away_team])
  
@pymc.deterministic
def away_theta(home_team=info['home'], 
               away_team=info['away'], 
               home=home, 
               atts=atts, 
               defs=defs, 
               intercept=intercept): 
    return np.exp(intercept + 
                  atts[away_team] + 
                  defs[home_team])   


home_goals = pymc.Poisson('home_goals', 
                          mu=home_theta, 
                          value=info['homeScore'], 
                          observed=True)
away_goals = pymc.Poisson('away_goals', 
                          mu=away_theta, 
                          value=info['awayScore'], 
                          observed=True)

mcmc = pymc.MCMC([home, intercept, tau_att, tau_def, 
                  home_theta, away_theta, 
                  atts_star, defs_star, atts, defs, 
                  home_goals, away_goals])
map_ = pymc.MAP( mcmc )
map_.fit()
mcmc.sample(20000, 4000, 20)

 [-----------------100%-----------------] 20000 of 20000 complete in 54.7 sec

Now we produce some plots once we verify that the Markov chain is converged. The first one shows the posterior median and 95% credible intervals for the attack and defense parameters for each team for the whole season.

# Defense/atack efficiency
teamNames = uniq
fig, ax = pl.subplots(ncols=1, nrows=2, figsize=(10,8))
medianA = atts.stats()['quantiles'][50]
intervalALow = medianA - atts.stats()['95% HPD interval'][:,0]
intervalAUp = atts.stats()['95% HPD interval'][:,1] - medianA
ax[0].errorbar(np.arange(nTeams, dtype=np.int8) + .5, medianA, 
             yerr=[intervalALow,intervalAUp], 
             fmt='o')
ax[0].set_title('HPD of Attack Strength, by Team')
ax[0].set_xlabel('Team')
ax[0].set_ylabel('Posterior Attack Strength')
_= ax[0].set_xticks(np.arange(nTeams, dtype=np.int8) + .5)
_= ax[0].set_xticklabels(teamNames, rotation=45)

medianD = defs.stats()['quantiles'][50]
intervalDLow = medianD - defs.stats()['95% HPD interval'][:,0]
intervalDUp = defs.stats()['95% HPD interval'][:,1] - medianD
ax[1].errorbar(np.arange(nTeams, dtype=np.int8) + .5, medianD, 
             yerr=[intervalDLow,intervalDUp], 
             fmt='o')
ax[1].set_title('HPD of Defense Strength, by Team')
ax[1].set_xlabel('Team')
ax[1].set_ylabel('Posterior Defense Strength')
_= ax[1].set_xticks(np.arange(nTeams, dtype=np.int8) + .5)
_= ax[1].set_xticklabels(teamNames, rotation=45)
pl.tight_layout()

/usr/pkg/python/Python-2.7.6/lib/python2.7/site-packages/matplotlib-1.4.3-py2.7-linux-x86_64.egg/matplotlib/font_manager.py:1282: UserWarning: findfont: Font family [u'Arial'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

The next plot shows the attack vs. defense parameters showing the 95% credible interval for each team.

fig, ax = pl.subplots(figsize=(8,6))
ax.errorbar(medianA, medianD, xerr=[intervalALow,intervalAUp], yerr=[intervalDLow,intervalDUp], fmt='o')
for label, x, y in zip(teamNames, medianA, medianD):
	ax.annotate(label, xy=(x,y), xytext = (1,-10), textcoords = 'offset points')
ax.set_title('Attack vs Defense avg effect: 13-14 liga BBVA')
ax.set_xlabel('Attack effect')
ax.set_ylabel('Defense effect')

<matplotlib.text.Text at 0x74fe4d0>

Let's simulate the final position, points and goals of many seasons using the values from the posterior.

def simulateSeason():
    """
    Simulate a season once, using one random draw from the mcmc chain. 
    """
    numSamples = atts.trace().shape[0]
    draw = np.random.randint(0, numSamples)
    attsDraw = atts.trace()[draw, :]
    defsDraw = defs.trace()[draw, :]
    homeDraw = home.trace()[draw]
    interceptDraw = intercept.trace()[draw]
    teams = np.zeros((2,nTeams), dtype=np.int16)
    for h in range(nTeams):
        for a in range(nTeams):
            if (h != a):
                thetah = np.exp(interceptDraw + homeDraw + attsDraw[h] + defsDraw[a])
                thetaa = np.exp(interceptDraw + attsDraw[a] + defsDraw[h])
                goalsh = np.random.poisson(thetah)
                goalsa = np.random.poisson(thetaa)
                                
                teams[0,h] += goalsh
                teams[0,a] += goalsa
                
                if (goalsh > goalsa):
                    teams[1,h] += 3
                if (goalsh == goalsa):
                    teams[1,h] += 1
                    teams[1,a] += 1
                if (goalsh < goalsa):
                    teams[1,a] += 3
                                        
    return teams

nSeasons = 100
goals = np.zeros((nTeams,nSeasons))
points = np.zeros((nTeams,nSeasons))
for i in range(nSeasons):
    res = simulateSeason()
    goals[:,i] = res[0,:]
    points[:,i] = res[1,:]

fig, ax = pl.subplots(nrows=4, ncols=5, figsize=(12,9))
ax = ax.flatten()
for i in range(nTeams):
    ax[i].hist(goals[i,:])
    ax[i].set_title(teamNames[i])
    median = np.median(goals[i,:])
    ax[i].plot([median, median], ax[i].get_ylim())
    ax[i].annotate('Median: %s' % median, xy=(median + 1, ax[i].get_ylim()[1]-10))
pl.tight_layout()

fig, ax = pl.subplots(nrows=4, ncols=5, figsize=(12,9))
ax = ax.flatten()
for i in range(nTeams):
    ax[i].hist(points[i,:])
    ax[i].set_title(teamNames[i])
    median = np.median(points[i,:])
    ax[i].plot([median, median], ax[i].get_ylim())
    ax[i].annotate('Median: %s' % median, xy=(median + 1, ax[i].get_ylim()[1]-10))
pl.tight_layout()

Finally, let's make histograms of the position for each team in the simulated seasons.

fig, ax = pl.subplots(nrows=4, ncols=5, figsize=(12,9))
ax = ax.flatten()
ind = np.argsort(points, axis=0)[::-1]
for i in range(nTeams):
    pos = np.zeros(nSeasons, dtype=np.int16)
    for j in range(nSeasons):
        pos[j] = np.where(ind[:,j] == i)[0]
    ax[i].hist(pos, width=1)
    ax[i].set_title(teamNames[i])
pl.tight_layout()

ligaBBVAHierarchical