Read matches of Italian soccer league 18/19 from www.football-data.co.uk. Then, using pipe %>%
and only one sequence of commands, do the following:
FTHG
and FTAG
)dmy
functionHomeTeamId
and AwayTeamId
with numeric identifiers for the teams (Hint: build a vector of teams and then use the match
function)Finally, write a function that computes team ratings (wins, draws, loses, points, goals for, goals against) from the team matches and use to create a rating data frame sorted by points.
library(readr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(modelr)
# read data from CSV online
matches = read_csv("http://www.football-data.co.uk/mmz4281/1819/I1.csv")
# get teams
teams = sort(unique(matches$HomeTeam))
# transform matches
matches = matches %>%
filter(!is.na(FTHG), !is.na(FTAG)) %>%
select(Date:AR) %>%
mutate(Date = dmy(Date)) %>%
mutate(HomeTeamId = match(HomeTeam, teams), AwayTeamId = match(AwayTeam, teams)) %>%
arrange(Date)
# compute team ratings
# Input:
# df -- data frame with match statistics
# teams -- vector with team names
# output:
# a tibble with ratings for each team
compute_football_ratings = function(df, teams) {
# number of matches
m = nrow(df)
# number of teams
n = length(teams)
# rating vectors
# wins
V = vector("integer", n)
# draws
P = vector("integer", n)
# loses
S = vector("integer", n)
# goals for
GF = vector("integer", n)
# goals against
GS = vector("integer", n)
# points
PT = vector("integer", n)
for (i in 1:m) {
# home team wins
if (df[[i, "FTHG"]] > df[[i, "FTAG"]]) {
V[df[[i, "HomeTeamId"]]] = V[df[[i, "HomeTeamId"]]] + 1L
S[df[[i, "AwayTeamId"]]] = S[df[[i, "AwayTeamId"]]] + 1L
}
# away team wins
if (df[[i, "FTHG"]] < df[[i, "FTAG"]]) {
S[df[[i, "HomeTeamId"]]] = S[df[[i, "HomeTeamId"]]] + 1L
V[df[[i, "AwayTeamId"]]] = V[df[[i, "AwayTeamId"]]] + 1L
}
# draw
if (df[[i, "FTHG"]] == df[[i, "FTAG"]]) {
P[df[[i, "HomeTeamId"]]] = P[df[[i, "HomeTeamId"]]] + 1L
P[df[[i, "AwayTeamId"]]] = P[df[[i, "AwayTeamId"]]] + 1L
}
# goals for
GF[df[[i, "HomeTeamId"]]] = GF[df[[i, "HomeTeamId"]]] + df[[i, "FTHG"]]
GF[df[[i, "AwayTeamId"]]] = GF[df[[i, "AwayTeamId"]]] + df[[i, "FTAG"]]
# goals against
GS[df[[i, "HomeTeamId"]]] = GS[df[[i, "HomeTeamId"]]] + df[[i, "FTAG"]]
GS[df[[i, "AwayTeamId"]]] = GS[df[[i, "AwayTeamId"]]] + df[[i, "FTHG"]]
# points
PT[df[[i, "HomeTeamId"]]] = 3L * V[df[[i, "HomeTeamId"]]] + P[df[[i, "HomeTeamId"]]]
PT[df[[i, "AwayTeamId"]]] = 3L * V[df[[i, "AwayTeamId"]]] + P[df[[i, "AwayTeamId"]]]
}
return(tibble(Team = teams, PT, V = V, P = P, S = S, GF = GF, GS = GS))
}
rating = compute_football_ratings(matches, teams) %>%
arrange(desc(PT))
modPTGF = lm(PT ~ GF, data = rating)
# plot observed and predicted values
rating = add_predictions(rating, modPTGF)
ggplot(rating, aes(x = GF)) +
geom_point(aes(y = PT)) + # observed values
geom_line(aes(y = pred), colour = "red") # predicted values
# add and plot residuals
rating = add_residuals(rating, modPTGF)
ggplot(rating, aes(GF, resid)) +
geom_ref_line(h = 0) +
geom_point()
# sort by residuals
rating %>%
arrange(desc(resid)) %>%
head(3)
## # A tibble: 3 x 9
## Team PT V P S GF GS pred resid
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Juventus 88 28 4 2 68 24 71.6 16.4
## 2 Torino 56 14 14 6 44 29 44.8 11.2
## 3 Cagliari 40 10 10 14 32 47 31.5 8.55
rating %>%
arrange(resid) %>%
head(3)
## # A tibble: 3 x 9
## Team PT V P S GF GS pred resid
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Empoli 29 7 8 19 43 66 43.7 -14.7
## 2 Atalanta 56 16 8 9 66 42 69.4 -13.4
## 3 Sassuolo 38 8 14 11 47 52 48.2 -10.2
# remove residulas
rating = select(rating, -resid, -pred)
# compute model quality
corPTGF = cor(rating$PT, rating$GF)
quality = list()
quality$PFGF = corPTGF^2
quality
## $PFGF
## [1] 0.7476105
modPTGS = lm(PT ~ GS, data = rating)
# plot observed and predicted values
ggplot(rating, aes(x = GS, y = PT)) +
geom_point() +
geom_abline(intercept = modPTGS$coefficients[1],
slope = modPTGS$coefficients[2],
color = "red")
corPTGS = cor(rating$PT, rating$GS)
quality$PFGS = corPTGS^2
quality
## $PFGF
## [1] 0.7476105
##
## $PFGS
## [1] 0.8081161
# add goal spread
rating = mutate(rating, DG = GF - GS)
modPTDG = lm(PT ~ DG, data = rating)
# plot observed and predicted values
ggplot(rating, aes(x = DG, y = PT)) +
geom_point() +
geom_abline(intercept = modPTDG$coefficients[1],
slope = modPTDG$coefficients[2],
color = "red")
corPTDG = cor(rating$PT, rating$DG)
quality$PFDG = corPTDG^2
quality
## $PFGF
## [1] 0.7476105
##
## $PFGS
## [1] 0.8081161
##
## $PFDG
## [1] 0.9321075
modPTGFGS = lm(PT ~ GF + GS, data = rating)
summary(modPTGFGS)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.3449457 9.0561966 5.890436 1.780922e-05
## GF 0.6162210 0.1073007 5.742933 2.390090e-05
## GS -0.7818483 0.1119883 -6.981516 2.212118e-06
coe = abs(summary(modPTGFGS)$coefficients[2:3])
names(coe) = c("attack", "defense")
round(coe / sum(coe), 2)
## attack defense
## 0.44 0.56
quality$PTGFGS = summary(modPTGFGS)$r.squared
quality
## $PFGF
## [1] 0.7476105
##
## $PFGS
## [1] 0.8081161
##
## $PFDG
## [1] 0.9321075
##
## $PTGFGS
## [1] 0.934735