Prolog

Read matches of Italian soccer league 18/19 from www.football-data.co.uk. Then, using pipe %>%and only one sequence of commands, do the following:

  1. filter valid matches (those with a value for variables FTHG and FTAG)
  2. select variables of interest, that is those from Date to AR
  3. create Date objects from Date variable with dmy function
  4. add columns HomeTeamId and AwayTeamId with numeric identifiers for the teams (Hint: build a vector of teams and then use the match function)
  5. arrange the data frame by Date

Finally, write a function that computes team ratings (wins, draws, loses, points, goals for, goals against) from the team matches and use to create a rating data frame sorted by points.

library(readr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(modelr)
# read data from CSV online 
matches = read_csv("http://www.football-data.co.uk/mmz4281/1819/I1.csv")


# get teams
teams = sort(unique(matches$HomeTeam)) 

# transform matches
matches = matches %>% 
  filter(!is.na(FTHG), !is.na(FTAG)) %>% 
  select(Date:AR) %>%
  mutate(Date = dmy(Date)) %>% 
  mutate(HomeTeamId = match(HomeTeam, teams), AwayTeamId = match(AwayTeam, teams)) %>% 
  arrange(Date)
# compute team ratings
# Input: 
# df -- data frame with match statistics
# teams -- vector with team names
# output:
# a tibble with ratings for each team
compute_football_ratings = function(df, teams) {
  
  # number of matches
  m = nrow(df)
  
  # number of teams
  n = length(teams)
  
  # rating vectors
  # wins
  V = vector("integer", n)
  # draws
  P = vector("integer", n)
  # loses
  S = vector("integer", n)
  # goals for
  GF = vector("integer", n)
  # goals against
  GS = vector("integer", n)
  # points
  PT = vector("integer", n)
  
  for (i in 1:m) {
    # home team wins
    if (df[[i, "FTHG"]] > df[[i, "FTAG"]]) {
      V[df[[i, "HomeTeamId"]]] = V[df[[i, "HomeTeamId"]]] + 1L
      S[df[[i, "AwayTeamId"]]] = S[df[[i, "AwayTeamId"]]] + 1L
    }
    # away team wins
    if (df[[i, "FTHG"]] < df[[i, "FTAG"]]) {
      S[df[[i, "HomeTeamId"]]] = S[df[[i, "HomeTeamId"]]] + 1L
      V[df[[i, "AwayTeamId"]]] = V[df[[i, "AwayTeamId"]]] + 1L
    }
    # draw
    if (df[[i, "FTHG"]] == df[[i, "FTAG"]]) {
      P[df[[i, "HomeTeamId"]]] = P[df[[i, "HomeTeamId"]]] + 1L
      P[df[[i, "AwayTeamId"]]] = P[df[[i, "AwayTeamId"]]] + 1L
    }
    # goals for
    GF[df[[i, "HomeTeamId"]]] = GF[df[[i, "HomeTeamId"]]] + df[[i, "FTHG"]]
    GF[df[[i, "AwayTeamId"]]] = GF[df[[i, "AwayTeamId"]]] + df[[i, "FTAG"]]
    
    # goals against
    GS[df[[i, "HomeTeamId"]]] = GS[df[[i, "HomeTeamId"]]] + df[[i, "FTAG"]]
    GS[df[[i, "AwayTeamId"]]] = GS[df[[i, "AwayTeamId"]]] + df[[i, "FTHG"]]
  
    # points
    PT[df[[i, "HomeTeamId"]]] = 3L * V[df[[i, "HomeTeamId"]]] + P[df[[i, "HomeTeamId"]]]
    PT[df[[i, "AwayTeamId"]]] = 3L * V[df[[i, "AwayTeamId"]]] + P[df[[i, "AwayTeamId"]]]
  }
  
  return(tibble(Team = teams, PT, V = V, P = P, S = S, GF = GF, GS = GS))
}
rating = compute_football_ratings(matches, teams) %>% 
  arrange(desc(PT))

Linear regression modeling

  1. Write simple linear regression models over the rating data frame explaining variable points in terms of variables:
    1. goals for,
    2. goals against,
    3. goals spread (goals for minus goals against).
    Find out which is the best model.
  2. Model points in terms of both goals for and goals against using multiple linear regression. Which among goals for and goals against contribute more to points? Use the answer to suggest a market strategy for a team.
modPTGF = lm(PT ~ GF, data = rating)

# plot observed and predicted values
rating = add_predictions(rating, modPTGF) 
ggplot(rating, aes(x = GF)) +
  geom_point(aes(y = PT)) + # observed values
  geom_line(aes(y = pred), colour = "red") # predicted values

# add and plot residuals
rating = add_residuals(rating, modPTGF)
ggplot(rating, aes(GF, resid)) + 
  geom_ref_line(h = 0) +
  geom_point()

# sort by residuals
rating %>%
  arrange(desc(resid)) %>% 
  head(3)
## # A tibble: 3 x 9
##   Team        PT     V     P     S    GF    GS  pred resid
##   <chr>    <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Juventus    88    28     4     2    68    24  71.6 16.4 
## 2 Torino      56    14    14     6    44    29  44.8 11.2 
## 3 Cagliari    40    10    10    14    32    47  31.5  8.55
rating %>%
  arrange(resid) %>% 
  head(3)
## # A tibble: 3 x 9
##   Team        PT     V     P     S    GF    GS  pred resid
##   <chr>    <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Empoli      29     7     8    19    43    66  43.7 -14.7
## 2 Atalanta    56    16     8     9    66    42  69.4 -13.4
## 3 Sassuolo    38     8    14    11    47    52  48.2 -10.2
# remove residulas
rating = select(rating, -resid, -pred)

# compute model quality
corPTGF = cor(rating$PT, rating$GF)
quality = list()
quality$PFGF = corPTGF^2
quality
## $PFGF
## [1] 0.7476105
modPTGS = lm(PT ~ GS, data = rating)

# plot observed and predicted values
ggplot(rating, aes(x = GS, y = PT)) + 
  geom_point() + 
  geom_abline(intercept = modPTGS$coefficients[1], 
              slope = modPTGS$coefficients[2], 
              color = "red")

corPTGS = cor(rating$PT, rating$GS)
quality$PFGS = corPTGS^2
quality
## $PFGF
## [1] 0.7476105
## 
## $PFGS
## [1] 0.8081161
# add goal spread
rating = mutate(rating, DG = GF - GS)
modPTDG = lm(PT ~ DG, data = rating)

# plot observed and predicted values
ggplot(rating, aes(x = DG, y = PT)) + 
  geom_point() + 
  geom_abline(intercept = modPTDG$coefficients[1], 
              slope = modPTDG$coefficients[2], 
              color = "red")

corPTDG = cor(rating$PT, rating$DG)
quality$PFDG = corPTDG^2
quality
## $PFGF
## [1] 0.7476105
## 
## $PFGS
## [1] 0.8081161
## 
## $PFDG
## [1] 0.9321075
modPTGFGS = lm(PT ~ GF + GS, data = rating)

summary(modPTGFGS)$coefficients
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 53.3449457  9.0561966  5.890436 1.780922e-05
## GF           0.6162210  0.1073007  5.742933 2.390090e-05
## GS          -0.7818483  0.1119883 -6.981516 2.212118e-06
coe = abs(summary(modPTGFGS)$coefficients[2:3])
names(coe) = c("attack", "defense")
round(coe / sum(coe), 2)
##  attack defense 
##    0.44    0.56
quality$PTGFGS = summary(modPTGFGS)$r.squared
quality
## $PFGF
## [1] 0.7476105
## 
## $PFGS
## [1] 0.8081161
## 
## $PFDG
## [1] 0.9321075
## 
## $PTGFGS
## [1] 0.934735