This challenge explores all matches of Italian soccer Serie A league from 1993 to 2021.
library(readr)
library(tidyr)
library(dplyr)
library(lubridate)
library(stringr)
# prepare link structure
link1 = "http://www.football-data.co.uk/mmz4281/"
link2 = "/I1.csv"
# first solution
years = sprintf("%02d", c(seq(93, 99), seq(0, 21)))
# second solution
years = c(as.character(seq(93, 99)), str_c("0", as.character(seq(0, 9))), as.character(seq(10, 21)))
# link vector
nseasons = length(years)-1
links = str_c(link1, str_c(years[1:nseasons], years[2:(nseasons+1)]), link2)
# scrape data
# make a list of lists (each list element is a season)
l = vector("list", nseasons)
for (i in 1:nseasons) {
# read the season dataset
l[[i]] = read_csv(links[i])
}
# name the seasons with years
names(l) = years[1:nseasons]
# clean data (filter columns and rows)
for (i in 1:nseasons) {
l[[i]] = l[[i]] %>%
select(Date, HomeTeam, AwayTeam, FTHG, FTAG) %>%
filter(!is.na(Date), !is.na(FTHG), !is.na(FTAG)) %>%
mutate(Date = dmy(Date))
}
# combine seasons in a unique data frame
history = list()
for (i in 1:nseasons) {
history = rbind(history, l[[i]])
}
#View(history)
# save data on disk
#write_csv(history, "history.csv")