Since the beginning of the age of the Enlightenment and over the course of modernization, the mortality of children below 5 years of age has declined rapidly. Read more at Our World in Data.
## Analysis
library(tidyverse)
library(modelr)
library(broom)
# read data from CSV
#mortality = read_csv("infant-mortality.csv")
mortality = read_csv("child-mortality.csv")
continents = read_csv("Countries-Continents.csv")
mortality = mortality %>%
rename(Country = Entity, Mortality = 'Under five mortality rate (% of children that die before they are 5 years old)') %>%
filter(Year >= 1950) %>%
left_join(continents, by="Country")
(mortality = select(mortality, Continent, Country, Year, Mortality))
## # A tibble: 13,464 x 4
## Continent Country Year Mortality
## <chr> <chr> <dbl> <dbl>
## 1 Asia Afghanistan 1950 42.0
## 2 Asia Afghanistan 1951 41.6
## 3 Asia Afghanistan 1952 40.9
## 4 Asia Afghanistan 1953 40.2
## 5 Asia Afghanistan 1954 39.6
## 6 Asia Afghanistan 1955 38.9
## 7 Asia Afghanistan 1956 38.3
## 8 Asia Afghanistan 1957 37.7
## 9 Asia Afghanistan 1958 37.1
## 10 Asia Afghanistan 1959 36.5
## # … with 13,454 more rows
# Is child mortality decreasing over time?
ggplot(mortality, aes(Year, Mortality)) +
geom_line(aes(group = Country), alpha = 1/3) +
theme_minimal()
# Is there a difference among continents?
ggplot(mortality, aes(Year, Mortality, color = Continent)) +
geom_line(aes(group = Country), alpha = 1/3) +
theme_minimal()
ggplot(mortality, aes(Year, Mortality)) +
geom_line(aes(group = Country), alpha = 1/3) +
facet_wrap(~Continent) +
theme_minimal()
# Which are the countries with the largest variability in mortality?
mortality_stats = group_by(mortality, Country) %>%
summarise(sd = sd(Mortality))
arrange(mortality_stats, desc(sd))
## # A tibble: 195 x 2
## Country sd
## <chr> <dbl>
## 1 Yemen 73.0
## 2 Egypt 69.6
## 3 Nepal 64.8
## 4 Maldives 64.3
## 5 Oman 63.6
## 6 Turkey 56.6
## 7 Iraq 55.7
## 8 Afghanistan 54.7
## 9 Pakistan 54.3
## 10 Malawi 53.6
## # … with 185 more rows
volatile_countries =
filter(mortality_stats, rank(desc(sd)) <= 10) %>%
select(Country, sd) %>%
arrange(-sd)
volatile_countries
## # A tibble: 10 x 2
## Country sd
## <chr> <dbl>
## 1 Yemen 73.0
## 2 Egypt 69.6
## 3 Nepal 64.8
## 4 Maldives 64.3
## 5 Oman 63.6
## 6 Turkey 56.6
## 7 Iraq 55.7
## 8 Afghanistan 54.7
## 9 Pakistan 54.3
## 10 Malawi 53.6
mortality %>%
semi_join(volatile_countries) %>%
ggplot(aes(Year, Mortality, group = Country, color = Continent)) +
geom_line(alpha = 1/3)
# Which are the countries whose child mortality over time show the largest deviation from a linear pattern?
# nest data by countries
by_country = mortality %>%
group_by(Country, Continent) %>%
nest()
by_country
## # A tibble: 204 x 3
## Country Continent data
## <chr> <chr> <list>
## 1 Afghanistan Asia <tibble [66 × 2]>
## 2 Albania Europe <tibble [66 × 2]>
## 3 Algeria Africa <tibble [66 × 2]>
## 4 Angola Africa <tibble [66 × 2]>
## 5 Antigua and Barbuda North America <tibble [66 × 2]>
## 6 Argentina South America <tibble [66 × 2]>
## 7 Armenia Europe <tibble [66 × 2]>
## 8 Aruba <NA> <tibble [66 × 2]>
## 9 Australia Oceania <tibble [66 × 2]>
## 10 Austria Europe <tibble [66 × 2]>
## # … with 194 more rows
# add models
country_model = function(df) {
lm(Mortality ~ Year, data = df)
}
by_country = by_country %>%
mutate(model = map(data, country_model))
by_country
## # A tibble: 204 x 4
## Country Continent data model
## <chr> <chr> <list> <list>
## 1 Afghanistan Asia <tibble [66 × 2]> <S3: lm>
## 2 Albania Europe <tibble [66 × 2]> <S3: lm>
## 3 Algeria Africa <tibble [66 × 2]> <S3: lm>
## 4 Angola Africa <tibble [66 × 2]> <S3: lm>
## 5 Antigua and Barbuda North America <tibble [66 × 2]> <S3: lm>
## 6 Argentina South America <tibble [66 × 2]> <S3: lm>
## 7 Armenia Europe <tibble [66 × 2]> <S3: lm>
## 8 Aruba <NA> <tibble [66 × 2]> <S3: lm>
## 9 Australia Oceania <tibble [66 × 2]> <S3: lm>
## 10 Austria Europe <tibble [66 × 2]> <S3: lm>
## # … with 194 more rows
# add residuals
by_country =
by_country %>%
mutate(data = map2(data, model, add_residuals))
by_country
## # A tibble: 204 x 4
## Country Continent data model
## <chr> <chr> <list> <list>
## 1 Afghanistan Asia <tibble [66 × 3]> <S3: lm>
## 2 Albania Europe <tibble [66 × 3]> <S3: lm>
## 3 Algeria Africa <tibble [66 × 3]> <S3: lm>
## 4 Angola Africa <tibble [66 × 3]> <S3: lm>
## 5 Antigua and Barbuda North America <tibble [66 × 3]> <S3: lm>
## 6 Argentina South America <tibble [66 × 3]> <S3: lm>
## 7 Armenia Europe <tibble [66 × 3]> <S3: lm>
## 8 Aruba <NA> <tibble [66 × 3]> <S3: lm>
## 9 Australia Oceania <tibble [66 × 3]> <S3: lm>
## 10 Austria Europe <tibble [66 × 3]> <S3: lm>
## # … with 194 more rows
# unnest residuals
resids = unnest(by_country, data)
# plot residuals
ggplot(resids, aes(Year, resid)) +
geom_line(aes(group = Country), alpha = 1/3) +
geom_smooth(se = FALSE)
ggplot(resids, aes(Year, resid)) +
geom_line(aes(group = Country), alpha = 1/3) +
facet_wrap(~Continent)
# add model quality
glance =
by_country %>%
mutate(glance = map(model, broom::glance)) %>%
unnest(glance, .drop = TRUE)
group_by(glance, Continent) %>%
summarise(quality = median(r.squared)) %>%
arrange(quality)
## # A tibble: 7 x 2
## Continent quality
## <chr> <dbl>
## 1 Europe 0.871
## 2 <NA> 0.873
## 3 North America 0.902
## 4 Asia 0.913
## 5 Oceania 0.949
## 6 Africa 0.955
## 7 South America 0.968
# countries that deviate from linearity
arrange(glance, r.squared)
## # A tibble: 204 x 13
## Country Continent r.squared adj.r.squared sigma statistic p.value df
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Rwanda Africa 0.274 0.263 7.40 24.1 6.52e- 6 2
## 2 Cambod… Asia 0.422 0.413 7.46 46.7 3.61e- 9 2
## 3 Ukraine Europe 0.566 0.560 1.88 83.6 3.20e-13 2
## 4 Russia Asia 0.596 0.590 2.34 94.4 3.24e-14 2
## 5 Latvia Europe 0.605 0.599 1.74 98.0 1.56e-14 2
## 6 Belarus Europe 0.620 0.614 2.14 105. 4.34e-15 2
## 7 Lithua… Europe 0.631 0.625 2.32 109. 1.74e-15 2
## 8 North … Asia 0.634 0.628 3.13 111. 1.34e-15 2
## 9 Estonia Europe 0.646 0.641 1.79 117. 4.42e-16 2
## 10 Bulgar… Europe 0.663 0.658 1.80 126. 9.08e-17 2
## # … with 194 more rows, and 5 more variables: logLik <dbl>, AIC <dbl>,
## # BIC <dbl>, deviance <dbl>, df.residual <int>
bad_fit = filter(glance, r.squared < 0.7)
semi_join(mortality, bad_fit, by = "Country") %>%
ggplot(aes(Year, Mortality, colour = Country)) +
geom_line()
semi_join(resids, bad_fit, by = "Country") %>%
ggplot(aes(Year, resid, colour = Country)) +
geom_line()