Since the beginning of the age of the Enlightenment and over the course of modernization, the mortality of children below 5 years of age has declined rapidly. Read more at Our World in Data.

Downloads

Challenges

  1. Is child mortality decreasing over time?
  2. Is there a difference among continents?
  3. Which are the countries with the largest variability in child mortality?
  4. Which are the countries whose child mortality over time show the largest deviation from a linear pattern?
## Analysis
library(tidyverse)
library(modelr)
library(broom)
# read data from CSV 
#mortality = read_csv("infant-mortality.csv")
mortality = read_csv("child-mortality.csv") 
continents = read_csv("Countries-Continents.csv")
mortality = mortality %>% 
  rename(Country = Entity, Mortality = 'Under five mortality rate (% of children that die before they are 5 years old)') %>% 
  filter(Year >= 1950) %>% 
  left_join(continents, by="Country")
(mortality = select(mortality, Continent, Country, Year, Mortality))
## # A tibble: 13,464 x 4
##    Continent Country      Year Mortality
##    <chr>     <chr>       <dbl>     <dbl>
##  1 Asia      Afghanistan  1950      42.0
##  2 Asia      Afghanistan  1951      41.6
##  3 Asia      Afghanistan  1952      40.9
##  4 Asia      Afghanistan  1953      40.2
##  5 Asia      Afghanistan  1954      39.6
##  6 Asia      Afghanistan  1955      38.9
##  7 Asia      Afghanistan  1956      38.3
##  8 Asia      Afghanistan  1957      37.7
##  9 Asia      Afghanistan  1958      37.1
## 10 Asia      Afghanistan  1959      36.5
## # … with 13,454 more rows
# Is child mortality decreasing over time?

ggplot(mortality, aes(Year, Mortality)) +
  geom_line(aes(group = Country), alpha = 1/3) +
  theme_minimal()

# Is there a difference among continents?

ggplot(mortality, aes(Year, Mortality, color = Continent)) +
  geom_line(aes(group = Country), alpha = 1/3) +
  theme_minimal()

ggplot(mortality, aes(Year, Mortality)) +
  geom_line(aes(group = Country), alpha = 1/3) +
  facet_wrap(~Continent)  +
  theme_minimal()

# Which are the countries with the largest variability in mortality?

mortality_stats = group_by(mortality, Country) %>%
  summarise(sd = sd(Mortality))

arrange(mortality_stats, desc(sd))
## # A tibble: 195 x 2
##    Country        sd
##    <chr>       <dbl>
##  1 Yemen        73.0
##  2 Egypt        69.6
##  3 Nepal        64.8
##  4 Maldives     64.3
##  5 Oman         63.6
##  6 Turkey       56.6
##  7 Iraq         55.7
##  8 Afghanistan  54.7
##  9 Pakistan     54.3
## 10 Malawi       53.6
## # … with 185 more rows
volatile_countries = 
  filter(mortality_stats, rank(desc(sd)) <= 10) %>% 
  select(Country, sd) %>%
  arrange(-sd)
volatile_countries
## # A tibble: 10 x 2
##    Country        sd
##    <chr>       <dbl>
##  1 Yemen        73.0
##  2 Egypt        69.6
##  3 Nepal        64.8
##  4 Maldives     64.3
##  5 Oman         63.6
##  6 Turkey       56.6
##  7 Iraq         55.7
##  8 Afghanistan  54.7
##  9 Pakistan     54.3
## 10 Malawi       53.6
mortality %>%
  semi_join(volatile_countries) %>%
  ggplot(aes(Year, Mortality, group = Country, color = Continent)) +
    geom_line(alpha = 1/3)

# Which are the countries whose child mortality over time show the largest deviation from a linear pattern?

# nest data by countries
by_country = mortality %>% 
  group_by(Country, Continent) %>% 
  nest()

by_country
## # A tibble: 204 x 3
##    Country             Continent     data             
##    <chr>               <chr>         <list>           
##  1 Afghanistan         Asia          <tibble [66 × 2]>
##  2 Albania             Europe        <tibble [66 × 2]>
##  3 Algeria             Africa        <tibble [66 × 2]>
##  4 Angola              Africa        <tibble [66 × 2]>
##  5 Antigua and Barbuda North America <tibble [66 × 2]>
##  6 Argentina           South America <tibble [66 × 2]>
##  7 Armenia             Europe        <tibble [66 × 2]>
##  8 Aruba               <NA>          <tibble [66 × 2]>
##  9 Australia           Oceania       <tibble [66 × 2]>
## 10 Austria             Europe        <tibble [66 × 2]>
## # … with 194 more rows
# add models
country_model = function(df) {
  lm(Mortality ~ Year, data = df)
}

by_country = by_country %>% 
  mutate(model = map(data, country_model))

by_country
## # A tibble: 204 x 4
##    Country             Continent     data              model   
##    <chr>               <chr>         <list>            <list>  
##  1 Afghanistan         Asia          <tibble [66 × 2]> <S3: lm>
##  2 Albania             Europe        <tibble [66 × 2]> <S3: lm>
##  3 Algeria             Africa        <tibble [66 × 2]> <S3: lm>
##  4 Angola              Africa        <tibble [66 × 2]> <S3: lm>
##  5 Antigua and Barbuda North America <tibble [66 × 2]> <S3: lm>
##  6 Argentina           South America <tibble [66 × 2]> <S3: lm>
##  7 Armenia             Europe        <tibble [66 × 2]> <S3: lm>
##  8 Aruba               <NA>          <tibble [66 × 2]> <S3: lm>
##  9 Australia           Oceania       <tibble [66 × 2]> <S3: lm>
## 10 Austria             Europe        <tibble [66 × 2]> <S3: lm>
## # … with 194 more rows
# add residuals
by_country = 
  by_country %>% 
  mutate(data = map2(data, model, add_residuals))

by_country
## # A tibble: 204 x 4
##    Country             Continent     data              model   
##    <chr>               <chr>         <list>            <list>  
##  1 Afghanistan         Asia          <tibble [66 × 3]> <S3: lm>
##  2 Albania             Europe        <tibble [66 × 3]> <S3: lm>
##  3 Algeria             Africa        <tibble [66 × 3]> <S3: lm>
##  4 Angola              Africa        <tibble [66 × 3]> <S3: lm>
##  5 Antigua and Barbuda North America <tibble [66 × 3]> <S3: lm>
##  6 Argentina           South America <tibble [66 × 3]> <S3: lm>
##  7 Armenia             Europe        <tibble [66 × 3]> <S3: lm>
##  8 Aruba               <NA>          <tibble [66 × 3]> <S3: lm>
##  9 Australia           Oceania       <tibble [66 × 3]> <S3: lm>
## 10 Austria             Europe        <tibble [66 × 3]> <S3: lm>
## # … with 194 more rows
# unnest residuals
resids = unnest(by_country, data)

# plot residuals
ggplot(resids, aes(Year, resid)) +
    geom_line(aes(group = Country), alpha = 1/3) + 
    geom_smooth(se = FALSE)

ggplot(resids, aes(Year, resid)) +
    geom_line(aes(group = Country), alpha = 1/3) + 
    facet_wrap(~Continent)

# add model quality
glance = 
  by_country %>% 
  mutate(glance = map(model, broom::glance)) %>% 
  unnest(glance, .drop = TRUE)

group_by(glance, Continent) %>%
  summarise(quality = median(r.squared)) %>%
  arrange(quality)
## # A tibble: 7 x 2
##   Continent     quality
##   <chr>           <dbl>
## 1 Europe          0.871
## 2 <NA>            0.873
## 3 North America   0.902
## 4 Asia            0.913
## 5 Oceania         0.949
## 6 Africa          0.955
## 7 South America   0.968
# countries that deviate from linearity
arrange(glance, r.squared)
## # A tibble: 204 x 13
##    Country Continent r.squared adj.r.squared sigma statistic  p.value    df
##    <chr>   <chr>         <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>
##  1 Rwanda  Africa        0.274         0.263  7.40      24.1 6.52e- 6     2
##  2 Cambod… Asia          0.422         0.413  7.46      46.7 3.61e- 9     2
##  3 Ukraine Europe        0.566         0.560  1.88      83.6 3.20e-13     2
##  4 Russia  Asia          0.596         0.590  2.34      94.4 3.24e-14     2
##  5 Latvia  Europe        0.605         0.599  1.74      98.0 1.56e-14     2
##  6 Belarus Europe        0.620         0.614  2.14     105.  4.34e-15     2
##  7 Lithua… Europe        0.631         0.625  2.32     109.  1.74e-15     2
##  8 North … Asia          0.634         0.628  3.13     111.  1.34e-15     2
##  9 Estonia Europe        0.646         0.641  1.79     117.  4.42e-16     2
## 10 Bulgar… Europe        0.663         0.658  1.80     126.  9.08e-17     2
## # … with 194 more rows, and 5 more variables: logLik <dbl>, AIC <dbl>,
## #   BIC <dbl>, deviance <dbl>, df.residual <int>
bad_fit = filter(glance, r.squared < 0.7)

semi_join(mortality, bad_fit, by = "Country") %>% 
  ggplot(aes(Year, Mortality, colour = Country)) +
    geom_line()

semi_join(resids, bad_fit, by = "Country") %>% 
  ggplot(aes(Year, resid, colour = Country)) +
    geom_line()