ggplot2 is a system for declaratively creating graphics. You provide:
- the data,
- what graphical primitives to use,
- tell how to map variables to aesthetics,
and it takes care of the details.
ggplot2 is a system for declaratively creating graphics. You provide:
and it takes care of the details.
The following is a reusable template for making graphs with ggplot2:
ggplot(data = <DATA FRAME>) + <GEOMETRIC OBJECT>(mapping = aes(<MAPPINGS>))
We will work with the mpg dataset that contains fuel economy data and use the following variables:
library(ggplot2) library(dplyr) mpg
## # A tibble: 234 × 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp… ## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp… ## 3 audi a4 2 2008 4 manu… f 20 31 p comp… ## 4 audi a4 2 2008 4 auto… f 21 30 p comp… ## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp… ## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp… ## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp… ## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp… ## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp… ## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp… ## # ℹ 224 more rows
Let us stick to the well-known scatterplot geometric object geom_point()
for the moment.
library(ggplot2) # scatteplot of displ and hwy ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy))
# add drv as color ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy, color = drv))
We will discover different geometric objects by exploring variation and covariation of qualitative and quantitative variables:
A dataset diamonds contains the prices and other attributes of almost 54,000 diamonds. The main attributes are:
ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))
count(diamonds, cut)
## # A tibble: 5 × 2 ## cut n ## <ord> <int> ## 1 Fair 1610 ## 2 Good 4906 ## 3 Very Good 12082 ## 4 Premium 13791 ## 5 Ideal 21551
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.2)
ggplot(data = filter(diamonds, carat <= 2.5)) + geom_histogram(mapping = aes(x = carat), binwidth = 0.01)
head( sort( table( cut_interval(diamonds$carat, length = 0.05)), decreasing = TRUE), 10)
## ## (0.3,0.35] (1,1.05] (0.5,0.55] (0.25,0.3] (0.35,0.4] (0.7,0.75] (0.4,0.45] ## 6855 4484 3774 3418 3333 3121 2898 ## (0.65,0.7] (0.95,1] (0.55,0.6] ## 2128 1774 1742
diamonds %>% filter(carat <= 2.5) %>% ggplot() + geom_histogram(mapping = aes(x = carat), binwidth = 0.01)
ggplot(data = diamonds) + geom_freqpoly(mapping = aes(x = carat), binwidth = 0.2)
ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = color))
# putting the bars beside one another ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut, fill = color), position = "dodge")
ggplot(data = mpg) + geom_boxplot(mapping = aes(x = class, y = hwy))
# reaorder qualitative variable class ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
ggplot(data = diamonds) + geom_freqpoly(mapping = aes(x = price, colour = cut), binwidth = 500)
ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))
# use transparency to avoid overlapping of points ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price), alpha = 0.05)
We overlay geom_point and geom_smooth geometries.
geom_smooth uses local polynomial regression, also known as moving regression, a generalization of moving average and polynomial regression.
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_smooth()
# color is global ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + geom_point() + geom_smooth(se = FALSE)
# color is local to geom_point ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point(mapping = aes(color = drv)) + geom_smooth(se = FALSE)
# color is local to geom_smooth ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + geom_point() + geom_smooth(mapping = aes(color = drv), se = FALSE)
ggplot2 contains much more, that you need to exploit when turning an exploratory graphic into an expository graphic:
# filter best model in each class best_in_class <- mpg %>% group_by(class) %>% filter(row_number(desc(hwy)) == 1) # or best_in_class <- mpg %>% group_by(class) %>% arrange(desc(hwy)) %>% filter(row_number() == 1) ggplot(mpg, aes(displ, hwy)) + geom_point(aes(colour = class)) + geom_point(data = best_in_class, size = 3, shape = 1) + ggrepel::geom_label_repel(data = best_in_class, mapping = aes(label = model)) + geom_smooth(se = FALSE) + labs( title = "Fuel efficiency generally decreases with engine size", subtitle = "Two seaters (sports cars) are an exception because of their light weight", caption = "Data from fueleconomy.gov", x = "Engine displacement (L)", y = "Highway fuel economy (mpg)", colour = "Car class" ) + theme_classic()
The presidential
data set in ggplot2 package contains the names of each president, the start and end date of their term, and their party of 11 US presidents from Eisenhower to Obama.
presidential
## # A tibble: 11 × 4 ## name start end party ## <chr> <date> <date> <chr> ## 1 Eisenhower 1953-01-20 1961-01-20 Republican ## 2 Kennedy 1961-01-20 1963-11-22 Democratic ## 3 Johnson 1963-11-22 1969-01-20 Democratic ## 4 Nixon 1969-01-20 1974-08-09 Republican ## 5 Ford 1974-08-09 1977-01-20 Republican ## 6 Carter 1977-01-20 1981-01-20 Democratic ## 7 Reagan 1981-01-20 1989-01-20 Republican ## 8 Bush 1989-01-20 1993-01-20 Republican ## 9 Clinton 1993-01-20 2001-01-20 Democratic ## 10 Bush 2001-01-20 2009-01-20 Republican ## 11 Obama 2009-01-20 2017-01-20 Democratic
Draw the following plot:
using the following scaffolding:
presidential %>% # add id of the president starting from 34 mutate(id = 33 + ___) %>% # map start, id, and party to x, y, and colour ggplot(aes(___, ___, colour = ___)) + # add a point marking the start of the term geom_point() + # add a segment with length the duration of the presidential term geom_segment(aes(xend = ___, yend = ___)) + # set manually the colors (Republican = red, Democratic = blue) scale_colour_manual(values = c(___ = ___, ___ = ___) + # add black and white theme ___
presidential %>% mutate(id = 33 + 1:nrow(presidential)) %>% ggplot(aes(start, id, colour = party)) + geom_point() + geom_segment(aes(xend = end, yend = id)) + scale_colour_manual(values = c(Republican = "red", Democratic = "blue")) + theme_bw()
The gapminder dataset is an excerpt of the Gapminder data on life expectancy, GDP per capita, and population by country.
library(gapminder) gapminder
## # A tibble: 1,704 × 6 ## country continent year lifeExp pop gdpPercap ## <fct> <fct> <int> <dbl> <int> <dbl> ## 1 Afghanistan Asia 1952 28.8 8425333 779. ## 2 Afghanistan Asia 1957 30.3 9240934 821. ## 3 Afghanistan Asia 1962 32.0 10267083 853. ## 4 Afghanistan Asia 1967 34.0 11537966 836. ## 5 Afghanistan Asia 1972 36.1 13079460 740. ## 6 Afghanistan Asia 1977 38.4 14880372 786. ## 7 Afghanistan Asia 1982 39.9 12881816 978. ## 8 Afghanistan Asia 1987 40.8 13867957 852. ## 9 Afghanistan Asia 1992 41.7 16317921 649. ## 10 Afghanistan Asia 1997 41.8 22227415 635. ## # … with 1,694 more rows
gaplot = function(cont, year1, year2) { gapminder %>% filter(___) %>% ggplot(aes(___, ___, size = ___, colour = country)) + geom_point(alpha = 0.5, show.legend = FALSE) + scale_colour_manual(values = country_colors) + scale_size(range = c(2, 20)) + scale_x_log10() + labs(title = ___) + theme_minimal() }
country_colors[names(country_colors) == "Italy"]
## Italy ## "#3B7D1D"
gaplot = function(cont, year1, year2) { gapminder %>% filter(continent == cont, year >= year1, year <= year2) %>% ggplot(aes(gdpPercap, lifeExp, size = pop, colour = country)) + geom_point(alpha = 0.5, show.legend = FALSE) + scale_colour_manual(values = country_colors) + scale_size(range = c(2, 20)) + scale_x_log10() + labs(title = paste(cont, year1, "-", year2)) + theme_minimal() } gaplot("Africa", 1960, 1970)
gaplot("Africa", 1971, 1980)
gaplot("Africa", 1981, 1990)
gaplot("Africa", 1991, 2000)
gaplot("Asia", 1991, 2000)
gaplot("Europe", 1991, 2000)
gaplot("Americas", 1991, 2000)
gaplot("Oceania", 1991, 2000)