We explore the qualities of diamonds and their association with price. Why are low quality diamonds (with respect to cut, color and clarity) more expensive?

## Analysis
library(dplyr)
library(ggplot2)
library(modelr)

Here is the dataset:

select(diamonds, price, carat:clarity) 
## # A tibble: 53,940 x 5
##    price carat cut       color clarity
##    <int> <dbl> <ord>     <ord> <ord>  
##  1   326 0.23  Ideal     E     SI2    
##  2   326 0.21  Premium   E     SI1    
##  3   327 0.23  Good      E     VS1    
##  4   334 0.290 Premium   I     VS2    
##  5   335 0.31  Good      J     SI2    
##  6   336 0.24  Very Good J     VVS2   
##  7   336 0.24  Very Good I     VVS1   
##  8   337 0.26  Very Good H     SI1    
##  9   337 0.22  Fair      E     VS2    
## 10   338 0.23  Very Good H     VS1    
## # … with 53,930 more rows

We observe that, apparently, low quality diamonds are more expensive:

# best cut is Ideal, worst is Foor
group_by(diamonds, cut) %>% 
  summarise(median_price = median(price))
## # A tibble: 5 x 2
##   cut       median_price
##   <ord>            <dbl>
## 1 Fair             3282 
## 2 Good             3050.
## 3 Very Good        2648 
## 4 Premium          3185 
## 5 Ideal            1810
ggplot(diamonds, aes(x = cut, y = price)) + 
  geom_boxplot()

# best color is D, worst is J
group_by(diamonds, color) %>% 
  summarise(median_price = median(price))
## # A tibble: 7 x 2
##   color median_price
##   <ord>        <dbl>
## 1 D            1838 
## 2 E            1739 
## 3 F            2344.
## 4 G            2242 
## 5 H            3460 
## 6 I            3730 
## 7 J            4234
ggplot(diamonds, aes(x = color, y = price)) + 
  geom_boxplot()

# best clarity is IF, worst is I1
group_by(diamonds, clarity) %>% 
  summarise(median_price = median(price))
## # A tibble: 8 x 2
##   clarity median_price
##   <ord>          <dbl>
## 1 I1              3344
## 2 SI2             4072
## 3 SI1             2822
## 4 VS2             2054
## 5 VS1             2005
## 6 VVS2            1311
## 7 VVS1            1093
## 8 IF              1080
ggplot(diamonds, aes(x = clarity, y = price)) + 
  geom_boxplot()

We first observe that best quality diamonds are smaller:

# best cut is Ideal, worst is Foor
ggplot(diamonds, aes(x = cut, y = carat)) + 
  geom_boxplot()

# best color is D, worst is J
ggplot(diamonds, aes(x = color, y = carat)) + 
  geom_boxplot()

# best clarity is IF, worst is I1
ggplot(diamonds, aes(x = clarity, y = carat)) + 
  geom_boxplot()

We next remove the effect of carat on price:

# Log-transform price and carat. The log-transformation is particularly useful here because it makes the pattern linear, and linear patterns are the easiest to work with.
diamonds = diamonds %>% 
  filter(carat <= 2.5) %>% 
  mutate(lprice = log2(price), lcarat = log2(carat))

# model prince in terms of carat
mod = lm(lprice ~ lcarat, data = diamonds)
cor(diamonds$lprice, diamonds$lcarat)
## [1] 0.9661276
ggplot(diamonds, aes(x = lcarat, y = lprice)) +
  geom_hex()

# add residuals
diamonds = diamonds %>% add_residuals(mod)
cor(diamonds$resid, diamonds$lcarat)
## [1] 3.292059e-13
ggplot(diamonds, aes(x = lcarat, y = resid)) +
  geom_hex()

We check again the correlation of quality on the residulas. Now low quality diamonds are less expensive.

# best cut is Ideal, worst is Foor
ggplot(diamonds, aes(x = cut, y = price)) + 
  geom_boxplot()

ggplot(diamonds, aes(x = cut, y = resid)) + 
  geom_boxplot()

# best color is D, worst is J
ggplot(diamonds, aes(x = color, y = price)) + 
  geom_boxplot()

ggplot(diamonds, aes(x = color, y = resid)) + 
  geom_boxplot()

# best clarity is IF, worst is I1
ggplot(diamonds, aes(x = clarity, y = price)) + 
  geom_boxplot()

ggplot(diamonds, aes(x = clarity, y = resid)) + 
  geom_boxplot()