#VISUALIZATION IN R: an introduction
# Version II
#November 9, 2018
setwd("C:/Users/arodriguez/Dropbox/RPractice/visualization") #make sure you change the wd to YOUR wd, not mine
dir()
## [1] " (DESKTOP-NMMG0TL's conflicted copy 2018-11-08).Rhistory"
## [2] "base-r.pdf"
## [3] "climate-change-warnings-over-the-years.jpg"
## [4] "corruption-hd.png"
## [5] "EconomistData.csv"
## [6] "ggplot_tutorial.pdf"
## [7] "ggplot_xrcises.pdf"
## [8] "ggplot2-cheatsheet-2.1.pdf"
## [9] "hullman_journal.pone.0142444.pdf"
## [10] "Moser_CHI2017.pdf"
## [11] "paper_BELIV_evaluating_uncertainty_vis.pdf"
## [12] "R-Intro.R"
## [13] "R - A Gentle Introduction.pdf"
## [14] "R - A Gentle Introduction_REV.pdf"
## [15] "stateData_aer.csv"
## [16] "Visualising Residuals.pdf"
## [17] "Visualization in R an Introduction.R"
## [18] "Visualization_In_R_II.R"
## [19] "Visualization_In_R_II.spin.R"
## [20] "Visualization_In_R_II.spin.Rmd"
####### housekeeping ###########
options(digits = 3, scipen = 999)
remove(list = ls()) #this clears your workspace
#######################################################################################################################
#There are many ways to do things with R, especially visualization. Here we will demonstrate two of them: the plotting
#capabilities of a popular package called ggplot.
#The array of tools built into R are known as Base-R tools. Among them is a plotting function
#called plot(). Philosophically, the plot function mimics our approach to building a plot manually: that is to say,
#you start with a blank page and build up from there.
#
#For instance, plot(x,y) or plot(y~x) produces a scatterplot of the numbers in y versus the numbers in x.
#The function also allows for the controlling of how the resulting graph is displayed. Thus, if your are interested
# providing labels to the x and y axes as well as a title you would add to the
#initial representation as follows plot(y~x, xlab="this is the x-axis", ylab = "this is the y-axis",
# main = "This is a plot of y versus x")
#
#
#############################
#Load all required packages
#install.packages("tidyverse")
#install.packages("dplyr")
##install.packages("ggplot2")
#install.packages("gapminder")
#install.packages("ggthemes")
#install.packages("datasets")
#install.packages("ggrepel")
#install.packages("grid")
#install.packages("splines")
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.5
## v tidyr 0.8.1 v stringr 1.3.0
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(ggthemes)
library(datasets)
library(ggrepel)
library(grid)
library(splines)
#==========================================================================================================
# Plotting in Base-R
state = read.csv("stateData_aer.csv", header = TRUE)
str(state)
## 'data.frame': 50 obs. of 19 variables:
## $ X : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ personal_rate : num 0.0402 0 0.0454 0.069 0.133 0.0463 0.0699 0.079 0 0.06 ...
## $ corporate_rate : num 0.0423 0.094 0.055 0.065 0.088 0.0463 0.09 0.117 0.055 0.06 ...
## $ progressivity : num -2.16 0 10.65 14.5 38.67 ...
## $ property_tax : num 15.2 37.1 27.5 18 28.6 ...
## $ sales_tax : num 25.01 5.71 37.16 35.64 24.24 ...
## $ remaing_tax : num 22.4 13.6 11.8 16.9 18.2 ...
## $ estate_tax : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
## $ tax_changes : num -0.04 0.19 -0.21 -0.3 -0.71 -0.03 3.5 1.19 -0.6 2.1 ...
## $ debt : num 0.085 0.062 0.083 0.05 0.09 0.11 0.071 0.1 0.09 0.08 ...
## $ employees : num 584 739 427 535 452 ...
## $ liability_survey: num 55.1 68.1 65.4 57.7 49.9 67.8 65.9 76.5 56 62.4 ...
## $ min_wage : num 7.25 9.75 8.05 8 10 8.31 9.6 8.25 8.05 7.25 ...
## $ workers_comp : num 1.81 2.68 1.6 1.08 3.48 1.5 2.87 2.31 1.82 1.75 ...
## $ rtw : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 1 1 2 2 ...
## $ tax_limits : int 0 1 2 1 2 3 1 2 2 0 ...
## $ gdp_cumm : num 0.347 0.595 0.4 0.407 0.403 0.49 0.26 0.25 0.32 0.35 ...
## $ migration : int 103580 -21018 536269 63430 -1265447 315015 -153918 41162 834966 406863 ...
## $ emp_growth_cum : num 0.012 0.106 0.068 0.029 0.068 0.132 0.011 0.03 0.05 0.07 ...
head(state)
## X personal_rate corporate_rate progressivity property_tax
## 1 Alabama 0.0402 0.0423 -2.16 15.2
## 2 Alaska 0.0000 0.0940 0.00 37.1
## 3 Arizona 0.0454 0.0550 10.65 27.5
## 4 Arkansas 0.0690 0.0650 14.50 18.0
## 5 California 0.1330 0.0880 38.67 28.6
## 6 Colorado 0.0463 0.0463 6.39 28.8
## sales_tax remaing_tax estate_tax tax_changes debt employees
## 1 25.01 22.4 No -0.04 0.085 584
## 2 5.71 13.6 No 0.19 0.062 739
## 3 37.16 11.8 No -0.21 0.083 427
## 4 35.64 16.9 No -0.30 0.050 535
## 5 24.24 18.2 No -0.71 0.090 452
## 6 25.08 13.8 No -0.03 0.110 524
## liability_survey min_wage workers_comp rtw tax_limits gdp_cumm migration
## 1 55.1 7.25 1.81 Yes 0 0.347 103580
## 2 68.1 9.75 2.68 No 1 0.595 -21018
## 3 65.4 8.05 1.60 Yes 2 0.400 536269
## 4 57.7 8.00 1.08 Yes 1 0.407 63430
## 5 49.9 10.00 3.48 No 2 0.403 -1265447
## 6 67.8 8.31 1.50 No 3 0.490 315015
## emp_growth_cum
## 1 0.012
## 2 0.106
## 3 0.068
## 4 0.029
## 5 0.068
## 6 0.132
colnames(state)[1] = c("State") #Replace the name of column 1 from X to State
names(state)
## [1] "State" "personal_rate" "corporate_rate"
## [4] "progressivity" "property_tax" "sales_tax"
## [7] "remaing_tax" "estate_tax" "tax_changes"
## [10] "debt" "employees" "liability_survey"
## [13] "min_wage" "workers_comp" "rtw"
## [16] "tax_limits" "gdp_cumm" "migration"
## [19] "emp_growth_cum"
attach(state)
plot(debt ~ log10(gdp_cumm), xlab = "Log of Cumulative GDP", ylab = "State Debt, per-capita",
main = "Debt vs. GDP\n Source: Alec-Laffer Rankings, 2016")
abline(lm(debt~ log10(gdp_cumm)))

# Exercise 1 ##############################################################################
# Is Cumulative Employment Growth higher in Right-to-Work States?
# Answer the question by displaying a plot of Cumulative Employment Growth (emp_growth_cum)
# versus Right-to-Work (rtw)
# Label properly and include a title
# #########################################################################################
#==========================================================================================================
##############################################################################################
# Introducing ggplot
# GGPLOT is a plotting system based on the grammar of graphics.
# GGPLOT is/are a set of tools to map data to visual elements on your plot, to specify the kind of plot you want,
# and last, to control the details of how it will be displayed.
# In ggplot2 a plot is divided into various fundamental/constituent parts : Plot = data + Aesthetics + Geometry.
#Elements of GGPLOT
# 1. dataframe organized as tidydata
# 2. aesthetics mappings or aesthetics or aes : (i) what is your data (ii) how does your data map onto the page
# 3. geom: what sort of plot you want. e.g. scatterplot (geom_point()), boxplot (geom_boxplot()), etc.
# note: aesthetics and geoms combined by adding, i.e. using "+"
# Layering
# Adding geometric, scale, facet and statistics layers to a ggplot() object allows
# control of virtually every visual aspect of the plot.
##############################################################################################
# We use the dataset mtcars available in Base-R
?mtcars
## starting httpd help server ...
## done
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
#Display your data using aesthetics (WHAT YOU WANT TO SEE)
ggplot(data = mtcars, mapping = aes(x = wt, y = mpg))

#and select the geom (layer 1) type you want: scatterplot (i.e. geom_point) (HOW YOU WANT TO SEE IT)
ggplot(data = mtcars, mapping = aes(x = wt, y = mpg)) +
geom_point()

#You can control the shape, size, transparency of the points in the scatter point
ggplot(data = mtcars, mapping = aes(x = wt, y = mpg)) +
geom_point(shape = 1, size = 4, col = "red") # add shape = 1 to the geom_point arguments

#And you can continue layering on geoms as needed. For instance, add a smoother.
ggplot(mtcars, aes(wt,mpg)) +
geom_point() +
geom_smooth() #try geom_smooth(method = "lm")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#and here is the real power of ggplot; you can add third variable into the existing 2-dim plane
# In this case, information on the number of cylinders of a particular car is added via color gradations of the
# data points;
# In addition to color, you can use size and shape
ggplot(mtcars, aes(wt,mpg))+
geom_point(aes(color = cyl), size = 2) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Rerun the last command but change smoother; i.e. geom_smooth(method = "lm")
#rerun the previous command after changing the type of variable for cyl; from numeric to factor
# as follows
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
mtcars$cyl = as.factor(mtcars$cyl)
#'cause ggplot is interpreting the variable cyl as numeric; its a factor; no such thing as 3 or 5 cyl
#rerun the previous command with size = cyl; and then shape = cyl
# you could place the col shading in the global aesthetic within ggplot() but that informs
# all subsequent geoms
ggplot(mtcars, aes(wt,mpg, col = cyl))+
geom_point( size = 2) +
geom_smooth(method = "lm")

# You can also examine individual variables
ggplot(mtcars, aes(x = mpg)) +
geom_density()

# change line colors by cylinder
ggplot(mtcars, aes(x = mpg)) +
geom_density(aes(color = cyl))

# Change fill color by cyl
# Use semi-transparent fill: alpha = 0.4
ggplot(mtcars, aes(x = mpg)) +
geom_density(aes(fill = cyl), alpha=0.4)

#EXERCISE 2 ##########################################################################################
# This time using ggplot recreate the plots from Exercise 1 above. Sort of.
# Create a scatter plot of the relationship between the log of cumulative gdp and
# the corporate marginal tax rate (corporate_rate)
# Add a straight line (representing the relationship) onto the scatterplot
# Label properly and include a title
# Rerun and include rtw (Right-to-Work) as an aesthetic in geom_point mapping
#####################################################################################################
#EXERCISE 3 ##########################################################################################
# This time using ggplot recreate the boxplots from Exercise 1 above. Sort of.
# Create a boxplot using geom_boxplot of the relationship between net migration and
# Right-to-Work. Label properly - including a Title
# Rerun and include rtw mapping into color in the aesthetic in geom_boxplot
#####################################################################################################
#EXERCISE 4 ##########################################################################################
# Load the data set "faithful" - the Old Faithful Geyser Data
# Create a scatter plot of the relationship between eruptions and waiting time
# Add a straight line (representing the relationship) onto the scatterplot
# Can you identify the number of clusters visible?
# rerun and add a title and add countours to the "clusters" via geom_density_2d()
#####################################################################################################
###########################################################################################
# Display the relatiionship between miles-per-gallon (mpg) and a cars cylinders (cyl);
# Use a boxplot as your "geom_"
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point(aes(size = hp)) +
geom_boxplot()

#Now add information on horsepower (hp) as an aesthetic in geom_point mapping hp into size
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point(aes(size = hp)) +
geom_boxplot()

#Or even better, a violin plot, instead of a boxplot; i.e. geom_violin()
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_violin()

#add a little "jitter" to the points; to enhance their presence; i.e. geom_jitter()
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_violin() +
geom_jitter() # jitter throws a little random distance to each point so it appears offset a bit

# geom_text and geom_lable allows you to add text into the graph
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_violin() +
# geom_jitter(width = 0.5, height = 0.5)+ # control the jitter here or in the geom_label
geom_label(aes(label = rownames(mtcars)), fontface = "bold",
col = "red",position=position_jitter(width=0.25,height=0.55))

# like this
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_violin() +
geom_jitter(width = 0.5, height = 0.5)+ # control the jitter here o in the geom_lable
geom_label(aes(label = rownames(mtcars)), fontface = "bold",
col = "red")

# Still looks "scrunched-up"
# use geom_text_repel() in the package ggrepel to display the names better
# i.e. geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold",
library("ggrepel")
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_boxplot() +
geom_jitter(width=.25,height=.5) +
geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold",
col = "red")

# the use of "themes" allows a consistent presentation style
# theme economist reproduces the style of The Economist magazine
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_boxplot() +
geom_jitter(width=.25,height=.5) +
geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold",
col = "red") +
labs(x = "Cylinders", y = "Miles-per-Gallon", title = "Miles per Gallon v. Cylinders")+
theme_economist()

# Nate Silver's 538 theme
ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_boxplot() +
geom_jitter(width=.25,height=.5) +
geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold",
col = "red") +
labs(x = "Cylinders", y = "Miles-per-Gallon", title = "Miles per Gallon v. Cylinders")+
theme_fivethirtyeight()

# Still too cluttered; reduce to only Lincoln Continental, the Camaro and the Dodge Challenger
pointsToLabel <- c("Camaro Z28", "Lincoln Continental", "Dodge Challenger")
mtcars1 = rownames_to_column(mtcars, var = "Brand") #rownames must be a column
str(mtcars1)
## 'data.frame': 32 obs. of 12 variables:
## $ Brand: chr "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
mtcars1$Brand
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
ggplot(data = mtcars1, mapping = aes(x = cyl, y = mpg)) +
geom_point() +
geom_boxplot() +
geom_jitter(width=.25,height=.5) +
geom_text_repel(aes(label = Brand), force = 5,fontface = "bold",
col = "red", data = subset(mtcars1, Brand %in% pointsToLabel) ) +
labs(x = "Cylinders", y = "Miles-per-Gallon", title = "Miles per Gallon v. Cylinders")+
theme_fivethirtyeight()

#EXERCISE 5 ##########################################################################################
# go back to the state data set and the boxplot
# Create a boxplot using geom_boxplot of the relationship between employment growth and
# the presence of an estate tax in the state
# Rerun and include estate_tax mapping into color in the aesthetic in geom_boxplot
# Rerun and include the individual data points; throw in some jitter; note that the label is now in State
# add the theme doing homage to Edward Tufte; i.e. theme_tufte()
# Still too cluttered; reduce to CT, Florida, and Texas and change color to "gray20"
#####################################################################################################
# The Grand Finale
#####################################################################
# Reproduce the famous graph displaying the relationship between the Human Development Index
# and the Corruptions Perception Index that appeared in the Economist in December 2011.
# The data is available in the csv file titled "EconomistData."
dat <- read.csv("EconomistData.csv")
str(dat)
## 'data.frame': 173 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : Factor w/ 173 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ HDI.Rank: int 172 70 96 148 45 86 2 19 91 53 ...
## $ HDI : num 0.398 0.739 0.698 0.486 0.797 0.716 0.929 0.885 0.7 0.771 ...
## $ CPI : num 1.5 3.1 2.9 2 3 2.6 8.8 7.8 2.4 7.3 ...
## $ Region : Factor w/ 6 levels "Americas","Asia Pacific",..: 2 3 5 6 1 3 2 4 3 1 ...
names(dat)
## [1] "X" "Country" "HDI.Rank" "HDI" "CPI" "Region"
attach(dat)
#### QED ##########################################################################################################
#SOME RESOURCES
# https://flowingdata.com/2016/03/22/comparing-ggplot2-and-r-base-graphics/
# https://www.statmethods.net/advgraphs/axes.html
# http://www.r-graph-gallery.com/all-graphs/
#====================================================================================================================
# Exercise 1 ##############################################################################
# Is Cumulative Employment Growth higher in Right-to-Work States?
# Answer the question by displaying a plot of Cumulative Employment Growth (emp_growth_cum)
# versus Right-to-Work (rtw)
# Label properly and include a title
plot(emp_growth_cum ~ rtw, xlab = "Right-to-Work State", ylab = "Cumulative Employment Growth",
main = "Employment Growth vs. Right-to-Work Status\n Source: Alex-Laffer Rankings, 2016")

# #########################################################################################
#EXERCISE 2 ##########################################################################################
# This time using ggplot recreate the plots from Exercise 1 above. Sort of.
# Create a scatter plot of the relationship between the log of cumulative gdp and
# the corporate marginal tax rate (corporate_rate)
# Add a straight line (representing the relationship) onto the scatterplot
# Label properly and include a title
ggplot(state, aes(x = personal_rate, y = log10(gdp_cumm))) +
geom_point(size = 4) +
geom_smooth(method = "lm") +
labs(x = "Corporate Marginal Tax Rate", y = "Log of Cumulative GDP",
title = "Corporate Tax Rate vs. GDP\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include rtw (Right-to-Work) as an aesthetic in geom_point mapping
ggplot(state, aes(x = personal_rate, y = log10(gdp_cumm))) +
geom_point(aes(shape = rtw), size = 4) +
geom_smooth(method = "lm") +
labs(x = "Corporate Marginal Tax Rate", y = "Log of Cumulative GDP",
title = "Corporate Tax Rate vs. GDP\nSource: Alec-Laffer Rankings, 2016")

#####################################################################################################
#EXERCISE 3 ##########################################################################################
# This time using ggplot recreate the boxplots from Exercise 1 above. Sort of.
# Create a boxplot using geom_boxplot of the relationship between net migration and
# Right-to-Work. Label properly - including a Title
ggplot(state, aes(x = rtw, y = migration)) +
geom_boxplot() +
labs(x = "Rigth to Work State", y = "Cumulative Net Migration",
title = "Cumulative Net Migration vs. Right-to-Work\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include rtw mapping into color in the aesthetic in geom_boxplot
ggplot(state, aes(x = rtw, y = migration)) +
geom_boxplot( aes(col = rtw)) +
labs(x = "Rigth to Work State", y = "Cumulative Net Migration",
title = "Cumulative Net Migration vs. Right-to-Work\nSource: Alec-Laffer Rankings, 2016")

#####################################################################################################
#EXERCISE 4 ##########################################################################################
# Load the data set "faithful" - the Old Faithful Geyser Data
# Create a scatter plot of the relationship between eruptions and waiting time
# Add a straight line (representing the relationship) onto the scatterplot
# Can you identify the number of clusters visible?
ggplot(faithful, aes(x = waiting, y = eruptions)) +
geom_point() +
geom_smooth(method = "lm")

# rerun and add a title and add countours to the "clusters" via geom_density_2d()
ggplot(faithful, aes(x = waiting, y = eruptions)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "Old Faithful\nEruptions v. Waiting Time")+
geom_density_2d()

####################################################################################################
#EXERCISE 5 ##########################################################################################
# go back to the state data set and the boxplot
# Create a boxplot using geom_boxplot of the relationship between employment growth and
# the presence of an estate tax in the state
ggplot(state, aes(x = estate_tax, y = emp_growth_cum)) +
geom_boxplot() +
labs(x = "Estate Tax Present", y = "Cumulative Employment Growth",
title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include estate_tax mapping into color in the aesthetic in geom_boxplot
ggplot(state, aes(x = estate_tax, y = migration)) +
geom_boxplot( aes(col = estate_tax)) +
labs(x = "Estate Tax Present", y = "Cumulative Employment Growth",
title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include the individual data points; throw in some jitter; note that the label is now in State
ggplot(state, aes(x = estate_tax, y = migration)) +
geom_boxplot( aes(col = estate_tax)) +
labs(x = "Estate Tax Present", y = "Cumulative Employment Growth",
title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016") +
geom_jitter() +
geom_text_repel(aes(label = State), force = 10,
col = "gray20")

# add the theme doing homage to Edward Tufte; i.e. theme_tufte()
ggplot(state, aes(x = estate_tax, y = migration)) +
geom_boxplot( aes(col = estate_tax)) +
labs(x = "Estate Tax Present", y = "Cumulative Employment Growth",
title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016") +
geom_jitter() +
geom_text_repel(aes(label = State), fontface = "bold",
col = "purple", force = 10) +
theme_tufte()

# Still too cluttered; reduce to CT, Florida, and Texas and change color to "gray20"
pointsToLabel <- c("Connecticut", "Florida", "Texas")
ggplot(state, aes(x = estate_tax, y = migration)) +
geom_boxplot( aes(col = estate_tax)) +
labs(x = "Estate Tax Present", y = "Cumulative Employment Growth",
title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016") +
geom_jitter() +
geom_text_repel(aes(label = State), force = 5, fontface = "bold",
col = "gray20", data = subset(state, State %in% pointsToLabel)) +
theme_tufte()

#####################################################################################################
# The Grand Finale
#####################################################################
# Reproduce the famous (?) graph displaying the relationship between the Human Development Index
# and the Corruptions Perception Index that appeared in the Economist in December 2011.
# The data is available in the csv file titled "EconomistData."
dat <- read.csv("EconomistData.csv")
str(dat)
## 'data.frame': 173 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : Factor w/ 173 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ HDI.Rank: int 172 70 96 148 45 86 2 19 91 53 ...
## $ HDI : num 0.398 0.739 0.698 0.486 0.797 0.716 0.929 0.885 0.7 0.771 ...
## $ CPI : num 1.5 3.1 2.9 2 3 2.6 8.8 7.8 2.4 7.3 ...
## $ Region : Factor w/ 6 levels "Americas","Asia Pacific",..: 2 3 5 6 1 3 2 4 3 1 ...
names(dat)
## [1] "X" "Country" "HDI.Rank" "HDI" "CPI" "Region"
attach(dat)
## The following objects are masked from dat (pos = 3):
##
## Country, CPI, HDI, HDI.Rank, Region, X
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
geom_point(shape = 1, size = 4)+
geom_smooth(method = "lm", formula = y ~ log(x), se = TRUE, color = "red")

pointsToLabel <- c("Russia", "Venezuela", "Iraq", "Myanmar", "Sudan", "Afghanistan",
"Congo", "Greece", "Argentina", "Brazil", "India", "Italy", "China", "South Africa",
"Spain", "Botswana", "Cape Verde", "Bhutan", "Rwanda", "France", "United States",
"Germany", "Britain", "Barbados", "Norway", "New Zealand", "Singapore")
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
geom_point(shape = 1, size = 4)+
geom_smooth(method = "lm", formula = y ~ log(x), se = TRUE, color = "red") +
geom_text_repel(aes(label = Country), color = "gray20",
data = subset(dat, Country %in% pointsToLabel),force = 10)+
theme_economist()+
labs(x = "Corruption Perceptions Index, 2011 (10=least corrupt)", y ="Human Development Index, 2011 (1=Best)",
title = "Corruption and Human development")
#We are still missing the R2; the measure of fit. R2 describes the quality linear model fit on the data.
#We are still missing the "Source" text
model_1 = lm(HDI~log10(CPI), dat)
summary(model_1)
##
## Call:
## lm(formula = HDI ~ log10(CPI), data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3167 -0.0713 0.0171 0.0751 0.2585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3167 0.0267 11.9 <0.0000000000000002 ***
## log10(CPI) 0.6138 0.0450 13.6 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.122 on 171 degrees of freedom
## Multiple R-squared: 0.521, Adjusted R-squared: 0.518
## F-statistic: 186 on 1 and 171 DF, p-value: <0.0000000000000002
objects(summary(model_1))
## [1] "adj.r.squared" "aliased" "call" "coefficients"
## [5] "cov.unscaled" "df" "fstatistic" "r.squared"
## [9] "residuals" "sigma" "terms"
mR2 = summary(model_1)$r.squared
library(grid) # the package grid has numerous tools to place text on a graph
grid.text("Sources: Transparency International; UN Human Development Report",x = .02, y = .01,just = "left",draw = TRUE)
grid.segments(x0 = 0.81, x1 = 0.825,y0 = 0.90, y1 = 0.90, gp = gpar(col = "red"),draw = TRUE)
grid.text(paste0("R² = ", as.integer(mR2*100), "%"),x = 0.835, y = 0.90,gp = gpar(col = "gray20"),draw = TRUE,just = "left")

#====================================================================================================================