#VISUALIZATION IN R: an introduction
# Version II

#November 9, 2018

setwd("C:/Users/arodriguez/Dropbox/RPractice/visualization")  #make sure you change the wd to YOUR wd, not mine
dir()
##  [1] " (DESKTOP-NMMG0TL's conflicted copy 2018-11-08).Rhistory"
##  [2] "base-r.pdf"                                              
##  [3] "climate-change-warnings-over-the-years.jpg"              
##  [4] "corruption-hd.png"                                       
##  [5] "EconomistData.csv"                                       
##  [6] "ggplot_tutorial.pdf"                                     
##  [7] "ggplot_xrcises.pdf"                                      
##  [8] "ggplot2-cheatsheet-2.1.pdf"                              
##  [9] "hullman_journal.pone.0142444.pdf"                        
## [10] "Moser_CHI2017.pdf"                                       
## [11] "paper_BELIV_evaluating_uncertainty_vis.pdf"              
## [12] "R-Intro.R"                                               
## [13] "R - A Gentle Introduction.pdf"                           
## [14] "R - A Gentle Introduction_REV.pdf"                       
## [15] "stateData_aer.csv"                                       
## [16] "Visualising Residuals.pdf"                               
## [17] "Visualization in R an Introduction.R"                    
## [18] "Visualization_In_R_II.R"                                 
## [19] "Visualization_In_R_II.spin.R"                            
## [20] "Visualization_In_R_II.spin.Rmd"
####### housekeeping ###########
options(digits = 3, scipen = 999)
remove(list = ls())  #this clears your workspace

#######################################################################################################################
#There are many ways to do things with R, especially visualization.  Here we will demonstrate two of them: the plotting
#capabilities of a popular package called ggplot.

#The array of tools built into R are known as Base-R tools.  Among them is a plotting function
#called plot().  Philosophically, the plot function mimics our approach to building a plot manually: that is to say,
#you start with a blank page and build up from there.
#
#For instance, plot(x,y) or plot(y~x) produces a scatterplot of the numbers in y versus the numbers in x. 
#The function also allows for the controlling of how the resulting graph is displayed. Thus, if your are interested 
# providing labels to the x and y axes as well as a title you would add to the 
#initial representation as follows  plot(y~x, xlab="this is the x-axis", ylab = "this is the y-axis", 
#      main = "This is a plot of y versus x")
#
#
#############################      
#Load all required packages
#install.packages("tidyverse")
#install.packages("dplyr")
##install.packages("ggplot2")
#install.packages("gapminder")
#install.packages("ggthemes")
#install.packages("datasets")
#install.packages("ggrepel")
#install.packages("grid")
#install.packages("splines")

          library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.5
## v tidyr   0.8.1     v stringr 1.3.0
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
          library(dplyr)
          library(ggplot2)
          library(ggthemes)
          library(datasets)
          library(ggrepel) 
          library(grid) 
          library(splines)

  #==========================================================================================================
                    # Plotting in Base-R
                    state = read.csv("stateData_aer.csv", header =  TRUE)
                      str(state)
## 'data.frame':    50 obs. of  19 variables:
##  $ X               : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ personal_rate   : num  0.0402 0 0.0454 0.069 0.133 0.0463 0.0699 0.079 0 0.06 ...
##  $ corporate_rate  : num  0.0423 0.094 0.055 0.065 0.088 0.0463 0.09 0.117 0.055 0.06 ...
##  $ progressivity   : num  -2.16 0 10.65 14.5 38.67 ...
##  $ property_tax    : num  15.2 37.1 27.5 18 28.6 ...
##  $ sales_tax       : num  25.01 5.71 37.16 35.64 24.24 ...
##  $ remaing_tax     : num  22.4 13.6 11.8 16.9 18.2 ...
##  $ estate_tax      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
##  $ tax_changes     : num  -0.04 0.19 -0.21 -0.3 -0.71 -0.03 3.5 1.19 -0.6 2.1 ...
##  $ debt            : num  0.085 0.062 0.083 0.05 0.09 0.11 0.071 0.1 0.09 0.08 ...
##  $ employees       : num  584 739 427 535 452 ...
##  $ liability_survey: num  55.1 68.1 65.4 57.7 49.9 67.8 65.9 76.5 56 62.4 ...
##  $ min_wage        : num  7.25 9.75 8.05 8 10 8.31 9.6 8.25 8.05 7.25 ...
##  $ workers_comp    : num  1.81 2.68 1.6 1.08 3.48 1.5 2.87 2.31 1.82 1.75 ...
##  $ rtw             : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 1 1 2 2 ...
##  $ tax_limits      : int  0 1 2 1 2 3 1 2 2 0 ...
##  $ gdp_cumm        : num  0.347 0.595 0.4 0.407 0.403 0.49 0.26 0.25 0.32 0.35 ...
##  $ migration       : int  103580 -21018 536269 63430 -1265447 315015 -153918 41162 834966 406863 ...
##  $ emp_growth_cum  : num  0.012 0.106 0.068 0.029 0.068 0.132 0.011 0.03 0.05 0.07 ...
                        head(state)
##            X personal_rate corporate_rate progressivity property_tax
## 1    Alabama        0.0402         0.0423         -2.16         15.2
## 2     Alaska        0.0000         0.0940          0.00         37.1
## 3    Arizona        0.0454         0.0550         10.65         27.5
## 4   Arkansas        0.0690         0.0650         14.50         18.0
## 5 California        0.1330         0.0880         38.67         28.6
## 6   Colorado        0.0463         0.0463          6.39         28.8
##   sales_tax remaing_tax estate_tax tax_changes  debt employees
## 1     25.01        22.4         No       -0.04 0.085       584
## 2      5.71        13.6         No        0.19 0.062       739
## 3     37.16        11.8         No       -0.21 0.083       427
## 4     35.64        16.9         No       -0.30 0.050       535
## 5     24.24        18.2         No       -0.71 0.090       452
## 6     25.08        13.8         No       -0.03 0.110       524
##   liability_survey min_wage workers_comp rtw tax_limits gdp_cumm migration
## 1             55.1     7.25         1.81 Yes          0    0.347    103580
## 2             68.1     9.75         2.68  No          1    0.595    -21018
## 3             65.4     8.05         1.60 Yes          2    0.400    536269
## 4             57.7     8.00         1.08 Yes          1    0.407     63430
## 5             49.9    10.00         3.48  No          2    0.403  -1265447
## 6             67.8     8.31         1.50  No          3    0.490    315015
##   emp_growth_cum
## 1          0.012
## 2          0.106
## 3          0.068
## 4          0.029
## 5          0.068
## 6          0.132
                          colnames(state)[1] = c("State")  #Replace the name of column 1 from X to State
                            names(state)
##  [1] "State"            "personal_rate"    "corporate_rate"  
##  [4] "progressivity"    "property_tax"     "sales_tax"       
##  [7] "remaing_tax"      "estate_tax"       "tax_changes"     
## [10] "debt"             "employees"        "liability_survey"
## [13] "min_wage"         "workers_comp"     "rtw"             
## [16] "tax_limits"       "gdp_cumm"         "migration"       
## [19] "emp_growth_cum"
                              attach(state)
                    
                    plot(debt ~ log10(gdp_cumm), xlab = "Log of Cumulative GDP", ylab = "State Debt, per-capita", 
                      main = "Debt vs. GDP\n Source: Alec-Laffer Rankings, 2016")
                        abline(lm(debt~ log10(gdp_cumm)))

                                # Exercise 1 ##############################################################################
                                # Is Cumulative Employment Growth higher in Right-to-Work States?
                                # Answer the question by displaying a plot of Cumulative Employment Growth (emp_growth_cum)
                                # versus Right-to-Work (rtw)
                                # Label properly and include a title
                                
                                # #########################################################################################

  #==========================================================================================================

##############################################################################################
# Introducing ggplot
# GGPLOT is a plotting system based on the grammar of graphics.
# GGPLOT is/are a set of tools to map data to visual elements on your plot, to specify the kind of plot you want,
# and last, to control the details of how it will be displayed.
# In ggplot2 a plot is divided into various fundamental/constituent parts : Plot = data + Aesthetics + Geometry.

#Elements of GGPLOT
# 1. dataframe organized as tidydata
# 2. aesthetics mappings or aesthetics or aes : (i) what is your data (ii) how does your data map onto the page
# 3. geom: what sort of plot you want. e.g. scatterplot (geom_point()), boxplot (geom_boxplot()), etc.
# note: aesthetics and geoms combined by adding, i.e. using "+"

# Layering
# Adding geometric, scale, facet and statistics layers to a ggplot() object allows 
# control of virtually every visual aspect of the plot.

##############################################################################################
          
          # We use the dataset mtcars available in Base-R
          ?mtcars
## starting httpd help server ...
##  done
          data(mtcars)
          str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
          #Display your data using aesthetics      (WHAT YOU WANT TO SEE)
          ggplot(data = mtcars, mapping = aes(x = wt, y = mpg)) 

#and select the geom (layer 1) type you want: scatterplot (i.e. geom_point)      (HOW YOU WANT TO SEE IT)
ggplot(data = mtcars, mapping = aes(x = wt, y = mpg)) +
geom_point() 

              #You can control the shape, size, transparency of the points in the scatter point
              ggplot(data = mtcars, mapping = aes(x = wt, y = mpg)) +
              geom_point(shape = 1, size = 4, col = "red")  # add shape = 1 to the geom_point arguments

                      #And you can continue layering on geoms as needed. For instance, add a smoother.   
                      ggplot(mtcars, aes(wt,mpg)) +
                      geom_point() +
                      geom_smooth()   #try geom_smooth(method = "lm")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

      #and here is the real power of ggplot; you can add  third variable into the existing 2-dim plane
      # In this case, information on the number of cylinders of a particular car is added via color gradations of the
      # data points; 
      # In addition to color,  you can use size and shape
      ggplot(mtcars, aes(wt,mpg))+
      geom_point(aes(color = cyl), size = 2) +
      geom_smooth()   
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

                      #Rerun the last command but change smoother; i.e.  geom_smooth(method = "lm")

              #rerun the previous command after changing the type of variable for cyl; from numeric to factor
              # as follows 
              str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
              mtcars$cyl = as.factor(mtcars$cyl)
              #'cause ggplot is interpreting the variable cyl as numeric; its a factor; no such thing as 3 or 5 cyl
              #rerun the previous command with size = cyl; and then shape = cyl
              
                      #  you could place the col shading in the global aesthetic within ggplot() but that informs 
                      # all subsequent geoms
                      ggplot(mtcars, aes(wt,mpg, col = cyl))+
                      geom_point( size = 2) +
                      geom_smooth(method = "lm")   

                # You can also examine individual variables 
                ggplot(mtcars, aes(x = mpg)) + 
                geom_density()

                # change line colors by cylinder
                ggplot(mtcars, aes(x = mpg)) + 
                geom_density(aes(color = cyl)) 

                # Change fill color by cyl
                # Use semi-transparent fill: alpha = 0.4
                ggplot(mtcars, aes(x = mpg)) + 
                geom_density(aes(fill = cyl), alpha=0.4)

                #EXERCISE 2 ##########################################################################################
                # This time using ggplot recreate the plots from Exercise 1 above.  Sort of.
                # Create a scatter plot of the relationship between the log of cumulative gdp and 
                # the corporate marginal tax rate (corporate_rate)
                # Add a straight line (representing the relationship) onto the scatterplot
                # Label properly and include a title
                
                
                # Rerun and include rtw (Right-to-Work) as an aesthetic in geom_point mapping 
                
                #####################################################################################################
                
                
                #EXERCISE 3 ##########################################################################################
                # This time using ggplot recreate the boxplots from Exercise 1 above.  Sort of.
                # Create a boxplot using geom_boxplot of the relationship between net migration and 
                # Right-to-Work. Label properly - including a Title
                
                
                # Rerun and include rtw mapping into color in the  aesthetic in geom_boxplot 
                
                #####################################################################################################
              
                          
      #EXERCISE 4 ##########################################################################################
      # Load the data set "faithful" - the Old Faithful Geyser Data
      # Create a scatter plot of the relationship between eruptions and waiting time
      # Add a straight line (representing the relationship) onto the scatterplot
      # Can you identify the number of clusters visible?
        
      # rerun and add a title and add countours to the "clusters" via geom_density_2d()          
      
      #####################################################################################################
      
      ###########################################################################################
        # Display the relatiionship between miles-per-gallon (mpg) and a cars cylinders (cyl); 
        # Use a boxplot as your "geom_"     
        ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
          geom_point(aes(size = hp))  + 
          geom_boxplot()

                #Now add information on horsepower (hp) as an aesthetic in geom_point mapping hp into size
                ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                  geom_point(aes(size = hp))  + 
                  geom_boxplot()

                    #Or even better, a violin plot, instead of a boxplot; i.e.  geom_violin()
                    ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                    geom_point()  + 
                    geom_violin()

                        #add a little "jitter" to the points; to enhance their presence; i.e. geom_jitter()
                    ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                      geom_point()  + 
                      geom_violin() +
                          geom_jitter()  # jitter throws a little random distance to each point so it appears offset a bit

                    # geom_text and geom_lable allows you to add text into the graph
                    ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                      geom_point()  + 
                      geom_violin() +
                    #  geom_jitter(width = 0.5, height = 0.5)+   # control the jitter here or in the geom_label
                      geom_label(aes(label = rownames(mtcars)), fontface = "bold", 
                                col = "red",position=position_jitter(width=0.25,height=0.55)) 

                    # like this
                    ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                      geom_point()  + 
                      geom_violin() +
                      geom_jitter(width = 0.5, height = 0.5)+   # control the jitter here o in the geom_lable
                      geom_label(aes(label = rownames(mtcars)), fontface = "bold", 
                                 col = "red") 

                    # Still looks "scrunched-up"
                    # use geom_text_repel() in the package ggrepel to display the names better
                    # i.e. geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold", 
                    
                    library("ggrepel")
                    ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                      geom_point()  + 
                      geom_boxplot() +
                      geom_jitter(width=.25,height=.5) +
                      geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold", 
                                      col = "red") 

                  # the use of "themes" allows a consistent presentation style
                  # theme economist reproduces the style of The Economist magazine
                    
                    ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                      geom_point()  + 
                      geom_boxplot() +
                      geom_jitter(width=.25,height=.5) +
                      geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold", 
                                      col = "red") +
                      labs(x = "Cylinders", y = "Miles-per-Gallon", title = "Miles per Gallon v. Cylinders")+
                      theme_economist()

                 # Nate Silver's 538 theme
                      ggplot(data = mtcars, mapping = aes(x = cyl, y = mpg)) +
                        geom_point()  + 
                        geom_boxplot() +
                        geom_jitter(width=.25,height=.5) +
                        geom_text_repel(aes(label = rownames(mtcars)), fontface = "bold", 
                                        col = "red") +
                        labs(x = "Cylinders", y = "Miles-per-Gallon", title = "Miles per Gallon v. Cylinders")+
                      theme_fivethirtyeight()

                # Still too cluttered; reduce to only Lincoln Continental, the Camaro and the Dodge Challenger      
                      pointsToLabel <- c("Camaro Z28", "Lincoln Continental", "Dodge Challenger")
                      
                      mtcars1 = rownames_to_column(mtcars, var = "Brand") #rownames must be a column
                      str(mtcars1)
## 'data.frame':    32 obs. of  12 variables:
##  $ Brand: chr  "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ mpg  : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl  : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
##  $ disp : num  160 160 108 258 360 ...
##  $ hp   : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt   : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec : num  16.5 17 18.6 19.4 17 ...
##  $ vs   : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am   : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb : num  4 4 1 1 2 1 4 2 2 4 ...
                          mtcars1$Brand
##  [1] "Mazda RX4"           "Mazda RX4 Wag"       "Datsun 710"         
##  [4] "Hornet 4 Drive"      "Hornet Sportabout"   "Valiant"            
##  [7] "Duster 360"          "Merc 240D"           "Merc 230"           
## [10] "Merc 280"            "Merc 280C"           "Merc 450SE"         
## [13] "Merc 450SL"          "Merc 450SLC"         "Cadillac Fleetwood" 
## [16] "Lincoln Continental" "Chrysler Imperial"   "Fiat 128"           
## [19] "Honda Civic"         "Toyota Corolla"      "Toyota Corona"      
## [22] "Dodge Challenger"    "AMC Javelin"         "Camaro Z28"         
## [25] "Pontiac Firebird"    "Fiat X1-9"           "Porsche 914-2"      
## [28] "Lotus Europa"        "Ford Pantera L"      "Ferrari Dino"       
## [31] "Maserati Bora"       "Volvo 142E"
                      ggplot(data = mtcars1, mapping = aes(x = cyl, y = mpg)) +
                        geom_point()  + 
                        geom_boxplot() +
                        geom_jitter(width=.25,height=.5) +
                        geom_text_repel(aes(label = Brand), force = 5,fontface = "bold", 
                                        col = "red", data = subset(mtcars1, Brand %in% pointsToLabel)  ) +
                        labs(x = "Cylinders", y = "Miles-per-Gallon", title = "Miles per Gallon v. Cylinders")+
                        theme_fivethirtyeight()

                    #EXERCISE 5 ##########################################################################################
                    # go back to the state data set and the boxplot 
                    # Create a boxplot using geom_boxplot of the relationship between employment growth and 
                    # the presence of an estate tax in the state
                    
                    # Rerun and include estate_tax mapping into color in the  aesthetic in geom_boxplot 
                    
                    # Rerun and include the individual data points; throw in some jitter; note that the label is now in State
                     
                    
                    # add the theme doing homage to Edward Tufte; i.e. theme_tufte()
                     
                    # Still too cluttered; reduce to CT, Florida, and Texas  and change color to "gray20"  
                    
#####################################################################################################

# The Grand Finale
#####################################################################
#       Reproduce the famous graph displaying the relationship between the Human Development Index
#       and the Corruptions Perception Index that appeared in the Economist in December 2011.
#       The data is available in the csv file titled "EconomistData."

          dat <- read.csv("EconomistData.csv")
          str(dat)
## 'data.frame':    173 obs. of  6 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country : Factor w/ 173 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ HDI.Rank: int  172 70 96 148 45 86 2 19 91 53 ...
##  $ HDI     : num  0.398 0.739 0.698 0.486 0.797 0.716 0.929 0.885 0.7 0.771 ...
##  $ CPI     : num  1.5 3.1 2.9 2 3 2.6 8.8 7.8 2.4 7.3 ...
##  $ Region  : Factor w/ 6 levels "Americas","Asia Pacific",..: 2 3 5 6 1 3 2 4 3 1 ...
          names(dat)
## [1] "X"        "Country"  "HDI.Rank" "HDI"      "CPI"      "Region"
          attach(dat)



#### QED ##########################################################################################################
#SOME RESOURCES
# https://flowingdata.com/2016/03/22/comparing-ggplot2-and-r-base-graphics/
# https://www.statmethods.net/advgraphs/axes.html
# http://www.r-graph-gallery.com/all-graphs/ 














































































































































#====================================================================================================================

# Exercise 1 ##############################################################################
# Is Cumulative Employment Growth higher in Right-to-Work States?
# Answer the question by displaying a plot of Cumulative Employment Growth (emp_growth_cum)
# versus Right-to-Work (rtw)
# Label properly and include a title
plot(emp_growth_cum ~ rtw, xlab = "Right-to-Work State", ylab = "Cumulative Employment Growth",
     main = "Employment Growth vs. Right-to-Work Status\n Source: Alex-Laffer Rankings, 2016")

# #########################################################################################

#EXERCISE 2 ##########################################################################################
# This time using ggplot recreate the plots from Exercise 1 above.  Sort of.
# Create a scatter plot of the relationship between the log of cumulative gdp and 
# the corporate marginal tax rate (corporate_rate)
# Add a straight line (representing the relationship) onto the scatterplot
# Label properly and include a title

ggplot(state, aes(x = personal_rate, y = log10(gdp_cumm))) +
  geom_point(size = 4) +
  geom_smooth(method = "lm") +
  labs(x = "Corporate Marginal Tax Rate", y = "Log of Cumulative GDP", 
       title = "Corporate Tax Rate vs. GDP\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include rtw (Right-to-Work) as an aesthetic in geom_point mapping 

ggplot(state, aes(x = personal_rate, y = log10(gdp_cumm))) +
  geom_point(aes(shape = rtw), size = 4) +
  geom_smooth(method = "lm") +
  labs(x = "Corporate Marginal Tax Rate", y = "Log of Cumulative GDP", 
       title = "Corporate Tax Rate vs. GDP\nSource: Alec-Laffer Rankings, 2016")

#####################################################################################################


#EXERCISE 3 ##########################################################################################
# This time using ggplot recreate the boxplots from Exercise 1 above.  Sort of.
# Create a boxplot using geom_boxplot of the relationship between net migration and 
# Right-to-Work. Label properly - including a Title

ggplot(state, aes(x = rtw, y = migration)) +
  geom_boxplot() +
  labs(x = "Rigth to Work State", y = "Cumulative Net Migration", 
       title = "Cumulative Net Migration vs. Right-to-Work\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include rtw mapping into color in the  aesthetic in geom_boxplot 

ggplot(state, aes(x = rtw, y = migration)) +
  geom_boxplot( aes(col = rtw)) +
  labs(x = "Rigth to Work State", y = "Cumulative Net Migration", 
       title = "Cumulative Net Migration vs. Right-to-Work\nSource: Alec-Laffer Rankings, 2016")

#####################################################################################################


#EXERCISE 4 ##########################################################################################
# Load the data set "faithful" - the Old Faithful Geyser Data
# Create a scatter plot of the relationship between eruptions and waiting time
# Add a straight line (representing the relationship) onto the scatterplot
# Can you identify the number of clusters visible?
ggplot(faithful, aes(x = waiting,  y = eruptions)) +
  geom_point() +
  geom_smooth(method = "lm")

# rerun and add a title and add countours to the "clusters" via geom_density_2d()          

ggplot(faithful, aes(x = waiting,  y = eruptions)) +
  geom_point() +
  geom_smooth(method = "lm")+
  labs(title = "Old Faithful\nEruptions v. Waiting Time")+
  geom_density_2d()

####################################################################################################

#EXERCISE 5 ##########################################################################################
# go back to the state data set and the boxplot 
# Create a boxplot using geom_boxplot of the relationship between employment growth and 
# the presence of an estate tax in the state

ggplot(state, aes(x = estate_tax, y = emp_growth_cum)) +
  geom_boxplot() +
  labs(x = "Estate Tax Present", y = "Cumulative Employment Growth", 
       title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include estate_tax mapping into color in the  aesthetic in geom_boxplot 

ggplot(state, aes(x = estate_tax, y = migration)) +
  geom_boxplot( aes(col = estate_tax)) +
  labs(x = "Estate Tax Present", y = "Cumulative Employment Growth", 
       title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016")

# Rerun and include the individual data points; throw in some jitter; note that the label is now in State
ggplot(state, aes(x = estate_tax, y = migration)) +
  geom_boxplot( aes(col = estate_tax)) +
  labs(x = "Estate Tax Present", y = "Cumulative Employment Growth", 
       title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016") +
  geom_jitter() +
  geom_text_repel(aes(label = State), force = 10, 
                  col = "gray20") 

# add the theme doing homage to Edward Tufte; i.e. theme_tufte()
ggplot(state, aes(x = estate_tax, y = migration)) +
  geom_boxplot( aes(col = estate_tax)) +
  labs(x = "Estate Tax Present", y = "Cumulative Employment Growth", 
       title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016") +
  geom_jitter() +
  geom_text_repel(aes(label = State), fontface = "bold", 
                  col = "purple", force = 10) +
  theme_tufte()

# Still too cluttered; reduce to CT, Florida, and Texas  and change color to "gray20"  
pointsToLabel <- c("Connecticut", "Florida", "Texas")

ggplot(state, aes(x = estate_tax, y = migration)) +
  geom_boxplot( aes(col = estate_tax)) +
  labs(x = "Estate Tax Present", y = "Cumulative Employment Growth", 
       title = "Cumulative Employment Growth vs. Estate Tax\nSource: Alec-Laffer Rankings, 2016") +
  geom_jitter() +
  geom_text_repel(aes(label = State), force = 5, fontface = "bold", 
                  col = "gray20", data = subset(state, State %in% pointsToLabel)) +
  theme_tufte()

#####################################################################################################
# The Grand Finale

#####################################################################
#       Reproduce the famous (?) graph displaying the relationship between the Human Development Index
#       and the Corruptions Perception Index that appeared in the Economist in December 2011.
#       The data is available in the csv file titled "EconomistData."

dat <- read.csv("EconomistData.csv")
str(dat)
## 'data.frame':    173 obs. of  6 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country : Factor w/ 173 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ HDI.Rank: int  172 70 96 148 45 86 2 19 91 53 ...
##  $ HDI     : num  0.398 0.739 0.698 0.486 0.797 0.716 0.929 0.885 0.7 0.771 ...
##  $ CPI     : num  1.5 3.1 2.9 2 3 2.6 8.8 7.8 2.4 7.3 ...
##  $ Region  : Factor w/ 6 levels "Americas","Asia Pacific",..: 2 3 5 6 1 3 2 4 3 1 ...
names(dat)
## [1] "X"        "Country"  "HDI.Rank" "HDI"      "CPI"      "Region"
attach(dat)
## The following objects are masked from dat (pos = 3):
## 
##     Country, CPI, HDI, HDI.Rank, Region, X
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
  geom_point(shape = 1, size = 4)+
  geom_smooth(method = "lm", formula = y ~ log(x), se = TRUE, color = "red") 

pointsToLabel <- c("Russia", "Venezuela", "Iraq", "Myanmar", "Sudan", "Afghanistan", 
                   "Congo", "Greece", "Argentina", "Brazil", "India", "Italy", "China", "South Africa", 
                   "Spain", "Botswana", "Cape Verde", "Bhutan", "Rwanda", "France", "United States", 
                   "Germany", "Britain", "Barbados", "Norway", "New Zealand", "Singapore")

ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
  geom_point(shape = 1, size = 4)+
  geom_smooth(method = "lm", formula = y ~ log(x), se = TRUE, color = "red") +
  geom_text_repel(aes(label = Country), color = "gray20",
                  data = subset(dat, Country %in% pointsToLabel),force = 10)+
  theme_economist()+
  labs(x = "Corruption Perceptions Index, 2011 (10=least corrupt)", y ="Human Development Index, 2011 (1=Best)",
       title = "Corruption and Human development")

#We are still missing the R2; the measure of fit.  R2 describes the quality linear model fit on the data.
#We are still missing the "Source" text 
model_1 = lm(HDI~log10(CPI), dat)
summary(model_1)
## 
## Call:
## lm(formula = HDI ~ log10(CPI), data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3167 -0.0713  0.0171  0.0751  0.2585 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   0.3167     0.0267    11.9 <0.0000000000000002 ***
## log10(CPI)    0.6138     0.0450    13.6 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.122 on 171 degrees of freedom
## Multiple R-squared:  0.521,  Adjusted R-squared:  0.518 
## F-statistic:  186 on 1 and 171 DF,  p-value: <0.0000000000000002
objects(summary(model_1))
##  [1] "adj.r.squared" "aliased"       "call"          "coefficients" 
##  [5] "cov.unscaled"  "df"            "fstatistic"    "r.squared"    
##  [9] "residuals"     "sigma"         "terms"
mR2 = summary(model_1)$r.squared

library(grid) # the package grid has numerous tools to place text on a graph
grid.text("Sources: Transparency International; UN Human Development Report",x = .02, y = .01,just = "left",draw = TRUE) 
grid.segments(x0 = 0.81, x1 = 0.825,y0 = 0.90, y1 = 0.90, gp = gpar(col = "red"),draw = TRUE) 
grid.text(paste0("R² = ", as.integer(mR2*100), "%"),x = 0.835, y = 0.90,gp = gpar(col = "gray20"),draw = TRUE,just = "left")

#====================================================================================================================