Managing Data
Arranging and rearranging data is key to succesul work.

# ON SUBSETTING
#

remove(list = ls()) # Clear your workspace
data(mtcars)  
attach(mtcars)

names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
# Subsetting variables
# Create a smaller dataframe with only the variables mpg, cyl, and qsec.
  
  mtcars_small_1 = mtcars[, c(1,2, 7)]
  mtcars_small_1
##                      mpg cyl  qsec
## Mazda RX4           21.0   6 16.46
## Mazda RX4 Wag       21.0   6 17.02
## Datsun 710          22.8   4 18.61
## Hornet 4 Drive      21.4   6 19.44
## Hornet Sportabout   18.7   8 17.02
## Valiant             18.1   6 20.22
## Duster 360          14.3   8 15.84
## Merc 240D           24.4   4 20.00
## Merc 230            22.8   4 22.90
## Merc 280            19.2   6 18.30
## Merc 280C           17.8   6 18.90
## Merc 450SE          16.4   8 17.40
## Merc 450SL          17.3   8 17.60
## Merc 450SLC         15.2   8 18.00
## Cadillac Fleetwood  10.4   8 17.98
## Lincoln Continental 10.4   8 17.82
## Chrysler Imperial   14.7   8 17.42
## Fiat 128            32.4   4 19.47
## Honda Civic         30.4   4 18.52
## Toyota Corolla      33.9   4 19.90
## Toyota Corona       21.5   4 20.01
## Dodge Challenger    15.5   8 16.87
## AMC Javelin         15.2   8 17.30
## Camaro Z28          13.3   8 15.41
## Pontiac Firebird    19.2   8 17.05
## Fiat X1-9           27.3   4 18.90
## Porsche 914-2       26.0   4 16.70
## Lotus Europa        30.4   4 16.90
## Ford Pantera L      15.8   8 14.50
## Ferrari Dino        19.7   6 15.50
## Maserati Bora       15.0   8 14.60
## Volvo 142E          21.4   4 18.60
  mtcars_small_2 = mtcars[, 3:6]  # now choose variables 3 to 6
  mtcars_small_2
##                      disp  hp drat    wt
## Mazda RX4           160.0 110 3.90 2.620
## Mazda RX4 Wag       160.0 110 3.90 2.875
## Datsun 710          108.0  93 3.85 2.320
## Hornet 4 Drive      258.0 110 3.08 3.215
## Hornet Sportabout   360.0 175 3.15 3.440
## Valiant             225.0 105 2.76 3.460
## Duster 360          360.0 245 3.21 3.570
## Merc 240D           146.7  62 3.69 3.190
## Merc 230            140.8  95 3.92 3.150
## Merc 280            167.6 123 3.92 3.440
## Merc 280C           167.6 123 3.92 3.440
## Merc 450SE          275.8 180 3.07 4.070
## Merc 450SL          275.8 180 3.07 3.730
## Merc 450SLC         275.8 180 3.07 3.780
## Cadillac Fleetwood  472.0 205 2.93 5.250
## Lincoln Continental 460.0 215 3.00 5.424
## Chrysler Imperial   440.0 230 3.23 5.345
## Fiat 128             78.7  66 4.08 2.200
## Honda Civic          75.7  52 4.93 1.615
## Toyota Corolla       71.1  65 4.22 1.835
## Toyota Corona       120.1  97 3.70 2.465
## Dodge Challenger    318.0 150 2.76 3.520
## AMC Javelin         304.0 150 3.15 3.435
## Camaro Z28          350.0 245 3.73 3.840
## Pontiac Firebird    400.0 175 3.08 3.845
## Fiat X1-9            79.0  66 4.08 1.935
## Porsche 914-2       120.3  91 4.43 2.140
## Lotus Europa         95.1 113 3.77 1.513
## Ford Pantera L      351.0 264 4.22 3.170
## Ferrari Dino        145.0 175 3.62 2.770
## Maserati Bora       301.0 335 3.54 3.570
## Volvo 142E          121.0 109 4.11 2.780
  # Subsetting obervations
  
  mt_rows_3= mtcars_small_1[1:10,]  # this selects the first 10 observations
  mt_rows_3
##                    mpg cyl  qsec
## Mazda RX4         21.0   6 16.46
## Mazda RX4 Wag     21.0   6 17.02
## Datsun 710        22.8   4 18.61
## Hornet 4 Drive    21.4   6 19.44
## Hornet Sportabout 18.7   8 17.02
## Valiant           18.1   6 20.22
## Duster 360        14.3   8 15.84
## Merc 240D         24.4   4 20.00
## Merc 230          22.8   4 22.90
## Merc 280          19.2   6 18.30
  # Subsetting using logical queries
  # In this case choose all brands that have 4 cylinders
  
  mtcars_cyl = mtcars[cyl == 4, ]  # select the observations that meet the stated condition
  mtcars_cyl
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
    mtcars_small_3 = mtcars[cyl %in% c(2,4,6),]
    
      mtcars_small_4 = mtcars[with(mtcars, cyl == 4, hp > 100),]
  
    # Using the function subset: 
    
    mtcars_small_5 = subset(mtcars, mpg < 20)  
    
    mtcars_small_5 = subset(mtcars, mpg < 20 & cyl == 8)
    
    # using the function: select
    
    mtcars_small_6 = subset(mtcars, mpg < 20 & cyl == 8, hp)
    
    mtcars_small_6 = subset(mtcars, mpg < 20 & cyl == 8, c(4,6))
    mtcars_small_7 = subset(mtcars, mpg < 20 & cyl == 8, select = c(4,6))
    
    mtcars_small_8 = subset(mtcars, mpg < 20 & cyl == 8, select = hp:am)
    
    # subsetting BOTH variables and Observations
    
    names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
    mtcars_small_9 = subset(mtcars, cyl == 4, c(1:3, 7))
    mtcars_small_10 = mtcars[cyl == 4, c(1:3, 7)]

Homework or Classwork

Use the data set state_data_csv.

Create a smaller data set consisting only of Population, Income, and Life Expectancy
Using the smaller data set, show a plot of Income and Life Expectancy
Make sure the plot has a line through the data showing the best fit OLS line.