#========================================================#

# CLASSIFICATION TREE PROJECT

# Develop a decision tree model - decision rule - to advise the oncology department on
# whether a biopsy is malignant or benign.

# Use the Wisconsin Breast Cancer data set. 
# Report the confusion matrix of all iterations of your modeling.  Especially
# the final one.

###########################################################################################

        remove(list = ls())
        setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/DecisionTrees/DT_Project")
        options(digits = 3, scipen = 9999)

# ##############################################################################
          library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
          #special libraries
          library(mdsr)
## Loading required package: lattice
## Loading required package: ggformula
## Loading required package: ggstance
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
## 
## In accordance with CRAN policy, the 'mdsr' package 
##            no longer attaches
## the 'tidyverse' package automatically.
## You may need to 'library(tidyverse)' in order to 
##            use certain functions.
          library(rpart)
          library(partykit)  
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
          library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
## 
##     dotPlot
## The following object is masked from 'package:purrr':
## 
##     lift
          library(partykit)

  # Read the dataset
  cancer = read.csv("wisc_bc_data.csv", header = T)
  head(cancer)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M        18.0         10.4          122.8      1001
## 2   842517         M        20.6         17.8          132.9      1326
## 3 84300903         M        19.7         21.2          130.0      1203
## 4 84348301         M        11.4         20.4           77.6       386
## 5 84358402         M        20.3         14.3          135.1      1297
## 6   843786         M        12.4         15.7           82.6       477
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1          0.1184           0.2776         0.3001              0.1471
## 2          0.0847           0.0786         0.0869              0.0702
## 3          0.1096           0.1599         0.1974              0.1279
## 4          0.1425           0.2839         0.2414              0.1052
## 5          0.1003           0.1328         0.1980              0.1043
## 6          0.1278           0.1700         0.1578              0.0809
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1         0.242                 0.0787     1.095      0.905         8.59
## 2         0.181                 0.0567     0.543      0.734         3.40
## 3         0.207                 0.0600     0.746      0.787         4.58
## 4         0.260                 0.0974     0.496      1.156         3.44
## 5         0.181                 0.0588     0.757      0.781         5.44
## 6         0.209                 0.0761     0.335      0.890         2.22
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1   153.4       0.00640         0.0490       0.0537            0.0159
## 2    74.1       0.00522         0.0131       0.0186            0.0134
## 3    94.0       0.00615         0.0401       0.0383            0.0206
## 4    27.2       0.00911         0.0746       0.0566            0.0187
## 5    94.4       0.01149         0.0246       0.0569            0.0188
## 6    27.2       0.00751         0.0335       0.0367            0.0114
##   symmetry_se fractal_dimension_se radius_worst texture_worst
## 1      0.0300              0.00619         25.4          17.3
## 2      0.0139              0.00353         25.0          23.4
## 3      0.0225              0.00457         23.6          25.5
## 4      0.0596              0.00921         14.9          26.5
## 5      0.0176              0.00511         22.5          16.7
## 6      0.0216              0.00508         15.5          23.8
##   perimeter_worst area_worst smoothness_worst compactness_worst
## 1           184.6       2019            0.162             0.666
## 2           158.8       1956            0.124             0.187
## 3           152.5       1709            0.144             0.424
## 4            98.9        568            0.210             0.866
## 5           152.2       1575            0.137             0.205
## 6           103.4        742            0.179             0.525
##   concavity_worst concave.points_worst symmetry_worst
## 1           0.712                0.265          0.460
## 2           0.242                0.186          0.275
## 3           0.450                0.243          0.361
## 4           0.687                0.258          0.664
## 5           0.400                0.163          0.236
## 6           0.535                0.174          0.399
##   fractal_dimension_worst
## 1                  0.1189
## 2                  0.0890
## 3                  0.0876
## 4                  0.1730
## 5                  0.0768
## 6                  0.1244
            #Create training and testing subsets
            cancer_df = cancer %>% mutate( ID = row_number())
            train = cancer_df %>% sample_frac(0.8)
            test = cancer_df %>% anti_join(train, by = "ID")

                #Establish Null Model
                names(train)
##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave.points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave.points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave.points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"
## [33] "ID"
                prop.table(table(train$diagnosis))
## 
##    B    M 
## 0.62 0.38
                #equivalent
                tally(~diagnosis, data = train, format = "percent")                      
## diagnosis
##  B  M 
## 62 38
  # Fit a tree             
  form = as.formula("diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean + 
                    compactness_mean +concavity_mean+concave.points_mean + symmetry_mean+ fractal_dimension_mean")
  
  mod_tree = rpart(form, data = train)
  mod_tree
## n= 455 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 455 173 B (0.6198 0.3802)  
##    2) concave.points_mean< 0.0489 270  16 B (0.9407 0.0593)  
##      4) area_mean< 696 257   6 B (0.9767 0.0233) *
##      5) area_mean>=696 13   3 M (0.2308 0.7692) *
##    3) concave.points_mean>=0.0489 185  28 M (0.1514 0.8486)  
##      6) area_mean< 676 54  25 M (0.4630 0.5370)  
##       12) texture_mean< 20.2 33  10 B (0.6970 0.3030)  
##         24) texture_mean< 15.4 12   0 B (1.0000 0.0000) *
##         25) texture_mean>=15.4 21  10 B (0.5238 0.4762)  
##           50) texture_mean>=17.2 13   4 B (0.6923 0.3077) *
##           51) texture_mean< 17.2 8   2 M (0.2500 0.7500) *
##       13) texture_mean>=20.2 21   2 M (0.0952 0.9048) *
##      7) area_mean>=676 131   3 M (0.0229 0.9771) *
  #make a note of the position of the split in the first node.
  #so you can map it below - in a ggplot
  concave.points_mean_split = 0.0514
  
  #Plot the fitted tree
  plot(as.party(mod_tree))

  #Predict using the testing data set
  diagnosis_tree = predict(mod_tree, type = "class", newdata = test)

  # Visual Display
  # First add new variables to the dataset "test".  The variables should be
  # the binary ones identified by the tree.  For instance if node 2 returns
  # a split on "age" at age = 54, then create a new variable 
  # old = age> 15 in a mutate command. 
  # then introduce that variable into a ggplot graph.
  # Thus, you will have a ggplot that uses shape and faceting to convey information
  # on 3 of the predictive variables identified as important by the tree algorithm
  
    test = test %>%  mutate(hi_area_mean = area_mean >= 696, 
                          hi_text_mean = texture_mean < 16.8,
                          diagnosis_tree)
                          

        str(train)
## 'data.frame':    455 obs. of  33 variables:
##  $ id                     : int  901315 913063 869931 9113156 871149 853401 89143602 875099 854941 8810955 ...
##  $ diagnosis              : Factor w/ 2 levels "B","M": 1 1 1 1 1 2 1 1 1 2 ...
##  $ radius_mean            : num  10.6 12.4 13.7 14.4 10.9 ...
##  $ texture_mean           : num  20.2 16.4 17.9 27 13 ...
##  $ perimeter_mean         : num  70.2 82.8 88.1 92.2 68.7 ...
##  $ area_mean              : num  338 477 585 646 367 ...
##  $ smoothness_mean        : num  0.0907 0.0951 0.0794 0.0699 0.0751 ...
##  $ compactness_mean       : num  0.166 0.1511 0.0638 0.0522 0.0372 ...
##  $ concavity_mean         : num  0.228 0.1544 0.02881 0.03476 0.00309 ...
##  $ concave.points_mean    : num  0.05941 0.04846 0.01329 0.01737 0.00659 ...
##  $ symmetry_mean          : num  0.219 0.208 0.147 0.171 0.144 ...
##  $ fractal_dimension_mean : num  0.0845 0.0732 0.0558 0.0543 0.0574 ...
##  $ radius_se              : num  0.112 0.392 0.25 0.232 0.282 ...
##  $ texture_se             : num  1.231 1.207 0.757 0.911 0.761 ...
##  $ perimeter_se           : num  2.36 5 1.57 1.73 1.81 ...
##  $ area_se                : num  7.23 30.19 21.47 20.52 18.54 ...
##  $ smoothness_se          : num  0.0085 0.00723 0.00284 0.00536 0.00614 ...
##  $ compactness_se         : num  0.07643 0.07471 0.01592 0.01679 0.00613 ...
##  $ concavity_se           : num  0.1535 0.1114 0.0178 0.01971 0.00184 ...
##  $ concave.points_se      : num  0.02919 0.02721 0.00583 0.00637 0.00358 ...
##  $ symmetry_se            : num  0.0162 0.0323 0.0133 0.0141 0.0164 ...
##  $ fractal_dimension_se   : num  0.0122 0.00963 0.00198 0.00189 0.00266 ...
##  $ radius_worst           : num  10.8 13.8 15.3 15.4 12.4 ...
##  $ texture_worst          : num  22.8 21 22.5 32 18.2 ...
##  $ perimeter_worst        : num  76.5 97.8 97.2 100.4 78.1 ...
##  $ area_worst             : num  352 581 726 735 470 ...
##  $ smoothness_worst       : num  0.1143 0.1175 0.0971 0.1017 0.1171 ...
##  $ compactness_worst      : num  0.3619 0.4061 0.1824 0.146 0.0829 ...
##  $ concavity_worst        : num  0.603 0.4896 0.1564 0.1472 0.0185 ...
##  $ concave.points_worst   : num  0.1465 0.1342 0.0602 0.0556 0.0395 ...
##  $ symmetry_worst         : num  0.26 0.323 0.235 0.234 0.274 ...
##  $ fractal_dimension_worst: num  0.12 0.1034 0.0701 0.0646 0.0769 ...
##  $ ID                     : int  377 486 150 463 160 31 291 193 38 215 ...
        prop.table(table(test$hi_area_mean))
## 
## FALSE  TRUE 
## 0.711 0.289
        prop.table(table(test$hi_text_mean))
## 
## FALSE  TRUE 
## 0.711 0.289
              ggplot(test, aes(x= concave.points_mean, y = diagnosis)) +
            geom_count(aes(color = diagnosis_tree, shape = hi_text_mean, size =3), 
          position = position_jitter(width = 0, height = 0.1),
        alpha = 0.5) +
    geom_vline(xintercept = concave.points_mean_split, color = "black", lty = 2) +
  facet_grid(~hi_area_mean)

  ggplot(test, aes(x= concave.points_mean, y = diagnosis)) +
    geom_count(aes(color = diagnosis_tree, shape = hi_area_mean, size =3), 
       position = position_jitter(width = 0, height = 0.1),
         alpha = 0.5) +
          geom_vline(xintercept = concave.points_mean_split, color = "black", lty = 2) +
            facet_grid(~hi_text_mean)

  #Relabel the newly created binary predictive variables to improve the ggplot
  #graphs
  table(test$hi_text_mean)
## 
## FALSE  TRUE 
##    81    33
  test$hi_text_mean = as.factor(test$hi_text_mean)
  test$hi_text_mean = recode(test$hi_text_mean, "TRUE" = "High Mean Texture", "FALSE" = "Other")
  
  table(test$hi_area_mean)
## 
## FALSE  TRUE 
##    81    33
  test$hi_area_mean = as.factor(test$hi_area_mean)
  test$hi_area_mean = recode(test$hi_area_mean, "TRUE" = "High Mean Area", "FALSE" = "Other")

              # Calculate the performance via the confusion matrix.  
              # Focus on the false positives.
              confusionMatrix(test$diagnosis_tree, test$diagnosis)              
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 69  3
##          M  6 36
##                                          
##                Accuracy : 0.921          
##                  95% CI : (0.855, 0.963) 
##     No Information Rate : 0.658          
##     P-Value [Acc > NIR] : 0.0000000000399
##                                          
##                   Kappa : 0.828          
##  Mcnemar's Test P-Value : 0.505          
##                                          
##             Sensitivity : 0.920          
##             Specificity : 0.923          
##          Pos Pred Value : 0.958          
##          Neg Pred Value : 0.857          
##              Prevalence : 0.658          
##          Detection Rate : 0.605          
##    Detection Prevalence : 0.632          
##       Balanced Accuracy : 0.922          
##                                          
##        'Positive' Class : B              
## 
  # Try to reduce the "complexity" of the tree - by pruning at least one node.
  # in other words - reduce the node and then determine the reduction in accuracy 
  # is acceptable.  What about the false positives?  is the associated increase
  # troubling?  in other words - is the reduction in decision making complexity
  # worth the incremental false positives.
  
  # or determine whether the pruning "improves" the accuracy - and reduces the 
  # false positives.
  printcp(mod_tree)
## 
## Classification tree:
## rpart(formula = form, data = train)
## 
## Variables actually used in tree construction:
## [1] area_mean           concave.points_mean texture_mean       
## 
## Root node error: 173/455 = 0.4
## 
## n= 455 
## 
##     CP nsplit rel error xerror xstd
## 1 0.75      0       1.0    1.0 0.06
## 2 0.04      1       0.3    0.3 0.04
## 3 0.04      2       0.2    0.3 0.04
## 4 0.01      4       0.1    0.3 0.04
## 5 0.01      6       0.1    0.3 0.04
  plotcp(mod_tree)

    mod_tree2 = rpart(form, data = train, control = rpart.control(cp =0.03))
    mod_tree2
## n= 455 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 455 173 B (0.6198 0.3802)  
##    2) concave.points_mean< 0.0489 270  16 B (0.9407 0.0593)  
##      4) area_mean< 696 257   6 B (0.9767 0.0233) *
##      5) area_mean>=696 13   3 M (0.2308 0.7692) *
##    3) concave.points_mean>=0.0489 185  28 M (0.1514 0.8486)  
##      6) area_mean< 676 54  25 M (0.4630 0.5370)  
##       12) texture_mean< 20.2 33  10 B (0.6970 0.3030) *
##       13) texture_mean>=20.2 21   2 M (0.0952 0.9048) *
##      7) area_mean>=676 131   3 M (0.0229 0.9771) *
      plot(as.party(mod_tree2))

      plotcp(mod_tree2)

          diagnosis_tree2 = predict(mod_tree2, type = "class", newdata = test)
          confusionMatrix(diagnosis_tree2, test$diagnosis)              
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 69  3
##          M  6 36
##                                          
##                Accuracy : 0.921          
##                  95% CI : (0.855, 0.963) 
##     No Information Rate : 0.658          
##     P-Value [Acc > NIR] : 0.0000000000399
##                                          
##                   Kappa : 0.828          
##  Mcnemar's Test P-Value : 0.505          
##                                          
##             Sensitivity : 0.920          
##             Specificity : 0.923          
##          Pos Pred Value : 0.958          
##          Neg Pred Value : 0.857          
##              Prevalence : 0.658          
##          Detection Rate : 0.605          
##    Detection Prevalence : 0.632          
##       Balanced Accuracy : 0.922          
##                                          
##        'Positive' Class : B              
## 
#=========================================================================#