#========================================================#
# CLASSIFICATION TREE PROJECT
# Develop a decision tree model - decision rule - to advise the oncology department on
# whether a biopsy is malignant or benign.
# Use the Wisconsin Breast Cancer data set.
# Report the confusion matrix of all iterations of your modeling. Especially
# the final one.
###########################################################################################
remove(list = ls())
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/DecisionTrees/DT_Project")
options(digits = 3, scipen = 9999)
# ##############################################################################
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 2.0.1 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#special libraries
library(mdsr)
## Loading required package: lattice
## Loading required package: ggformula
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## In accordance with CRAN policy, the 'mdsr' package
## no longer attaches
## the 'tidyverse' package automatically.
## You may need to 'library(tidyverse)' in order to
## use certain functions.
library(rpart)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
##
## dotPlot
## The following object is masked from 'package:purrr':
##
## lift
library(partykit)
# Read the dataset
cancer = read.csv("wisc_bc_data.csv", header = T)
head(cancer)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 18.0 10.4 122.8 1001
## 2 842517 M 20.6 17.8 132.9 1326
## 3 84300903 M 19.7 21.2 130.0 1203
## 4 84348301 M 11.4 20.4 77.6 386
## 5 84358402 M 20.3 14.3 135.1 1297
## 6 843786 M 12.4 15.7 82.6 477
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.1184 0.2776 0.3001 0.1471
## 2 0.0847 0.0786 0.0869 0.0702
## 3 0.1096 0.1599 0.1974 0.1279
## 4 0.1425 0.2839 0.2414 0.1052
## 5 0.1003 0.1328 0.1980 0.1043
## 6 0.1278 0.1700 0.1578 0.0809
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.242 0.0787 1.095 0.905 8.59
## 2 0.181 0.0567 0.543 0.734 3.40
## 3 0.207 0.0600 0.746 0.787 4.58
## 4 0.260 0.0974 0.496 1.156 3.44
## 5 0.181 0.0588 0.757 0.781 5.44
## 6 0.209 0.0761 0.335 0.890 2.22
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.4 0.00640 0.0490 0.0537 0.0159
## 2 74.1 0.00522 0.0131 0.0186 0.0134
## 3 94.0 0.00615 0.0401 0.0383 0.0206
## 4 27.2 0.00911 0.0746 0.0566 0.0187
## 5 94.4 0.01149 0.0246 0.0569 0.0188
## 6 27.2 0.00751 0.0335 0.0367 0.0114
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 1 0.0300 0.00619 25.4 17.3
## 2 0.0139 0.00353 25.0 23.4
## 3 0.0225 0.00457 23.6 25.5
## 4 0.0596 0.00921 14.9 26.5
## 5 0.0176 0.00511 22.5 16.7
## 6 0.0216 0.00508 15.5 23.8
## perimeter_worst area_worst smoothness_worst compactness_worst
## 1 184.6 2019 0.162 0.666
## 2 158.8 1956 0.124 0.187
## 3 152.5 1709 0.144 0.424
## 4 98.9 568 0.210 0.866
## 5 152.2 1575 0.137 0.205
## 6 103.4 742 0.179 0.525
## concavity_worst concave.points_worst symmetry_worst
## 1 0.712 0.265 0.460
## 2 0.242 0.186 0.275
## 3 0.450 0.243 0.361
## 4 0.687 0.258 0.664
## 5 0.400 0.163 0.236
## 6 0.535 0.174 0.399
## fractal_dimension_worst
## 1 0.1189
## 2 0.0890
## 3 0.0876
## 4 0.1730
## 5 0.0768
## 6 0.1244
#Create training and testing subsets
cancer_df = cancer %>% mutate( ID = row_number())
train = cancer_df %>% sample_frac(0.8)
test = cancer_df %>% anti_join(train, by = "ID")
#Establish Null Model
names(train)
## [1] "id" "diagnosis"
## [3] "radius_mean" "texture_mean"
## [5] "perimeter_mean" "area_mean"
## [7] "smoothness_mean" "compactness_mean"
## [9] "concavity_mean" "concave.points_mean"
## [11] "symmetry_mean" "fractal_dimension_mean"
## [13] "radius_se" "texture_se"
## [15] "perimeter_se" "area_se"
## [17] "smoothness_se" "compactness_se"
## [19] "concavity_se" "concave.points_se"
## [21] "symmetry_se" "fractal_dimension_se"
## [23] "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst"
## [27] "smoothness_worst" "compactness_worst"
## [29] "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_dimension_worst"
## [33] "ID"
prop.table(table(train$diagnosis))
##
## B M
## 0.62 0.38
#equivalent
tally(~diagnosis, data = train, format = "percent")
## diagnosis
## B M
## 62 38
# Fit a tree
form = as.formula("diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean +
compactness_mean +concavity_mean+concave.points_mean + symmetry_mean+ fractal_dimension_mean")
mod_tree = rpart(form, data = train)
mod_tree
## n= 455
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 455 173 B (0.6198 0.3802)
## 2) concave.points_mean< 0.0489 270 16 B (0.9407 0.0593)
## 4) area_mean< 696 257 6 B (0.9767 0.0233) *
## 5) area_mean>=696 13 3 M (0.2308 0.7692) *
## 3) concave.points_mean>=0.0489 185 28 M (0.1514 0.8486)
## 6) area_mean< 676 54 25 M (0.4630 0.5370)
## 12) texture_mean< 20.2 33 10 B (0.6970 0.3030)
## 24) texture_mean< 15.4 12 0 B (1.0000 0.0000) *
## 25) texture_mean>=15.4 21 10 B (0.5238 0.4762)
## 50) texture_mean>=17.2 13 4 B (0.6923 0.3077) *
## 51) texture_mean< 17.2 8 2 M (0.2500 0.7500) *
## 13) texture_mean>=20.2 21 2 M (0.0952 0.9048) *
## 7) area_mean>=676 131 3 M (0.0229 0.9771) *
#make a note of the position of the split in the first node.
#so you can map it below - in a ggplot
concave.points_mean_split = 0.0514
#Plot the fitted tree
plot(as.party(mod_tree))

#Predict using the testing data set
diagnosis_tree = predict(mod_tree, type = "class", newdata = test)
# Visual Display
# First add new variables to the dataset "test". The variables should be
# the binary ones identified by the tree. For instance if node 2 returns
# a split on "age" at age = 54, then create a new variable
# old = age> 15 in a mutate command.
# then introduce that variable into a ggplot graph.
# Thus, you will have a ggplot that uses shape and faceting to convey information
# on 3 of the predictive variables identified as important by the tree algorithm
test = test %>% mutate(hi_area_mean = area_mean >= 696,
hi_text_mean = texture_mean < 16.8,
diagnosis_tree)
str(train)
## 'data.frame': 455 obs. of 33 variables:
## $ id : int 901315 913063 869931 9113156 871149 853401 89143602 875099 854941 8810955 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 1 1 1 1 1 2 1 1 1 2 ...
## $ radius_mean : num 10.6 12.4 13.7 14.4 10.9 ...
## $ texture_mean : num 20.2 16.4 17.9 27 13 ...
## $ perimeter_mean : num 70.2 82.8 88.1 92.2 68.7 ...
## $ area_mean : num 338 477 585 646 367 ...
## $ smoothness_mean : num 0.0907 0.0951 0.0794 0.0699 0.0751 ...
## $ compactness_mean : num 0.166 0.1511 0.0638 0.0522 0.0372 ...
## $ concavity_mean : num 0.228 0.1544 0.02881 0.03476 0.00309 ...
## $ concave.points_mean : num 0.05941 0.04846 0.01329 0.01737 0.00659 ...
## $ symmetry_mean : num 0.219 0.208 0.147 0.171 0.144 ...
## $ fractal_dimension_mean : num 0.0845 0.0732 0.0558 0.0543 0.0574 ...
## $ radius_se : num 0.112 0.392 0.25 0.232 0.282 ...
## $ texture_se : num 1.231 1.207 0.757 0.911 0.761 ...
## $ perimeter_se : num 2.36 5 1.57 1.73 1.81 ...
## $ area_se : num 7.23 30.19 21.47 20.52 18.54 ...
## $ smoothness_se : num 0.0085 0.00723 0.00284 0.00536 0.00614 ...
## $ compactness_se : num 0.07643 0.07471 0.01592 0.01679 0.00613 ...
## $ concavity_se : num 0.1535 0.1114 0.0178 0.01971 0.00184 ...
## $ concave.points_se : num 0.02919 0.02721 0.00583 0.00637 0.00358 ...
## $ symmetry_se : num 0.0162 0.0323 0.0133 0.0141 0.0164 ...
## $ fractal_dimension_se : num 0.0122 0.00963 0.00198 0.00189 0.00266 ...
## $ radius_worst : num 10.8 13.8 15.3 15.4 12.4 ...
## $ texture_worst : num 22.8 21 22.5 32 18.2 ...
## $ perimeter_worst : num 76.5 97.8 97.2 100.4 78.1 ...
## $ area_worst : num 352 581 726 735 470 ...
## $ smoothness_worst : num 0.1143 0.1175 0.0971 0.1017 0.1171 ...
## $ compactness_worst : num 0.3619 0.4061 0.1824 0.146 0.0829 ...
## $ concavity_worst : num 0.603 0.4896 0.1564 0.1472 0.0185 ...
## $ concave.points_worst : num 0.1465 0.1342 0.0602 0.0556 0.0395 ...
## $ symmetry_worst : num 0.26 0.323 0.235 0.234 0.274 ...
## $ fractal_dimension_worst: num 0.12 0.1034 0.0701 0.0646 0.0769 ...
## $ ID : int 377 486 150 463 160 31 291 193 38 215 ...
prop.table(table(test$hi_area_mean))
##
## FALSE TRUE
## 0.711 0.289
prop.table(table(test$hi_text_mean))
##
## FALSE TRUE
## 0.711 0.289
ggplot(test, aes(x= concave.points_mean, y = diagnosis)) +
geom_count(aes(color = diagnosis_tree, shape = hi_text_mean, size =3),
position = position_jitter(width = 0, height = 0.1),
alpha = 0.5) +
geom_vline(xintercept = concave.points_mean_split, color = "black", lty = 2) +
facet_grid(~hi_area_mean)

ggplot(test, aes(x= concave.points_mean, y = diagnosis)) +
geom_count(aes(color = diagnosis_tree, shape = hi_area_mean, size =3),
position = position_jitter(width = 0, height = 0.1),
alpha = 0.5) +
geom_vline(xintercept = concave.points_mean_split, color = "black", lty = 2) +
facet_grid(~hi_text_mean)

#Relabel the newly created binary predictive variables to improve the ggplot
#graphs
table(test$hi_text_mean)
##
## FALSE TRUE
## 81 33
test$hi_text_mean = as.factor(test$hi_text_mean)
test$hi_text_mean = recode(test$hi_text_mean, "TRUE" = "High Mean Texture", "FALSE" = "Other")
table(test$hi_area_mean)
##
## FALSE TRUE
## 81 33
test$hi_area_mean = as.factor(test$hi_area_mean)
test$hi_area_mean = recode(test$hi_area_mean, "TRUE" = "High Mean Area", "FALSE" = "Other")
# Calculate the performance via the confusion matrix.
# Focus on the false positives.
confusionMatrix(test$diagnosis_tree, test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 69 3
## M 6 36
##
## Accuracy : 0.921
## 95% CI : (0.855, 0.963)
## No Information Rate : 0.658
## P-Value [Acc > NIR] : 0.0000000000399
##
## Kappa : 0.828
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.920
## Specificity : 0.923
## Pos Pred Value : 0.958
## Neg Pred Value : 0.857
## Prevalence : 0.658
## Detection Rate : 0.605
## Detection Prevalence : 0.632
## Balanced Accuracy : 0.922
##
## 'Positive' Class : B
##
# Try to reduce the "complexity" of the tree - by pruning at least one node.
# in other words - reduce the node and then determine the reduction in accuracy
# is acceptable. What about the false positives? is the associated increase
# troubling? in other words - is the reduction in decision making complexity
# worth the incremental false positives.
# or determine whether the pruning "improves" the accuracy - and reduces the
# false positives.
printcp(mod_tree)
##
## Classification tree:
## rpart(formula = form, data = train)
##
## Variables actually used in tree construction:
## [1] area_mean concave.points_mean texture_mean
##
## Root node error: 173/455 = 0.4
##
## n= 455
##
## CP nsplit rel error xerror xstd
## 1 0.75 0 1.0 1.0 0.06
## 2 0.04 1 0.3 0.3 0.04
## 3 0.04 2 0.2 0.3 0.04
## 4 0.01 4 0.1 0.3 0.04
## 5 0.01 6 0.1 0.3 0.04
plotcp(mod_tree)

mod_tree2 = rpart(form, data = train, control = rpart.control(cp =0.03))
mod_tree2
## n= 455
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 455 173 B (0.6198 0.3802)
## 2) concave.points_mean< 0.0489 270 16 B (0.9407 0.0593)
## 4) area_mean< 696 257 6 B (0.9767 0.0233) *
## 5) area_mean>=696 13 3 M (0.2308 0.7692) *
## 3) concave.points_mean>=0.0489 185 28 M (0.1514 0.8486)
## 6) area_mean< 676 54 25 M (0.4630 0.5370)
## 12) texture_mean< 20.2 33 10 B (0.6970 0.3030) *
## 13) texture_mean>=20.2 21 2 M (0.0952 0.9048) *
## 7) area_mean>=676 131 3 M (0.0229 0.9771) *
plot(as.party(mod_tree2))

plotcp(mod_tree2)

diagnosis_tree2 = predict(mod_tree2, type = "class", newdata = test)
confusionMatrix(diagnosis_tree2, test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 69 3
## M 6 36
##
## Accuracy : 0.921
## 95% CI : (0.855, 0.963)
## No Information Rate : 0.658
## P-Value [Acc > NIR] : 0.0000000000399
##
## Kappa : 0.828
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.920
## Specificity : 0.923
## Pos Pred Value : 0.958
## Neg Pred Value : 0.857
## Prevalence : 0.658
## Detection Rate : 0.605
## Detection Prevalence : 0.632
## Balanced Accuracy : 0.922
##
## 'Positive' Class : B
##
#=========================================================================#