#NAIVE BAYES PROJECT
# The Fruit Detector
# Below are data on several types of fruit parsed by 3 attributes:
# Long, Sweet, Yellow
#PART I #############################
# Fit a Naive Bayes and Logistic Regression classification model
# And display ROC curves to establish which model performs better;
# Also, calculate the respective confusion matrices
# Combine or Conflate Fruit Other and Fruit Orange into one
# Factor Called "Other." this reduces the exercise to fitting
# a two-class model. we did this in class.
# Use the entire data set for training and testing.
# Provide graphical displays of Type of Fruit by its Attributes
#PART II #############################
#Fit a Naive Bayes model and a MULTINOMIAL Logistic Regression
#classification model on the fruit data to classify fruit.
#This time - however, do not conflate the factors.
#In other words, fit a 3-class model using Naive Bayes and
# Multinomial Logistic Regression. For this use the command
# multinom() in the package "nnet."
# Calculate the ROC graph and the confusion matrices.
# Interpret the results (briefly).
#################################################################
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/NaiveBayes")
options(digits = 3, scipen = 99999)
remove(list = ls())
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 2.0.1 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(e1071)
library(epitools)
library(DescTools)
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
## The following object is masked from 'package:purrr':
##
## lift
library(caTools)
#########################################################
Fruit = matrix(
c(7, 15, 5,
2, 3, 3,
7, 7, 5), # the data elements
nrow=3, # number of rows
ncol=3, # number of columns
byrow = TRUE) # fill matrix by rows
Fruit
## [,1] [,2] [,3]
## [1,] 7 15 5
## [2,] 2 3 3
## [3,] 7 7 5
dimnames(Fruit ) = list(
c("Banana", "Orange", "Other"), # row names
c("Long", "Sweet", "Yellow")) # column names
#Note that the prop.table command calculates
#proportion of TOTAL fruit; NOT as a proportion of the
# marginal totals
addmargins(Fruit)
## Long Sweet Yellow Sum
## Banana 7 15 5 27
## Orange 2 3 3 8
## Other 7 7 5 19
## Sum 16 25 13 54
prop.table(Fruit)
## Long Sweet Yellow
## Banana 0.130 0.2778 0.0926
## Orange 0.037 0.0556 0.0556
## Other 0.130 0.1296 0.0926
addmargins(prop.table(Fruit))
## Long Sweet Yellow Sum
## Banana 0.130 0.2778 0.0926 0.500
## Orange 0.037 0.0556 0.0556 0.148
## Other 0.130 0.1296 0.0926 0.352
## Sum 0.296 0.4630 0.2407 1.000
#Use expand.table function in the
#package epitools to expand the data
fruit = expand.table(Fruit)
head(fruit,3)
## Var2 Var1
## 1 Banana Long
## 2 Banana Long
## 3 Banana Long
#Rename the variables
colnames(fruit) = c("Type", "Attribute")
#Collapse Orange and Other into one Factor (called "Other");
#Hint: use the command: fct_collapse()
table(fruit$Type)
##
## Banana Orange Other
## 27 8 19
fruit$Type <- fct_collapse(fruit$Type,
Banana = c("Banana"),
Other = c("Orange", "Other")
)
table(fruit$Type, fruit$Attribute)
##
## Long Sweet Yellow
## Banana 7 15 5
## Other 9 10 8
head(fruit,3)
## Type Attribute
## 1 Banana Long
## 2 Banana Long
## 3 Banana Long
#Grouping Exercises
fruit %>%
group_by(Type) %>%
summarize(n = n())
## # A tibble: 2 x 2
## Type n
## <fct> <int>
## 1 Banana 27
## 2 Other 27
fruit %>%
group_by(Attribute) %>%
summarize(n = n())
## # A tibble: 3 x 2
## Attribute n
## <fct> <int>
## 1 Long 16
## 2 Sweet 25
## 3 Yellow 13
#Use this construction to obtain the marginals
fruit %>%
count(Type,Attribute) %>%
group_by(Type) %>%
mutate(prop = n / sum(n)) %>%
select(-n) %>%
spread(Attribute, prop, fill = 0)
## # A tibble: 2 x 4
## # Groups: Type [2]
## Type Long Sweet Yellow
## <fct> <dbl> <dbl> <dbl>
## 1 Banana 0.259 0.556 0.185
## 2 Other 0.333 0.370 0.296
#Again, note that the proportions are a function of the total
# not the row sums
fruit_t = table(fruit$Type,fruit$Attribute)
addmargins(fruit_t)
##
## Long Sweet Yellow Sum
## Banana 7 15 5 27
## Other 9 10 8 27
## Sum 16 25 13 54
prop.table(fruit_t)
##
## Long Sweet Yellow
## Banana 0.1296 0.2778 0.0926
## Other 0.1667 0.1852 0.1481
addmargins(prop.table(fruit_t))
##
## Long Sweet Yellow Sum
## Banana 0.1296 0.2778 0.0926 0.5000
## Other 0.1667 0.1852 0.1481 0.5000
## Sum 0.2963 0.4630 0.2407 1.0000
#Explore the commands in the package sjPlot for creating graphs of
#the cross-tables
sjp.grpfrq(fruit$Type,fruit$Attribute)

sjp.grpfrq(fruit$Attribute, fruit$Type )

sjp.frq(fruit$Type)

#Fit Naive Bayes
model1 = naiveBayes(Type ~Attribute, data = fruit)
model1_predict = predict(model1, newdata = fruit, data = "class")
table(model1_predict)
## model1_predict
## Banana Other
## 25 29
confusionMatrix(model1_predict, fruit$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Banana Other
## Banana 15 10
## Other 12 17
##
## Accuracy : 0.593
## 95% CI : (0.45, 0.724)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.110
##
## Kappa : 0.185
## Mcnemar's Test P-Value : 0.831
##
## Sensitivity : 0.556
## Specificity : 0.630
## Pos Pred Value : 0.600
## Neg Pred Value : 0.586
## Prevalence : 0.500
## Detection Rate : 0.278
## Detection Prevalence : 0.463
## Balanced Accuracy : 0.593
##
## 'Positive' Class : Banana
##
#Fit Logistic Model
model2 = glm(Type ~Attribute, family = "binomial", data = fruit)
model2_predict = predict(model2, newdata = fruit, type = "resp")
model2_predict = ifelse(model2_predict > 0.5, "Other", "Banana")
model2_predict = as.factor(model2_predict)
table(model2_predict)
## model2_predict
## Banana Other
## 25 29
confusionMatrix(model2_predict, fruit$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Banana Other
## Banana 15 10
## Other 12 17
##
## Accuracy : 0.593
## 95% CI : (0.45, 0.724)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.110
##
## Kappa : 0.185
## Mcnemar's Test P-Value : 0.831
##
## Sensitivity : 0.556
## Specificity : 0.630
## Pos Pred Value : 0.600
## Neg Pred Value : 0.586
## Prevalence : 0.500
## Detection Rate : 0.278
## Detection Prevalence : 0.463
## Balanced Accuracy : 0.593
##
## 'Positive' Class : Banana
##
plot_model(model2)

plot_model(model2, type = "pred")
## $Attribute

#Create the ROC curve for the two models
preds = cbind(Logit = model2_predict, Naive_Bayes = model1_predict, fruit$Type)
colAUC(preds, fruit$Type, plotROC = TRUE)

## Logit Naive_Bayes
## Banana vs. Other 0.593 0.593 1
#PART II
# 3-CLASS CLASSIFICATION
library(nnet)
# Reload fruit so it has the 3 Types
fruit = expand.table(Fruit)
head(fruit,3)
## Var2 Var1
## 1 Banana Long
## 2 Banana Long
## 3 Banana Long
colnames(fruit) = c("Type", "Attribute")
#Multinomial Logistic Regression
model22 <- multinom(Type ~ Attribute, data = fruit)
## # weights: 12 (6 variable)
## initial value 59.325064
## iter 10 value 52.620445
## final value 52.620443
## converged
summary(model22)
## Call:
## multinom(formula = Type ~ Attribute, data = fruit)
##
## Coefficients:
## (Intercept) AttributeSweet AttributeYellow
## Orange -1.25275705 -0.357 0.7419322
## Other -0.00000178 -0.762 0.0000051
##
## Std. Errors:
## (Intercept) AttributeSweet AttributeYellow
## Orange 0.802 1.021 1.085
## Other 0.535 0.704 0.828
##
## Residual Deviance: 105
## AIC: 117
model22_predict = predict(model22, newdata = fruit, type = "class")
table(model22_predict)
## model22_predict
## Banana Orange Other
## 38 0 16
confusionMatrix(model22_predict, fruit$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Banana Orange Other
## Banana 22 5 11
## Orange 0 0 0
## Other 5 3 8
##
## Overall Statistics
##
## Accuracy : 0.556
## 95% CI : (0.414, 0.691)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.2483
##
## Kappa : 0.183
## Mcnemar's Test P-Value : 0.0166
##
## Statistics by Class:
##
## Class: Banana Class: Orange Class: Other
## Sensitivity 0.815 0.000 0.421
## Specificity 0.407 1.000 0.771
## Pos Pred Value 0.579 NaN 0.500
## Neg Pred Value 0.688 0.852 0.711
## Prevalence 0.500 0.148 0.352
## Detection Rate 0.407 0.000 0.148
## Detection Prevalence 0.704 0.000 0.296
## Balanced Accuracy 0.611 0.500 0.596
#Naive Bayes
model12 = naiveBayes(Type ~Attribute, data = fruit)
model12_predict = predict(model12, newdata = fruit, data = "class")
table(model12_predict)
## model12_predict
## Banana Orange Other
## 54 0 0
confusionMatrix(model12_predict, fruit$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Banana Orange Other
## Banana 27 8 19
## Orange 0 0 0
## Other 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5
## 95% CI : (0.361, 0.639)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.554
##
## Kappa : 0
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Banana Class: Orange Class: Other
## Sensitivity 1.0 0.000 0.000
## Specificity 0.0 1.000 1.000
## Pos Pred Value 0.5 NaN NaN
## Neg Pred Value NaN 0.852 0.648
## Prevalence 0.5 0.148 0.352
## Detection Rate 0.5 0.000 0.000
## Detection Prevalence 1.0 0.000 0.000
## Balanced Accuracy 0.5 0.500 0.500
#Create the ROC curve for the two 3-Class models
preds2 = cbind(Logit = model22_predict, Naive_Bayes = model12_predict, fruit$Type)
colAUC(preds2, fruit$Type, plotROC = TRUE)

## Logit Naive_Bayes
## Banana vs. Orange 0.595 0.5 1
## Banana vs. Other 0.618 0.5 1
## Orange vs. Other 0.523 0.5 1
#========================================================================#