#NAIVE BAYES PROJECT

  # The Fruit Detector
  # Below are data on several types of fruit parsed by 3 attributes:
  # Long, Sweet, Yellow

  #PART I                             #############################
  # Fit a Naive Bayes and Logistic Regression classification model
  # And display ROC curves to establish which model performs better;
  # Also, calculate the respective confusion matrices

  # Combine or Conflate Fruit Other  and Fruit Orange into one
  # Factor Called "Other."  this reduces the exercise to fitting
  # a two-class model.  we did this in class.

  # Use the entire data set for training and testing.
  # Provide graphical displays of Type of Fruit by its Attributes

  #PART II                           #############################
  #Fit a Naive Bayes model and a MULTINOMIAL Logistic Regression 
  #classification model on the fruit data to classify fruit.
  #This time - however, do not conflate the factors.
  #In other words, fit a 3-class model using Naive Bayes and
  # Multinomial Logistic Regression.  For this use the command
  # multinom() in the package "nnet."
  # Calculate the ROC graph and the confusion matrices.
  # Interpret the results (briefly).

  #################################################################
  setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/NaiveBayes")
  options(digits = 3, scipen = 99999)    
  remove(list = ls())
  
          library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
          library(e1071)
          library(epitools)
          library(DescTools)
          library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
          library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
          library(caTools)
          #########################################################
  
        Fruit = matrix( 
            c(7, 15, 5, 
              2, 3, 3, 
              7, 7, 5), # the data elements 
            nrow=3,              # number of rows 
            ncol=3,              # number of columns 
            byrow = TRUE)        # fill matrix by rows 
          
          Fruit
##      [,1] [,2] [,3]
## [1,]    7   15    5
## [2,]    2    3    3
## [3,]    7    7    5
              dimnames(Fruit ) = list( 
                c("Banana", "Orange", "Other"),         # row names 
                c("Long", "Sweet", "Yellow")) # column names 

    #Note that the prop.table command calculates
    #proportion of TOTAL fruit; NOT as a proportion of the
    # marginal totals
    addmargins(Fruit)
##        Long Sweet Yellow Sum
## Banana    7    15      5  27
## Orange    2     3      3   8
## Other     7     7      5  19
## Sum      16    25     13  54
    prop.table(Fruit)
##         Long  Sweet Yellow
## Banana 0.130 0.2778 0.0926
## Orange 0.037 0.0556 0.0556
## Other  0.130 0.1296 0.0926
    addmargins(prop.table(Fruit))
##         Long  Sweet Yellow   Sum
## Banana 0.130 0.2778 0.0926 0.500
## Orange 0.037 0.0556 0.0556 0.148
## Other  0.130 0.1296 0.0926 0.352
## Sum    0.296 0.4630 0.2407 1.000
#Use expand.table function in the 
#package epitools to expand the data
fruit = expand.table(Fruit)
head(fruit,3)     
##     Var2 Var1
## 1 Banana Long
## 2 Banana Long
## 3 Banana Long
#Rename the variables
colnames(fruit) = c("Type", "Attribute") 

      #Collapse Orange and Other into one Factor (called "Other");
      #Hint: use the command: fct_collapse()
      table(fruit$Type)
## 
## Banana Orange  Other 
##     27      8     19
      fruit$Type <- fct_collapse(fruit$Type,
                                 Banana = c("Banana"),
                                 Other = c("Orange", "Other")
      )

        table(fruit$Type, fruit$Attribute)
##         
##          Long Sweet Yellow
##   Banana    7    15      5
##   Other     9    10      8
        head(fruit,3)
##     Type Attribute
## 1 Banana      Long
## 2 Banana      Long
## 3 Banana      Long
                      #Grouping Exercises
                      fruit %>%
                        group_by(Type) %>%
                        summarize(n = n())
## # A tibble: 2 x 2
##   Type       n
##   <fct>  <int>
## 1 Banana    27
## 2 Other     27
                      fruit %>%
                        group_by(Attribute) %>%
                        summarize(n = n())
## # A tibble: 3 x 2
##   Attribute     n
##   <fct>     <int>
## 1 Long         16
## 2 Sweet        25
## 3 Yellow       13
            #Use this construction to obtain the marginals
            fruit %>%
              count(Type,Attribute) %>%
              group_by(Type) %>%
              mutate(prop = n / sum(n)) %>%
              select(-n) %>%
              spread(Attribute, prop, fill = 0)
## # A tibble: 2 x 4
## # Groups:   Type [2]
##   Type    Long Sweet Yellow
##   <fct>  <dbl> <dbl>  <dbl>
## 1 Banana 0.259 0.556  0.185
## 2 Other  0.333 0.370  0.296
                    #Again, note that the proportions are a function of the total
                    # not the row sums
                    fruit_t = table(fruit$Type,fruit$Attribute)
                    addmargins(fruit_t)
##         
##          Long Sweet Yellow Sum
##   Banana    7    15      5  27
##   Other     9    10      8  27
##   Sum      16    25     13  54
                    prop.table(fruit_t)
##         
##            Long  Sweet Yellow
##   Banana 0.1296 0.2778 0.0926
##   Other  0.1667 0.1852 0.1481
                    addmargins(prop.table(fruit_t))
##         
##            Long  Sweet Yellow    Sum
##   Banana 0.1296 0.2778 0.0926 0.5000
##   Other  0.1667 0.1852 0.1481 0.5000
##   Sum    0.2963 0.4630 0.2407 1.0000
        #Explore the commands in the package sjPlot for creating graphs of
        #the cross-tables
        sjp.grpfrq(fruit$Type,fruit$Attribute)

        sjp.grpfrq(fruit$Attribute, fruit$Type )

        sjp.frq(fruit$Type)

      #Fit Naive Bayes  
      model1 = naiveBayes(Type ~Attribute, data = fruit)
        model1_predict = predict(model1, newdata = fruit, data = "class")
          table(model1_predict)
## model1_predict
## Banana  Other 
##     25     29
            confusionMatrix(model1_predict, fruit$Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Banana Other
##     Banana     15    10
##     Other      12    17
##                                        
##                Accuracy : 0.593        
##                  95% CI : (0.45, 0.724)
##     No Information Rate : 0.5          
##     P-Value [Acc > NIR] : 0.110        
##                                        
##                   Kappa : 0.185        
##  Mcnemar's Test P-Value : 0.831        
##                                        
##             Sensitivity : 0.556        
##             Specificity : 0.630        
##          Pos Pred Value : 0.600        
##          Neg Pred Value : 0.586        
##              Prevalence : 0.500        
##          Detection Rate : 0.278        
##    Detection Prevalence : 0.463        
##       Balanced Accuracy : 0.593        
##                                        
##        'Positive' Class : Banana       
## 
            #Fit Logistic Model
            model2 = glm(Type ~Attribute, family = "binomial", data = fruit)
          model2_predict = predict(model2, newdata = fruit, type = "resp")
        model2_predict = ifelse(model2_predict > 0.5, "Other", "Banana")
      model2_predict = as.factor(model2_predict)

          table(model2_predict)
## model2_predict
## Banana  Other 
##     25     29
          confusionMatrix(model2_predict, fruit$Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Banana Other
##     Banana     15    10
##     Other      12    17
##                                        
##                Accuracy : 0.593        
##                  95% CI : (0.45, 0.724)
##     No Information Rate : 0.5          
##     P-Value [Acc > NIR] : 0.110        
##                                        
##                   Kappa : 0.185        
##  Mcnemar's Test P-Value : 0.831        
##                                        
##             Sensitivity : 0.556        
##             Specificity : 0.630        
##          Pos Pred Value : 0.600        
##          Neg Pred Value : 0.586        
##              Prevalence : 0.500        
##          Detection Rate : 0.278        
##    Detection Prevalence : 0.463        
##       Balanced Accuracy : 0.593        
##                                        
##        'Positive' Class : Banana       
## 
          plot_model(model2)

          plot_model(model2, type = "pred")
## $Attribute

#Create the ROC curve for the two models
preds = cbind(Logit = model2_predict, Naive_Bayes = model1_predict, fruit$Type)
colAUC(preds, fruit$Type, plotROC = TRUE)

##                  Logit Naive_Bayes  
## Banana vs. Other 0.593       0.593 1
    #PART II
    # 3-CLASS CLASSIFICATION

          library(nnet)
          
          # Reload fruit so it has the 3 Types
          fruit = expand.table(Fruit)
          head(fruit,3)     
##     Var2 Var1
## 1 Banana Long
## 2 Banana Long
## 3 Banana Long
          colnames(fruit) = c("Type", "Attribute")
          
          #Multinomial Logistic Regression
          model22 <- multinom(Type ~ Attribute, data = fruit)
## # weights:  12 (6 variable)
## initial  value 59.325064 
## iter  10 value 52.620445
## final  value 52.620443 
## converged
          summary(model22)
## Call:
## multinom(formula = Type ~ Attribute, data = fruit)
## 
## Coefficients:
##        (Intercept) AttributeSweet AttributeYellow
## Orange -1.25275705         -0.357       0.7419322
## Other  -0.00000178         -0.762       0.0000051
## 
## Std. Errors:
##        (Intercept) AttributeSweet AttributeYellow
## Orange       0.802          1.021           1.085
## Other        0.535          0.704           0.828
## 
## Residual Deviance: 105 
## AIC: 117
          model22_predict = predict(model22, newdata = fruit, type = "class")
          table(model22_predict)
## model22_predict
## Banana Orange  Other 
##     38      0     16
          confusionMatrix(model22_predict, fruit$Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Banana Orange Other
##     Banana     22      5    11
##     Orange      0      0     0
##     Other       5      3     8
## 
## Overall Statistics
##                                         
##                Accuracy : 0.556         
##                  95% CI : (0.414, 0.691)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : 0.2483        
##                                         
##                   Kappa : 0.183         
##  Mcnemar's Test P-Value : 0.0166        
## 
## Statistics by Class:
## 
##                      Class: Banana Class: Orange Class: Other
## Sensitivity                  0.815         0.000        0.421
## Specificity                  0.407         1.000        0.771
## Pos Pred Value               0.579           NaN        0.500
## Neg Pred Value               0.688         0.852        0.711
## Prevalence                   0.500         0.148        0.352
## Detection Rate               0.407         0.000        0.148
## Detection Prevalence         0.704         0.000        0.296
## Balanced Accuracy            0.611         0.500        0.596
          #Naive Bayes
          model12 = naiveBayes(Type ~Attribute, data = fruit)
          model12_predict = predict(model12, newdata = fruit, data = "class")
          table(model12_predict)
## model12_predict
## Banana Orange  Other 
##     54      0      0
          confusionMatrix(model12_predict, fruit$Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Banana Orange Other
##     Banana     27      8    19
##     Orange      0      0     0
##     Other       0      0     0
## 
## Overall Statistics
##                                         
##                Accuracy : 0.5           
##                  95% CI : (0.361, 0.639)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : 0.554         
##                                         
##                   Kappa : 0             
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: Banana Class: Orange Class: Other
## Sensitivity                    1.0         0.000        0.000
## Specificity                    0.0         1.000        1.000
## Pos Pred Value                 0.5           NaN          NaN
## Neg Pred Value                 NaN         0.852        0.648
## Prevalence                     0.5         0.148        0.352
## Detection Rate                 0.5         0.000        0.000
## Detection Prevalence           1.0         0.000        0.000
## Balanced Accuracy              0.5         0.500        0.500
          #Create the ROC curve for the two 3-Class models
          preds2 = cbind(Logit = model22_predict, Naive_Bayes = model12_predict, fruit$Type)
          colAUC(preds2, fruit$Type, plotROC = TRUE)

##                   Logit Naive_Bayes  
## Banana vs. Orange 0.595         0.5 1
## Banana vs. Other  0.618         0.5 1
## Orange vs. Other  0.523         0.5 1
#========================================================================#