Prob. classifiers

rm(list=ls())
library(ggplot2)
library(learningmachine)
library(caret)
library(palmerpenguins)
library(mlbench)
library(skimr)
library(reshape2)
library(pROC)

1 - Using Classifier object

set.seed(43)
X <- as.matrix(iris[, 1:4])
# y <- factor(as.numeric(iris$Species))
y <- iris$Species

index_train <- base::sample.int(n = nrow(X),
                                 size = floor(0.8*nrow(X)),
                                 replace = FALSE)
X_train <- X[index_train, ]
y_train <- y[index_train]
X_test <- X[-index_train, ]
y_test <- y[-index_train]
dim(X_train)
## [1] 120   4
dim(X_test)
## [1] 30  4
obj <- learningmachine::Classifier$new(method = "ranger")
obj$get_type()
## [1] "classification"
obj$get_name()
## [1] "Classifier"
obj$set_B(10)
obj$set_level(95)

t0 <- proc.time()[3]
obj$fit(X_train, y_train, pi_method="kdesplitconformal")
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed:  0.048 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims$setosa[1:3, ])
df$Var2 <- NULL 
colnames(df) <- c("individual", "prob_setosa")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_setosa)) + geom_boxplot() + coord_flip()

ggplot2::ggplot(df, aes(x=prob_setosa, fill=individual)) + geom_density(alpha=.3)

obj$summary(X_test, y=y_test, 
            class_name = "setosa",
            show_progress=FALSE)
## $Coverage_rate
## [1] 100
## 
## $ttests
##                   estimate       lower        upper      p-value signif
## Sepal.Length -0.0393312907 -0.06040518 -0.018257404 0.0006558023    ***
## Sepal.Width   0.0608232094  0.02515088  0.096495535 0.0015764747     **
## Petal.Length -0.0237062332 -0.04523012 -0.002182351 0.0320229769      *
## Petal.Width   0.0008608256 -0.04423245  0.045954101 0.9691234874       
## 
## $effects
## ── Data Summary ────────────────────────
##                            Values 
## Name                       effects
## Number of rows             30     
## Number of columns          4      
## _______________________           
## Column type frequency:            
##   numeric                  4      
## ________________________          
## Group variables            None   
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable      mean     sd      p0       p25      p50     p75   p100 hist 
## 1 Sepal.Length  -0.0393   0.0564 -0.214  -0.0727   -0.00980 0.00220 0.0139 ▁▁▃▃▇
## 2 Sepal.Width    0.0608   0.0955 -0.0584  0.000668  0.0424  0.0667  0.387  ▆▇▁▁▁
## 3 Petal.Length  -0.0237   0.0576 -0.164   0         0       0       0.0205 ▁▁▁▁▇
## 4 Petal.Width    0.000861 0.121  -0.249  -0.0198    0       0       0.423  ▁▇▁▁▁

2 - ranger classification

obj <- learningmachine::Classifier$new(method = "ranger")
obj$set_level(95)
obj$set_pi_method("bootsplitconformal")
t0 <- proc.time()[3]
obj$fit(X_train, y_train)
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed:  0.036 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims$setosa[1:3, ])
df$Var2 <- NULL 
colnames(df) <- c("individual", "prob_setosa")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_setosa)) + geom_boxplot() + coord_flip()

ggplot2::ggplot(df, aes(x=prob_setosa, fill=individual)) + geom_density(alpha=.3)

obj$summary(X_test, y=y_test, 
            class_name = "setosa",
            show_progress=FALSE)
## $Coverage_rate
## [1] 100
## 
## $ttests
##                   estimate       lower        upper      p-value signif
## Sepal.Length -0.0393312907 -0.06040518 -0.018257404 0.0006558023    ***
## Sepal.Width   0.0608232094  0.02515088  0.096495535 0.0015764747     **
## Petal.Length -0.0237062332 -0.04523012 -0.002182351 0.0320229769      *
## Petal.Width   0.0008608256 -0.04423245  0.045954101 0.9691234874       
## 
## $effects
## ── Data Summary ────────────────────────
##                            Values 
## Name                       effects
## Number of rows             30     
## Number of columns          4      
## _______________________           
## Column type frequency:            
##   numeric                  4      
## ________________________          
## Group variables            None   
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable      mean     sd      p0       p25      p50     p75   p100 hist 
## 1 Sepal.Length  -0.0393   0.0564 -0.214  -0.0727   -0.00980 0.00220 0.0139 ▁▁▃▃▇
## 2 Sepal.Width    0.0608   0.0955 -0.0584  0.000668  0.0424  0.0667  0.387  ▆▇▁▁▁
## 3 Petal.Length  -0.0237   0.0576 -0.164   0         0       0       0.0205 ▁▁▁▁▇
## 4 Petal.Width    0.000861 0.121  -0.249  -0.0198    0       0       0.423  ▁▇▁▁▁

3 - extratrees classification

obj <- learningmachine::Classifier$new(method = "extratrees")
obj$set_level(95)
t0 <- proc.time()[3]
obj$fit(X_train, y_train, 
        pi_method = "bootsplitconformal")
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed:  0.046 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims$virginica[1:3, ])
df$Var2 <- NULL 
colnames(df) <- c("individual", "prob_virginica")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_virginica)) + geom_boxplot() + coord_flip()

ggplot2::ggplot(df, aes(x=prob_virginica, fill=individual)) + geom_density(alpha=.3)

obj$summary(X_test, y=y_test, 
            class_name = "virginica",
            show_progress=FALSE)
## $Coverage_rate
## [1] 100
## 
## $ttests
##                  estimate       lower      upper      p-value signif
## Sepal.Length  0.036541255 -0.01605960 0.08914210 0.1660389146       
## Sepal.Width  -0.008665406 -0.08293934 0.06560852 0.8130836281       
## Petal.Length  0.367528646  0.17051921 0.56453808 0.0006587430    ***
## Petal.Width   0.657891051  0.31550619 1.00027591 0.0004837876    ***
## 
## $effects
## ── Data Summary ────────────────────────
##                            Values 
## Name                       effects
## Number of rows             30     
## Number of columns          4      
## _______________________           
## Column type frequency:            
##   numeric                  4      
## ________________________          
## Group variables            None   
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable     mean    sd       p0      p25      p50    p75  p100 hist 
## 1 Sepal.Length   0.0365  0.141 -0.247   -0.0246   0.00605 0.0720 0.571 ▁▇▂▁▁
## 2 Sepal.Width   -0.00867 0.199 -0.378   -0.0921  -0.0117  0.0154 0.629 ▂▇▁▁▁
## 3 Petal.Length   0.368   0.528 -0.00377  0.00117  0.0920  0.525  1.63  ▇▁▁▁▁
## 4 Petal.Width    0.658   0.917  0        0.0426   0.388   0.678  4.17  ▇▁▁▁▁

4 - Penguins dataset

library(palmerpenguins)
data(penguins)
penguins_ <- as.data.frame(palmerpenguins::penguins)

replacement <- median(penguins$bill_length_mm, na.rm = TRUE)
penguins_$bill_length_mm[is.na(penguins$bill_length_mm)] <- replacement

replacement <- median(penguins$bill_depth_mm, na.rm = TRUE)
penguins_$bill_depth_mm[is.na(penguins$bill_depth_mm)] <- replacement

replacement <- median(penguins$flipper_length_mm, na.rm = TRUE)
penguins_$flipper_length_mm[is.na(penguins$flipper_length_mm)] <- replacement

replacement <- median(penguins$body_mass_g, na.rm = TRUE)
penguins_$body_mass_g[is.na(penguins$body_mass_g)] <- replacement

# replacing NA's by the most frequent occurence
penguins_$sex[is.na(penguins$sex)] <- "male" # most frequent

# one-hot encoding for covariates
penguins_mat <- model.matrix(species ~., data=penguins_)[,-1]
penguins_mat <- cbind.data.frame(penguins_$species, penguins_mat)
penguins_mat <- as.data.frame(penguins_mat)
colnames(penguins_mat)[1] <- "species"

y <- penguins_mat$species
X <- as.matrix(penguins_mat[,2:ncol(penguins_mat)])

n <- nrow(X)
p <- ncol(X)

set.seed(1234)
index_train <- sample(1:n, size=floor(0.8*n))
X_train <- X[index_train, ]
y_train <- factor(y[index_train])
X_test <- X[-index_train, ][1:5, ]
y_test <- factor(y[-index_train][1:5])
obj <- learningmachine::Classifier$new(method = "extratrees")
obj$set_pi_method("bootsplitconformal")
obj$set_level(95)
obj$set_B(10L)
t0 <- proc.time()[3]
obj$fit(X_train, y_train)
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed:  0.067 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims[[1]][1:3, ])
df$Var2 <- NULL 
colnames(df) <- c("individual", "prob_Adelie")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_Adelie)) + geom_boxplot() + coord_flip()

ggplot2::ggplot(df, aes(x=prob_Adelie, fill=individual)) + geom_density(alpha=.3)

obj$summary(X_test, y=y_test, 
            class_name = "Adelie",
            show_progress=FALSE)
## $Coverage_rate
## [1] 100
## 
## $ttests
##                        estimate         lower         upper    p-value signif
## islandDream        0.000000e+00           NaN           NaN        NaN       
## islandTorgersen   -1.777179e-02 -3.460341e-02 -9.401734e-04 0.04274849      *
## bill_length_mm     5.783764e-03 -5.548285e-03  1.711581e-02 0.22942360       
## bill_depth_mm      3.417161e-03 -1.936433e-03  8.770756e-03 0.15105416       
## flipper_length_mm -4.789082e-04 -1.784421e-03  8.266042e-04 0.36603238       
## body_mass_g       -2.321786e-05 -5.784108e-05  1.140536e-05 0.13610928       
## sexmale           -8.000160e-03 -2.218600e-02  6.185680e-03 0.19245625       
## year              -3.342752e-05 -8.060785e-05  1.375281e-05 0.12056579       
## 
## $effects
## ── Data Summary ────────────────────────
##                            Values 
## Name                       effects
## Number of rows             5      
## Number of columns          8      
## _______________________           
## Column type frequency:            
##   numeric                  8      
## ________________________          
## Group variables            None   
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable           mean        sd         p0        p25         p50
## 1 islandDream        0         0          0          0          0         
## 2 islandTorgersen   -0.0178    0.0136    -0.0344    -0.0270    -0.0189    
## 3 bill_length_mm     0.00578   0.00913    0.000433   0.00101    0.00263   
## 4 bill_depth_mm      0.00342   0.00431   -0.00114    0.00175    0.00276   
## 5 flipper_length_mm -0.000479  0.00105   -0.00233   -0.000239  -0.0000588 
## 6 body_mass_g       -0.0000232 0.0000279 -0.0000604 -0.0000441 -0.00000946
## 7 sexmale           -0.00800   0.0114    -0.0224    -0.0185     0         
## 8 year              -0.0000334 0.0000380 -0.0000969 -0.0000349 -0.0000212 
##           p75        p100 hist 
## 1  0           0          ▁▁▇▁▁
## 2 -0.00589    -0.00261    ▃▃▃▁▇
## 3  0.00285     0.0220     ▇▁▁▁▂
## 4  0.00320     0.0105     ▂▇▁▁▂
## 5 -0.0000351   0.000270   ▂▁▁▁▇
## 6 -0.00000863  0.00000656 ▃▃▁▇▃
## 7  0           0.000837   ▅▁▁▁▇
## 8 -0.0000174   0.00000325 ▂▁▁▇▂