1 - Using Classifier
object
set.seed(43)
X <- as.matrix(iris[, 1:4])
# y <- factor(as.numeric(iris$Species))
y <- iris$Species
index_train <- base::sample.int(n = nrow(X),
size = floor(0.8*nrow(X)),
replace = FALSE)
X_train <- X[index_train, ]
y_train <- y[index_train]
X_test <- X[-index_train, ]
y_test <- y[-index_train]
dim(X_train)
## [1] 120 4
## [1] 30 4
obj <- learningmachine::Classifier$new(method = "ranger")
obj$get_type()
## [1] "classification"
## [1] "Classifier"
obj$set_B(10)
obj$set_level(95)
t0 <- proc.time()[3]
obj$fit(X_train, y_train, pi_method="kdesplitconformal")
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed: 0.048 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims$setosa[1:3, ])
df$Var2 <- NULL
colnames(df) <- c("individual", "prob_setosa")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_setosa)) + geom_boxplot() + coord_flip()
ggplot2::ggplot(df, aes(x=prob_setosa, fill=individual)) + geom_density(alpha=.3)
obj$summary(X_test, y=y_test,
class_name = "setosa",
show_progress=FALSE)
## $Coverage_rate
## [1] 100
##
## $ttests
## estimate lower upper p-value signif
## Sepal.Length -0.0393312907 -0.06040518 -0.018257404 0.0006558023 ***
## Sepal.Width 0.0608232094 0.02515088 0.096495535 0.0015764747 **
## Petal.Length -0.0237062332 -0.04523012 -0.002182351 0.0320229769 *
## Petal.Width 0.0008608256 -0.04423245 0.045954101 0.9691234874
##
## $effects
## ── Data Summary ────────────────────────
## Values
## Name effects
## Number of rows 30
## Number of columns 4
## _______________________
## Column type frequency:
## numeric 4
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable mean sd p0 p25 p50 p75 p100 hist
## 1 Sepal.Length -0.0393 0.0564 -0.214 -0.0727 -0.00980 0.00220 0.0139 ▁▁▃▃▇
## 2 Sepal.Width 0.0608 0.0955 -0.0584 0.000668 0.0424 0.0667 0.387 ▆▇▁▁▁
## 3 Petal.Length -0.0237 0.0576 -0.164 0 0 0 0.0205 ▁▁▁▁▇
## 4 Petal.Width 0.000861 0.121 -0.249 -0.0198 0 0 0.423 ▁▇▁▁▁
2 - ranger
classification
obj <- learningmachine::Classifier$new(method = "ranger")
obj$set_level(95)
obj$set_pi_method("bootsplitconformal")
t0 <- proc.time()[3]
obj$fit(X_train, y_train)
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed: 0.036 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims$setosa[1:3, ])
df$Var2 <- NULL
colnames(df) <- c("individual", "prob_setosa")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_setosa)) + geom_boxplot() + coord_flip()
ggplot2::ggplot(df, aes(x=prob_setosa, fill=individual)) + geom_density(alpha=.3)
obj$summary(X_test, y=y_test,
class_name = "setosa",
show_progress=FALSE)
## $Coverage_rate
## [1] 100
##
## $ttests
## estimate lower upper p-value signif
## Sepal.Length -0.0393312907 -0.06040518 -0.018257404 0.0006558023 ***
## Sepal.Width 0.0608232094 0.02515088 0.096495535 0.0015764747 **
## Petal.Length -0.0237062332 -0.04523012 -0.002182351 0.0320229769 *
## Petal.Width 0.0008608256 -0.04423245 0.045954101 0.9691234874
##
## $effects
## ── Data Summary ────────────────────────
## Values
## Name effects
## Number of rows 30
## Number of columns 4
## _______________________
## Column type frequency:
## numeric 4
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable mean sd p0 p25 p50 p75 p100 hist
## 1 Sepal.Length -0.0393 0.0564 -0.214 -0.0727 -0.00980 0.00220 0.0139 ▁▁▃▃▇
## 2 Sepal.Width 0.0608 0.0955 -0.0584 0.000668 0.0424 0.0667 0.387 ▆▇▁▁▁
## 3 Petal.Length -0.0237 0.0576 -0.164 0 0 0 0.0205 ▁▁▁▁▇
## 4 Petal.Width 0.000861 0.121 -0.249 -0.0198 0 0 0.423 ▁▇▁▁▁
4 - Penguins dataset
library(palmerpenguins)
data(penguins)
penguins_ <- as.data.frame(palmerpenguins::penguins)
replacement <- median(penguins$bill_length_mm, na.rm = TRUE)
penguins_$bill_length_mm[is.na(penguins$bill_length_mm)] <- replacement
replacement <- median(penguins$bill_depth_mm, na.rm = TRUE)
penguins_$bill_depth_mm[is.na(penguins$bill_depth_mm)] <- replacement
replacement <- median(penguins$flipper_length_mm, na.rm = TRUE)
penguins_$flipper_length_mm[is.na(penguins$flipper_length_mm)] <- replacement
replacement <- median(penguins$body_mass_g, na.rm = TRUE)
penguins_$body_mass_g[is.na(penguins$body_mass_g)] <- replacement
# replacing NA's by the most frequent occurence
penguins_$sex[is.na(penguins$sex)] <- "male" # most frequent
# one-hot encoding for covariates
penguins_mat <- model.matrix(species ~., data=penguins_)[,-1]
penguins_mat <- cbind.data.frame(penguins_$species, penguins_mat)
penguins_mat <- as.data.frame(penguins_mat)
colnames(penguins_mat)[1] <- "species"
y <- penguins_mat$species
X <- as.matrix(penguins_mat[,2:ncol(penguins_mat)])
n <- nrow(X)
p <- ncol(X)
set.seed(1234)
index_train <- sample(1:n, size=floor(0.8*n))
X_train <- X[index_train, ]
y_train <- factor(y[index_train])
X_test <- X[-index_train, ][1:5, ]
y_test <- factor(y[-index_train][1:5])
obj <- learningmachine::Classifier$new(method = "extratrees")
obj$set_pi_method("bootsplitconformal")
obj$set_level(95)
obj$set_B(10L)
t0 <- proc.time()[3]
obj$fit(X_train, y_train)
cat("Elapsed: ", proc.time()[3] - t0, "s \n")
## Elapsed: 0.067 s
probs <- obj$predict_proba(X_test)
df <- reshape2::melt(probs$sims[[1]][1:3, ])
df$Var2 <- NULL
colnames(df) <- c("individual", "prob_Adelie")
df$individual <- as.factor(df$individual)
ggplot2::ggplot(df, aes(x=individual, y=prob_Adelie)) + geom_boxplot() + coord_flip()
ggplot2::ggplot(df, aes(x=prob_Adelie, fill=individual)) + geom_density(alpha=.3)
obj$summary(X_test, y=y_test,
class_name = "Adelie",
show_progress=FALSE)
## $Coverage_rate
## [1] 100
##
## $ttests
## estimate lower upper p-value signif
## islandDream 0.000000e+00 NaN NaN NaN
## islandTorgersen -1.777179e-02 -3.460341e-02 -9.401734e-04 0.04274849 *
## bill_length_mm 5.783764e-03 -5.548285e-03 1.711581e-02 0.22942360
## bill_depth_mm 3.417161e-03 -1.936433e-03 8.770756e-03 0.15105416
## flipper_length_mm -4.789082e-04 -1.784421e-03 8.266042e-04 0.36603238
## body_mass_g -2.321786e-05 -5.784108e-05 1.140536e-05 0.13610928
## sexmale -8.000160e-03 -2.218600e-02 6.185680e-03 0.19245625
## year -3.342752e-05 -8.060785e-05 1.375281e-05 0.12056579
##
## $effects
## ── Data Summary ────────────────────────
## Values
## Name effects
## Number of rows 5
## Number of columns 8
## _______________________
## Column type frequency:
## numeric 8
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable mean sd p0 p25 p50
## 1 islandDream 0 0 0 0 0
## 2 islandTorgersen -0.0178 0.0136 -0.0344 -0.0270 -0.0189
## 3 bill_length_mm 0.00578 0.00913 0.000433 0.00101 0.00263
## 4 bill_depth_mm 0.00342 0.00431 -0.00114 0.00175 0.00276
## 5 flipper_length_mm -0.000479 0.00105 -0.00233 -0.000239 -0.0000588
## 6 body_mass_g -0.0000232 0.0000279 -0.0000604 -0.0000441 -0.00000946
## 7 sexmale -0.00800 0.0114 -0.0224 -0.0185 0
## 8 year -0.0000334 0.0000380 -0.0000969 -0.0000349 -0.0000212
## p75 p100 hist
## 1 0 0 ▁▁▇▁▁
## 2 -0.00589 -0.00261 ▃▃▃▁▇
## 3 0.00285 0.0220 ▇▁▁▁▂
## 4 0.00320 0.0105 ▂▇▁▁▂
## 5 -0.0000351 0.000270 ▂▁▁▁▇
## 6 -0.00000863 0.00000656 ▃▃▁▇▃
## 7 0 0.000837 ▅▁▁▁▇
## 8 -0.0000174 0.00000325 ▂▁▁▇▂