--- title: "BCN with observations clustering" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{bcn-intro} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- **BCN with observations clustering** ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(bcn) library(randomForest) library(pROC) ``` **Content** - [0 - Installing/Loading Packages](#installing-and-loading-packages) - [1 - iris dataset](#iris-dataset) - [2 - wine dataset](#wine-dataset) - [3 - Palmer Penguins dataset](#penguins-dataset) # 0 - Installing and loading packages Installing `bcn` From Github: ```R devtools::install_github("Techtonique/bcn") # Browse the bcn manual pages help(package = 'bcn') ``` Installing `bcn` from R universe: ```R # Enable repository from techtonique options(repos = c( techtonique = 'https://techtonique.r-universe.dev', CRAN = 'https://cloud.r-project.org')) # Download and install bcn in R install.packages('bcn') # Browse the bcn manual pages help(package = 'bcn') ```` Loading packages: ```{r} library(bcn) library(randomForest) library(pROC) ``` # 1 - iris dataset ```{r} data("iris") ``` ```{r} head(iris) dim(iris) set.seed(1234) train_idx <- sample(nrow(iris), 0.8 * nrow(iris)) X_train <- as.matrix(iris[train_idx, -ncol(iris)]) X_test <- as.matrix(iris[-train_idx, -ncol(iris)]) y_train <- iris$Species[train_idx] y_test <- iris$Species[-train_idx] ``` ```{r fit_iris} ptm <- proc.time() fit_obj <- bcn::bcn(x = X_train, y = y_train, B = 10L, nu = 0.335855, lam = 10**0.7837525, r = 1 - 10**(-5.470031), tol = 10**-7, n_clusters = 3L, activation = "tanh", type_optim = "nlminb", show_progress = FALSE) cat("Elapsed: ", (proc.time() - ptm)[3]) ``` ```{r fig.width=7.2} plot(fit_obj$errors_norm, type='l') ``` ```{r} preds <- predict(fit_obj, newx = X_test) # accuracy mean(preds == y_test) # confusion matrix table(y_test, preds) ``` ```{r} rf <- randomForest::randomForest(x = X_train, y = y_train) mean(predict(rf, newdata=as.matrix(X_test)) == y_test) ``` ```{r} print(head(predict(fit_obj, newx = X_test, type='probs'))) print(head(predict(rf, newdata=as.matrix(X_test), type='prob'))) ``` # 2- wine dataset ```{r} data(wine) ``` ```{r} head(wine) dim(wine) set.seed(1234) train_idx <- sample(nrow(wine), 0.8 * nrow(wine)) X_train <- as.matrix(wine[train_idx, -ncol(wine)]) X_test <- as.matrix(wine[-train_idx, -ncol(wine)]) y_train <- as.factor(wine$target[train_idx]) y_test <- as.factor(wine$target[-train_idx]) ``` ```{r fit_wine} ptm <- proc.time() fit_obj <- bcn::bcn(x = X_train, y = y_train, B = 6L, nu = 0.8715725, lam = 10**0.2143678, r = 1 - 10**(-6.1072786), tol = 10**-4.9605713, n_clusters = 3L, show_progress = FALSE) cat("Elapsed: ", (proc.time() - ptm)[3]) ``` ```{r fig.width=7.2} plot(fit_obj$errors_norm, type='l') ``` ```{r} preds <- predict(fit_obj, newx = X_test) # accuracy mean(preds == y_test) # confusion matrix table(y_test, preds) ``` ```{r} rf <- randomForest::randomForest(x = X_train, y = y_train) mean(predict(rf, newdata=as.matrix(X_test)) == y_test) ``` ```{r} print(head(predict(fit_obj, newx = X_test, type='probs'))) print(head(predict(rf, newdata=as.matrix(X_test), type='prob'))) ``` # 3 - Penguins dataset ```{r} data("penguins") ``` ```{r} penguins_ <- as.data.frame(penguins) replacement <- median(penguins$bill_length_mm, na.rm = TRUE) penguins_$bill_length_mm[is.na(penguins$bill_length_mm)] <- replacement replacement <- median(penguins$bill_depth_mm, na.rm = TRUE) penguins_$bill_depth_mm[is.na(penguins$bill_depth_mm)] <- replacement replacement <- median(penguins$flipper_length_mm, na.rm = TRUE) penguins_$flipper_length_mm[is.na(penguins$flipper_length_mm)] <- replacement replacement <- median(penguins$body_mass_g, na.rm = TRUE) penguins_$body_mass_g[is.na(penguins$body_mass_g)] <- replacement # replacing NA's by the most frequent occurence penguins_$sex[is.na(penguins$sex)] <- "male" # most frequent print(summary(penguins_)) print(sum(is.na(penguins_))) # one-hot encoding for covariates penguins_mat <- model.matrix(species ~., data=penguins_)[,-1] penguins_mat <- cbind(penguins_$species, penguins_mat) penguins_mat <- as.data.frame(penguins_mat) colnames(penguins_mat)[1] <- "species" print(head(penguins_mat)) print(tail(penguins_mat)) y <- as.integer(penguins_mat$species) X <- as.matrix(penguins_mat[,2:ncol(penguins_mat)]) n <- nrow(X) p <- ncol(X) set.seed(1234) index_train <- sample(1:n, size=floor(0.8*n)) X_train <- X[index_train, ] y_train <- factor(y[index_train]) X_test <- X[-index_train, ] y_test <- factor(y[-index_train]) ``` ```{r fit_penguins} ptm <- proc.time() fit_obj <- bcn::bcn(x = X_train, y = y_train, B = 23, nu = 0.470043, lam = 10**-0.05766029, r = 1 - 10**(-7.905866), tol = 10**-7, n_clusters = 3L, show_progress = FALSE) cat("Elapsed: ", (proc.time() - ptm)[3]) ``` ```{r fig.width=7.2} plot(fit_obj$errors_norm, type='l') ``` ```{r} preds <- predict(fit_obj, newx = X_test) # accuracy mean(preds == y_test) # confusion matrix table(y_test, preds) ``` ```{r} rf <- randomForest::randomForest(x = X_train, y = y_train) mean(predict(rf, newdata=as.matrix(X_test)) == y_test) ``` ```{r} print(head(predict(fit_obj, newx = X_test, type='probs'))) print(head(predict(rf, newdata=as.matrix(X_test), type='prob'))) ```