BCN with observations clustering
library(bcn)
library(randomForest)
#> randomForest 4.7-1.2
#> Type rfNews() to see new features/changes/bug fixes.
library(pROC)
#> Type 'citation("pROC")' for a citation.
#>
#> Attaching package: 'pROC'
#> The following objects are masked from 'package:stats':
#>
#> cov, smooth, var
Content
Installing bcn
From Github:
Installing bcn
from R universe:
# Enable repository from techtonique
options(repos = c(
techtonique = 'https://techtonique.r-universe.dev',
CRAN = 'https://cloud.r-project.org'))
# Download and install bcn in R
install.packages('bcn')
# Browse the bcn manual pages
help(package = 'bcn')
Loading packages:
head(iris)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
dim(iris)
#> [1] 150 5
set.seed(1234)
train_idx <- sample(nrow(iris), 0.8 * nrow(iris))
X_train <- as.matrix(iris[train_idx, -ncol(iris)])
X_test <- as.matrix(iris[-train_idx, -ncol(iris)])
y_train <- iris$Species[train_idx]
y_test <- iris$Species[-train_idx]
ptm <- proc.time()
fit_obj <- bcn::bcn(x = X_train, y = y_train, B = 10L, nu = 0.335855,
lam = 10**0.7837525, r = 1 - 10**(-5.470031), tol = 10**-7,
n_clusters = 3L,
activation = "tanh", type_optim = "nlminb", show_progress = FALSE)
cat("Elapsed: ", (proc.time() - ptm)[3])
#> Elapsed: 0.128
preds <- predict(fit_obj, newx = X_test)
# accuracy
mean(preds == y_test)
#> [1] 0.9333333
# confusion matrix
table(y_test, preds)
#> preds
#> y_test setosa versicolor virginica
#> setosa 13 0 0
#> versicolor 0 8 0
#> virginica 0 2 7
rf <- randomForest::randomForest(x = X_train, y = y_train)
mean(predict(rf, newdata=as.matrix(X_test)) == y_test)
#> [1] 0.9333333
print(head(predict(fit_obj, newx = X_test, type='probs')))
#> setosa versicolor virginica
#> [1,] 0.4087696 0.3023203 0.2889101
#> [2,] 0.4098305 0.2994149 0.2907546
#> [3,] 0.4098215 0.2994409 0.2907376
#> [4,] 0.3990702 0.3286958 0.2722340
#> [5,] 0.4095235 0.3002482 0.2902283
#> [6,] 0.4046861 0.3135704 0.2817435
print(head(predict(rf, newdata=as.matrix(X_test), type='prob')))
#> setosa versicolor virginica
#> 1 1.000 0.000 0
#> 7 1.000 0.000 0
#> 12 1.000 0.000 0
#> 15 0.978 0.022 0
#> 18 1.000 0.000 0
#> 23 1.000 0.000 0
head(wine)
#> alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids
#> 1 14.23 1.71 2.43 15.6 127 2.80 3.06
#> 2 13.20 1.78 2.14 11.2 100 2.65 2.76
#> 3 13.16 2.36 2.67 18.6 101 2.80 3.24
#> 4 14.37 1.95 2.50 16.8 113 3.85 3.49
#> 5 13.24 2.59 2.87 21.0 118 2.80 2.69
#> 6 14.20 1.76 2.45 15.2 112 3.27 3.39
#> nonflavanoid_phenols proanthocyanins color_intensity hue
#> 1 0.28 2.29 5.64 1.04
#> 2 0.26 1.28 4.38 1.05
#> 3 0.30 2.81 5.68 1.03
#> 4 0.24 2.18 7.80 0.86
#> 5 0.39 1.82 4.32 1.04
#> 6 0.34 1.97 6.75 1.05
#> od280.od315_of_diluted_wines proline target
#> 1 3.92 1065 1
#> 2 3.40 1050 1
#> 3 3.17 1185 1
#> 4 3.45 1480 1
#> 5 2.93 735 1
#> 6 2.85 1450 1
dim(wine)
#> [1] 178 14
set.seed(1234)
train_idx <- sample(nrow(wine), 0.8 * nrow(wine))
X_train <- as.matrix(wine[train_idx, -ncol(wine)])
X_test <- as.matrix(wine[-train_idx, -ncol(wine)])
y_train <- as.factor(wine$target[train_idx])
y_test <- as.factor(wine$target[-train_idx])
ptm <- proc.time()
fit_obj <- bcn::bcn(x = X_train, y = y_train, B = 6L, nu = 0.8715725,
lam = 10**0.2143678, r = 1 - 10**(-6.1072786),
tol = 10**-4.9605713,
n_clusters = 3L,
show_progress = FALSE)
cat("Elapsed: ", (proc.time() - ptm)[3])
#> Elapsed: 0.24
preds <- predict(fit_obj, newx = X_test)
# accuracy
mean(preds == y_test)
#> [1] 0.8888889
# confusion matrix
table(y_test, preds)
#> preds
#> y_test 1 2 3
#> 1 14 0 0
#> 2 3 12 1
#> 3 0 0 6
rf <- randomForest::randomForest(x = X_train, y = y_train)
mean(predict(rf, newdata=as.matrix(X_test)) == y_test)
#> [1] 0.9722222
print(head(predict(fit_obj, newx = X_test, type='probs')))
#> 1 2 3
#> [1,] 0.4221447 0.2909523 0.2869030
#> [2,] 0.4183810 0.2897152 0.2919038
#> [3,] 0.4219856 0.2903089 0.2877055
#> [4,] 0.4220841 0.2904210 0.2874949
#> [5,] 0.4212313 0.2895900 0.2891787
#> [6,] 0.4197144 0.2871924 0.2930932
print(head(predict(rf, newdata=as.matrix(X_test), type='prob')))
#> 1 2 3
#> 1 0.998 0.002 0.000
#> 5 0.558 0.384 0.058
#> 12 0.802 0.172 0.026
#> 13 0.906 0.084 0.010
#> 18 0.914 0.066 0.020
#> 31 0.908 0.088 0.004
penguins_ <- as.data.frame(penguins)
replacement <- median(penguins$bill_length_mm, na.rm = TRUE)
penguins_$bill_length_mm[is.na(penguins$bill_length_mm)] <- replacement
replacement <- median(penguins$bill_depth_mm, na.rm = TRUE)
penguins_$bill_depth_mm[is.na(penguins$bill_depth_mm)] <- replacement
replacement <- median(penguins$flipper_length_mm, na.rm = TRUE)
penguins_$flipper_length_mm[is.na(penguins$flipper_length_mm)] <- replacement
replacement <- median(penguins$body_mass_g, na.rm = TRUE)
penguins_$body_mass_g[is.na(penguins$body_mass_g)] <- replacement
# replacing NA's by the most frequent occurence
penguins_$sex[is.na(penguins$sex)] <- "male" # most frequent
print(summary(penguins_))
#> species island bill_length_mm bill_depth_mm
#> Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10
#> Chinstrap: 68 Dream :124 1st Qu.:39.27 1st Qu.:15.60
#> Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30
#> Mean :43.92 Mean :17.15
#> 3rd Qu.:48.50 3rd Qu.:18.70
#> Max. :59.60 Max. :21.50
#> flipper_length_mm body_mass_g sex year
#> Min. :172.0 Min. :2700 female:165 Min. :2007
#> 1st Qu.:190.0 1st Qu.:3550 male :179 1st Qu.:2007
#> Median :197.0 Median :4050 Median :2008
#> Mean :200.9 Mean :4201 Mean :2008
#> 3rd Qu.:213.0 3rd Qu.:4750 3rd Qu.:2009
#> Max. :231.0 Max. :6300 Max. :2009
print(sum(is.na(penguins_)))
#> [1] 0
# one-hot encoding for covariates
penguins_mat <- model.matrix(species ~., data=penguins_)[,-1]
penguins_mat <- cbind(penguins_$species, penguins_mat)
penguins_mat <- as.data.frame(penguins_mat)
colnames(penguins_mat)[1] <- "species"
print(head(penguins_mat))
#> species islandDream islandTorgersen bill_length_mm bill_depth_mm
#> 1 1 0 1 39.10 18.7
#> 2 1 0 1 39.50 17.4
#> 3 1 0 1 40.30 18.0
#> 4 1 0 1 44.45 17.3
#> 5 1 0 1 36.70 19.3
#> 6 1 0 1 39.30 20.6
#> flipper_length_mm body_mass_g sexmale year
#> 1 181 3750 1 2007
#> 2 186 3800 0 2007
#> 3 195 3250 0 2007
#> 4 197 4050 1 2007
#> 5 193 3450 0 2007
#> 6 190 3650 1 2007
print(tail(penguins_mat))
#> species islandDream islandTorgersen bill_length_mm bill_depth_mm
#> 339 2 1 0 45.7 17.0
#> 340 2 1 0 55.8 19.8
#> 341 2 1 0 43.5 18.1
#> 342 2 1 0 49.6 18.2
#> 343 2 1 0 50.8 19.0
#> 344 2 1 0 50.2 18.7
#> flipper_length_mm body_mass_g sexmale year
#> 339 195 3650 0 2009
#> 340 207 4000 1 2009
#> 341 202 3400 0 2009
#> 342 193 3775 1 2009
#> 343 210 4100 1 2009
#> 344 198 3775 0 2009
y <- as.integer(penguins_mat$species)
X <- as.matrix(penguins_mat[,2:ncol(penguins_mat)])
n <- nrow(X)
p <- ncol(X)
set.seed(1234)
index_train <- sample(1:n, size=floor(0.8*n))
X_train <- X[index_train, ]
y_train <- factor(y[index_train])
X_test <- X[-index_train, ]
y_test <- factor(y[-index_train])
ptm <- proc.time()
fit_obj <- bcn::bcn(x = X_train, y = y_train, B = 23, nu = 0.470043,
lam = 10**-0.05766029, r = 1 - 10**(-7.905866), tol = 10**-7,
n_clusters = 3L,
show_progress = FALSE)
cat("Elapsed: ", (proc.time() - ptm)[3])
#> Elapsed: 0.34
preds <- predict(fit_obj, newx = X_test)
# accuracy
mean(preds == y_test)
#> [1] 1
# confusion matrix
table(y_test, preds)
#> preds
#> y_test 1 2 3
#> 1 24 0 0
#> 2 0 13 0
#> 3 0 0 32
rf <- randomForest::randomForest(x = X_train, y = y_train)
mean(predict(rf, newdata=as.matrix(X_test)) == y_test)
#> [1] 1
print(head(predict(fit_obj, newx = X_test, type='probs')))
#> 1 2 3
#> [1,] 0.4331285 0.2797813 0.2870902
#> [2,] 0.4002902 0.3222456 0.2774642
#> [3,] 0.4304068 0.2809500 0.2886432
#> [4,] 0.4274011 0.2855262 0.2870728
#> [5,] 0.4276808 0.2864688 0.2858505
#> [6,] 0.4197694 0.2955128 0.2847178
print(head(predict(rf, newdata=as.matrix(X_test), type='prob')))
#> 1 2 3
#> 1 0.998 0.002 0.000
#> 3 0.992 0.006 0.002
#> 8 0.974 0.006 0.020
#> 11 0.996 0.004 0.000
#> 16 0.998 0.002 0.000
#> 18 0.950 0.032 0.018