--- title: "conformalize" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{conformalize} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r, message=FALSE} library(misc) ``` ## Example: Conformal Prediction with Out-of-Sample Coverage In this example, we demonstrate how to use the `conformalize` function to perform conformal prediction and calculate the out-of-sample coverage rate. ### Simulated Data We will generate a simple dataset for demonstration purposes. ```{r simulate-data, eval=TRUE} set.seed(123) n <- 200 x <- matrix(runif(n * 2), ncol = 2) y <- 3 * x[, 1] + 2 * x[, 2] + rnorm(n, sd = 0.5) data <- data.frame(x1 = x[, 1], x2 = x[, 2], y = y) ``` ### Fit Conformal Model We will use a linear model (`lm`) as the `fit_func` and its corresponding `predict` function as the `predict_func`. ```{r fit-conformal-model} library(stats) # Define fit and predict functions fit_func <- function(formula, data, ...) lm(formula, data = data, ...) predict_func <- function(fit, newdata, ...) predict(fit, newdata = newdata, ...) # Apply conformalize conformal_model <- misc::conformalize( formula = y ~ x1 + x2, data = data, fit_func = fit_func, predict_func = predict_func, split_ratio = 0.8, seed = 123 ) ``` ### Generate Predictions and Prediction Intervals We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals. ```{r predict-intervals} # New data for prediction new_data <- data.frame(x1 = runif(50), x2 = runif(50)) # Predict with split conformal method predictions <- predict( conformal_model, newdata = new_data, level = 0.95, method = "split" ) head(predictions) ``` ### Calculate Out-of-Sample Coverage Rate The coverage rate is the proportion of true values that fall within the prediction intervals. ```{r calculate-coverage, eval=TRUE} # Simulate true values for the new data true_y <- 3 * new_data$x1 + 2 * new_data$x2 + rnorm(50, sd = 0.5) # Check if true values fall within the prediction intervals coverage <- mean(true_y >= predictions[, "lwr"] & true_y <= predictions[, "upr"]) cat("Out-of-sample coverage rate:", coverage) ``` ### Results - The prediction intervals are calculated using the split conformal method. - The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95). ## Example: Conformal Prediction with the `MASS::Boston` Dataset In this example, we use the `MASS::Boston` dataset to demonstrate conformal prediction. ### Load the Data We will use the `MASS` package to access the `Boston` dataset. ```{r load-boston-data} library(MASS) # Load the Boston dataset data(Boston) # Inspect the dataset head(Boston) ``` ### Split the Data We will split the data into training and test sets to ensure they are disjoint. ```{r split-data-boston} set.seed(123) n <- nrow(Boston) train_indices <- sample(seq_len(n), size = floor(0.8 * n)) train_data <- Boston[train_indices, ] test_data <- Boston[-train_indices, ] ``` ### Fit Conformal Model 1 ```{r fit-conformal-boston} # Define fit and predict functions fit_func <- function(formula, data, ...) MASS::rlm(formula, data = data, ...) predict_func <- function(fit, newdata, ...) predict(fit, newdata, ...) # Apply conformalize using the training data conformal_model_boston <- misc::conformalize( formula = medv ~ ., data = train_data, fit_func = fit_func, predict_func = predict_func, seed = 123 ) ``` ### Generate Predictions and Prediction Intervals 1 We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals for the test set. ```{r predict-intervals-boston} # Predict with split conformal method on the test data predictions_boston <- predict( conformal_model_boston, newdata = test_data, level = 0.95, method = "split" ) head(predictions_boston) ``` ### Calculate Out-of-Sample Coverage Rate 1 The coverage rate is the proportion of true values in the test set that fall within the prediction intervals. ```{r calculate-coverage-boston} # True values for the test set true_y_boston <- test_data$medv # Check if true values fall within the prediction intervals coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"]) cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston) ``` ### Fit Conformal Model 2 ```{r fit-conformal-boston2} # Define fit and predict functions fit_func <- function(formula, data, ...) stats::glm(formula, data = data, ...) predict_func <- function(fit, newdata, ...) predict(fit, newdata, ...) # Apply conformalize using the training data conformal_model_boston <- misc::conformalize( formula = medv ~ ., data = train_data, fit_func = fit_func, predict_func = predict_func, seed = 123 ) ``` ### Generate Predictions and Prediction Intervals 2 We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals for the test set. ```{r predict-intervals-boston2} # Predict with split conformal method on the test data predictions_boston <- predict( conformal_model_boston, newdata = test_data, level = 0.95, method = "split" ) head(predictions_boston) ``` ```{r predict-intervals-boston3} # Predict with split conformal method on the test data predictions_boston2 <- predict( conformal_model_boston, newdata = test_data, level = 0.95, method = "kde" ) head(predictions_boston2) ``` ```{r predict-intervals-boston4} # Predict with split conformal method on the test data predictions_boston3 <- predict( conformal_model_boston, newdata = test_data, level = 0.95, method = "surrogate" ) head(predictions_boston3) ``` ```{r predict-intervals-boston5} # Predict with split conformal method on the test data predictions_boston4 <- predict( conformal_model_boston, newdata = test_data, level = 0.95, method = "bootstrap" ) head(predictions_boston4) ``` ### Fit Conformal Model 2 ```{r fit-conformal-boston3} # Define fit and predict functions fit_func <- function(formula, data, ...) ranger::ranger(formula, data = data) predict_func <- function(fit, newdata, ...) predict(fit, newdata)$predictions # Apply conformalize using the training data conformal_model_boston_rf <- misc::conformalize( formula = medv ~ ., data = train_data, fit_func = fit_func, predict_func = predict_func, seed = 123 ) # Predict with split conformal method on the test data predictions_boston_rf <- predict( conformal_model_boston_rf, newdata = test_data, predict_func = predict_func, level = 0.95, method = "kde" ) head(predictions_boston_rf) ``` ```{r plot-prediction-intervals, echo=FALSE, message=FALSE} # Create a data frame for plotting plot_data <- data.frame( Observation = seq_len(nrow(test_data)), TrueValue = test_data$medv, LowerBound = predictions_boston_rf[, "lwr"], UpperBound = predictions_boston_rf[, "upr"] ) ``` ```{r plot-prediction-intervals-base, echo=FALSE, message=FALSE, fig.width=7.5} # Sort data by observation for proper plotting plot_data <- plot_data[order(plot_data$Observation), ] # Plot the true values plot( plot_data$Observation, plot_data$TrueValue, pch = 16, col = "blue", cex = 0.7, xlab = "Observation", ylab = "Value", main = "Prediction Intervals vs True Values" ) # Add the prediction intervals using polygon polygon( c(plot_data$Observation, rev(plot_data$Observation)), c(plot_data$LowerBound, rev(plot_data$UpperBound)), col = rgb(1, 0, 0, 0.2), border = NA ) # Add points for true values again to overlay on the polygon points( plot_data$Observation, plot_data$TrueValue, pch = 16, col = "blue", cex = 0.7 ) ``` ### Calculate Out-of-Sample Coverage Rate 2 The coverage rate is the proportion of true values in the test set that fall within the prediction intervals. ```{r calculate-coverage-boston2} # True values for the test set true_y_boston <- test_data$medv # Check if true values fall within the prediction intervals coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"]) cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston) ``` ```{r calculate-coverage-boston3} # True values for the test set true_y_boston <- test_data$medv # Check if true values fall within the prediction intervals coverage_boston <- mean(true_y_boston >= predictions_boston2[, "lwr"] & true_y_boston <= predictions_boston2[, "upr"]) cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston) ``` ```{r calculate-coverage-boston4} # True values for the test set true_y_boston <- test_data$medv # Check if true values fall within the prediction intervals coverage_boston <- mean(true_y_boston >= predictions_boston3[, "lwr"] & true_y_boston <= predictions_boston3[, "upr"]) cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston) ``` ```{r calculate-coverage-boston5} # True values for the test set true_y_boston <- test_data$medv # Check if true values fall within the prediction intervals coverage_boston <- mean(true_y_boston >= predictions_boston4[, "lwr"] & true_y_boston <= predictions_boston4[, "upr"]) cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston) ``` ```{r calculate-coverage-boston6} # True values for the test set true_y_boston <- test_data$medv # Check if true values fall within the prediction intervals coverage_boston <- mean(true_y_boston >= predictions_boston_rf[, "lwr"] & true_y_boston <= predictions_boston_rf[, "upr"]) cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston) ``` ### Results - The prediction intervals are calculated using the split conformal method. - The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95).