---
title: "conformalize"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{conformalize}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r, message=FALSE}
library(misc)
```
## Example: Conformal Prediction with Out-of-Sample Coverage
In this example, we demonstrate how to use the `conformalize` function to perform conformal prediction and calculate the out-of-sample coverage rate.
### Simulated Data
We will generate a simple dataset for demonstration purposes.
```{r simulate-data, eval=TRUE}
set.seed(123)
n <- 200
x <- matrix(runif(n * 2), ncol = 2)
y <- 3 * x[, 1] + 2 * x[, 2] + rnorm(n, sd = 0.5)
data <- data.frame(x1 = x[, 1], x2 = x[, 2], y = y)
```
### Fit Conformal Model
We will use a linear model (`lm`) as the `fit_func` and its corresponding `predict` function as the `predict_func`.
```{r fit-conformal-model}
library(stats)
# Define fit and predict functions
fit_func <- function(formula, data, ...) lm(formula, data = data, ...)
predict_func <- function(fit, newdata, ...) predict(fit, newdata = newdata, ...)
# Apply conformalize
conformal_model <- misc::conformalize(
formula = y ~ x1 + x2,
data = data,
fit_func = fit_func,
predict_func = predict_func,
split_ratio = 0.8,
seed = 123
)
```
### Generate Predictions and Prediction Intervals
We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals.
```{r predict-intervals}
# New data for prediction
new_data <- data.frame(x1 = runif(50), x2 = runif(50))
# Predict with split conformal method
predictions <- predict(
conformal_model,
newdata = new_data,
level = 0.95,
method = "split"
)
head(predictions)
```
### Calculate Out-of-Sample Coverage Rate
The coverage rate is the proportion of true values that fall within the prediction intervals.
```{r calculate-coverage, eval=TRUE}
# Simulate true values for the new data
true_y <- 3 * new_data$x1 + 2 * new_data$x2 + rnorm(50, sd = 0.5)
# Check if true values fall within the prediction intervals
coverage <- mean(true_y >= predictions[, "lwr"] & true_y <= predictions[, "upr"])
cat("Out-of-sample coverage rate:", coverage)
```
### Results
- The prediction intervals are calculated using the split conformal method.
- The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95).
## Example: Conformal Prediction with the `MASS::Boston` Dataset
In this example, we use the `MASS::Boston` dataset to demonstrate conformal prediction.
### Load the Data
We will use the `MASS` package to access the `Boston` dataset.
```{r load-boston-data}
library(MASS)
# Load the Boston dataset
data(Boston)
# Inspect the dataset
head(Boston)
```
### Split the Data
We will split the data into training and test sets to ensure they are disjoint.
```{r split-data-boston}
set.seed(123)
n <- nrow(Boston)
train_indices <- sample(seq_len(n), size = floor(0.8 * n))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
```
### Fit Conformal Model 1
```{r fit-conformal-boston}
# Define fit and predict functions
fit_func <- function(formula, data, ...) MASS::rlm(formula, data = data, ...)
predict_func <- function(fit, newdata, ...) predict(fit, newdata, ...)
# Apply conformalize using the training data
conformal_model_boston <- misc::conformalize(
formula = medv ~ .,
data = train_data,
fit_func = fit_func,
predict_func = predict_func,
seed = 123
)
```
### Generate Predictions and Prediction Intervals 1
We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals for the test set.
```{r predict-intervals-boston}
# Predict with split conformal method on the test data
predictions_boston <- predict(
conformal_model_boston,
newdata = test_data,
level = 0.95,
method = "split"
)
head(predictions_boston)
```
### Calculate Out-of-Sample Coverage Rate 1
The coverage rate is the proportion of true values in the test set that fall within the prediction intervals.
```{r calculate-coverage-boston}
# True values for the test set
true_y_boston <- test_data$medv
# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"])
cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```
### Fit Conformal Model 2
```{r fit-conformal-boston2}
# Define fit and predict functions
fit_func <- function(formula, data, ...) stats::glm(formula, data = data, ...)
predict_func <- function(fit, newdata, ...) predict(fit, newdata, ...)
# Apply conformalize using the training data
conformal_model_boston <- misc::conformalize(
formula = medv ~ .,
data = train_data,
fit_func = fit_func,
predict_func = predict_func,
seed = 123
)
```
### Generate Predictions and Prediction Intervals 2
We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals for the test set.
```{r predict-intervals-boston2}
# Predict with split conformal method on the test data
predictions_boston <- predict(
conformal_model_boston,
newdata = test_data,
level = 0.95,
method = "split"
)
head(predictions_boston)
```
```{r predict-intervals-boston3}
# Predict with split conformal method on the test data
predictions_boston2 <- predict(
conformal_model_boston,
newdata = test_data,
level = 0.95,
method = "kde"
)
head(predictions_boston2)
```
```{r predict-intervals-boston4}
# Predict with split conformal method on the test data
predictions_boston3 <- predict(
conformal_model_boston,
newdata = test_data,
level = 0.95,
method = "surrogate"
)
head(predictions_boston3)
```
```{r predict-intervals-boston5}
# Predict with split conformal method on the test data
predictions_boston4 <- predict(
conformal_model_boston,
newdata = test_data,
level = 0.95,
method = "bootstrap"
)
head(predictions_boston4)
```
### Fit Conformal Model 2
```{r fit-conformal-boston3}
# Define fit and predict functions
fit_func <- function(formula, data, ...) ranger::ranger(formula, data = data)
predict_func <- function(fit, newdata, ...) predict(fit, newdata)$predictions
# Apply conformalize using the training data
conformal_model_boston_rf <- misc::conformalize(
formula = medv ~ .,
data = train_data,
fit_func = fit_func,
predict_func = predict_func,
seed = 123
)
# Predict with split conformal method on the test data
predictions_boston_rf <- predict(
conformal_model_boston_rf,
newdata = test_data,
predict_func = predict_func,
level = 0.95,
method = "kde"
)
head(predictions_boston_rf)
```
```{r plot-prediction-intervals, echo=FALSE, message=FALSE}
# Create a data frame for plotting
plot_data <- data.frame(
Observation = seq_len(nrow(test_data)),
TrueValue = test_data$medv,
LowerBound = predictions_boston_rf[, "lwr"],
UpperBound = predictions_boston_rf[, "upr"]
)
```
```{r plot-prediction-intervals-base, echo=FALSE, message=FALSE, fig.width=7.5}
# Sort data by observation for proper plotting
plot_data <- plot_data[order(plot_data$Observation), ]
# Plot the true values
plot(
plot_data$Observation, plot_data$TrueValue,
pch = 16, col = "blue", cex = 0.7,
xlab = "Observation", ylab = "Value",
main = "Prediction Intervals vs True Values"
)
# Add the prediction intervals using polygon
polygon(
c(plot_data$Observation, rev(plot_data$Observation)),
c(plot_data$LowerBound, rev(plot_data$UpperBound)),
col = rgb(1, 0, 0, 0.2), border = NA
)
# Add points for true values again to overlay on the polygon
points(
plot_data$Observation, plot_data$TrueValue,
pch = 16, col = "blue", cex = 0.7
)
```
### Calculate Out-of-Sample Coverage Rate 2
The coverage rate is the proportion of true values in the test set that fall within the prediction intervals.
```{r calculate-coverage-boston2}
# True values for the test set
true_y_boston <- test_data$medv
# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"])
cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```
```{r calculate-coverage-boston3}
# True values for the test set
true_y_boston <- test_data$medv
# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston2[, "lwr"] & true_y_boston <= predictions_boston2[, "upr"])
cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```
```{r calculate-coverage-boston4}
# True values for the test set
true_y_boston <- test_data$medv
# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston3[, "lwr"] & true_y_boston <= predictions_boston3[, "upr"])
cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```
```{r calculate-coverage-boston5}
# True values for the test set
true_y_boston <- test_data$medv
# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston4[, "lwr"] & true_y_boston <= predictions_boston4[, "upr"])
cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```
```{r calculate-coverage-boston6}
# True values for the test set
true_y_boston <- test_data$medv
# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston_rf[, "lwr"] & true_y_boston <= predictions_boston_rf[, "upr"])
cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```
### Results
- The prediction intervals are calculated using the split conformal method.
- The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95).