---
title: "conformalize"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{conformalize}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r, message=FALSE}
library(misc)
```

## Example: Conformal Prediction with Out-of-Sample Coverage

In this example, we demonstrate how to use the `conformalize` function to perform conformal prediction and calculate the out-of-sample coverage rate.

### Simulated Data

We will generate a simple dataset for demonstration purposes.

```{r simulate-data, eval=TRUE}
set.seed(123)
n <- 200
x <- matrix(runif(n * 2), ncol = 2)
y <- 3 * x[, 1] + 2 * x[, 2] + rnorm(n, sd = 0.5)
data <- data.frame(x1 = x[, 1], x2 = x[, 2], y = y)
```

### Fit Conformal Model

We will use a linear model (`lm`) as the `fit_func` and its corresponding `predict` function as the `predict_func`.

```{r fit-conformal-model}
library(stats)

# Define fit and predict functions
fit_func <- function(formula, data, ...) lm(formula, data = data, ...)
predict_func <- function(fit, newdata, ...) predict(fit, newdata = newdata, ...)

# Apply conformalize
conformal_model <- misc::conformalize(
  formula = y ~ x1 + x2,
  data = data,
  fit_func = fit_func,
  predict_func = predict_func,
  split_ratio = 0.8,
  seed = 123
)
```

### Generate Predictions and Prediction Intervals

We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals.

```{r predict-intervals}
# New data for prediction
new_data <- data.frame(x1 = runif(50), x2 = runif(50))

# Predict with split conformal method
predictions <- predict(
  conformal_model,
  newdata = new_data,
  level = 0.95,
  method = "split"
)

head(predictions)
```

### Calculate Out-of-Sample Coverage Rate

The coverage rate is the proportion of true values that fall within the prediction intervals.

```{r calculate-coverage, eval=TRUE}
# Simulate true values for the new data
true_y <- 3 * new_data$x1 + 2 * new_data$x2 + rnorm(50, sd = 0.5)

# Check if true values fall within the prediction intervals
coverage <- mean(true_y >= predictions[, "lwr"] & true_y <= predictions[, "upr"])

cat("Out-of-sample coverage rate:", coverage)
```

### Results

- The prediction intervals are calculated using the split conformal method.
- The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95).

## Example: Conformal Prediction with the `MASS::Boston` Dataset

In this example, we use the `MASS::Boston` dataset to demonstrate conformal prediction.

### Load the Data

We will use the `MASS` package to access the `Boston` dataset.

```{r load-boston-data}
library(MASS)

# Load the Boston dataset
data(Boston)

# Inspect the dataset
head(Boston)
```

### Split the Data

We will split the data into training and test sets to ensure they are disjoint.

```{r split-data-boston}
set.seed(123)
n <- nrow(Boston)
train_indices <- sample(seq_len(n), size = floor(0.8 * n))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
```

### Fit Conformal Model 1

```{r fit-conformal-boston}
# Define fit and predict functions
fit_func <- function(formula, data, ...) MASS::rlm(formula, data = data, ...)
predict_func <- function(fit, newdata, ...) predict(fit, newdata, ...)

# Apply conformalize using the training data
conformal_model_boston <- misc::conformalize(
  formula = medv ~ .,
  data = train_data,
  fit_func = fit_func,
  predict_func = predict_func,
  seed = 123
)
```

### Generate Predictions and Prediction Intervals 1

We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals for the test set.

```{r predict-intervals-boston}
# Predict with split conformal method on the test data
predictions_boston <- predict(
  conformal_model_boston,
  newdata = test_data,
  level = 0.95,
  method = "split"
)

head(predictions_boston)
```

### Calculate Out-of-Sample Coverage Rate 1

The coverage rate is the proportion of true values in the test set that fall within the prediction intervals.

```{r calculate-coverage-boston}
# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```


### Fit Conformal Model 2


```{r fit-conformal-boston2}
# Define fit and predict functions
fit_func <- function(formula, data, ...) stats::glm(formula, data = data, ...)
predict_func <- function(fit, newdata, ...) predict(fit, newdata, ...)

# Apply conformalize using the training data
conformal_model_boston <- misc::conformalize(
  formula = medv ~ .,
  data = train_data,
  fit_func = fit_func,
  predict_func = predict_func,
  seed = 123
)
```

### Generate Predictions and Prediction Intervals 2

We will use the `predict.conformalize` method to generate predictions and calculate prediction intervals for the test set.

```{r predict-intervals-boston2}
# Predict with split conformal method on the test data
predictions_boston <- predict(
  conformal_model_boston,
  newdata = test_data,
  level = 0.95,
  method = "split"
)

head(predictions_boston)
```

```{r predict-intervals-boston3}
# Predict with split conformal method on the test data
predictions_boston2 <- predict(
  conformal_model_boston,
  newdata = test_data,
  level = 0.95,
  method = "kde"
)

head(predictions_boston2)
```

```{r predict-intervals-boston4}
# Predict with split conformal method on the test data
predictions_boston3 <- predict(
  conformal_model_boston,
  newdata = test_data,
  level = 0.95,
  method = "surrogate"
)

head(predictions_boston3)
```

```{r predict-intervals-boston5}
# Predict with split conformal method on the test data
predictions_boston4 <- predict(
  conformal_model_boston,
  newdata = test_data,
  level = 0.95,
  method = "bootstrap"
)

head(predictions_boston4)
```

### Fit Conformal Model 2


```{r fit-conformal-boston3}
# Define fit and predict functions
fit_func <- function(formula, data, ...) ranger::ranger(formula, data = data)
predict_func <- function(fit, newdata, ...) predict(fit, newdata)$predictions

# Apply conformalize using the training data
conformal_model_boston_rf <- misc::conformalize(
  formula = medv ~ .,
  data = train_data,
  fit_func = fit_func,
  predict_func = predict_func,
  seed = 123
)

# Predict with split conformal method on the test data
predictions_boston_rf <- predict(
  conformal_model_boston_rf,
  newdata = test_data,
  predict_func = predict_func,
  level = 0.95,
  method = "kde"
)

head(predictions_boston_rf)
```

```{r plot-prediction-intervals, echo=FALSE, message=FALSE}
# Create a data frame for plotting
plot_data <- data.frame(
  Observation = seq_len(nrow(test_data)),
  TrueValue = test_data$medv,
  LowerBound = predictions_boston_rf[, "lwr"],
  UpperBound = predictions_boston_rf[, "upr"]
)
```

```{r plot-prediction-intervals-base, echo=FALSE, message=FALSE, fig.width=7.5}
# Sort data by observation for proper plotting
plot_data <- plot_data[order(plot_data$Observation), ]

# Plot the true values
plot(
  plot_data$Observation, plot_data$TrueValue,
  pch = 16, col = "blue", cex = 0.7,
  xlab = "Observation", ylab = "Value",
  main = "Prediction Intervals vs True Values"
)

# Add the prediction intervals using polygon
polygon(
  c(plot_data$Observation, rev(plot_data$Observation)),
  c(plot_data$LowerBound, rev(plot_data$UpperBound)),
  col = rgb(1, 0, 0, 0.2), border = NA
)

# Add points for true values again to overlay on the polygon
points(
  plot_data$Observation, plot_data$TrueValue,
  pch = 16, col = "blue", cex = 0.7
)
```

### Calculate Out-of-Sample Coverage Rate 2

The coverage rate is the proportion of true values in the test set that fall within the prediction intervals.

```{r calculate-coverage-boston2}
# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```

```{r calculate-coverage-boston3}
# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston2[, "lwr"] & true_y_boston <= predictions_boston2[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```

```{r calculate-coverage-boston4}
# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston3[, "lwr"] & true_y_boston <= predictions_boston3[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```

```{r calculate-coverage-boston5}
# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston4[, "lwr"] & true_y_boston <= predictions_boston4[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```

```{r calculate-coverage-boston6}
# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston_rf[, "lwr"] & true_y_boston <= predictions_boston_rf[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)
```

### Results

- The prediction intervals are calculated using the split conformal method.
- The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95).