Title: | Miscellaneous Useful R Functions |
---|---|
Description: | Miscellaneous Useful R Functions. |
Authors: | T. Moudiki [aut, cre] |
Maintainer: | T. Moudiki <[email protected]> |
License: | MIT |
Version: | 0.4.0 |
Built: | 2024-11-14 05:29:26 UTC |
Source: | https://github.com/thierrymoudiki/misc |
Debug print
debug_print(x)
debug_print(x)
x |
An object to be printed |
misc::debug_print(1:10) misc::debug_print("Hello, world!")
misc::debug_print(1:10) misc::debug_print("Hello, world!")
Fit multiple parametric distributions, compute KL divergence, simulate best fit
fit_param_dist(vector, num_bins = 30, verbose = TRUE)
fit_param_dist(vector, num_bins = 30, verbose = TRUE)
vector |
Numeric vector of data to fit |
num_bins |
Number of bins for the empirical histogram |
verbose |
Logical indicating whether to print results |
Function to simulate data from the best-fitting distribution
set.seed(123) n <- 1000 vector <- rnorm(n) start <- proc.time()[3] simulate_function <- fit_param_dist(vector) end <- proc.time()[3] print(paste("Time taken:", end - start)) simulated_data <- simulate_function(n) # Generate 100 samples from the best-fit distribution par(mfrow = c(1, 2)) hist(vector, main = "Original Data", xlab = "Value", ylab = "Frequency") hist(simulated_data, main = "Simulated Data", xlab = "Value", ylab = "Frequency")
set.seed(123) n <- 1000 vector <- rnorm(n) start <- proc.time()[3] simulate_function <- fit_param_dist(vector) end <- proc.time()[3] print(paste("Time taken:", end - start)) simulated_data <- simulate_function(n) # Generate 100 samples from the best-fit distribution par(mfrow = c(1, 2)) hist(vector, main = "Original Data", xlab = "Value", ylab = "Frequency") hist(simulated_data, main = "Simulated Data", xlab = "Value", ylab = "Frequency")
Check if a package is available
is_package_available(pkg_name)
is_package_available(pkg_name)
pkg_name |
A package name |
A logical value
misc::is_package_available("dplyr")
misc::is_package_available("dplyr")
Check if a number is a whole number
is_wholenumber(x, tol = .Machine$double.eps^0.5)
is_wholenumber(x, tol = .Machine$double.eps^0.5)
x |
A number |
tol |
A tolerance level |
A logical value
is_wholenumber(1) is_wholenumber(1.1) is_wholenumber(1L)
is_wholenumber(1) is_wholenumber(1.1) is_wholenumber(1L)
Function to calculate KL divergence for continuous distributions using histograms
KL_divergence_hist(P, Q)
KL_divergence_hist(P, Q)
P |
Numeric vector representing the empirical distribution |
Q |
Numeric vector representing the theoretical distribution |
KL divergence between P and Q
P <- c(0.2, 0.3, 0.5) Q <- c(0.1, 0.4, 0.5) misc::KL_divergence_hist(P, Q)
P <- c(0.2, 0.3, 0.5) Q <- c(0.1, 0.4, 0.5) misc::KL_divergence_hist(P, Q)
One-hot encoding
one_hot_encode(y)
one_hot_encode(y)
y |
A vector of class labels |
n_classes |
The number of classes |
A matrix of one-hot encoded labels
y <- as.factor(c(1, 2, 1, 1, 2)) misc::one_hot_encode(y)
y <- as.factor(c(1, 2, 1, 1, 2)) misc::one_hot_encode(y)
Sequential or parallel for loop.
parfor( what, args, cl = NULL, combine = c, errorhandling = c("stop", "remove", "pass"), verbose = FALSE, show_progress = TRUE, export = NULL, ... )
parfor( what, args, cl = NULL, combine = c, errorhandling = c("stop", "remove", "pass"), verbose = FALSE, show_progress = TRUE, export = NULL, ... )
what |
A function. |
args |
A list of arguments. |
cl |
Number of cores to use. If |
combine |
A function to combine the results. |
errorhandling |
A character string specifying how to handle errors. Possible values are |
verbose |
A logical indicating whether to print progress. |
show_progress |
A logical indicating whether to show a progress bar. |
export |
A list of objects to export to the workers. |
... |
Additional arguments to pass to |
A list of results.
# Sequential print(misc::parfor(function(x) x^2, 1:10)) # Parallel print(misc::parfor(function(x) x^2, 1:10, cl = 2))
# Sequential print(misc::parfor(function(x) x^2, 1:10)) # Parallel print(misc::parfor(function(x) x^2, 1:10, cl = 2))
Removing columns containing only zeros
rm_zero_cols(X)
rm_zero_cols(X)
X |
A matrix or data frame |
A matrix or data frame
X <- matrix(c(1, 0, 3, 0, 5, 0, 0, 0), nrow = 2) print(misc::rm_zero_cols(X))
X <- matrix(c(1, 0, 3, 0, 5, 0, 0, 0), nrow = 2) print(misc::rm_zero_cols(X))
Scale matrix
scale_matrix(X, X_mean = NULL, X_sd = NULL)
scale_matrix(X, X_mean = NULL, X_sd = NULL)
X |
A matrix |
X_mean |
Mean of each column |
X_sd |
Standard deviation of each column |
A list containing the scaled matrix, mean of each column, and standard deviation of each column
X <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2) (X_scaled <- misc::scale_matrix(X)) (X_scaled <- misc::scale_matrix(X, X_mean = colMeans(X), X_sd = apply(X, 2, stats::sd))) print(colMeans(X_scaled$X)) print(apply(X_scaled$X, 2, stats::sd))
X <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2) (X_scaled <- misc::scale_matrix(X)) (X_scaled <- misc::scale_matrix(X, X_mean = colMeans(X), X_sd = apply(X, 2, stats::sd))) print(colMeans(X_scaled$X)) print(apply(X_scaled$X, 2, stats::sd))
Sort data frame
sort_df(df, by, decreasing = FALSE)
sort_df(df, by, decreasing = FALSE)
df |
data frame |
by |
column to sort by |
decreasing |
logical. Should sorting be decreasing? |
A sorted data frame
df <- data.frame(a = c(2, 4, 3), b = c(3, 5, 1)) misc::sort_df(df, "a") misc::sort_df(df, "b", decreasing = TRUE)
df <- data.frame(a = c(2, 4, 3), b = c(3, 5, 1)) misc::sort_df(df, "a") misc::sort_df(df, "b", decreasing = TRUE)
Split a dataset
split_data(y, p = 0.5, seed = 123, type_split = c("stratify", "sequential"))
split_data(y, p = 0.5, seed = 123, type_split = c("stratify", "sequential"))
y |
A vector of labels |
p |
A proportion of the dataset to split |
seed |
An integer to set the seed |
type_split |
A character string specifying the type of split |
A vector of indices
set.seed(123) (y <- rnorm(10)) misc::split_data(y, 0.5) misc::split_data(y, 0.5, type_split = "sequential")
set.seed(123) (y <- rnorm(10)) misc::split_data(y, 0.5) misc::split_data(y, 0.5, type_split = "sequential")
Partition a time series object
splitts(y, split_prob = 0.5, return_indices = FALSE)
splitts(y, split_prob = 0.5, return_indices = FALSE)
y |
A time series object |
split_prob |
Splitting ratio |
return_indices |
if TRUE, returns series' indices, otherwise, time series objects |
misc::splitts(ts(1:10))
misc::splitts(ts(1:10))
Timing an expression
timeit(expr, times = 1, ...)
timeit(expr, times = 1, ...)
expr |
an R expression |
times |
number of repetitions |
... |
additional arguments passed to |
the elapsed time in seconds
timeit(1 + 1) timeit(1 + 1, times = 10)
timeit(1 + 1) timeit(1 + 1, times = 10)
A simple implementation similar to the VLOOKUP function in Excel.
vlookup(this, df, key, value)
vlookup(this, df, key, value)
this |
The value to look up |
df |
A data frame |
key |
The column to look up |
value |
The column to return |
The value in the value
column corresponding to the key
column
df <- data.frame(key = c("a", "b", "c"), value = c(1, 2, 3)) print(misc::vlookup("b", df, "key", "value"))
df <- data.frame(key = c("a", "b", "c"), value = c(1, 2, 3)) print(misc::vlookup("b", df, "key", "value"))
Winkler score for probabilistic forecasts
winkler_score(actual, lower, upper, level = 95, scale = FALSE)
winkler_score(actual, lower, upper, level = 95, scale = FALSE)
actual |
numeric vector of actual values |
lower |
numeric vector of lower bounds |
upper |
numeric vector of upper bounds |
level |
numeric level of confidence |
scale |
logical, if TRUE, the score is scaled by the range of the bounds |
numeric score
actual <- c(1, 2, 3, 4, 5) lower <- c(0, 1, 2, 3, 4) upper <- c(2, 3, 4, 5, 6) winkler_score(actual, lower, upper) winkler_score(actual, lower, upper, scale = TRUE) winkler_score(actual, lower, upper, level = 99) winkler_score(actual, lower, upper, level = 99, scale = TRUE)
actual <- c(1, 2, 3, 4, 5) lower <- c(0, 1, 2, 3, 4) upper <- c(2, 3, 4, 5, 6) winkler_score(actual, lower, upper) winkler_score(actual, lower, upper, scale = TRUE) winkler_score(actual, lower, upper, level = 99) winkler_score(actual, lower, upper, level = 99, scale = TRUE)