The Test-Train Split Won't Save You
library(stats)
library(leaps)
library(dplyr)
library(caret)
library(MASS)
create_synthetic_data <- function(n = 100, p = 10,
non_random_params = 2,
signal_variance = 0.8,
seed = 123) {
# Seed for reproducibility
set.seed(seed)
# Generate random data (n predictors)
X <- matrix(rnorm(n * p),
nrow = n, ncol = p)
colnames(X) <- paste0("X", 1:p)
# Error handling for pnr > p mistake
if (non_random_params > p) {
stop("'non_random_params'
cannot be greater than
the number of predictors (p).")}
# Generate non-random signal
noise <- 1 - signal_variance
if (non_random_params > 0) {
# Coefficients for the non-random predictors
signal_coefficients <- rep(0.7,
non_random_params)
# Construct y
y <- rowSums(sweep(X[, 1:non_random_params,
drop = F], 2,
signal_coefficients, "*")) +
rnorm(n,
sd = sqrt(noise))}
else {
# If no non-random predictors, y is pure noise
y <- rnorm(n,
sd = sqrt(1))}
# Convert to df
data <- data.frame(y, X)
# Scale predictors
data_scaled <- as.data.frame(scale(data))
return(data_scaled)}
synthetic_data_test <- create_synthetic_data(n = 10,p = 5,
non_random_params = 2,signal_variance = 0.5,seed = 456)
sim_caret_control <- train(y ~.,
data = synthetic_data_test %>%
dplyr::select(-c(findCorrelation(
cor(synthetic_data_test %>%
dplyr::select(-y)),
cutoff = 0.3, names = T))),
method = "leapForward",
tuneGrid = data.frame(nvmax = 5),
trControl =
trainControl(
method = "LGOCV",
p = 0.7,
number = 3))
generate_all_datasets <- function(n_values, n_p_ratios,
non_random_params_values,
r2_signal_values,
num_datasets){
# Initialize a list to store all datasets
all_datasets <- list()
# Initialize an iteration counter
it <- 1
# Loop over all parameter combinations
for (n in n_values) {
for (ratio in n_p_ratios) {
p <- round(n / ratio)
for (non_random_params in non_random_params_values) {
for (r2_signal in r2_signal_values) {
for (i in 1:num_datasets) {
# Generate the synthetic data
sim_data <- create_synthetic_data(n,p,
non_random_params, r2_signal,
seed = it)
step_reg <- train(y ~.,
data = sim_data %>%
dplyr::select(-c(findCorrelation(
cor(sim_data %>%
dplyr::select(-y)),
cutoff = 0.3, names = T))),
method = "leapForward",
tuneGrid = data.frame(
nvmax = n/5),
trControl =
trainControl(
method = "LGOCV",
p = 0.7,
number = 3))
# Add metadata columns to a df
output_df <- data.frame(
iter = it,
N = n, p = p,
N_to_p = ratio,
gt_param = non_random_params,
gt_r2 = r2_signal,
sel_mod_r2 = as.numeric(
step_reg$results$Rsquared),
no_pred_mod = as.numeric(
step_reg$bestTune))
# Append the dataset to the list
all_datasets[[length(all_datasets) + 1]] <-
output_df
# Increment the iteration counter
it <- it + 1
}
}
}
}
}
# Combine all datasets into one large dataframe
combined_df <- do.call(rbind, all_datasets)
return(combined_df)
}
sim_dat_df <- generate_all_datasets(seq(20, 100, by = 5),
c(0.5,1,2),
0:2,0.5,100)
library(ggplot2)
R2_plot <- ggplot(data = sim_dat_df,
aes(x = as.factor(N_to_p),
y = sel_mod_r2,
fill = as.factor(gt_param))) +
geom_hline(yintercept = 0.7,
alpha = 0.3) +
geom_boxplot() +
labs(fill = "True Predictors",
x = "N/p Ratio",
y = expression(paste("Model ",
R^2))) +
ylim(0,1) +
theme_classic() +
theme(legend.position = "bottom")
print(R2_plot)
This is the R2 picture: