Model Selection

The Test-Train Split Won't Save You

library(stats)
library(leaps)
library(dplyr)
library(caret)
library(MASS)

create_synthetic_data <- function(n = 100, p = 10,
                                  non_random_params = 2, 
                                  signal_variance = 0.8, 
                                  seed = 123) { 
  # Seed for reproducibility
  set.seed(seed)
  
  # Generate random data (n predictors)
  X <- matrix(rnorm(n * p), 
              nrow = n, ncol = p)  
  colnames(X) <- paste0("X", 1:p)               
  
  # Error handling for pnr > p mistake
  if (non_random_params > p) {
    stop("'non_random_params' 
         cannot be greater than 
         the number of predictors (p).")}
  
  # Generate non-random signal
  noise <- 1 - signal_variance
  
  if (non_random_params > 0) {
    # Coefficients for the non-random predictors
    signal_coefficients <- rep(0.7, 
                               non_random_params)
    
    # Construct y
    y <- rowSums(sweep(X[, 1:non_random_params, 
                         drop = F], 2, 
                       signal_coefficients, "*")) + 
      rnorm(n, 
            sd = sqrt(noise))}
  
  else {
    # If no non-random predictors, y is pure noise
    y <- rnorm(n, 
               sd = sqrt(1))}
  
  # Convert to df  
  data <- data.frame(y, X)
  
  # Scale predictors
  data_scaled <- as.data.frame(scale(data))
  
  return(data_scaled)}
synthetic_data_test <- create_synthetic_data(n = 10,p = 5,
  non_random_params = 2,signal_variance = 0.5,seed = 456)

sim_caret_control <- train(y ~., 
                           data = synthetic_data_test %>%
                             dplyr::select(-c(findCorrelation(
                               cor(synthetic_data_test %>%
                                     dplyr::select(-y)),
                               cutoff = 0.3, names = T))),
                           method = "leapForward", 
                           tuneGrid = data.frame(nvmax = 5),
                           trControl = 
                             trainControl(
                               method = "LGOCV",
                               p = 0.7,
                               number = 3))
generate_all_datasets <- function(n_values, n_p_ratios, 
                                  non_random_params_values, 
                                  r2_signal_values, 
                                  num_datasets){
  
  # Initialize a list to store all datasets
  all_datasets <- list()
  
  # Initialize an iteration counter
  it <- 1
  
  # Loop over all parameter combinations
  for (n in n_values) {
    for (ratio in n_p_ratios) {
      p <- round(n / ratio)
      for (non_random_params in non_random_params_values) {
        for (r2_signal in r2_signal_values) {
          for (i in 1:num_datasets) {
            # Generate the synthetic data
            sim_data <- create_synthetic_data(n,p, 
              non_random_params, r2_signal,
              seed = it)
            
            step_reg <- train(y ~., 
                              data = sim_data %>%
                                dplyr::select(-c(findCorrelation(
                                  cor(sim_data %>%
                                        dplyr::select(-y)),
                                  cutoff = 0.3, names = T))),
                              method = "leapForward", 
                              tuneGrid = data.frame(
                                nvmax = n/5),
                              trControl = 
                                trainControl(
                                  method = "LGOCV",
                                  p = 0.7,
                                  number = 3))
            
            # Add metadata columns to a df
            output_df <- data.frame(
              iter = it,
              N = n, p = p,
              N_to_p = ratio,
              gt_param = non_random_params,
              gt_r2 = r2_signal,
              sel_mod_r2 = as.numeric(
                step_reg$results$Rsquared),
              no_pred_mod = as.numeric(
                step_reg$bestTune))
            
            # Append the dataset to the list
            all_datasets[[length(all_datasets) + 1]] <- 
              output_df
            
            # Increment the iteration counter
            it <- it + 1
          }
        }
      }
    }
  }
  
  # Combine all datasets into one large dataframe
  combined_df <- do.call(rbind, all_datasets) 
  
  return(combined_df)
}

sim_dat_df <- generate_all_datasets(seq(20, 100, by = 5),
                                 c(0.5,1,2),
                                 0:2,0.5,100)
library(ggplot2)

R2_plot <- ggplot(data = sim_dat_df,
                  aes(x = as.factor(N_to_p),
                      y = sel_mod_r2,
                      fill = as.factor(gt_param))) +
  geom_hline(yintercept = 0.7,
             alpha = 0.3) +
  geom_boxplot() +
  labs(fill = "True Predictors",
       x = "N/p Ratio",
       y = expression(paste("Model ",
                            R^2))) +
  ylim(0,1) +
  theme_classic() +
  theme(legend.position = "bottom")

print(R2_plot)

This is the R2 picture: R2_plot