26  Exercises for ‘Hypothesis Testing’ chapter

# dependencies
library(tidyr)
library(dplyr)
library(furrr) 
library(parameters)
library(stringr)
library(forcats)
library(ggplot2)
library(scales)
library(ggstance)
library(patchwork)
library(janitor)
library(effectsize)
library(ggstance)
library(knitr)
library(kableExtra)

# set up parallelization
plan(multisession)

26.1 Reading

Read Lakens et al. (2018) “Equivalence Testing for Psychological Research: A Tutorial”.

26.2 Basis simulation

# functions for simulation
generate_data <- function(n_per_condition,
                          mean_control,
                          mean_intervention,
                          sd) {
  
  data_control <- 
    tibble(condition = "control",
           score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
  
  data_intervention <- 
    tibble(condition = "intervention",
           score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
  
  data_combined <- 
    bind_rows(data_control,
              data_intervention) |>
    mutate(condition = factor(condition, level = c("intervention", "control")))
  
  return(data_combined)
}

analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
  fit <- t.test(
    formula = score ~ condition,
    mu = mean_difference_h0,
    alternative = direction_h1,
    conf.level = 1-alpha,
    var.equal = TRUE, # students t test: assumes equal variances
    data = data
  )
  
  results <- fit %>%
    model_parameters() %>%
    as_tibble() %>% 
    janitor::clean_names() %>% 
    # create estimand
    mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
           signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
    # select columns of interest
    select(mean_diff = difference,
           ci_low,
           ci_high,
           signficiant_ci,
           p,
           significant_p)
  
  return(results)
}


# simulation parameters
experiment_parameters <- expand_grid(
  n_per_condition = seq(from = 25, to = 100, by = 25),
  mean_control = 0,
  mean_intervention = c(0, 0.5),
  sd = 1,
  direction_h1 = "two.sided", 
  alpha = 0.05,
  mean_difference_h0 = 0,
  iteration = 1:1000
) |>
  mutate(pop_mean_diff = mean_intervention - mean_control)

# set seed
set.seed(42)

# run simulation
simulation <- experiment_parameters |>
  mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, 
                                                mean_control = mean_control,
                                                mean_intervention = mean_intervention,
                                                sd = sd),
                                      .f = generate_data,
                                      .progress = TRUE,
                                      .options = furrr_options(seed = TRUE))) |>
  mutate(results = future_pmap(.l = list(data = generated_data),
                               .f = analyse,
                               .progress = TRUE,
                               .options = furrr_options(seed = TRUE))) |>
  unnest(results)

# calculate summary
results_summary <- simulation |>
  group_by(pop_mean_diff,
           n_per_condition) |>
  summarize(mean_difference_in_means = mean(mean_diff),
            mean_ci_low = mean(ci_low),
            mean_ci_high = mean(ci_high),
            proportion_significant_p = mean(significant_p),
            .groups = "drop")

results_summary |>
  mutate_if(is.numeric, round, digits = 2)
pop_mean_diff n_per_condition mean_difference_in_means mean_ci_low mean_ci_high proportion_significant_p
0.0 25 -0.01 -0.57 0.56 0.05
0.0 50 0.00 -0.39 0.40 0.05
0.0 75 0.00 -0.32 0.32 0.04
0.0 100 0.00 -0.28 0.28 0.05
0.5 25 0.50 -0.07 1.07 0.42
0.5 50 0.49 0.10 0.89 0.69
0.5 75 0.50 0.18 0.83 0.87
0.5 100 0.50 0.22 0.78 0.94

26.3 Exercises

For each of the exercises below, create a copy of the basis simulation above. Modify it appropriately to answer the exercise.

26.3.1 Statistical power for a classic Null Hypothesis Significance Test (Two-Sided)

When using a classic Null Hypothesis Significance Test (independent t-test testing a population difference in means (mu) of 0), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)?

Notes:

  • We’re only interested in statistical power here, so set the population difference in means to zero. Check you understand why.
# functions for simulation
generate_data <- function(n_per_condition,
                          mean_control,
                          mean_intervention,
                          sd) {
  
  data_control <- 
    tibble(condition = "control",
           score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
  
  data_intervention <- 
    tibble(condition = "intervention",
           score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
  
  data_combined <- 
    bind_rows(data_control,
              data_intervention) |>
    mutate(condition = factor(condition, level = c("intervention", "control")))
  
  return(data_combined)
}

analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
  fit <- t.test(
    formula = score ~ condition,
    mu = mean_difference_h0,
    alternative = direction_h1,
    conf.level = 1-alpha,
    var.equal = TRUE, # students t test: assumes equal variances
    data = data
  )
  
  results <- fit %>%
    model_parameters() %>%
    as_tibble() %>% 
    janitor::clean_names() %>% 
    # create estimand
    mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
           signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
    # select columns of interest
    select(mean_diff = difference,
           ci_low,
           ci_high,
           signficiant_ci,
           p,
           significant_p)
  
  return(results)
}


# simulation parameters
experiment_parameters <- expand_grid(
  n_per_condition = seq(from = 25, to = 100, by = 25),
  mean_control = 0,
  mean_intervention = c(1, 2, 3),
  sd = 1,
  direction_h1 = "two.sided", 
  alpha = 0.05,
  mean_difference_h0 = 0,
  iteration = 1:1000
) |>
  mutate(pop_mean_diff = mean_intervention - mean_control)

# set seed
set.seed(42)

# run simulation
simulation <- experiment_parameters |>
  mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, 
                                                mean_control = mean_control,
                                                mean_intervention = mean_intervention,
                                                sd = sd),
                                      .f = generate_data,
                                      .progress = TRUE,
                                      .options = furrr_options(seed = TRUE))) |>
  mutate(results = future_pmap(.l = list(data = generated_data),
                               .f = analyse,
                               .progress = TRUE,
                               .options = furrr_options(seed = TRUE))) |>
  unnest(results)

# calculate summary
results_summary <- simulation |>
  group_by(pop_mean_diff,
           n_per_condition) |>
  summarize(mean_difference_in_means = mean(mean_diff),
            mean_ci_low = mean(ci_low),
            mean_ci_high = mean(ci_high),
            proportion_significant_p = mean(significant_p),
            .groups = "drop")

results_summary |>
  mutate_if(is.numeric, round, digits = 2)
pop_mean_diff n_per_condition mean_difference_in_means mean_ci_low mean_ci_high proportion_significant_p
1 25 0.99 0.43 1.56 0.94
1 50 0.99 0.60 1.39 1.00
1 75 1.00 0.67 1.32 1.00
1 100 1.00 0.72 1.28 1.00
2 25 2.00 1.43 2.57 1.00
2 50 2.01 1.61 2.40 1.00
2 75 2.00 1.68 2.32 1.00
2 100 2.00 1.72 2.28 1.00
3 25 2.99 2.42 3.56 1.00
3 50 3.01 2.61 3.41 1.00
3 75 3.01 2.68 3.33 1.00
3 100 3.00 2.72 3.28 1.00

26.3.2 Statistical power for a Minimal-Effects Test (One-Sided)

When using a Minimal-Effects Test (One-Sided) (independent t-test testing a population difference in means (mu) of 1), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)? What is the false positive rate across these sample sizes?

# functions for simulation
generate_data <- function(n_per_condition,
                          mean_control,
                          mean_intervention,
                          sd) {
  
  data_control <- 
    tibble(condition = "control",
           score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
  
  data_intervention <- 
    tibble(condition = "intervention",
           score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
  
  data_combined <- 
    bind_rows(data_control,
              data_intervention) |>
    mutate(condition = factor(condition, level = c("intervention", "control")))
  
  return(data_combined)
}

analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
  fit <- t.test(
    formula = score ~ condition,
    mu = mean_difference_h0,
    alternative = direction_h1,
    conf.level = 1-alpha,
    var.equal = TRUE, # students t test: assumes equal variances
    data = data
  )
  
  results <- fit %>%
    model_parameters() %>%
    as_tibble() %>% 
    janitor::clean_names() %>% 
    # create estimand
    mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
           signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
    # select columns of interest
    select(mean_diff = difference,
           ci_low,
           ci_high,
           signficiant_ci,
           p,
           significant_p)
  
  return(results)
}


# simulation parameters
experiment_parameters <- expand_grid(
  n_per_condition = seq(from = 25, to = 100, by = 25),
  mean_control = 0,
  mean_intervention = c(1, 2, 3),
  sd = 1,
  direction_h1 = "greater", 
  alpha = 0.05,
  mean_difference_h0 = 1,
  iteration = 1:1000
) |>
  mutate(pop_mean_diff = mean_intervention - mean_control)

# set seed
set.seed(42)

# run simulation
simulation <- experiment_parameters |>
  mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, 
                                                mean_control = mean_control,
                                                mean_intervention = mean_intervention,
                                                sd = sd),
                                      .f = generate_data,
                                      .progress = TRUE,
                                      .options = furrr_options(seed = TRUE))) |>
  mutate(results = future_pmap(.l = list(data = generated_data),
                               .f = analyse,
                               .progress = TRUE,
                               .options = furrr_options(seed = TRUE))) |>
  unnest(results)

# calculate summary
results_summary <- simulation |>
  group_by(pop_mean_diff,
           n_per_condition) |>
  summarize(mean_difference_in_means = mean(mean_diff),
            mean_ci_low = mean(ci_low),
            mean_ci_high = mean(ci_high),
            proportion_significant_p = mean(significant_p),
            .groups = "drop")

results_summary |>
  mutate_if(is.numeric, round, digits = 2)
pop_mean_diff n_per_condition mean_difference_in_means mean_ci_low mean_ci_high proportion_significant_p
1 25 0.99 0.43 1.56 0.94
1 50 0.99 0.60 1.39 1.00
1 75 1.00 0.67 1.32 1.00
1 100 1.00 0.72 1.28 1.00
2 25 2.00 1.43 2.57 1.00
2 50 2.01 1.61 2.40 1.00
2 75 2.00 1.68 2.32 1.00
2 100 2.00 1.72 2.28 1.00
3 25 2.99 2.42 3.56 1.00
3 50 3.01 2.61 3.41 1.00
3 75 3.01 2.68 3.33 1.00
3 100 3.00 2.72 3.28 1.00

26.3.3 Statistical power for equivalence test

When using a Two One-Sided Test (TOST), with a Smallest Effect Size of Interest (SESOI) of difference-in-means = ±0.2, what is the statistical power across different sample sizes (n per condition = 25, 50, 75, 100)? What N is needed for 80% power to detect a true null effect size as equivalent to zero?

Notes:

  • We’re only interested in statistical power here, so set the population difference in means to zero. Why?
  • Because reasons (see Lakens et al., 2018), use the 90% Confidence Interval instead of 95% CI.