26 Exercises for ‘Hypothesis Testing’ chapter

# dependencies
library(tidyr)
library(dplyr)
library(furrr) 
library(parameters)
library(stringr)
library(forcats)
library(ggplot2)
library(scales)
library(ggstance)
library(patchwork)
library(janitor)
library(effectsize)
library(ggstance)
library(knitr)
library(kableExtra)

# set up parallelization
plan(multisession)

26.1 Reading

Read Lakens et al. (2018) “Equivalence Testing for Psychological Research: A Tutorial”.

26.2 Basis simulation

# functions for simulation
generate_data <- function(n_per_condition,
                          mean_control,
                          mean_intervention,
                          sd) {
  
  data_control <- 
    tibble(condition = "control",
           score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
  
  data_intervention <- 
    tibble(condition = "intervention",
           score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
  
  data_combined <- 
    bind_rows(data_control,
              data_intervention) |>
    mutate(condition = factor(condition, level = c("intervention", "control")))
  
  return(data_combined)
}

analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
  fit <- t.test(
    formula = score ~ condition,
    mu = mean_difference_h0,
    alternative = direction_h1,
    conf.level = 1-alpha,
    var.equal = TRUE, # students t test: assumes equal variances
    data = data
  )
  
  results <- fit %>%
    model_parameters() %>%
    as_tibble() %>% 
    janitor::clean_names() %>% 
    # create estimand
    mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
           signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
    # select columns of interest
    select(mean_diff = difference,
           ci_low,
           ci_high,
           signficiant_ci,
           p,
           significant_p)
  
  return(results)
}


# simulation parameters
experiment_parameters <- expand_grid(
  n_per_condition = seq(from = 25, to = 100, by = 25),
  mean_control = 0,
  mean_intervention = c(0, 0.5),
  sd = 1,
  direction_h1 = "two.sided", 
  alpha = 0.05,
  mean_difference_h0 = 0,
  iteration = 1:1000
) |>
  mutate(pop_mean_diff = mean_intervention - mean_control)

# set seed
set.seed(42)

# run simulation
simulation <- experiment_parameters |>
  mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, 
                                                mean_control = mean_control,
                                                mean_intervention = mean_intervention,
                                                sd = sd),
                                      .f = generate_data,
                                      .progress = TRUE,
                                      .options = furrr_options(seed = TRUE))) |>
  mutate(results = future_pmap(.l = list(data = generated_data),
                               .f = analyse,
                               .progress = TRUE,
                               .options = furrr_options(seed = TRUE))) |>
  unnest(results)

# calculate summary
results_summary <- simulation |>
  group_by(pop_mean_diff,
           n_per_condition) |>
  summarize(mean_difference_in_means = mean(mean_diff),
            mean_ci_low = mean(ci_low),
            mean_ci_high = mean(ci_high),
            proportion_significant_p = mean(significant_p),
            .groups = "drop")

results_summary |>
  mutate_if(is.numeric, round, digits = 2)

pop_mean_diff	n_per_condition	mean_difference_in_means	mean_ci_low	mean_ci_high	proportion_significant_p
0.0	25	-0.01	-0.57	0.56	0.05
0.0	50	0.00	-0.39	0.40	0.05
0.0	75	0.00	-0.32	0.32	0.04
0.0	100	0.00	-0.28	0.28	0.05
0.5	25	0.50	-0.07	1.07	0.42
0.5	50	0.49	0.10	0.89	0.69
0.5	75	0.50	0.18	0.83	0.87
0.5	100	0.50	0.22	0.78	0.94

26.3 Exercises

For each of the exercises below, create a copy of the basis simulation above. Modify it appropriately to answer the exercise.

26.3.1 Statistical power for a classic Null Hypothesis Significance Test (Two-Sided)

When using a classic Null Hypothesis Significance Test (independent t-test testing a population difference in means (mu) of 0), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)?

Notes:

We’re only interested in statistical power here, so set the population difference in means to zero. Check you understand why.

# functions for simulation
generate_data <- function(n_per_condition,
                          mean_control,
                          mean_intervention,
                          sd) {
  
  data_control <- 
    tibble(condition = "control",
           score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
  
  data_intervention <- 
    tibble(condition = "intervention",
           score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
  
  data_combined <- 
    bind_rows(data_control,
              data_intervention) |>
    mutate(condition = factor(condition, level = c("intervention", "control")))
  
  return(data_combined)
}

analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
  fit <- t.test(
    formula = score ~ condition,
    mu = mean_difference_h0,
    alternative = direction_h1,
    conf.level = 1-alpha,
    var.equal = TRUE, # students t test: assumes equal variances
    data = data
  )
  
  results <- fit %>%
    model_parameters() %>%
    as_tibble() %>% 
    janitor::clean_names() %>% 
    # create estimand
    mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
           signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
    # select columns of interest
    select(mean_diff = difference,
           ci_low,
           ci_high,
           signficiant_ci,
           p,
           significant_p)
  
  return(results)
}


# simulation parameters
experiment_parameters <- expand_grid(
  n_per_condition = seq(from = 25, to = 100, by = 25),
  mean_control = 0,
  mean_intervention = c(1, 2, 3),
  sd = 1,
  direction_h1 = "two.sided", 
  alpha = 0.05,
  mean_difference_h0 = 0,
  iteration = 1:1000
) |>
  mutate(pop_mean_diff = mean_intervention - mean_control)

# set seed
set.seed(42)

# run simulation
simulation <- experiment_parameters |>
  mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, 
                                                mean_control = mean_control,
                                                mean_intervention = mean_intervention,
                                                sd = sd),
                                      .f = generate_data,
                                      .progress = TRUE,
                                      .options = furrr_options(seed = TRUE))) |>
  mutate(results = future_pmap(.l = list(data = generated_data),
                               .f = analyse,
                               .progress = TRUE,
                               .options = furrr_options(seed = TRUE))) |>
  unnest(results)

# calculate summary
results_summary <- simulation |>
  group_by(pop_mean_diff,
           n_per_condition) |>
  summarize(mean_difference_in_means = mean(mean_diff),
            mean_ci_low = mean(ci_low),
            mean_ci_high = mean(ci_high),
            proportion_significant_p = mean(significant_p),
            .groups = "drop")

results_summary |>
  mutate_if(is.numeric, round, digits = 2)

pop_mean_diff	n_per_condition	mean_difference_in_means	mean_ci_low	mean_ci_high	proportion_significant_p
1	25	0.99	0.43	1.56	0.94
1	50	0.99	0.60	1.39	1.00
1	75	1.00	0.67	1.32	1.00
1	100	1.00	0.72	1.28	1.00
2	25	2.00	1.43	2.57	1.00
2	50	2.01	1.61	2.40	1.00
2	75	2.00	1.68	2.32	1.00
2	100	2.00	1.72	2.28	1.00
3	25	2.99	2.42	3.56	1.00
3	50	3.01	2.61	3.41	1.00
3	75	3.01	2.68	3.33	1.00
3	100	3.00	2.72	3.28	1.00

26.3.2 Statistical power for a Minimal-Effects Test (One-Sided)

When using a Minimal-Effects Test (One-Sided) (independent t-test testing a population difference in means (mu) of 1), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)? What is the false positive rate across these sample sizes?

# functions for simulation
generate_data <- function(n_per_condition,
                          mean_control,
                          mean_intervention,
                          sd) {
  
  data_control <- 
    tibble(condition = "control",
           score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
  
  data_intervention <- 
    tibble(condition = "intervention",
           score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
  
  data_combined <- 
    bind_rows(data_control,
              data_intervention) |>
    mutate(condition = factor(condition, level = c("intervention", "control")))
  
  return(data_combined)
}

analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
  fit <- t.test(
    formula = score ~ condition,
    mu = mean_difference_h0,
    alternative = direction_h1,
    conf.level = 1-alpha,
    var.equal = TRUE, # students t test: assumes equal variances
    data = data
  )
  
  results <- fit %>%
    model_parameters() %>%
    as_tibble() %>% 
    janitor::clean_names() %>% 
    # create estimand
    mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
           signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
    # select columns of interest
    select(mean_diff = difference,
           ci_low,
           ci_high,
           signficiant_ci,
           p,
           significant_p)
  
  return(results)
}


# simulation parameters
experiment_parameters <- expand_grid(
  n_per_condition = seq(from = 25, to = 100, by = 25),
  mean_control = 0,
  mean_intervention = c(1, 2, 3),
  sd = 1,
  direction_h1 = "greater", 
  alpha = 0.05,
  mean_difference_h0 = 1,
  iteration = 1:1000
) |>
  mutate(pop_mean_diff = mean_intervention - mean_control)

# set seed
set.seed(42)

# run simulation
simulation <- experiment_parameters |>
  mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, 
                                                mean_control = mean_control,
                                                mean_intervention = mean_intervention,
                                                sd = sd),
                                      .f = generate_data,
                                      .progress = TRUE,
                                      .options = furrr_options(seed = TRUE))) |>
  mutate(results = future_pmap(.l = list(data = generated_data),
                               .f = analyse,
                               .progress = TRUE,
                               .options = furrr_options(seed = TRUE))) |>
  unnest(results)

# calculate summary
results_summary <- simulation |>
  group_by(pop_mean_diff,
           n_per_condition) |>
  summarize(mean_difference_in_means = mean(mean_diff),
            mean_ci_low = mean(ci_low),
            mean_ci_high = mean(ci_high),
            proportion_significant_p = mean(significant_p),
            .groups = "drop")

results_summary |>
  mutate_if(is.numeric, round, digits = 2)

pop_mean_diff	n_per_condition	mean_difference_in_means	mean_ci_low	mean_ci_high	proportion_significant_p
1	25	0.99	0.43	1.56	0.94
1	50	0.99	0.60	1.39	1.00
1	75	1.00	0.67	1.32	1.00
1	100	1.00	0.72	1.28	1.00
2	25	2.00	1.43	2.57	1.00
2	50	2.01	1.61	2.40	1.00
2	75	2.00	1.68	2.32	1.00
2	100	2.00	1.72	2.28	1.00
3	25	2.99	2.42	3.56	1.00
3	50	3.01	2.61	3.41	1.00
3	75	3.01	2.68	3.33	1.00
3	100	3.00	2.72	3.28	1.00

26.3.3 Statistical power for equivalence test

When using a Two One-Sided Test (TOST), with a Smallest Effect Size of Interest (SESOI) of difference-in-means = ±0.2, what is the statistical power across different sample sizes (n per condition = 25, 50, 75, 100)? What N is needed for 80% power to detect a true null effect size as equivalent to zero?

Notes:

We’re only interested in statistical power here, so set the population difference in means to zero. Why?
Because reasons (see Lakens et al., 2018), use the 90% Confidence Interval instead of 95% CI.

# Exercises for 'Hypothesis Testing' chapter ```{r} #| include: false # if it is available, run the setup script that tells quarto to round all df/tibble outputs to three decimal places if(file.exists("../_setup.R")){source("../_setup.R")} ``` ```{r} # dependencies library(tidyr) library(dplyr) library(furrr) library(parameters) library(stringr) library(forcats) library(ggplot2) library(scales) library(ggstance) library(patchwork) library(janitor) library(effectsize) library(ggstance) library(knitr) library(kableExtra) # set up parallelization plan(multisession) ``` ## Reading Read [Lakens et al. (2018) "Equivalence Testing for Psychological Research: A Tutorial"](https://doi.org/10.1177/2515245918770963). ## Basis simulation ```{r} # functions for simulation generate_data <- function(n_per_condition, mean_control, mean_intervention, sd) { data_control <- tibble(condition = "control", score = rnorm(n = n_per_condition, mean = mean_control, sd = sd)) data_intervention <- tibble(condition = "intervention", score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd)) data_combined <- bind_rows(data_control, data_intervention) |> mutate(condition = factor(condition, level = c("intervention", "control"))) return(data_combined) } analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){ fit <- t.test( formula = score ~ condition, mu = mean_difference_h0, alternative = direction_h1, conf.level = 1-alpha, var.equal = TRUE, # students t test: assumes equal variances data = data ) results <- fit %>% model_parameters() %>% as_tibble() %>% janitor::clean_names() %>% # create estimand mutate(significant_p = p < .05, # statistically significant in the sense that p < .05 signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0 # select columns of interest select(mean_diff = difference, ci_low, ci_high, signficiant_ci, p, significant_p) return(results) } # simulation parameters experiment_parameters <- expand_grid( n_per_condition = seq(from = 25, to = 100, by = 25), mean_control = 0, mean_intervention = c(0, 0.5), sd = 1, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0, iteration = 1:1000 ) |> mutate(pop_mean_diff = mean_intervention - mean_control) # set seed set.seed(42) # run simulation simulation <- experiment_parameters |> mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, mean_control = mean_control, mean_intervention = mean_intervention, sd = sd), .f = generate_data, .progress = TRUE, .options = furrr_options(seed = TRUE))) |> mutate(results = future_pmap(.l = list(data = generated_data), .f = analyse, .progress = TRUE, .options = furrr_options(seed = TRUE))) |> unnest(results) # calculate summary results_summary <- simulation |> group_by(pop_mean_diff, n_per_condition) |> summarize(mean_difference_in_means = mean(mean_diff), mean_ci_low = mean(ci_low), mean_ci_high = mean(ci_high), proportion_significant_p = mean(significant_p), .groups = "drop") results_summary |> mutate_if(is.numeric, round, digits = 2) ``` ## Exercises For each of the exercises below, create a copy of the basis simulation above. Modify it appropriately to answer the exercise. ### Statistical power for a classic Null Hypothesis Significance Test (Two-Sided) When using a classic Null Hypothesis Significance Test (independent t-test testing a population difference in means (mu) of 0), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)? Notes: - We're only interested in statistical power here, so set the population difference in means to zero. Check you understand why. ```{r} # functions for simulation generate_data <- function(n_per_condition, mean_control, mean_intervention, sd) { data_control <- tibble(condition = "control", score = rnorm(n = n_per_condition, mean = mean_control, sd = sd)) data_intervention <- tibble(condition = "intervention", score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd)) data_combined <- bind_rows(data_control, data_intervention) |> mutate(condition = factor(condition, level = c("intervention", "control"))) return(data_combined) } analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){ fit <- t.test( formula = score ~ condition, mu = mean_difference_h0, alternative = direction_h1, conf.level = 1-alpha, var.equal = TRUE, # students t test: assumes equal variances data = data ) results <- fit %>% model_parameters() %>% as_tibble() %>% janitor::clean_names() %>% # create estimand mutate(significant_p = p < .05, # statistically significant in the sense that p < .05 signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0 # select columns of interest select(mean_diff = difference, ci_low, ci_high, signficiant_ci, p, significant_p) return(results) } # simulation parameters experiment_parameters <- expand_grid( n_per_condition = seq(from = 25, to = 100, by = 25), mean_control = 0, mean_intervention = c(1, 2, 3), sd = 1, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0, iteration = 1:1000 ) |> mutate(pop_mean_diff = mean_intervention - mean_control) # set seed set.seed(42) # run simulation simulation <- experiment_parameters |> mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, mean_control = mean_control, mean_intervention = mean_intervention, sd = sd), .f = generate_data, .progress = TRUE, .options = furrr_options(seed = TRUE))) |> mutate(results = future_pmap(.l = list(data = generated_data), .f = analyse, .progress = TRUE, .options = furrr_options(seed = TRUE))) |> unnest(results) # calculate summary results_summary <- simulation |> group_by(pop_mean_diff, n_per_condition) |> summarize(mean_difference_in_means = mean(mean_diff), mean_ci_low = mean(ci_low), mean_ci_high = mean(ci_high), proportion_significant_p = mean(significant_p), .groups = "drop") results_summary |> mutate_if(is.numeric, round, digits = 2) ``` ### Statistical power for a Minimal-Effects Test (One-Sided) When using a Minimal-Effects Test (One-Sided) (independent t-test testing a population difference in means (mu) of 1), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)? What is the false positive rate across these sample sizes? ```{r} # functions for simulation generate_data <- function(n_per_condition, mean_control, mean_intervention, sd) { data_control <- tibble(condition = "control", score = rnorm(n = n_per_condition, mean = mean_control, sd = sd)) data_intervention <- tibble(condition = "intervention", score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd)) data_combined <- bind_rows(data_control, data_intervention) |> mutate(condition = factor(condition, level = c("intervention", "control"))) return(data_combined) } analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){ fit <- t.test( formula = score ~ condition, mu = mean_difference_h0, alternative = direction_h1, conf.level = 1-alpha, var.equal = TRUE, # students t test: assumes equal variances data = data ) results <- fit %>% model_parameters() %>% as_tibble() %>% janitor::clean_names() %>% # create estimand mutate(significant_p = p < .05, # statistically significant in the sense that p < .05 signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0 # select columns of interest select(mean_diff = difference, ci_low, ci_high, signficiant_ci, p, significant_p) return(results) } # simulation parameters experiment_parameters <- expand_grid( n_per_condition = seq(from = 25, to = 100, by = 25), mean_control = 0, mean_intervention = c(1, 2, 3), sd = 1, direction_h1 = "greater", alpha = 0.05, mean_difference_h0 = 1, iteration = 1:1000 ) |> mutate(pop_mean_diff = mean_intervention - mean_control) # set seed set.seed(42) # run simulation simulation <- experiment_parameters |> mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition, mean_control = mean_control, mean_intervention = mean_intervention, sd = sd), .f = generate_data, .progress = TRUE, .options = furrr_options(seed = TRUE))) |> mutate(results = future_pmap(.l = list(data = generated_data), .f = analyse, .progress = TRUE, .options = furrr_options(seed = TRUE))) |> unnest(results) # calculate summary results_summary <- simulation |> group_by(pop_mean_diff, n_per_condition) |> summarize(mean_difference_in_means = mean(mean_diff), mean_ci_low = mean(ci_low), mean_ci_high = mean(ci_high), proportion_significant_p = mean(significant_p), .groups = "drop") results_summary |> mutate_if(is.numeric, round, digits = 2) ``` ### Statistical power for equivalence test When using a Two One-Sided Test (TOST), with a Smallest Effect Size of Interest (SESOI) of difference-in-means = ±0.2, what is the statistical power across different sample sizes (n per condition = 25, 50, 75, 100)? What N is needed for 80% power to detect a true null effect size as equivalent to zero? Notes: - We're only interested in statistical power here, so set the population difference in means to zero. Why? - Because reasons (see Lakens et al., 2018), use the 90% Confidence Interval instead of 95% CI.