# dependencies
library(tidyr)
library(dplyr)
library(furrr)
library(parameters)
library(stringr)
library(forcats)
library(ggplot2)
library(scales)
library(ggstance)
library(patchwork)
library(janitor)
library(effectsize)
library(ggstance)
library(knitr)
library(kableExtra)
# set up parallelization
plan(multisession)26 Exercises for ‘Hypothesis Testing’ chapter
26.1 Reading
Read Lakens et al. (2018) “Equivalence Testing for Psychological Research: A Tutorial”.
26.2 Basis simulation
# functions for simulation
generate_data <- function(n_per_condition,
mean_control,
mean_intervention,
sd) {
data_control <-
tibble(condition = "control",
score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
data_intervention <-
tibble(condition = "intervention",
score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
data_combined <-
bind_rows(data_control,
data_intervention) |>
mutate(condition = factor(condition, level = c("intervention", "control")))
return(data_combined)
}
analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
fit <- t.test(
formula = score ~ condition,
mu = mean_difference_h0,
alternative = direction_h1,
conf.level = 1-alpha,
var.equal = TRUE, # students t test: assumes equal variances
data = data
)
results <- fit %>%
model_parameters() %>%
as_tibble() %>%
janitor::clean_names() %>%
# create estimand
mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
# select columns of interest
select(mean_diff = difference,
ci_low,
ci_high,
signficiant_ci,
p,
significant_p)
return(results)
}
# simulation parameters
experiment_parameters <- expand_grid(
n_per_condition = seq(from = 25, to = 100, by = 25),
mean_control = 0,
mean_intervention = c(0, 0.5),
sd = 1,
direction_h1 = "two.sided",
alpha = 0.05,
mean_difference_h0 = 0,
iteration = 1:1000
) |>
mutate(pop_mean_diff = mean_intervention - mean_control)
# set seed
set.seed(42)
# run simulation
simulation <- experiment_parameters |>
mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results = future_pmap(.l = list(data = generated_data),
.f = analyse,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
unnest(results)
# calculate summary
results_summary <- simulation |>
group_by(pop_mean_diff,
n_per_condition) |>
summarize(mean_difference_in_means = mean(mean_diff),
mean_ci_low = mean(ci_low),
mean_ci_high = mean(ci_high),
proportion_significant_p = mean(significant_p),
.groups = "drop")
results_summary |>
mutate_if(is.numeric, round, digits = 2)| pop_mean_diff | n_per_condition | mean_difference_in_means | mean_ci_low | mean_ci_high | proportion_significant_p |
|---|---|---|---|---|---|
| 0.0 | 25 | -0.01 | -0.57 | 0.56 | 0.05 |
| 0.0 | 50 | 0.00 | -0.39 | 0.40 | 0.05 |
| 0.0 | 75 | 0.00 | -0.32 | 0.32 | 0.04 |
| 0.0 | 100 | 0.00 | -0.28 | 0.28 | 0.05 |
| 0.5 | 25 | 0.50 | -0.07 | 1.07 | 0.42 |
| 0.5 | 50 | 0.49 | 0.10 | 0.89 | 0.69 |
| 0.5 | 75 | 0.50 | 0.18 | 0.83 | 0.87 |
| 0.5 | 100 | 0.50 | 0.22 | 0.78 | 0.94 |
26.3 Exercises
For each of the exercises below, create a copy of the basis simulation above. Modify it appropriately to answer the exercise.
26.3.1 Statistical power for a classic Null Hypothesis Significance Test (Two-Sided)
When using a classic Null Hypothesis Significance Test (independent t-test testing a population difference in means (mu) of 0), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)?
Notes:
- We’re only interested in statistical power here, so set the population difference in means to zero. Check you understand why.
# functions for simulation
generate_data <- function(n_per_condition,
mean_control,
mean_intervention,
sd) {
data_control <-
tibble(condition = "control",
score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
data_intervention <-
tibble(condition = "intervention",
score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
data_combined <-
bind_rows(data_control,
data_intervention) |>
mutate(condition = factor(condition, level = c("intervention", "control")))
return(data_combined)
}
analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
fit <- t.test(
formula = score ~ condition,
mu = mean_difference_h0,
alternative = direction_h1,
conf.level = 1-alpha,
var.equal = TRUE, # students t test: assumes equal variances
data = data
)
results <- fit %>%
model_parameters() %>%
as_tibble() %>%
janitor::clean_names() %>%
# create estimand
mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
# select columns of interest
select(mean_diff = difference,
ci_low,
ci_high,
signficiant_ci,
p,
significant_p)
return(results)
}
# simulation parameters
experiment_parameters <- expand_grid(
n_per_condition = seq(from = 25, to = 100, by = 25),
mean_control = 0,
mean_intervention = c(1, 2, 3),
sd = 1,
direction_h1 = "two.sided",
alpha = 0.05,
mean_difference_h0 = 0,
iteration = 1:1000
) |>
mutate(pop_mean_diff = mean_intervention - mean_control)
# set seed
set.seed(42)
# run simulation
simulation <- experiment_parameters |>
mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results = future_pmap(.l = list(data = generated_data),
.f = analyse,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
unnest(results)
# calculate summary
results_summary <- simulation |>
group_by(pop_mean_diff,
n_per_condition) |>
summarize(mean_difference_in_means = mean(mean_diff),
mean_ci_low = mean(ci_low),
mean_ci_high = mean(ci_high),
proportion_significant_p = mean(significant_p),
.groups = "drop")
results_summary |>
mutate_if(is.numeric, round, digits = 2)| pop_mean_diff | n_per_condition | mean_difference_in_means | mean_ci_low | mean_ci_high | proportion_significant_p |
|---|---|---|---|---|---|
| 1 | 25 | 0.99 | 0.43 | 1.56 | 0.94 |
| 1 | 50 | 0.99 | 0.60 | 1.39 | 1.00 |
| 1 | 75 | 1.00 | 0.67 | 1.32 | 1.00 |
| 1 | 100 | 1.00 | 0.72 | 1.28 | 1.00 |
| 2 | 25 | 2.00 | 1.43 | 2.57 | 1.00 |
| 2 | 50 | 2.01 | 1.61 | 2.40 | 1.00 |
| 2 | 75 | 2.00 | 1.68 | 2.32 | 1.00 |
| 2 | 100 | 2.00 | 1.72 | 2.28 | 1.00 |
| 3 | 25 | 2.99 | 2.42 | 3.56 | 1.00 |
| 3 | 50 | 3.01 | 2.61 | 3.41 | 1.00 |
| 3 | 75 | 3.01 | 2.68 | 3.33 | 1.00 |
| 3 | 100 | 3.00 | 2.72 | 3.28 | 1.00 |
26.3.2 Statistical power for a Minimal-Effects Test (One-Sided)
When using a Minimal-Effects Test (One-Sided) (independent t-test testing a population difference in means (mu) of 1), what is the statistical power to detect a population mean difference of 1, 2, or 3 across different sample sizes (n per condition = 25, 50, 75, 100)? What is the false positive rate across these sample sizes?
# functions for simulation
generate_data <- function(n_per_condition,
mean_control,
mean_intervention,
sd) {
data_control <-
tibble(condition = "control",
score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
data_intervention <-
tibble(condition = "intervention",
score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
data_combined <-
bind_rows(data_control,
data_intervention) |>
mutate(condition = factor(condition, level = c("intervention", "control")))
return(data_combined)
}
analyse <- function(data, direction_h1 = "two.sided", alpha = 0.05, mean_difference_h0 = 0){
fit <- t.test(
formula = score ~ condition,
mu = mean_difference_h0,
alternative = direction_h1,
conf.level = 1-alpha,
var.equal = TRUE, # students t test: assumes equal variances
data = data
)
results <- fit %>%
model_parameters() %>%
as_tibble() %>%
janitor::clean_names() %>%
# create estimand
mutate(significant_p = p < .05, # statistically significant in the sense that p < .05
signficiant_ci = !dplyr::between(0, ci_low, ci_high)) |> # statistically significant in the sense that 95% CI excludes a mean difference of 0
# select columns of interest
select(mean_diff = difference,
ci_low,
ci_high,
signficiant_ci,
p,
significant_p)
return(results)
}
# simulation parameters
experiment_parameters <- expand_grid(
n_per_condition = seq(from = 25, to = 100, by = 25),
mean_control = 0,
mean_intervention = c(1, 2, 3),
sd = 1,
direction_h1 = "greater",
alpha = 0.05,
mean_difference_h0 = 1,
iteration = 1:1000
) |>
mutate(pop_mean_diff = mean_intervention - mean_control)
# set seed
set.seed(42)
# run simulation
simulation <- experiment_parameters |>
mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results = future_pmap(.l = list(data = generated_data),
.f = analyse,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
unnest(results)
# calculate summary
results_summary <- simulation |>
group_by(pop_mean_diff,
n_per_condition) |>
summarize(mean_difference_in_means = mean(mean_diff),
mean_ci_low = mean(ci_low),
mean_ci_high = mean(ci_high),
proportion_significant_p = mean(significant_p),
.groups = "drop")
results_summary |>
mutate_if(is.numeric, round, digits = 2)| pop_mean_diff | n_per_condition | mean_difference_in_means | mean_ci_low | mean_ci_high | proportion_significant_p |
|---|---|---|---|---|---|
| 1 | 25 | 0.99 | 0.43 | 1.56 | 0.94 |
| 1 | 50 | 0.99 | 0.60 | 1.39 | 1.00 |
| 1 | 75 | 1.00 | 0.67 | 1.32 | 1.00 |
| 1 | 100 | 1.00 | 0.72 | 1.28 | 1.00 |
| 2 | 25 | 2.00 | 1.43 | 2.57 | 1.00 |
| 2 | 50 | 2.01 | 1.61 | 2.40 | 1.00 |
| 2 | 75 | 2.00 | 1.68 | 2.32 | 1.00 |
| 2 | 100 | 2.00 | 1.72 | 2.28 | 1.00 |
| 3 | 25 | 2.99 | 2.42 | 3.56 | 1.00 |
| 3 | 50 | 3.01 | 2.61 | 3.41 | 1.00 |
| 3 | 75 | 3.01 | 2.68 | 3.33 | 1.00 |
| 3 | 100 | 3.00 | 2.72 | 3.28 | 1.00 |
26.3.3 Statistical power for equivalence test
When using a Two One-Sided Test (TOST), with a Smallest Effect Size of Interest (SESOI) of difference-in-means = ±0.2, what is the statistical power across different sample sizes (n per condition = 25, 50, 75, 100)? What N is needed for 80% power to detect a true null effect size as equivalent to zero?
Notes:
- We’re only interested in statistical power here, so set the population difference in means to zero. Why?
- Because reasons (see Lakens et al., 2018), use the 90% Confidence Interval instead of 95% CI.