# dependencies
library(tidyr)
library(dplyr)
library(furrr)
library(parameters)
library(stringr)
library(forcats)
library(ggplot2)
library(scales)
library(ggstance)
library(patchwork)
library(janitor)
library(effectsize)
library(ggstance)
library(knitr)
library(kableExtra)
# set up parallelization
plan(multisession)28 Exercises for ‘Estimation’ chapter
28.1 Basis simulation
Same as in the chapter.
# functions for simulation
generate_data <- function(n_per_condition,
mean_control,
mean_intervention,
sd) {
data_control <-
tibble(condition = "control",
score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
data_intervention <-
tibble(condition = "intervention",
score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
data_combined <-
bind_rows(data_control,
data_intervention) |>
mutate(condition = factor(condition, level = c("intervention", "control")))
return(data_combined)
}
analyse <- function(data){
fit <- t.test(
formula = score ~ condition,
var.equal = TRUE, # students t test: assumes equal variances
data = data
)
results <- fit %>%
model_parameters() %>%
as_tibble() %>%
janitor::clean_names() %>%
# create estimand
mutate(confidence_interval_width = ci_high - ci_low) |>
# select columns of interest
select(mean_diff = difference, # for parameter recovery
ci_low, # plotting
ci_high, # plotting
confidence_interval_width) # estimand
return(results)
}
# simulation parameters
experiment_parameters <- expand_grid(
n_per_condition = seq(from = 10, to = 100, by = 10),
mean_control = 4,
mean_intervention = 5,
sd = 1.5,
iteration = 1:1000
)
# set seed
set.seed(43)
# run simulation
simulation <- experiment_parameters |>
mutate(generated_data = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results = future_pmap(.l = list(data = generated_data),
.f = analyse,
.progress = TRUE,
.options = furrr_options(seed = TRUE)))
# summarise results
results_summary <- simulation |>
unnest(results) |>
group_by(n_per_condition) |>
summarize(mean_difference_in_means = mean(mean_diff),
mean_ci_low = mean(ci_low),
mean_ci_high = mean(ci_high),
mean_confidence_interval_width = mean(confidence_interval_width))
## table
## remember: only round results as you're about to print a table
results_summary |>
mutate_if(is.numeric, round, digits = 1)| n_per_condition | mean_difference_in_means | mean_ci_low | mean_ci_high | mean_confidence_interval_width |
|---|---|---|---|---|
| 10 | 1 | -0.4 | 2.4 | 2.8 |
| 20 | 1 | 0.0 | 1.9 | 1.9 |
| 30 | 1 | 0.2 | 1.8 | 1.5 |
| 40 | 1 | 0.3 | 1.7 | 1.3 |
| 50 | 1 | 0.4 | 1.6 | 1.2 |
| 60 | 1 | 0.5 | 1.5 | 1.1 |
| 70 | 1 | 0.5 | 1.5 | 1.0 |
| 80 | 1 | 0.5 | 1.5 | 0.9 |
| 90 | 1 | 0.6 | 1.4 | 0.9 |
| 100 | 1 | 0.6 | 1.4 | 0.8 |
## Difference-in-means
ggplot(results_summary, aes(n_per_condition*2, mean_difference_in_means)) +
geom_point() +
geom_linerange(aes(ymin = mean_ci_low, ymax = mean_ci_high)) +
geom_hline(yintercept = 0, linetype = "dotted") +
scale_x_continuous(breaks = breaks_pretty(n = 10),
name = "Total sample size per study") +
scale_y_continuous(breaks = breaks_pretty(n = 5),
#limits = c(0,1),
name = "Mean sample difference-in-means\n(and mean 95% CIs)") +
theme_linedraw() +
ggtitle("Population difference-in-means = 1")
## Difference-in-means' 95% Confidence Interval width
ggplot(results_summary, aes(n_per_condition*2, mean_confidence_interval_width)) +
geom_hline(yintercept = 0, linetype = "dotted") +
#geom_linerange(aes(ymin = min_ci_width, ymax = max_ci_width)) +
geom_point() +
#scale_y_continuous(name = "95% CI width\n(mean [min, max])", limits = c(0, NA)) +
scale_y_continuous(name = "Mean sample 95% CI width",
limits = c(0, NA)) +
scale_x_continuous(breaks = breaks_pretty(n = 9),
name = "Total sample size per study") +
theme_linedraw() +
scale_color_viridis_d(begin = 0.3, end = 0.7) +
ggtitle("Population Mean difference = 1")
28.2 Exercises
For each of the exercises below, create a copy of the simulation above. Modify it appropriately to answer the exercise.
28.2.1 For each sample size, what proportion of the 95% Confidence Intervals exclude 0?
Use between().
28.2.2 When the population difference in means is 1, what proportion of the p values are statistically significant for each sample size?
Also have the analysis extract a p-value for the difference in means.
Think about it: what statistical property does this represent?
28.2.3 What is the relationship between the confidence interval excluding 0 and the significance of the p value?
For each iteration, calculate if (95% CI!=0) == (p<.05)
28.2.4 When the population difference in means is 0, what proportion of the p values are statistically significant for each sample size?
Change the population effect difference-in-means to zero.
Think about it: what statistical property does this represent?