library(tidyr)
library(dplyr)
library(forcats)
library(stringr)
library(ggplot2)
library(scales)
library(patchwork)
library(tibble)
library(purrr)
library(furrr)
library(faux)
library(janitor)
#library(afex)
# set up parallelization
plan(multisession)16 p-hacking via different forms of selecting reporting ✎ Rough draft
16.1 Overview
p-hacking is increasing the false-positive rate through analytic choices. These choices are often a) flexible, in that many different options might be tried until significance is found (Gelman called this the “Garden of Forking Paths”), and b) undisclosed. However, it need not be either in a given instance: perhaps the very first analytic strategy tried produces the significant result (but if it hadn’t, other strategies would have been tried), or perhaps the author fully discloses the analytic choices but does not make clear to the reader how this undermines the severity of the test (i.e., it takes a very close reading to understand that the results present weak evidence, or the verbal conclusions oversell this).
p-hacking can occur because of the uniform distribution of p-values under the null hypothesis. Because all values are equally likely, you just have to keep rolling the dice to eventually find p < .05.
This lesson illustrates a few different examples of one broad form of p-hacking called selective reporting.
16.2 Dependencies
16.3 Selective reporting of studies
If I run two experiments, and only report the one that “works”, I will increase the false positive rate across studies. Perhaps it is because one really is better designed etc than the other. Or perhaps its just a false positive.
What would you guess the false positive rate is for two studies?
Let’s simulate it.
16.3.1 Run simulation
generate_data <- function(n_per_condition,
mean_control,
mean_intervention,
sd) {
data_control <-
tibble(condition = "control",
score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
data_intervention <-
tibble(condition = "intervention",
score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
data_combined <- bind_rows(data_control,
data_intervention)
return(data_combined)
}
# define data analysis function ----
analyze <- function(data) {
students_ttest <- t.test(formula = score ~ condition,
data = data,
var.equal = TRUE,
alternative = "two.sided")
res <- tibble(p = students_ttest$p.value)
return(res)
}
# define experiment parameters ----
experiment_parameters <- expand_grid(
n_per_condition = 100,
mean_control = 0,
mean_intervention = 0,
sd = 1,
iteration = 1:1000
)
# run simulation ----
set.seed(42)
simulation <-
# using the experiment parameters
experiment_parameters |>
# generate data using the data generating function and the parameters relevant to data generation
mutate(generated_data_study1 = future_pmap(.l = list(n_per_condition,
mean_control,
mean_intervention,
sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(generated_data_study2 = future_pmap(.l = list(n_per_condition,
mean_control,
mean_intervention,
sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
# apply the analysis function to the generated data using the parameters relevant to analysis
mutate(results_study1 = future_pmap(.l = list(data = generated_data_study1),
.f = analyze,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results_study2 = future_pmap(.l = list(data = generated_data_study2),
.f = analyze,
.progress = TRUE,
.options = furrr_options(seed = TRUE)))
# summarise simulation results over the iterations ----
simulation_summary <- simulation |>
# unnest and rename
unnest(results_study1) |>
rename(p_study1 = p) |>
unnest(results_study2) |>
rename(p_study2 = p) |>
# simulate flexible reporting
mutate(p_hacked = ifelse(p_study1 < .05, p_study1, p_study2)) |>
# summarize
summarize(prop_sig_study1 = mean(p_study1 < .05),
prop_sig_study2 = mean(p_study2 < .05),
prop_sig_hacked = mean(p_hacked < .05),
.groups = "drop") |>
pivot_longer(cols = everything(),
names_to = "Source",
values_to = "proportion_significant") |>
mutate(Source = str_remove(Source, "prop_sig_"))16.3.2 Results
simulation_summary |>
mutate_if(is.numeric, round_half_up, digits = 2) | Source | proportion_significant |
|---|---|
| study1 | 0.04 |
| study2 | 0.05 |
| hacked | 0.09 |
16.3.3 Conclusions
The false positive rate for two studies is ~10%, because each study has a 5% false positive rate.
16.3.4 Exercise: extend the simulation
Extend the simulation so that four studies are run. What do you expect the false positive rate to be? What do you find?
16.3.5 Solution
generate_data <- function(n_per_condition,
mean_control,
mean_intervention,
sd) {
data_control <-
tibble(condition = "control",
score = rnorm(n = n_per_condition, mean = mean_control, sd = sd))
data_intervention <-
tibble(condition = "intervention",
score = rnorm(n = n_per_condition, mean = mean_intervention, sd = sd))
data_combined <- bind_rows(data_control,
data_intervention)
return(data_combined)
}
# define data analysis function ----
analyze <- function(data) {
students_ttest <- t.test(formula = score ~ condition,
data = data,
var.equal = TRUE,
alternative = "two.sided")
res <- tibble(p = students_ttest$p.value)
return(res)
}
# define experiment parameters ----
experiment_parameters <- expand_grid(
n_per_condition = 100,
mean_control = 0,
mean_intervention = 0,
sd = 1,
iteration = 1:1000
)
# run simulation ----
set.seed(42)
simulation <-
# using the experiment parameters
experiment_parameters |>
# generate data using the data generating function and the parameters relevant to data generation
mutate(generated_data_study1 = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(generated_data_study2 = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(generated_data_study3 = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(generated_data_study4 = future_pmap(.l = list(n_per_condition = n_per_condition,
mean_control = mean_control,
mean_intervention = mean_intervention,
sd = sd),
.f = generate_data,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
# apply the analysis function to the generated data using the parameters relevant to analysis
mutate(results_study1 = future_pmap(.l = list(data = generated_data_study1),
analyze,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results_study2 = future_pmap(.l = list(data = generated_data_study2),
analyze,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results_study3 = future_pmap(.l = list(data = generated_data_study3),
analyze,
.progress = TRUE,
.options = furrr_options(seed = TRUE))) |>
mutate(results_study4 = future_pmap(.l = list(data = generated_data_study4),
analyze,
.progress = TRUE,
.options = furrr_options(seed = TRUE)))
# summarise simulation results over the iterations ----
simulation_summary <- simulation |>
# unnest and rename
unnest(results_study1) |>
rename(p_study1 = p) |>
unnest(results_study2) |>
rename(p_study2 = p) |>
unnest(results_study3) |>
rename(p_study3 = p) |>
unnest(results_study4) |>
rename(p_study4 = p) |>
# simulate flexible reporting
mutate(p_hacked = case_when(p_study1 < .05 ~ p_study1,
p_study2 < .05 ~ p_study2,
p_study3 < .05 ~ p_study3,
TRUE ~ p_study4)) |>
# summarize
summarize(prop_sig_study1 = mean(p_study1 < .05),
prop_sig_study2 = mean(p_study2 < .05),
prop_sig_study3 = mean(p_study3 < .05),
prop_sig_study4 = mean(p_study4 < .05),
prop_sig_hacked = mean(p_hacked < .05),
.groups = "drop") |>
pivot_longer(cols = everything(),
names_to = "Source",
values_to = "proportion_significant") |>
mutate(Source = str_remove(Source, "prop_sig_"))
simulation_summary |>
mutate_if(is.numeric, round_half_up, digits = 2)| Source | proportion_significant |
|---|---|
| study1 | 0.04 |
| study2 | 0.05 |
| study3 | 0.04 |
| study4 | 0.05 |
| hacked | 0.17 |
16.6 Check your learning
If authors reported the results of all outcome variables (and not just the ones that ‘worked’):
- Would this problem be improved?
- Would this problem be completely solved?
16.7 In-class exercises
Without writing any code, design and describe the necessary components of simulations that quantify the the effect of the following p-hacking strategies on the false positive rate (i.e., empirical detection rate when true difference in means is 0)
- Flexibility in whether the DV is log transformed or not.
- Flexibility in whether to exclude DV outliers or not (e.g., >±1SD from mean).
- Flexibility calculating a median split for the DV or not.
16.8 Readings
- Stefan, A. M. and Schönbrodt F. D. (2023) Big little lies: a compendium and simulation of p-hacking strategies. Royal Society Open Science. 10220346 http://doi.org/10.1098/rsos.220346




