# Install packages and functions
ipak <- function(pkg) {
new.pkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
if (length(new.pkg)) {
install.packages(new.pkg, dependencies = TRUE)
cat("Not Installed:", paste(new.pkg, collapse = ", "), "\n")
} else {
cat("All Packages Installed :) \n")
}
invisible(sapply(pkg, require, character.only = TRUE))
}
packages <- c("tidyverse","here", "simDAG","brms","cmdstanr","dagitty","ggdag","ggdist","ggtext","scales")
bayesplot::color_scheme_set("blue") # Set color scheme
options(mc.cores = parallel::detectCores()) # Parallel Chains
cmdstanr::set_cmdstan_path(path = "C:/Program Files/R/cmdstan-2.36.0") # R-Stan
ipak(packages) # Install packages
## All Packages Installed :)
Figure 1: Directed acyclic graph of the hypothesised causal
relationships related to salary: G = gender; S = salary; R = role type;
L = level; T = time at the organisation; and P = percentage of FTE
employed. The relationship of interest is between the exposure (red) and
outcome (blue). The latent variable (green) is unmeasured. Arrows
indicate the direction of hypothesised causal relationships.
# Simulate data from DAG
dag <- empty_dag() +
node("u", type = "rnorm", mean = 0, sd = 1) + # unmeasured confounder
node("years_since_start_date", type = "rnbinom", size = 10, prob = 0.5) + # years since start date: a negative binomial distribution is used to ensure values are integers and positive
node("percentage", type="rbeta", shape1 = 5, shape2 = 0.5) + # percentage FTE employed: a beta distribution is used to ensure values are between 0 and 1
node("gender",
type = "binomial", # A binomial distribution is used for gender (2-levels) for this minimal reproducible example.
formula = ~ u,
betas = 0.5, # Effect of unmeasured factor (positive = men advantaged)
intercept = 0) + # Men:woman ratio (0 is 50:50 - positive increases men in organisation)
node("level",
type = "negative_binomial", # A negative binomial distribution is used for level to ensure values are integers and positive.
formula = ~ gender + years_since_start_date + u,
betas = c(0.005, # Effect of level (i.e., if positive men are advantaged)
0.01, # Effect of years since start date
0.01), # Effect of unmeasured factor
theta = 3,
intercept = 0) +
node("role_type",
type = "binomial", # Role type is either one of 2-levels - support or research
formula = ~ gender + level + u,
betas = c(0.1, # Effect of gender (i.e, if positive men are advantaged in support roles) # Interaction
0.1, # Effect of level (i.e, if positive then higher levels more likely to be in support roles)
0.5), # Effect of unmeasured factor on level
intercept = -0.9) + # research:support ratio (0 is 50:50 - negative indicates less people in support)
node("salary",
type = "gaussian", # Salary is continuous and normally distributed (although in practice this will probably be skewed)
formula = ~ gender + role_type + level + sqrt(years_since_start_date) + percentage + u, # Note the square root of years since start date is used to produce a non-linear relationship.
betas = c(4000,2000,5000,3000,2000,5000), # Betas are listed in order: e.g., $4000 is the organisational pay gap; $2000 is the difference between support and research roles.
intercept = 80000,
error = 2000)
# Set seed and simulate from DAG
set.seed(42)
sim_dat <-sim_from_dag(dag=dag, n_sim = 500)
# Add interaction between gender and role type
sim_dat <- sim_dat %>%
mutate(salary = case_when(
gender == "TRUE" & role_type == "TRUE" ~ salary + 2000,
TRUE ~ salary))
# Rename and tidy variables
sim_dat <- sim_dat %>%
mutate(gender = ifelse(gender == "TRUE","man","woman")) %>%
mutate(role_type = ifelse(role_type == "TRUE","support","research")) %>%
mutate(percentage = round(percentage,2)) %>%
mutate(level = level + 1)
# Reassign levels of gender
sim_dat$gender <- factor(sim_dat$gender, levels = c("woman", "man"))
# Define salary bands
s <- c(50000, 60000, 70000, 80000, 90000, 100000, 120000, 140000, 160000, 200000)
# Create labels for salary band bins
labels <- c("below 50000", paste(head(s, -1), tail(s, -1), sep = "-"), "above 200000")
# Function to find min and max of salary band
find_bracket <- function(salary, breaks) {
bin <- cut(salary, breaks = c(-Inf, breaks, Inf), labels = labels, right = FALSE)
if (is.na(bin)) {
return(c(NA, NA))
} else {
bounds <- strsplit(as.character(bin), "-")[[1]]
return(as.numeric(bounds))
}
}
sim_dat <- sim_dat %>%
rowwise() %>%
mutate(
min = find_bracket(salary, s)[1],
max = find_bracket(salary, s)[2]) %>%
# Promote all women in research in level 1 to the next band
# mutate(
# min = ifelse(gender == "woman" & role_type == "research" & level == 1,
# s[which(s > min)[1]],
# min),
# max = ifelse(gender == "woman" & role_type == "research" & level == 1,
# s[which(s > min)[1] + 1],
# max)) %>%
ungroup()
# Calculate salary estimate and standard deviation for error measurement model
sim_dat <- sim_dat %>%
mutate(mid_est = (max + min) / 2) %>% # Calculate range mean
mutate(sd_val = (max - min) / 4) %>% # Calculate approximate standard deviation
mutate(percentage_std = scale(percentage)) %>% # Scale percentage to aid model fitting
mutate(s_lab = paste0(gender, role_type)) %>% # Define labels for spline
dplyr::select(c("u","gender","role_type","level","years_since_start_date","percentage_std","salary","min","max","mid_est","sd_val","s_lab")) # Select variables
head(sim_dat)
# Model formula
mod_formula <- bf(
mid_est | se(sd_val, sigma = TRUE) ~ 0 + Intercept + gender * role_type + mo(level) + percentage_std + # u +
s(years_since_start_date, by = s_lab) # Spline as non-linear
)
# Priors
priors <- c(
prior(normal(100000, 50000), class = b, coef = Intercept),
prior(normal(0, 10000), class = b)
)
mod <- brm(
mod_formula,
family = gaussian(link = "identity"),
prior = priors,
data = sim_dat,
threads = threading(threads = NULL, grainsize = 1250, static = FALSE),
backend = "cmdstanr",
silent = 0,
refresh = 2000,
chains = 4,
cores = 4,
warmup = 1000,
iter = 2000,
file = paste0(here("Analysis/Output/Models/Sims/A4.mod.error.measurement.s42.n500"))
)
# The 0 + Intercept syntax is used since variables are not centered or scaled - this ensures priors are implemented correctly.
# u is the confounding variable: if included the estimates are not confounded since the model has information on how the confounding occurs
Bürkner, P.C., (2017). brms: An R Package for Bayesian Multilevel Models Using Stan. Journal of Statistical Software, 80(1), 1-28. https://econpapers.repec.org/article/jssjstsof/v_3a080_3ai01.htm
McElreath, R. (2020). Statistical rethinking: A Bayesian course with examples in R and Stan. Chapman and Hall/CRC.
R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Pacific/Auckland
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.3.0 ggtext_0.1.2 ggdist_3.3.2 ggdag_0.2.13
## [5] dagitty_0.3-4 cmdstanr_0.8.0 brms_2.22.0 Rcpp_1.0.13
## [9] simDAG_0.2.0 here_1.0.1 lubridate_1.9.3 forcats_1.0.0
## [13] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [17] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gridExtra_2.3 sandwich_3.1-1 rlang_1.1.4
## [4] magrittr_2.0.3 multcomp_1.4-26 matrixStats_1.4.1
## [7] compiler_4.4.2 loo_2.8.0 systemfonts_1.1.0
## [10] vctrs_0.6.5 pkgconfig_2.0.3 fastmap_1.2.0
## [13] backports_1.5.0 labeling_0.4.3 ggraph_2.2.1
## [16] utf8_1.2.4 rmarkdown_2.28 markdown_1.13
## [19] tzdb_0.4.0 ps_1.8.0 ragg_1.3.3
## [22] xfun_0.47 cachem_1.1.0 jsonlite_1.8.9
## [25] highr_0.11 tweenr_2.0.3 parallel_4.4.2
## [28] R6_2.5.1 bslib_0.8.0 stringi_1.8.4
## [31] boot_1.3-31 jquerylib_0.1.4 estimability_1.5.1
## [34] knitr_1.48 zoo_1.8-12 bayesplot_1.11.1
## [37] Matrix_1.7-1 splines_4.4.2 igraph_2.0.3
## [40] timechange_0.3.0 tidyselect_1.2.1 viridis_0.6.5
## [43] rstudioapi_0.16.0 abind_1.4-8 yaml_2.3.10
## [46] codetools_0.2-20 curl_5.2.3 processx_3.8.4
## [49] lattice_0.22-6 withr_3.0.1 bridgesampling_1.1-2
## [52] posterior_1.6.0 coda_0.19-4.1 evaluate_1.0.0
## [55] survival_3.7-0 RcppParallel_5.1.9 polyclip_1.10-7
## [58] xml2_1.3.6 pillar_1.9.0 tensorA_0.36.2.1
## [61] checkmate_2.3.2 distributional_0.5.0 generics_0.1.3
## [64] rprojroot_2.0.4 hms_1.1.3 commonmark_1.9.1
## [67] rstantools_2.4.0 munsell_0.5.1 xtable_1.8-4
## [70] glue_1.8.0 emmeans_1.10.4 tools_4.4.2
## [73] data.table_1.16.0 mvtnorm_1.3-2 graphlayouts_1.2.0
## [76] tidygraph_1.3.1 grid_4.4.2 colorspace_2.1-1
## [79] nlme_3.1-166 ggforce_0.4.2 cli_3.6.3
## [82] textshaping_0.4.0 fansi_1.0.6 viridisLite_0.4.2
## [85] Brobdingnag_1.2-9 V8_5.0.1 gtable_0.3.5
## [88] sass_0.4.9 digest_0.6.37 ggrepel_0.9.6
## [91] TH.data_1.1-2 farver_2.1.2 memoise_2.0.1
## [94] htmltools_0.5.8.1 lifecycle_1.0.4 gridtext_0.1.5
## [97] MASS_7.3-61