Setup

# Install packages and functions
ipak <- function(pkg) {
  new.pkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
  if (length(new.pkg)) {
    install.packages(new.pkg, dependencies = TRUE)
    cat("Not Installed:", paste(new.pkg, collapse = ", "), "\n")
  } else {
    cat("All Packages Installed :) \n")
  }
  invisible(sapply(pkg, require, character.only = TRUE))
}

packages <- c("tidyverse","here", "simDAG","brms","cmdstanr","dagitty","ggdag","ggdist","ggtext","scales")

bayesplot::color_scheme_set("blue")                                     # Set color scheme
options(mc.cores = parallel::detectCores())                             # Parallel Chains
cmdstanr::set_cmdstan_path(path = "C:/Program Files/R/cmdstan-2.36.0")  # R-Stan
ipak(packages)                                                          # Install packages

## All Packages Installed :)

DAG

Figure 1: Directed acyclic graph of the hypothesised causal relationships related to salary: G = gender; S = salary; R = role type; L = level; T = time at the organisation; and P = percentage of FTE employed. The relationship of interest is between the exposure (red) and outcome (blue). The latent variable (green) is unmeasured. Arrows indicate the direction of hypothesised causal relationships.

Sim DAG

# Simulate data from DAG
dag <- empty_dag() +
  
  node("u", type = "rnorm", mean = 0, sd = 1) +                              # unmeasured confounder
  node("years_since_start_date", type = "rnbinom", size = 10, prob = 0.5) +  # years since start date: a negative binomial distribution is used to ensure values are integers and positive
  node("percentage", type="rbeta", shape1 = 5, shape2 = 0.5) +               # percentage FTE employed: a beta distribution is used to ensure values are between 0 and 1
  
  node("gender", 
       type = "binomial", # A binomial distribution is used for gender (2-levels) for this minimal reproducible example.                                    
       formula = ~ u,
       betas = 0.5,       # Effect of unmeasured factor (positive = men advantaged)
       intercept = 0) +   # Men:woman ratio (0 is 50:50 - positive increases men in organisation)  
  
  node("level",
       type = "negative_binomial", # A negative binomial distribution is used for level to ensure values are integers and positive.
       formula = ~ gender + years_since_start_date + u,
       betas = c(0.005,  # Effect of level (i.e., if positive men are advantaged)
                 0.01,   # Effect of years since start date
                 0.01),  # Effect of unmeasured factor
       theta = 3,
       intercept = 0) +
  
    node("role_type", 
       type = "binomial", # Role type is either one of 2-levels - support or research                                     
       formula = ~ gender + level + u,
       betas = c(0.1,      # Effect of gender (i.e, if positive men are advantaged in support roles) # Interaction
                 0.1,      # Effect of level  (i.e, if positive then higher levels more likely to be in support roles)
                 0.5),     # Effect of unmeasured factor on level
       intercept = -0.9) + # research:support ratio (0 is 50:50 - negative indicates less people in support)

  
  node("salary", 
       type = "gaussian", # Salary is continuous and normally distributed (although in practice this will probably be skewed)
       formula = ~ gender + role_type + level + sqrt(years_since_start_date) + percentage + u, # Note the square root of years since start date is used to produce a non-linear relationship.
       betas = c(4000,2000,5000,3000,2000,5000), # Betas are listed in order: e.g., $4000 is the organisational pay gap; $2000 is the difference between support and research roles.
       intercept = 80000, 
       error = 2000)

# Set seed and simulate from DAG
set.seed(42)
sim_dat <-sim_from_dag(dag=dag, n_sim = 500)

# Add interaction between gender and role type
sim_dat <- sim_dat %>%
  mutate(salary = case_when(
    gender == "TRUE" & role_type == "TRUE"  ~ salary + 2000,
    TRUE ~ salary))

# Rename and tidy variables
sim_dat <- sim_dat %>%
  mutate(gender = ifelse(gender == "TRUE","man","woman")) %>%
  mutate(role_type = ifelse(role_type == "TRUE","support","research")) %>%
  mutate(percentage = round(percentage,2)) %>%
  mutate(level = level + 1)

# Reassign levels of gender
sim_dat$gender <- factor(sim_dat$gender, levels = c("woman", "man"))

Intervention 1

# sim_dat %>% group_by(gender) %>% summarise(median = median(salary))
# 101950.66 - 94763.87  = 7186.79

# sim_dat <- sim_dat %>%
#   mutate(salary = ifelse(gender == "woman",
#                          salary + 7187, salary))

Salary error measurement calculation

# Define salary bands
s <- c(50000, 60000, 70000, 80000, 90000, 100000, 120000, 140000, 160000, 200000)

# Create labels for salary band bins
labels <- c("below 50000", paste(head(s, -1), tail(s, -1), sep = "-"), "above 200000")

# Function to find min and max of salary band
find_bracket <- function(salary, breaks) {
  bin <- cut(salary, breaks = c(-Inf, breaks, Inf), labels = labels, right = FALSE)
  if (is.na(bin)) {
    return(c(NA, NA))
  } else {
    bounds <- strsplit(as.character(bin), "-")[[1]]
    return(as.numeric(bounds))
  }
}

Intervention 2

sim_dat <- sim_dat %>%
 rowwise() %>%
 mutate(
   min = find_bracket(salary, s)[1],
   max = find_bracket(salary, s)[2]) %>%
  
# Promote all women in research in level 1 to the next band 
  
# mutate(
#   min = ifelse(gender == "woman" & role_type == "research" & level == 1,
#                s[which(s > min)[1]],
#                min),
#   max = ifelse(gender == "woman" & role_type == "research" & level == 1, 
#                s[which(s > min)[1] + 1],
#                max)) %>%
  
 ungroup()

Simulated data

# Calculate salary estimate and standard deviation for error measurement model
sim_dat <- sim_dat %>%
  mutate(mid_est = (max + min) / 2) %>% # Calculate range mean
  mutate(sd_val  = (max - min) / 4) %>% # Calculate approximate standard deviation
  mutate(percentage_std = scale(percentage)) %>% # Scale percentage to aid model fitting
  mutate(s_lab   = paste0(gender, role_type)) %>% # Define labels for spline
  dplyr::select(c("u","gender","role_type","level","years_since_start_date","percentage_std","salary","min","max","mid_est","sd_val","s_lab")) # Select variables
  
head(sim_dat)

Plot simulated data

Error measurement model

# Model formula
mod_formula <- bf(
   mid_est | se(sd_val, sigma = TRUE) ~ 0 + Intercept + gender * role_type + mo(level) + percentage_std + # u +
    s(years_since_start_date, by = s_lab) # Spline as non-linear
  )

# Priors
priors <- c(
  prior(normal(100000, 50000), class = b, coef = Intercept),
  prior(normal(0, 10000), class = b)
)

mod <- brm(
  mod_formula,  
  family = gaussian(link = "identity"),
  prior = priors,
  data = sim_dat,  
  threads = threading(threads = NULL, grainsize = 1250, static = FALSE),
  backend = "cmdstanr", 
  silent = 0,
  refresh = 2000,
  chains = 4, 
  cores = 4, 
  warmup = 1000, 
  iter = 2000,
  file = paste0(here("Analysis/Output/Models/Sims/A4.mod.error.measurement.s42.n500"))
)

# The 0 + Intercept syntax is used since variables are not centered or scaled - this ensures priors are implemented correctly.
# u is the confounding variable: if included the estimates are not confounded since the model has information on how the confounding occurs

References

Bürkner, P.C., (2017). brms: An R Package for Bayesian Multilevel Models Using Stan. Journal of Statistical Software, 80(1), 1-28. https://econpapers.repec.org/article/jssjstsof/v_3a080_3ai01.htm

McElreath, R. (2020). Statistical rethinking: A Bayesian course with examples in R and Stan. Chapman and Hall/CRC.

R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/

Session Information

utils::sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Pacific/Auckland
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.3.0    ggtext_0.1.2    ggdist_3.3.2    ggdag_0.2.13   
##  [5] dagitty_0.3-4   cmdstanr_0.8.0  brms_2.22.0     Rcpp_1.0.13    
##  [9] simDAG_0.2.0    here_1.0.1      lubridate_1.9.3 forcats_1.0.0  
## [13] stringr_1.5.1   dplyr_1.1.4     purrr_1.0.2     readr_2.1.5    
## [17] tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] gridExtra_2.3        sandwich_3.1-1       rlang_1.1.4         
##  [4] magrittr_2.0.3       multcomp_1.4-26      matrixStats_1.4.1   
##  [7] compiler_4.4.2       loo_2.8.0            systemfonts_1.1.0   
## [10] vctrs_0.6.5          pkgconfig_2.0.3      fastmap_1.2.0       
## [13] backports_1.5.0      labeling_0.4.3       ggraph_2.2.1        
## [16] utf8_1.2.4           rmarkdown_2.28       markdown_1.13       
## [19] tzdb_0.4.0           ps_1.8.0             ragg_1.3.3          
## [22] xfun_0.47            cachem_1.1.0         jsonlite_1.8.9      
## [25] highr_0.11           tweenr_2.0.3         parallel_4.4.2      
## [28] R6_2.5.1             bslib_0.8.0          stringi_1.8.4       
## [31] boot_1.3-31          jquerylib_0.1.4      estimability_1.5.1  
## [34] knitr_1.48           zoo_1.8-12           bayesplot_1.11.1    
## [37] Matrix_1.7-1         splines_4.4.2        igraph_2.0.3        
## [40] timechange_0.3.0     tidyselect_1.2.1     viridis_0.6.5       
## [43] rstudioapi_0.16.0    abind_1.4-8          yaml_2.3.10         
## [46] codetools_0.2-20     curl_5.2.3           processx_3.8.4      
## [49] lattice_0.22-6       withr_3.0.1          bridgesampling_1.1-2
## [52] posterior_1.6.0      coda_0.19-4.1        evaluate_1.0.0      
## [55] survival_3.7-0       RcppParallel_5.1.9   polyclip_1.10-7     
## [58] xml2_1.3.6           pillar_1.9.0         tensorA_0.36.2.1    
## [61] checkmate_2.3.2      distributional_0.5.0 generics_0.1.3      
## [64] rprojroot_2.0.4      hms_1.1.3            commonmark_1.9.1    
## [67] rstantools_2.4.0     munsell_0.5.1        xtable_1.8-4        
## [70] glue_1.8.0           emmeans_1.10.4       tools_4.4.2         
## [73] data.table_1.16.0    mvtnorm_1.3-2        graphlayouts_1.2.0  
## [76] tidygraph_1.3.1      grid_4.4.2           colorspace_2.1-1    
## [79] nlme_3.1-166         ggforce_0.4.2        cli_3.6.3           
## [82] textshaping_0.4.0    fansi_1.0.6          viridisLite_0.4.2   
## [85] Brobdingnag_1.2-9    V8_5.0.1             gtable_0.3.5        
## [88] sass_0.4.9           digest_0.6.37        ggrepel_0.9.6       
## [91] TH.data_1.1-2        farver_2.1.2         memoise_2.0.1       
## [94] htmltools_0.5.8.1    lifecycle_1.0.4      gridtext_0.1.5      
## [97] MASS_7.3-61

Hierarchical organisation simulation

Dr Tom Moore - Statistical Scientist

2025-01-31

DAG