Setup

# Install packages and functions
ipak <- function(pkg) {
  new.pkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
  if (length(new.pkg)) {
    install.packages(new.pkg, dependencies = TRUE)
    cat("Not Installed:", paste(new.pkg, collapse = ", "), "\n")
  } else {
    cat("All Packages Installed :) \n")
  }
  invisible(sapply(pkg, require, character.only = TRUE))
}

packages <- c("tidyverse","here", "simDAG","brms","cmdstanr","dagitty","ggdag","ggdist","ggtext","scales")

bayesplot::color_scheme_set("blue")                                     # Set color scheme
options(mc.cores = parallel::detectCores())                             # Parallel Chains
cmdstanr::set_cmdstan_path(path = "C:/Program Files/R/cmdstan-2.36.0")  # R-Stan
ipak(packages)                                                          # Install packages

## All Packages Installed :)

Introduction

The aim of this analysis is to report the gender pay gap across an entire workforce and identify areas where this gap is most pronounced. To estimate the gender pay gap other variables known to influence salary (i.e., role type (support/research), time since working at an organisation, level in the organisation, and percentage of FTE employed) were measured and accounted for statistically using a causal inference framework.

Methods:

For modeling, salary was treated as the mean of the corresponding range (i.e., (range maximum + range minimum) / 2) with an approximated standard deviation to represent measurement error (i.e., range maximum - range minimum / 4).

Measurement error models (one for each year) predicted salary with measurement error based on gender, role type, and level (monotonic predictor) (interactions specified), as well as percentage employed and days since employment start date (fitted as splines). The random effect was position nested within team nested within site location, with random intercepts for gender and role type (correlated). A skew normal distribution was selected to account for positively skewed salary data. Weakly informative priors were used to aid model convergence for the intercept and slope parameters (see models). This model was performed in R (R Core Team 2022) using the brms package (Bürkner, 2017) with ideas inspired from (McElreath, 2020).

Analysis

Measurement error model

Models

# Read in all simulations
mods <- lapply(list.files(
  path = here("Analysis/Output/Models/Sims"), 
  pattern = "\\.rds$", 
  full.names = TRUE),
  readRDS)

# Select model by index
mod <- mods[[4]] # Select A4 model [[4]]

Validation

## Loading required package: rstan

## Loading required package: StanHeaders

## 
## rstan version 2.32.6 (Stan version 2.32.2)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)

## 
## Attaching package: 'rstan'

## The following object is masked from 'package:tidyr':
## 
##     extract

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

Output

summary(mod)   # Summary of model

## Warning: There were 33 divergent transitions after warmup. Increasing
## adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: mid_est | se(sd_val, sigma = TRUE) ~ 0 + Intercept + gender * role_type + mo(level) + percentage_std + u + s(years_since_start_date, by = s_lab) 
##    Data: sim_dat (Number of observations: 500) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Smoothing Spline Hyperparameters:
##                                                  Estimate Est.Error l-95% CI
## sds(syears_since_start_dates_labmanresearch_1)    6449.08   5664.08   192.15
## sds(syears_since_start_dates_labmansupport_1)     9702.96   5679.28  1861.18
## sds(syears_since_start_dates_labwomanresearch_1)  3782.41   3728.08    99.10
## sds(syears_since_start_dates_labwomansupport_1)   6557.43   4776.54   402.62
##                                                  u-95% CI Rhat Bulk_ESS
## sds(syears_since_start_dates_labmanresearch_1)   21780.59 1.00     1022
## sds(syears_since_start_dates_labmansupport_1)    23492.84 1.00     1471
## sds(syears_since_start_dates_labwomanresearch_1) 13808.24 1.00     1150
## sds(syears_since_start_dates_labwomansupport_1)  18434.30 1.00     1524
##                                                  Tail_ESS
## sds(syears_since_start_dates_labmanresearch_1)       1543
## sds(syears_since_start_dates_labmansupport_1)        1808
## sds(syears_since_start_dates_labwomanresearch_1)     1843
## sds(syears_since_start_dates_labwomansupport_1)      1588
## 
## Regression Coefficients:
##                                              Estimate Est.Error  l-95% CI
## Intercept                                    90340.19    366.93  89634.70
## genderman                                     4873.54    444.27   4021.13
## role_typesupport                              2718.73    557.98   1655.81
## percentage_std                                 316.18    186.42    -53.50
## u                                             5067.13    207.97   4669.00
## genderman:role_typesupport                    1539.62    871.01   -151.98
## syears_since_start_date:s_labmanresearch_1   10151.77   8497.00  -8309.33
## syears_since_start_date:s_labmansupport_1     4981.68   8894.76 -13124.45
## syears_since_start_date:s_labwomanresearch_1 10921.01   7147.72  -6645.83
## syears_since_start_date:s_labwomansupport_1   8068.23   9665.53 -12220.00
## molevel                                       5012.11    507.06   4083.03
##                                              u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept                                    91071.44 1.00     3323     2511
## genderman                                     5719.10 1.00     3696     3101
## role_typesupport                              3792.08 1.00     3234     2875
## percentage_std                                 680.83 1.00     4771     1949
## u                                             5469.53 1.00     5635     2627
## genderman:role_typesupport                    3289.48 1.00     3493     2622
## syears_since_start_date:s_labmanresearch_1   23455.83 1.00     1938     2305
## syears_since_start_date:s_labmansupport_1    21688.73 1.00     3776     2680
## syears_since_start_date:s_labwomanresearch_1 21948.11 1.00     1752     1422
## syears_since_start_date:s_labwomansupport_1  24298.50 1.00     2103     2086
## molevel                                       6061.14 1.00     2088     2294
## 
## Monotonic Simplex Parameters:
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## molevel1[1]     0.19      0.02     0.15     0.24 1.00     2460     2661
## molevel1[2]     0.19      0.03     0.14     0.26 1.00     3166     2788
## molevel1[3]     0.12      0.04     0.05     0.20 1.00     3772     2323
## molevel1[4]     0.17      0.05     0.08     0.28 1.00     4765     2647
## molevel1[5]     0.12      0.08     0.01     0.30 1.00     5098     1806
## molevel1[6]     0.19      0.10     0.02     0.39 1.00     3093     2500
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma  2557.13    220.48  2132.07  2995.81 1.00     5082     2644
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Plots

Figure 2: Conditional effect of gender and role type on salary. Esimates are displayed as intervals (mean (black dot) with 95 % credible interval (range)) and labeled as text. Verticle lines display average estimates: women in research (red); women in support (dark red); men in research (blue); men in support (purple).

Figure 3: Conditional effect of gender and role type across levels: women in research (red); women in support (dark red); men in research (blue); men in support (purple). Level 1 employees are 4 steps removed from the CEO and level 5 employees are those who only have the CEO as a line manager.

Figure 4: Conditional effect of gender and role type on salary over time. Esimates are displayed as intervals (mean (dot) with 95 % credible interval (range)): women in research (red); women in support (dark red); men in research (blue); men in support (purple).

Figure 5: Conditional effect of percent FTE on salary over time. Esimates are displayed as intervals (mean (dot) with 95 % credible interval (range)): women in research (red); and women in support (dark red).

Hypotheses

# Overall gender pay gap $
a <- hypothesis(mod, 'genderman > 0') 
# How much more do men in support earn than woman in support?
b <- hypothesis(mod, '(Intercept + role_typesupport + genderman + genderman:role_typesupport) - (Intercept + role_typesupport) > 0') 
# How much more do men in research earn than women in research?
c <- hypothesis(mod, '(Intercept + genderman) - (Intercept) > 0') 
# What is the difference between men in support and men in research?
d <- hypothesis(mod, '(Intercept + role_typesupport + genderman +  genderman:role_typesupport) - (Intercept + genderman) > 0') 
 # What is the difference between women in support and women in research?
e <- hypothesis(mod, '(Intercept + role_typesupport) - (Intercept) > 0')
# What is the difference between men in support and women in research
f <- hypothesis(mod, '(Intercept + role_typesupport + genderman + genderman:role_typesupport) - (Intercept) > 0')

Outputs

a$hypothesis$Estimate

## [1] 4873.539

b$hypothesis$Estimate

## [1] 6413.159

c$hypothesis$Estimate

## [1] 4873.539

d$hypothesis$Estimate

## [1] 4258.353

e$hypothesis$Estimate

## [1] 2718.733

f$hypothesis$Estimate

## [1] 9131.893

References

Blevins, C., & Mullen, L. (2015). Jane, John… Leslie? A Historical Method for Algorithmic Gender Prediction. DHQ: Digital Humanities Quarterly, 9(3). http://www.digitalhumanities.org/dhq/vol/9/3/000223/000223.html

Bürkner, P.C., (2017). brms: An R Package for Bayesian Multilevel Models Using Stan. Journal of Statistical Software, 80(1), 1-28. https://econpapers.repec.org/article/jssjstsof/v_3a080_3ai01.htm

McElreath, R. (2020). Statistical rethinking: A Bayesian course with examples in R and Stan. Chapman and Hall/CRC.

R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/

Session Information

utils::sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Pacific/Auckland
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rstan_2.32.6        StanHeaders_2.32.10 scales_1.3.0       
##  [4] ggtext_0.1.2        ggdist_3.3.2        ggdag_0.2.13       
##  [7] dagitty_0.3-4       cmdstanr_0.8.0      brms_2.22.0        
## [10] Rcpp_1.0.13         simDAG_0.2.0        here_1.0.1         
## [13] lubridate_1.9.3     forcats_1.0.0       stringr_1.5.1      
## [16] dplyr_1.1.4         purrr_1.0.2         readr_2.1.5        
## [19] tidyr_1.3.1         tibble_3.2.1        ggplot2_3.5.1      
## [22] tidyverse_2.0.0    
## 
## loaded via a namespace (and not attached):
##  [1] gridExtra_2.3        inline_0.3.19        sandwich_3.1-1      
##  [4] rlang_1.1.4          magrittr_2.0.3       multcomp_1.4-26     
##  [7] matrixStats_1.4.1    compiler_4.4.2       mgcv_1.9-1          
## [10] loo_2.8.0            systemfonts_1.1.0    reshape2_1.4.4      
## [13] vctrs_0.6.5          pkgconfig_2.0.3      fastmap_1.2.0       
## [16] backports_1.5.0      labeling_0.4.3       utf8_1.2.4          
## [19] rmarkdown_2.28       tzdb_0.4.0           ps_1.8.0            
## [22] ragg_1.3.3           xfun_0.47            cachem_1.1.0        
## [25] jsonlite_1.8.9       highr_0.11           parallel_4.4.2      
## [28] R6_2.5.1             bslib_0.8.0          stringi_1.8.4       
## [31] boot_1.3-31          jquerylib_0.1.4      estimability_1.5.1  
## [34] knitr_1.48           zoo_1.8-12           bayesplot_1.11.1    
## [37] Matrix_1.7-1         splines_4.4.2        igraph_2.0.3        
## [40] timechange_0.3.0     tidyselect_1.2.1     rstudioapi_0.16.0   
## [43] abind_1.4-8          yaml_2.3.10          codetools_0.2-20    
## [46] curl_5.2.3           processx_3.8.4       pkgbuild_1.4.4      
## [49] plyr_1.8.9           lattice_0.22-6       withr_3.0.1         
## [52] bridgesampling_1.1-2 posterior_1.6.0      coda_0.19-4.1       
## [55] evaluate_1.0.0       survival_3.7-0       RcppParallel_5.1.9  
## [58] xml2_1.3.6           pillar_1.9.0         tensorA_0.36.2.1    
## [61] checkmate_2.3.2      stats4_4.4.2         distributional_0.5.0
## [64] generics_0.1.3       rprojroot_2.0.4      hms_1.1.3           
## [67] rstantools_2.4.0     munsell_0.5.1        xtable_1.8-4        
## [70] glue_1.8.0           emmeans_1.10.4       tools_4.4.2         
## [73] data.table_1.16.0    mvtnorm_1.3-2        cowplot_1.1.3       
## [76] tidygraph_1.3.1      grid_4.4.2           QuickJSR_1.3.1      
## [79] colorspace_2.1-1     nlme_3.1-166         cli_3.6.3           
## [82] textshaping_0.4.0    fansi_1.0.6          Brobdingnag_1.2-9   
## [85] V8_5.0.1             gtable_0.3.5         sass_0.4.9          
## [88] digest_0.6.37        TH.data_1.1-2        farver_2.1.2        
## [91] htmltools_0.5.8.1    lifecycle_1.0.4      gridtext_0.1.5      
## [94] MASS_7.3-61

Simulated report - GPG

Dr Tom Moore - Statistical Scientist

2025-01-31