# Install packages and functions
ipak <- function(pkg) {
new.pkg <- pkg[!(pkg %in% installed.packages()[,"Package"])]
if (length(new.pkg)) {
install.packages(new.pkg, dependencies = TRUE)
cat("Not Installed:", paste(new.pkg, collapse = ", "), "\n")
} else {
cat("All Packages Installed :) \n")
}
invisible(sapply(pkg, require, character.only = TRUE))
}
packages <- c("tidyverse","here", "simDAG","brms","cmdstanr","dagitty","ggdag","ggdist","ggtext","scales")
bayesplot::color_scheme_set("blue") # Set color scheme
options(mc.cores = parallel::detectCores()) # Parallel Chains
cmdstanr::set_cmdstan_path(path = "C:/Program Files/R/cmdstan-2.36.0") # R-Stan
ipak(packages) # Install packages
## All Packages Installed :)
The aim of this analysis is to report the gender pay gap across an entire workforce and identify areas where this gap is most pronounced. To estimate the gender pay gap other variables known to influence salary (i.e., role type (support/research), time since working at an organisation, level in the organisation, and percentage of FTE employed) were measured and accounted for statistically using a causal inference framework.
For modeling, salary was treated as the mean of the corresponding range (i.e., (range maximum + range minimum) / 2) with an approximated standard deviation to represent measurement error (i.e., range maximum - range minimum / 4).
Measurement error models (one for each year) predicted salary with measurement error based on gender, role type, and level (monotonic predictor) (interactions specified), as well as percentage employed and days since employment start date (fitted as splines). The random effect was position nested within team nested within site location, with random intercepts for gender and role type (correlated). A skew normal distribution was selected to account for positively skewed salary data. Weakly informative priors were used to aid model convergence for the intercept and slope parameters (see models). This model was performed in R (R Core Team 2022) using the brms package (Bürkner, 2017) with ideas inspired from (McElreath, 2020).
## Loading required package: rstan
## Loading required package: StanHeaders
##
## rstan version 2.32.6 (Stan version 2.32.2)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
## change `threads_per_chain` option:
## rstan_options(threads_per_chain = 1)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
## extract
## Using 10 posterior draws for ppc type 'dens_overlay' by default.
## Warning: There were 33 divergent transitions after warmup. Increasing
## adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: mid_est | se(sd_val, sigma = TRUE) ~ 0 + Intercept + gender * role_type + mo(level) + percentage_std + u + s(years_since_start_date, by = s_lab)
## Data: sim_dat (Number of observations: 500)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Smoothing Spline Hyperparameters:
## Estimate Est.Error l-95% CI
## sds(syears_since_start_dates_labmanresearch_1) 6449.08 5664.08 192.15
## sds(syears_since_start_dates_labmansupport_1) 9702.96 5679.28 1861.18
## sds(syears_since_start_dates_labwomanresearch_1) 3782.41 3728.08 99.10
## sds(syears_since_start_dates_labwomansupport_1) 6557.43 4776.54 402.62
## u-95% CI Rhat Bulk_ESS
## sds(syears_since_start_dates_labmanresearch_1) 21780.59 1.00 1022
## sds(syears_since_start_dates_labmansupport_1) 23492.84 1.00 1471
## sds(syears_since_start_dates_labwomanresearch_1) 13808.24 1.00 1150
## sds(syears_since_start_dates_labwomansupport_1) 18434.30 1.00 1524
## Tail_ESS
## sds(syears_since_start_dates_labmanresearch_1) 1543
## sds(syears_since_start_dates_labmansupport_1) 1808
## sds(syears_since_start_dates_labwomanresearch_1) 1843
## sds(syears_since_start_dates_labwomansupport_1) 1588
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI
## Intercept 90340.19 366.93 89634.70
## genderman 4873.54 444.27 4021.13
## role_typesupport 2718.73 557.98 1655.81
## percentage_std 316.18 186.42 -53.50
## u 5067.13 207.97 4669.00
## genderman:role_typesupport 1539.62 871.01 -151.98
## syears_since_start_date:s_labmanresearch_1 10151.77 8497.00 -8309.33
## syears_since_start_date:s_labmansupport_1 4981.68 8894.76 -13124.45
## syears_since_start_date:s_labwomanresearch_1 10921.01 7147.72 -6645.83
## syears_since_start_date:s_labwomansupport_1 8068.23 9665.53 -12220.00
## molevel 5012.11 507.06 4083.03
## u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 91071.44 1.00 3323 2511
## genderman 5719.10 1.00 3696 3101
## role_typesupport 3792.08 1.00 3234 2875
## percentage_std 680.83 1.00 4771 1949
## u 5469.53 1.00 5635 2627
## genderman:role_typesupport 3289.48 1.00 3493 2622
## syears_since_start_date:s_labmanresearch_1 23455.83 1.00 1938 2305
## syears_since_start_date:s_labmansupport_1 21688.73 1.00 3776 2680
## syears_since_start_date:s_labwomanresearch_1 21948.11 1.00 1752 1422
## syears_since_start_date:s_labwomansupport_1 24298.50 1.00 2103 2086
## molevel 6061.14 1.00 2088 2294
##
## Monotonic Simplex Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## molevel1[1] 0.19 0.02 0.15 0.24 1.00 2460 2661
## molevel1[2] 0.19 0.03 0.14 0.26 1.00 3166 2788
## molevel1[3] 0.12 0.04 0.05 0.20 1.00 3772 2323
## molevel1[4] 0.17 0.05 0.08 0.28 1.00 4765 2647
## molevel1[5] 0.12 0.08 0.01 0.30 1.00 5098 1806
## molevel1[6] 0.19 0.10 0.02 0.39 1.00 3093 2500
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 2557.13 220.48 2132.07 2995.81 1.00 5082 2644
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Figure 2: Conditional effect of gender and role type on salary. Esimates are displayed as intervals (mean (black dot) with 95 % credible interval (range)) and labeled as text. Verticle lines display average estimates: women in research (red); women in support (dark red); men in research (blue); men in support (purple).
Figure 3: Conditional effect of gender and role type across levels: women in research (red); women in support (dark red); men in research (blue); men in support (purple). Level 1 employees are 4 steps removed from the CEO and level 5 employees are those who only have the CEO as a line manager.
Figure 4: Conditional effect of gender and role type on salary over time. Esimates are displayed as intervals (mean (dot) with 95 % credible interval (range)): women in research (red); women in support (dark red); men in research (blue); men in support (purple).
Figure 5: Conditional effect of percent FTE on salary over time. Esimates are displayed as intervals (mean (dot) with 95 % credible interval (range)): women in research (red); and women in support (dark red).
# Overall gender pay gap $
a <- hypothesis(mod, 'genderman > 0')
# How much more do men in support earn than woman in support?
b <- hypothesis(mod, '(Intercept + role_typesupport + genderman + genderman:role_typesupport) - (Intercept + role_typesupport) > 0')
# How much more do men in research earn than women in research?
c <- hypothesis(mod, '(Intercept + genderman) - (Intercept) > 0')
# What is the difference between men in support and men in research?
d <- hypothesis(mod, '(Intercept + role_typesupport + genderman + genderman:role_typesupport) - (Intercept + genderman) > 0')
# What is the difference between women in support and women in research?
e <- hypothesis(mod, '(Intercept + role_typesupport) - (Intercept) > 0')
# What is the difference between men in support and women in research
f <- hypothesis(mod, '(Intercept + role_typesupport + genderman + genderman:role_typesupport) - (Intercept) > 0')
## [1] 4873.539
## [1] 6413.159
## [1] 4873.539
## [1] 4258.353
## [1] 2718.733
## [1] 9131.893
Blevins, C., & Mullen, L. (2015). Jane, John… Leslie? A Historical Method for Algorithmic Gender Prediction. DHQ: Digital Humanities Quarterly, 9(3). http://www.digitalhumanities.org/dhq/vol/9/3/000223/000223.html
Bürkner, P.C., (2017). brms: An R Package for Bayesian Multilevel Models Using Stan. Journal of Statistical Software, 80(1), 1-28. https://econpapers.repec.org/article/jssjstsof/v_3a080_3ai01.htm
McElreath, R. (2020). Statistical rethinking: A Bayesian course with examples in R and Stan. Chapman and Hall/CRC.
R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Pacific/Auckland
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rstan_2.32.6 StanHeaders_2.32.10 scales_1.3.0
## [4] ggtext_0.1.2 ggdist_3.3.2 ggdag_0.2.13
## [7] dagitty_0.3-4 cmdstanr_0.8.0 brms_2.22.0
## [10] Rcpp_1.0.13 simDAG_0.2.0 here_1.0.1
## [13] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [16] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1
## [22] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gridExtra_2.3 inline_0.3.19 sandwich_3.1-1
## [4] rlang_1.1.4 magrittr_2.0.3 multcomp_1.4-26
## [7] matrixStats_1.4.1 compiler_4.4.2 mgcv_1.9-1
## [10] loo_2.8.0 systemfonts_1.1.0 reshape2_1.4.4
## [13] vctrs_0.6.5 pkgconfig_2.0.3 fastmap_1.2.0
## [16] backports_1.5.0 labeling_0.4.3 utf8_1.2.4
## [19] rmarkdown_2.28 tzdb_0.4.0 ps_1.8.0
## [22] ragg_1.3.3 xfun_0.47 cachem_1.1.0
## [25] jsonlite_1.8.9 highr_0.11 parallel_4.4.2
## [28] R6_2.5.1 bslib_0.8.0 stringi_1.8.4
## [31] boot_1.3-31 jquerylib_0.1.4 estimability_1.5.1
## [34] knitr_1.48 zoo_1.8-12 bayesplot_1.11.1
## [37] Matrix_1.7-1 splines_4.4.2 igraph_2.0.3
## [40] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.16.0
## [43] abind_1.4-8 yaml_2.3.10 codetools_0.2-20
## [46] curl_5.2.3 processx_3.8.4 pkgbuild_1.4.4
## [49] plyr_1.8.9 lattice_0.22-6 withr_3.0.1
## [52] bridgesampling_1.1-2 posterior_1.6.0 coda_0.19-4.1
## [55] evaluate_1.0.0 survival_3.7-0 RcppParallel_5.1.9
## [58] xml2_1.3.6 pillar_1.9.0 tensorA_0.36.2.1
## [61] checkmate_2.3.2 stats4_4.4.2 distributional_0.5.0
## [64] generics_0.1.3 rprojroot_2.0.4 hms_1.1.3
## [67] rstantools_2.4.0 munsell_0.5.1 xtable_1.8-4
## [70] glue_1.8.0 emmeans_1.10.4 tools_4.4.2
## [73] data.table_1.16.0 mvtnorm_1.3-2 cowplot_1.1.3
## [76] tidygraph_1.3.1 grid_4.4.2 QuickJSR_1.3.1
## [79] colorspace_2.1-1 nlme_3.1-166 cli_3.6.3
## [82] textshaping_0.4.0 fansi_1.0.6 Brobdingnag_1.2-9
## [85] V8_5.0.1 gtable_0.3.5 sass_0.4.9
## [88] digest_0.6.37 TH.data_1.1-2 farver_2.1.2
## [91] htmltools_0.5.8.1 lifecycle_1.0.4 gridtext_0.1.5
## [94] MASS_7.3-61