13 Demo on non-collapsibility

14 Load Libraries

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(marginaleffects)
library(gtsummary)

15 Demonstrating non-collapsibility of odds ratios

Non-collapsibility refers to the fact that the conditional and marginal OR do not match. The conditional and marginal OR estimate agree in identity link regression model, so it is collapsible.

Simulate data first: outcome, treatment, and one continuous covariate

########----##
n <- 5000
# 1) Simulate covariates (examples)
x1 <- rnorm(n, mean = 0, sd = 1)       # continuous
treatment <- rbinom(n, size = 1, prob = 0.5)  # binary indicator, 
# 2) Specify true effects (log-odds scale). Include factor via contrasts.
#    For factors, use model.matrix or include as factor in a data.frame and formula.
dat <- data.frame(x1, treatment)

# Build design matrix with contrasts automatically
X <- model.matrix(~ x1+treatment, data = dat)  # includes intercept and two dummies for x4

# Coefficients (choose your truth)
beta <- c(
  "(Intercept)" = -2.0,   # baseline log-odds
  "x1"          = 0.8,
  "treatment"          = -0.7
)

# 3) Linear predictor and probabilities
linpred <- as.numeric(X %*% beta)
p <- 1 / (1 + exp(-linpred))  # inverse-logit

# 4) Simulate binary outcome
y <- rbinom(n, size = 1, prob = p)

dat|>
  cbind(y)|>
  count(treatment,y)

  treatment y    n
1         0 0 2137
2         0 1  381
3         1 0 2277
4         1 1  205

#(213/2317)-(380/2090)

# Inspect prevalence and fit back a model to check recovery
#mean(y)
mod1<-glm(y ~ x1 + treatment, data = dat, family = binomial())
mod1|>
  tbl_regression(exponentiate = TRUE)

Characteristic	OR	95% CI	p-value
x1	2.34	2.12, 2.59	<0.001
treatment	0.47	0.39, 0.56	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

mod2<-glm(y ~ treatment, data = dat, family = binomial())
mod2|>
  tbl_regression(exponentiate = TRUE)

Characteristic	OR	95% CI	p-value
treatment	0.50	0.42, 0.60	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

The regression coefficients for x3 (binary treatment effect) differ. (The true OR is 0.5)

16 Now let’s use g-computation

# multiple variable logistic regression
avg_comparisons(mod1,newdata=dat,variables="treatment")


 Estimate Std. Error     z Pr(>|z|)    S   2.5 %  97.5 %
  -0.0706    0.00868 -8.14   <0.001 51.2 -0.0877 -0.0536

Term: treatment
Type:  response 
Comparison: 1 - 0

# marginal logistic regression 
avg_comparisons(mod2,newdata=dat,variables="treatment")


 Estimate Std. Error     z Pr(>|z|)    S   2.5 % 97.5 %
  -0.0687    0.00903 -7.61   <0.001 45.1 -0.0864 -0.051

Term: treatment
Type:  response 
Comparison: 1 - 0

17 Let’s simulate more datasets and measure the OR and probability difference (ATE)

n <- 5000

simdatORG<-function(n){
# 1) Simulate covariates (examples)
x1 <- rnorm(n, mean = 0, sd = 1)       # continuous
treatment <- rbinom(n, size = 1, prob = 0.5)  # binary indicator, 
# 2) Specify true effects (log-odds scale). Include factor via contrasts.
#    For factors, use model.matrix or include as factor in a data.frame and formula.
dat <- data.frame(x1, treatment)

# Build design matrix with contrasts automatically
X <- model.matrix(~ x1+treatment, data = dat)  # includes intercept and two dummies for x4

# Coefficients (choose your truth)
beta <- c(
  "(Intercept)" = -2.0,   # baseline log-odds
  "x1"          = 0.8,
  "treatment"          = -0.7
)

# 3) Linear predictor and probabilities
linpred <- as.numeric(X %*% beta)
p <- 1 / (1 + exp(-linpred))  # inverse-logit

# 4) Simulate binary outcome
y <- rbinom(n, size = 1, prob = p)

# Inspect prevalence and fit back a model to check recovery
#mean(y)
mod1<-glm(y ~ x1 + treatment, data = dat, family = binomial())
#mod1$coefficients[3]
mod2<-glm(y ~ treatment, data = dat, family = binomial())
#mod2$coefficients[2]#
#g-comput
g1<-avg_comparisons(mod1,newdata=dat,variables="treatment")
#g1$estimate
g2<-avg_comparisons(mod2,newdata=dat,variables="treatment")
#g2$estimate


outtbl<-tibble(condbiasOR=exp(mod1$coefficients[3]),margbiasOR=exp(mod2$coefficients[2]),cmm=condbiasOR-margbiasOR,condbiasg=g1$estimate,margbiasg=g2$estimate,atecmm=condbiasg-margbiasg)
return(outtbl)
}

#simulation
sim0.1<-tibble(n=1:1000)|>
  group_by(n)|>
  do(out=simdatORG(n=1000))|>
  unnest(out)
#sim0.1

comptbl<-sim0.1|>
  select(-n)|>
  tbl_summary(label = list(condbiasOR~"Conditional OR",margbiasOR~"Marginal OR",condbiasg~"Conditional ATE",margbiasg~"Marginal ATE",cmm~"Condition-Marginal OR",atecmm~"Condition-Marginal ATE"))|>
  as_gt()
comptbl

Characteristic	N = 1,000¹
Conditional OR	0.49 (0.43, 0.57)
Marginal OR	0.51 (0.45, 0.59)
Condition-Marginal OR	-0.021 (-0.040, -0.002)
Conditional ATE	-0.064 (-0.076, -0.051)
Marginal ATE	-0.064 (-0.077, -0.051)
Condition-Marginal ATE	0.000 (-0.003, 0.004)
¹ Median (Q1, Q3)

#saving figures
#gtsave(comptbl,filename="noncollapsibility_of_OR_treatment_covariate_model_vs_marginal_model.png")
#gtsave(comptbl,filename="noncollapsibility_of_OR_treatment_covariate_model_vs_marginal_model.docx")

Across 1,000 simulations, the OR differ between a conditional OR vs marginal OR, but ATE remains the same between models (multivariable vs marginal model).

18 Sessioninfo

sessionInfo()

R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_United States.utf8 
[2] LC_CTYPE=English_United States.utf8   
[3] LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.utf8    

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] gtsummary_2.2.0        marginaleffects_0.25.1 lubridate_1.9.4       
 [4] forcats_1.0.0          stringr_1.5.1          dplyr_1.1.4           
 [7] purrr_1.0.4            readr_2.1.5            tidyr_1.3.1           
[10] tibble_3.2.1           ggplot2_3.5.2          tidyverse_2.0.0       

loaded via a namespace (and not attached):
 [1] gt_1.0.0             sass_0.4.10          generics_0.1.3      
 [4] xml2_1.3.8           stringi_1.8.7        hms_1.1.3           
 [7] digest_0.6.37        magrittr_2.0.3       evaluate_1.0.3      
[10] grid_4.5.0           timechange_0.3.0     cards_0.6.0         
[13] fastmap_1.2.0        broom.helpers_1.20.0 jsonlite_2.0.0      
[16] backports_1.5.0      scales_1.3.0         cli_3.6.4           
[19] labelled_2.14.0      rlang_1.1.6          litedown_0.7        
[22] commonmark_1.9.5     munsell_0.5.1        base64enc_0.1-3     
[25] withr_3.0.2          yaml_2.3.10          tools_4.5.0         
[28] tzdb_0.5.0           checkmate_2.3.2      colorspace_2.1-1    
[31] broom_1.0.8          vctrs_0.6.5          R6_2.6.1            
[34] lifecycle_1.0.4      htmlwidgets_1.6.4    insight_1.1.0       
[37] pkgconfig_2.0.3      pillar_1.10.2        gtable_0.3.6        
[40] glue_1.8.0           data.table_1.17.0    Rcpp_1.0.14         
[43] haven_2.5.4          xfun_0.52            tidyselect_1.2.1    
[46] knitr_1.50           htmltools_0.5.8.1    rmarkdown_2.29      
[49] compiler_4.5.0       markdown_2.0