7 Bayesian sequential analysis

8 Load library

library(tidyverse)
library(gt)

9 Introduction

In this script, I’m simulating sequential patient monitoring in a single-arm clinical trial. Here are the conditions that we can modify in the simulation:

Number of simulations
True event probability and target probability for decision making
Prior: $\beta(1,1)$, which is a flat prior
Total sample size
Sequential monitoring start at a given sample size
Sequential monitoring step (every 1 patient is default)
Posterior after n patients is: $p|data (1+x, 1+n-x) $

At each look, evaluate:

Futility threshold
Early efficacy threshold
Otherwise continue

10 Simulation set up

# Bayesian beta-binomial 1-arm simulation with configurable thresholds
# Includes posterior bias = (posterior mean at stopping - true probability)
simulate_trial <- function(true_p = 0.30,
                           target_p = 0.30,
                           maxN = 200,
                           start_monitor = 50,
                           monitor_every = 1,
                           prior_a = 1,
                           prior_b = 1,
                           futility_threshold = 0.05,   # stop if Pr(p > target_p) < futility_threshold
                           efficacy_threshold = 0.50,   # stop if Pr(p > target_p) >= efficacy_threshold
                           nsim = 1,
                           return_both = c("data", "summary")) {
  stopifnot(start_monitor >= 1, maxN >= start_monitor,
            monitor_every >= 1, nsim >= 1)
  return_both <- match.arg(return_both)

  # Storage for per-simulation results
  results <- data.frame(
    sim = integer(nsim),
    n = integer(nsim),
    x = integer(nsim),
    decision = character(nsim),
    post_mean = numeric(nsim),
    post_lcl = numeric(nsim),
    post_ucl = numeric(nsim),
    pr_gt_p = numeric(nsim),
    posterior_bias = numeric(nsim),   # NEW: posterior mean - true_p
    stringsAsFactors = FALSE
  )

  for (s in seq_len(nsim)) {
    # Generate Bernoulli outcomes under the true event rate
    outcomes <- rbinom(maxN, size = 1, prob = true_p)

    decision <- "Max Sample"
    n_final <- maxN
    x_final <- sum(outcomes)

    looks <- seq(from = start_monitor, to = maxN, by = monitor_every)

    for (n in looks) {
      x <- sum(outcomes[1:n])

      # Posterior parameters for Beta(prior_a, prior_b)
      post_a <- prior_a + x
      post_b <- prior_b + (n - x)

      # Analytic posterior probability Pr(p > target_p)
      pr_gt <- 1 - pbeta(target_p, shape1 = post_a, shape2 = post_b)

      # Futility rule
      if (pr_gt < futility_threshold) {
        decision <- "Futility"
        n_final <- n
        x_final <- x
        break
      }

      # Early efficacy rule
      if (pr_gt >= efficacy_threshold) {
        decision <- "Early Efficacy"
        n_final <- n
        x_final <- x
        break
      }
      # Else continue
    }

    # Posterior summaries at stopping point (or maxN)
    post_a <- prior_a + x_final
    post_b <- prior_b + (n_final - x_final)

    post_mean <- post_a / (post_a + post_b)
    post_ci <- qbeta(c(0.025, 0.975), shape1 = post_a, shape2 = post_b)
    pr_gt_p <- 1 - pbeta(target_p, shape1 = post_a, shape2 = post_b)
    posterior_bias <- post_mean - true_p   # NEW

    # Save results
    results$sim[s]            <- s
    results$n[s]              <- n_final
    results$x[s]              <- x_final
    results$decision[s]       <- decision
    results$post_mean[s]      <- post_mean
    results$post_lcl[s]       <- post_ci[1]
    results$post_ucl[s]       <- post_ci[2]
    results$pr_gt_p[s]        <- pr_gt_p
    results$posterior_bias[s] <- posterior_bias
  }

  # Summaries across simulations
  decision_table <- table(results$decision)
  ess <- mean(results$n)
  mean_post_mean <- mean(results$post_mean)
  mean_posterior_bias <- mean(results$posterior_bias)  # NEW (same as mean_post_mean - true_p)

  summary <- list(
    settings = list(
      true_p = true_p,
      target_p = target_p,
      maxN = maxN,
      start_monitor = start_monitor,
      monitor_every = monitor_every,
      prior_a = prior_a,
      prior_b = prior_b,
      futility_threshold = futility_threshold,
      efficacy_threshold = efficacy_threshold,
      nsim = nsim
    ),
    decisions = decision_table,
    expected_sample_size = ess,
    mean_post_mean = mean_post_mean,
    mean_posterior_bias = mean_posterior_bias
  )

  if (return_both == "data") {
    return(results)
  } else if (return_both == "summary") {
    return(list(results = results, summary = summary))
  }
}

11 Running simulation

The target and true probability is 0.1 and I want to do 10,000 simulations.Max sample size is 200.

trialsim1<-simulate_trial(true_p = 0.1,target_p=.1,nsim=10000)

#count decisions 
results<-trialsim1|>
  dplyr::count(decision)|>
  dplyr::mutate(proportion=round(n/sum(n),3))
results|>
  gt()

decision	n	proportion
Early Efficacy	8258	0.826
Futility	1261	0.126
Max Sample	481	0.048

#For the ones that go to max sample, did we reach the goal? 
goalmax<-trialsim1|>
  filter(decision=="Max Sample")|>
  dplyr::mutate(goal=if_else(pr_gt_p>=.5,1,0))
mean(goalmax$goal)

[1] 0

#none of the max sample sizes reached their goal

For this simulation, it looks like futility is too stringent. Let’s try reducing futility threshold and increasing sample size

Max sample size = 500
Threshold for futility = 0.025

trialsim2<-simulate_trial(true_p = 0.1,target_p=.1,nsim=10000,maxN = 500,futility_threshold = 0.025)

trialsim2|>
  dplyr::count(decision)|>
  dplyr::mutate(percent=n/sum(n))

        decision    n percent
1 Early Efficacy 8825  0.8825
2       Futility  883  0.0883
3     Max Sample  292  0.0292

#these look like better operating characteristics 
#For the ones that go to max sample, did we reach the goal? 
goalmax2<-trialsim2|>
  filter(decision=="Max Sample")|>
  dplyr::mutate(goal=if_else(pr_gt_p>=.5,1,0))
mean(goalmax2$goal)

[1] 0

#no 

#Let's calculate the average sample size for early efficacy 
trialsim2|>
  filter(decision=="Early Efficacy")|>
  summarize(meann=mean(n))

    meann
1 71.6834

#~71 patients

#Let's see how biased we are for early stopping 
trialsim2|>
  filter(decision=="Early Efficacy")|>
  ggplot(aes(x=post_mean))+geom_histogram()+scale_x_continuous(limits=c(0,.4),breaks=seq(0,.4,.025),labels=seq(0,.4,.025))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_bar()`).

#Let's see how many are within acceptable level of bias , say 0.8-.15? 
trialsim2|>
  filter(decision=="Early Efficacy")|>
  dplyr::mutate(accept=if_else(post_mean>=0.8 | post_mean<=0.15,1,0))|>
  summarize(`Acceptable bias`=mean(accept),`Not acceptable bias`=1-`Acceptable bias`)|>
  round(2)

  Acceptable bias Not acceptable bias
1            0.75                0.25

12 Ways to limit early futility

Lower futility threshold

trialsim3<-simulate_trial(true_p = 0.1,target_p=.1,nsim=10000,maxN = 1000,futility_threshold = 0.01,start_monitor = 100)

#trial results
trialsim3|>
  dplyr::count(decision)|>
  dplyr::mutate(percent=n/sum(n))

        decision    n percent
1 Early Efficacy 8934  0.8934
2       Futility  516  0.0516
3     Max Sample  550  0.0550

13 Concluding remarks

This simulation code lets you simulate different scenarios for one-arm studies. You can change the true probability, target probability, number of simulations, sample sizes, thresholding, and when you want to start monitoring.

14 Session info

sessionInfo()

R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_United States.utf8 
[2] LC_CTYPE=English_United States.utf8   
[3] LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.utf8    

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] gt_1.0.0        lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1  
 [5] dplyr_1.1.4     purrr_1.0.4     readr_2.1.5     tidyr_1.3.1    
 [9] tibble_3.2.1    ggplot2_3.5.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] gtable_0.3.6      jsonlite_2.0.0    compiler_4.5.0    tidyselect_1.2.1 
 [5] xml2_1.3.8        scales_1.3.0      yaml_2.3.10       fastmap_1.2.0    
 [9] R6_2.6.1          labeling_0.4.3    generics_0.1.3    knitr_1.50       
[13] htmlwidgets_1.6.4 munsell_0.5.1     pillar_1.10.2     tzdb_0.5.0       
[17] rlang_1.1.6       stringi_1.8.7     xfun_0.52         sass_0.4.10      
[21] timechange_0.3.0  cli_3.6.4         withr_3.0.2       magrittr_2.0.3   
[25] digest_0.6.37     grid_4.5.0        hms_1.1.3         lifecycle_1.0.4  
[29] vctrs_0.6.5       evaluate_1.0.3    glue_1.8.0        farver_2.1.2     
[33] colorspace_2.1-1  rmarkdown_2.29    tools_4.5.0       pkgconfig_2.0.3  
[37] htmltools_0.5.8.1