Libraries

library(tidyverse)
library(tidybayes)
library(rstan)
library(posterior)
library(patchwork)
set.seed(236)
options(mc.cores = parallelly::availableCores())
rstan_options(auto_write = TRUE)

Data

Data were generated from a stochastic SIR Model using the Gillespie algorithm. R0 was 1.5, the average time spent in the infectious stage (1/\(\nu\)) was 7 days. Cases (\(O\)) were assumed to be noisy realizations of the number of new infectious individuals, modeled as a negative binomial random variable. Let \(C(t)\) be cumulative infections at time \(t\), then for day \(k\): \[\begin{equation*} O_{k} \sim \text{Neg-Binom}(\rho \times (C(t_{k}) - C(t_{k-1})), \phi). \end{equation*}\] For simulations, \(\rho\) was 0.2, and \(\phi\) was 57.55. Cases were simulated on a daily time scale, then aggregated to weeks. Here is what the data looks like:

states <- read_csv("https://raw.githubusercontent.com/vnminin/sismid_mcmc_one/main/2023/code/sim_state.csv")

test_data <- read_csv("https://raw.githubusercontent.com/vnminin/sismid_mcmc_one/main/2023/code/sim_data.csv")

state_plot <- states %>% 
              dplyr::select(t,S,I,R,C) %>% 
              pivot_longer(-t) %>% 
              mutate(Compartment = fct_relevel(name, "S", "I", "R", "C")) %>%
              ggplot(aes(x = t, y = value, color = Compartment)) + 
              geom_point() +
              theme_bw() +
              theme(text = element_text(size = 20)) +
              ylab("Count") +
              xlab("Day") +
              ggtitle("Simulated Compartments")
              
              
data_plot <- test_data %>% 
             ggplot(aes(x = week, y = total_cases)) +
             geom_point() + 
             geom_line() + 
             theme_bw() + 
             xlab("Week") + 
             ylab("Cases") + 
             ggtitle("Simulated Data") +
             theme(text = element_text(size = 20))

combined_plot <- state_plot + data_plot + plot_layout(guides = "collect")

combined_plot

SIR ODE Model

Latent Dynamics

\(N\) is the population size, \(\beta\) is the transmission rate, \(1/\nu\) is the mean of the infectious period. Latent dynamics of the spread of the Epidemic are described by a system of ODEs. \[\begin{align*} \frac{dS}{dt} &= -\beta \times S/N \times I, \\ \frac{dI}{dt} &= \beta \times S/N \times I - \nu \times I, \\ \frac{dR}{dt} &= \nu \times I, \\ \frac{dC}{dt} &= \beta \times S/N \times I. \end{align*}\]

We put priors on \(\nu\) and \(R_{0} = \beta/\nu\), the basic reproduction number.

Initial Counts

We have to specify the initial compartment counts in order to solve the sytem of ODEs. We define fractions of the population in each compartment at time 0 as: \[\begin{align*} S_{SIR} &= \frac{S(0)}{N},\\ I_{IR} &= \frac{I(0)}{N-S(0)}. \end{align*}\] Then \[\begin{align*} S(0) &= S_{SIR} \times N,\\ I(0) &= I_{IR} \times (N - S(0)), \\ R(0) &= N - S(0) - I(0),\\ C(0) &= 1. \end{align*}\] We put priors on the fractions in order to preserve the population size. In practice, these are very tight as the parameters are not identifiable from case data.

Surveillance Model

We use the surveillance model used to generate the data. \[\begin{equation*} O_{k} \sim \text{Neg-Binom}(\rho \times (C(t_{k}) - C(t_{k-1})), \phi). \end{equation*}\]

We put priors on \(\rho\) and \(\phi\).

Full Posterior

Let \(M(t) = (S(t), I(t), R(t), C(t))\) be the solution to this series of ODEs. Then the posterior of interest is: \[\begin{equation*} P(S_{SIR}, I_{IR}, R_{0}, \nu, \rho, \phi|O) \propto P(O|M(t), \rho, \phi)P(S_{SIR}, I_{IR}, R_{0}, \nu, \rho, \phi). \end{equation*}\]

Fitting the model using Rstan

The following code fits the SIR.stan model to this simulated data. The posterior_object contains both the data, as well as parameters specifying the model priors. These are all treated as “data” by stan.

posterior_object <- list(priors_only = FALSE,
                     num_obs = dim(test_data)[1],
                     pop_size = 100000,
                     cases = test_data$total_cases,
                     obs_times = test_data$week,
                     R0_mean = log(1),
                     R0_sd = 0.2,
                     nu_mean = log(1),
                     nu_sd = 0.2,
                     phi_mean = log(50),
                     phi_sd = 0.2,
                     rho_mean = log(0.2),
                     rho_sd = 0.2,
                     frac_S_mean = 6.90755,
                     frac_S_sd = 0.05,
                     frac_I_mean = 0,
                     frac_I_sd = 0.05)

chain = 4
iterations = 1000
seed = 236
thin = 1

posterior_fit <- stan(file = "https://raw.githubusercontent.com/vnminin/sismid_mcmc_one/main/2023/code/SIR.stan",
                  data = posterior_object,
                  seed = seed,
                  iter = iterations,
                  thin = thin,
                  chain = chain)

The following code produces draws from the model priors which we use for visualization purposes

prior_object <- list(priors_only = TRUE,
                     num_obs = dim(test_data)[1],
                     pop_size = 100000,
                     cases = test_data$total_cases,
                     obs_times = test_data$week,
                     R0_mean = log(1),
                     R0_sd = 0.2,
                     nu_mean = log(1),
                     nu_sd = 0.2,
                     phi_mean = log(50),
                     phi_sd = 0.2,
                     rho_mean = log(0.2),
                     rho_sd = 0.2,
                     frac_S_mean = 6.90755,
                     frac_S_sd = 0.05,
                     frac_I_mean = 0,
                     frac_I_sd = 0.05)


chain = 4
iterations = 1000
seed = 236
thin = 1

prior_fit <- stan(file = "https://raw.githubusercontent.com/vnminin/sismid_mcmc_one/main/2023/code/SIR.stan",
                  data = prior_object,
                  seed = seed,
                  iter = iterations,
                  thin = thin,
                  chain = chain)
## [[1]]
## Stan model 'SIR' does not contain samples.

Assessing MCMC Diagnostics

We can assess the traceplot for the scaled likelihood.

traceplot(posterior_fit, pars = "lp__")

We can also summarise Rhat and effective sample size.

summary <- posterior_fit %>% 
           as_draws() %>% 
           summarise_draws() %>% 
           filter(variable != "C_init" & variable != "inits[4]") %>% #the initial value of C is fixed to 1, it naturally has NA values
           summarise(
             min_rhat = min(rhat),
             max_rhat = max(rhat),
             min_ess_bulk = min(ess_bulk),
             max_ess_bulk = max(ess_bulk),
             min_ess_tail = min(ess_tail),
             max_ess_tail = max(ess_tail)
           )

summary
## # A tibble: 1 × 6
##   min_rhat max_rhat min_ess_bulk max_ess_bulk min_ess_tail max_ess_tail
##      <dbl>    <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
## 1    0.999     1.01         539.        2144.         852.        2020.

Visualizing the results

We can visualize the changes in prior versus posterior distributions. Dots are medians, thick lines are 66% credible intervals, thin lines are 95% credible intervals.

fixed_prior_draws <- prior_fit %>% spread_draws(R0, nu, rho, phi, S_init, I_init, R_init) %>%
                     group_by(.chain, .iteration, .draw) %>%
                     pivot_longer(cols = c(R0, nu, rho, phi, S_init, I_init, R_init)) %>%
                     mutate(type = "prior")
fixed_posterior_draws <- posterior_fit %>% spread_draws(R0, nu, rho, phi, S_init, I_init, R_init) %>%
  group_by(.chain, .iteration, .draw) %>%
  pivot_longer(cols = c(R0, nu, rho, phi, S_init, I_init, R_init)) %>%
  mutate(type = "posterior")

fixed_names <- c("R0",
                 "nu",
                 "rho",
                 "phi",
                 "S_init",
                 "I_init",
                 "R_init")

true_vals <- c(1.5,
               1,
               0.2,
               57.55,
               99900,
               50,
               50)

true_frame <- data.frame(name = fixed_names, true_value = true_vals)


priors_and_posteriors <- rbind(fixed_posterior_draws, fixed_prior_draws) %>%
                         left_join(true_frame, by = "name")


param_plot <- priors_and_posteriors %>%
  ggplot(aes(value, type, fill = type)) +
  stat_halfeye(normalize = "xy")  +
  geom_vline(aes(xintercept = true_value), linetype = "dotted", size = 1) +
  facet_wrap(. ~ name, scales = "free_x") +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()) +
  ggtitle("Fixed parameter prior and posteriors") +
  theme(text = element_text(size = 20)) +
  ylab("Type") + 
  xlab("Value")

param_plot

We can also visualize the posterior predictive to assess model fit.

true_cases <- test_data %>%
  dplyr::select(week, total_cases)
posterior_predictive <- posterior_fit %>% spread_draws(gen_cases[i]) %>%
  rename(week = i) %>%
  group_by(week) %>%
  median_qi(gen_cases, .width = c(0.5, 0.8, 0.95)) %>%
  left_join(true_cases, by = "week")

post_pred_plot <- posterior_predictive %>%
  ggplot() +
  geom_lineribbon(aes(x = week, y = gen_cases, ymin = .lower, ymax = .upper, fill = fct_rev(ordered(.width)))) +
  geom_point(mapping = aes(x = week, y = total_cases), color = "coral1") +
  scale_fill_brewer(name = "Credible Interval Width") +
  # scale_fill_manual(values=c("skyblue1", "skyblue2", "skyblue3"), name="fill") +
  theme_bw() +
  ggtitle(paste0("Posterior Predictive")) + 
  ylab("Cases") +
  xlab("Week") +
  theme(text = element_text(size = 20))

post_pred_plot