A simple trivariate model with one binary margin

This isn’t really a blog post so much as it is just me dumping some thoughts in case they would be useful to someone.

Our data for observation \(i\) is a vector \(Y_{i} = (\xi_{i},\, y_{2i}, y_{3i})'\), where \(y_{2i}\) and \(y_{3i}\) are observed variables, and \(\xi_{i}\) is unobserved. We observe a variable \(y_{1i}\), which is 1 when \(\xi_{i}> 0\) and 0 otherwise. Across observations, \(Y_{i}\) is conditionally multivariate normal with a mean vector \(\mu_{i} = \alpha + BX_{i}\), where \(\alpha\) is a vector of intercepts, \(X_{i}\) is a vector of \(P\) covariates, and \(B\) is a \(3 \times P\) matrix of regression coefficients. The errors are Gaussian with covariance matrix \(\Sigma\), where the first diagonal element is 1 (else the model won’t be identified). Let’s simulate some fake data to show what we’re talking about

# Number of observations
N <- 5000
# Number of covariates
P <- 50
# Intercept vector
alpha <- rnorm(3)
# coefficient matrix
B <- matrix(rnorm(3*P), 3, P)
# covariate matrix
X <- matrix(rnorm(N*P), N, P)
# Error scale
tau <- c(1, 2, 1.2)
# Correlation matrix
Omega <- cor(matrix(rnorm(3*4), 4, 3))
# Covariance matrix
Sigma <- diag(tau) %*% Omega %*% diag(tau)

# Simulate fake data
mu <- matrix(alpha, N, 3, byrow = T) + X %*% t(B)
Y <- matrix(NA, N, 3)
for(i in 1:N) {
  Y[i,] <- MASS::mvrnorm(1, mu[i,], Sigma)
}

Y <- data.frame(y1 = as.numeric(Y[,1]>0), xi1 = Y[,1], y2 = Y[,2], y3 = Y[,3])

# Plot the four series
plot(as.data.frame(Y))

# Which positions are negative and how many of each are there?
# This is needed for the Albert and Chib formulation
pos_neg <- which(Y$y1 == 0)
pos_pos <- which(Y$y1 == 1)
N_neg <- sum(Y$y1 == 0)
N_pos <- sum(Y$y1 == 1)

Now that we have the data, let’s fit the model in Stan

data {
  int N;
  int N_neg;
  int N_pos;
  int P;
  matrix[N, P] X;
  matrix[N, 3] Y_raw; // first column is 0s and 1s
  int pos_neg[N_neg];
  int pos_pos[N_pos];
}
parameters {
  vector[3] alpha;
  vector<lower = 0>[2] tau_raw;
  cholesky_factor_corr[3] L_Omega;
  matrix[3, P] Beta;
  vector<lower = 0>[N_pos] xi_pos;
  vector<upper = 0>[N_neg] xi_neg;
}
transformed parameters {
  vector[3] tau = append_row(rep_vector(1.0, 1), tau_raw);
  vector[N] xi;
  for(i in 1:N_neg) {
    xi[pos_neg[i]] = xi_neg[i];
  }
  for(i in 1:N_pos) {
    xi[pos_pos[i]] = xi_pos[i];
  }
}
model {
  matrix[N, 3] Y = append_col(xi, Y_raw[,2:3]);
  matrix[N, 3] z = (Y - rep_matrix(alpha',N) - X * Beta') * inverse(diag_matrix(tau) * L_Omega);
  
  // priors
  alpha ~ normal(0, 2);
  to_vector(Beta) ~ normal(0, 1);
  tau_raw ~ cauchy(0, 1);
  L_Omega ~ lkj_corr_cholesky(3.0);
  // likelihood
  to_vector(z) ~ normal(0, 1);
}

library(rstan)
options(mc.cores = parallel::detectCores())

data_list <- list(N = N, 
                  N_neg = N_neg, 
                  N_pos = N_pos,
                  P = P,
                  X = X,
                  Y_raw = Y[,c(1,3,4)],
                  pos_neg = pos_neg,
                  pos_pos = pos_pos)

sampling_fit <- sampling(trivariate_model, data = data_list, iter = 500)

Now plot the calibration of the model estimates.

xi_est <- get_posterior_mean(sampling_fit, pars = "xi")[,5]

library(tidyverse); library(stringr)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## extract(): tidyr, rstan
## filter():  dplyr, stats
## lag():     dplyr, stats

actuals <- data.frame(row = rep(1:3, P), col = rep(1:P, each = 3), actual = as.vector(B))

intervals <- as.data.frame(sampling_fit, pars = "Beta") %>% 
  gather(parameter, value) %>% 
  mutate(row = parse_number(str_extract(parameter,"[0-9]{1,2},")),
         col = parse_number(str_extract(parameter,",[0-9]{1,2}"))) %>%
  group_by(parameter) %>% 
  summarise(row = first(row),
            col = first(col),
            mean = mean(value),
            lower = quantile(value, .05),
            upper = quantile(value, .95)) %>% 
  left_join(actuals, by = c("row", "col")) %>% 
  mutate(within = actual > lower & actual < upper)

intervals %>%
  arrange(mean) %>% 
  mutate(rank = 1:n()) %>% 
  ggplot(aes(x = rank)) +
  geom_linerange(aes(ymin = lower, ymax = upper), colour = "green") +
  geom_point(aes(y = mean), colour = "green") +
  geom_point(aes(y = actual, colour = within), colour = "black",alpha = 0.5) +
  lendable::theme_lendable()

## Warning: replacing previous import 'data.table::first' by 'dplyr::first'
## when loading 'lendable'

## Warning: replacing previous import 'data.table::between' by
## 'dplyr::between' when loading 'lendable'

## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when
## loading 'lendable'

## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'

## Warning: replacing previous import 'data.table::isoweek' by
## 'lubridate::isoweek' when loading 'lendable'

## Warning: replacing previous import 'data.table::month' by
## 'lubridate::month' when loading 'lendable'

## Warning: replacing previous import 'data.table::wday' by 'lubridate::wday'
## when loading 'lendable'

## Warning: replacing previous import 'data.table::year' by 'lubridate::year'
## when loading 'lendable'

## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt'
## when loading 'lendable'

percentiles <- seq(from = 0.3, to = 0.99, by = 0.01)


# Check for calibration of estimation
out <- data_frame(percentile = percentiles, within = NA)
count <- 0
for(i in percentiles) {
  count <- count + 1
  l <- ((1 - i)/2)
  h <- 1 - (1-i)/2
  intervals <- as.data.frame(sampling_fit, pars = "Beta") %>% 
  gather(parameter, value) %>% 
  mutate(row = parse_number(str_extract(parameter,"[0-9]{1,2},")),
         col = parse_number(str_extract(parameter,",[0-9]{1,2}"))) %>%
  group_by(parameter) %>% 
  summarise(row = first(row),
            col = first(col),
            mean = mean(value),
            lower = quantile(value, l),
            upper = quantile(value, h)) %>% 
  left_join(actuals, by = c("row", "col")) %>% 
  mutate(within = actual > lower & actual < upper)
 out[count,2] <- intervals %>% 
  summarise(p = mean(actual > lower & actual < upper)) %>% .$p

}

out %>% 
  ggplot(aes(x = percentile)) +
  geom_point(aes(y = within)) +
  geom_line(aes(y = percentile)) +
  theme_bw() +
  labs(x = "Proportion expected to fall inside credibility interval",
       subtitle = "Proportion falling in interval",
       y = NULL, 
       title = "A coverage/calibration plot")

A simple trivariate model with one binary margin

Jim Savage

14 July 2018