From 9acf16e38a8dd2fa177c7e7ddae85edaae4b711f Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Wed, 27 Jun 2018 14:23:01 -0500
Subject: [PATCH 01/23] add back in deleted 'gene_from_gene' method to fix #190

---
 NAMESPACE             |  1 +
 R/sleuth.R            | 46 +++++++++++++++++++++++++++++++++++++++++++
 man/gene_from_gene.Rd | 34 ++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100644 man/gene_from_gene.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 01d495b..bb5f80d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -31,6 +31,7 @@ export(design_matrix)
 export(enclosed_brush)
 export(excluded_ids)
 export(extract_model)
+export(gene_from_gene)
 export(get_bootstrap_summary)
 export(get_bootstraps)
 export(get_quantile)
diff --git a/R/sleuth.R b/R/sleuth.R
index fea2305..1c1e6bd 100644
--- a/R/sleuth.R
+++ b/R/sleuth.R
@@ -1150,6 +1150,52 @@ transcripts_from_gene <- function(obj, test, test_type,
   table$target_id[table[, 2] == gene_name]
 }
 
+#' Get the gene ID using other gene identifiers
+#'
+#' Get the \code{target_id} of a gene using other gene identifiers.
+#' The identifiers found under the \code{obj$gene_column} are often
+#' difficult to remember (e.g. ensembl gene ID, ENSG00000111640).
+#' This function allows a user to find that difficult-to-remember
+#' identifier using more-easily-remembered identifiers, such as
+#' gene symbol (e.g. "GAPDH").
+#'
+#' @param obj a \code{sleuth} object
+#' @param gene_colname the name of the column containing 'gene_name'.
+#'   This parameter refers to the name of the column that the gene you are searching for appears in.
+#'   Check the column names using \code{colnames(obj$target_mapping)}.
+#' @param gene_name a string containing the name of the gene you are interested in.
+#' @return a character vector containing the \code{target_id} of the gene, found under
+#'   \code{obj$gene_column} within \code{obj$target_mapping}.
+#'   If the column name provided is the same as \code{obj$gene_column}, and the
+#'   gene_name used is found, that gene_name will be returned.
+#' @examples
+#'   \dontrun{gene_from_gene(obj, "gene_symbol", "GAPDH")}
+#' @export
+gene_from_gene <- function(obj, gene_colname, gene_name) {
+
+  if (!obj$gene_mode) {
+    stop("this sleuth object is in transcript mode. Please use 'transcripts_from_gene' instead.")
+  }
+
+  table <- as.data.frame(obj$target_mapping)
+  if (gene_colname == obj$gene_column) {
+    if (!(gene_name %in% table[, eval(parse(text = obj$gene_column))])) {
+      stop("Couldn't find gene ", gene_name)
+    } else {
+      return(gene_name)
+    }
+  }
+
+  table <- unique(dplyr::select_(table, obj$gene_column, gene_colname))
+  if (!(gene_name %in% table[, 2])) {
+    stop("Couldn't find gene ", gene_name)
+  }
+  hits <- unique(table[table[,2] == gene_name, 1])
+  if (length(hits) > 1) {
+    warning("there was more than one gene ID that matched this identifier; taking the first one")
+  }
+  hits[1]
+ }
 
 #' Change sleuth transform counts function
 #'
diff --git a/man/gene_from_gene.Rd b/man/gene_from_gene.Rd
new file mode 100644
index 0000000..0f7077e
--- /dev/null
+++ b/man/gene_from_gene.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sleuth.R
+\name{gene_from_gene}
+\alias{gene_from_gene}
+\title{Get the gene ID using other gene identifiers}
+\usage{
+gene_from_gene(obj, gene_colname, gene_name)
+}
+\arguments{
+\item{obj}{a \code{sleuth} object}
+
+\item{gene_colname}{the name of the column containing 'gene_name'.
+This parameter refers to the name of the column that the gene you are searching for appears in.
+Check the column names using \code{colnames(obj$target_mapping)}.}
+
+\item{gene_name}{a string containing the name of the gene you are interested in.}
+}
+\value{
+a character vector containing the \code{target_id} of the gene, found under
+  \code{obj$gene_column} within \code{obj$target_mapping}.
+  If the column name provided is the same as \code{obj$gene_column}, and the
+  gene_name used is found, that gene_name will be returned.
+}
+\description{
+Get the \code{target_id} of a gene using other gene identifiers.
+The identifiers found under the \code{obj$gene_column} are often
+difficult to remember (e.g. ensembl gene ID, ENSG00000111640).
+This function allows a user to find that difficult-to-remember
+identifier using more-easily-remembered identifiers, such as
+gene symbol (e.g. "GAPDH").
+}
+\examples{
+  \dontrun{gene_from_gene(obj, "gene_symbol", "GAPDH")}
+}

From 50b35d0629c7f46fa8303b08b3ce30efce462b31 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Wed, 27 Jun 2018 14:23:27 -0500
Subject: [PATCH 02/23] fix outdated documentation for 'sleuth_to_matrix'

---
 R/matrix.R              | 5 +++--
 man/sleuth_to_matrix.Rd | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/R/matrix.R b/R/matrix.R
index 4ffecf3..b0fff75 100644
--- a/R/matrix.R
+++ b/R/matrix.R
@@ -24,8 +24,9 @@
 #' @param which_df character vector of length one. Which type of data to use
 #' ("obs_norm" or "obs_raw")
 #' @param which_units character vector of length one. Which units to use ("tpm"
-#' or "est_counts")
-#' @return a matrix which contains a matrix of target_ids and transcript expression in \code{which_units}
+#' or "est_counts" (for transcript-level analyses) or "scaled_reads_per_base" (for gene-level analyses))
+#' @return a matrix which contains a matrix of target_ids and transcript (or gene) expression in \code{which_units}.
+#'   Note this currently does not support returning raw values for gene-level counts or TPMs.
 #' @examples
 #' sleuth_matrix <- sleuth_to_matrix(sleuth_obj, 'obs_norm', 'tpm')
 #' head(sleuth_matrix) # look at first 5 transcripts, sorted by name
diff --git a/man/sleuth_to_matrix.Rd b/man/sleuth_to_matrix.Rd
index bc4bc30..c5e0f6d 100644
--- a/man/sleuth_to_matrix.Rd
+++ b/man/sleuth_to_matrix.Rd
@@ -13,16 +13,16 @@ sleuth_to_matrix(obj, which_df, which_units)
 ("obs_norm" or "obs_raw")}
 
 \item{which_units}{character vector of length one. Which units to use ("tpm"
-or "est_counts")}
+or "est_counts" (for transcript-level analyses) or "scaled_reads_per_base" (for gene-level analyses))}
 }
 \value{
-a \code{list} with an attribute 'data', which contains a matrix of target_ids
-        and transcript expression in \code{which_units}
+a matrix which contains a matrix of target_ids and transcript (or gene) expression in \code{which_units}.
+  Note this currently does not support returning raw values for gene-level counts or TPMs.
 }
 \description{
 Convert a sleuth object to a matrix with the condition names.
 }
 \examples{
 sleuth_matrix <- sleuth_to_matrix(sleuth_obj, 'obs_norm', 'tpm')
-head(sleuth_matrix$data) # look at first 5 transcripts, sorted by name
+head(sleuth_matrix) # look at first 5 transcripts, sorted by name
 }

From 5b4a3308d69254c0fab838b188cd0a6bb75da54c Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Thu, 28 Jun 2018 16:48:32 -0500
Subject: [PATCH 03/23] fix bug found by @lmigueel in #190; switch condition to
 just check for obj so that this function can be used with p-value aggregation
 mode

---
 R/sleuth.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/sleuth.R b/R/sleuth.R
index 1c1e6bd..4025979 100644
--- a/R/sleuth.R
+++ b/R/sleuth.R
@@ -1173,13 +1173,13 @@ transcripts_from_gene <- function(obj, test, test_type,
 #' @export
 gene_from_gene <- function(obj, gene_colname, gene_name) {
 
-  if (!obj$gene_mode) {
+  if (is.null(obj$gene_column)) {
     stop("this sleuth object is in transcript mode. Please use 'transcripts_from_gene' instead.")
   }
 
   table <- as.data.frame(obj$target_mapping)
   if (gene_colname == obj$gene_column) {
-    if (!(gene_name %in% table[, eval(parse(text = obj$gene_column))])) {
+    if (!(gene_name %in% table[, obj$gene_column])) {
       stop("Couldn't find gene ", gene_name)
     } else {
       return(gene_name)

From 4da1c6c567934a98ce738ca8fa3afad76cef19e8 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Thu, 21 Jun 2018 17:51:10 -0500
Subject: [PATCH 04/23] speed up measurement model estimation: + this is done
 by using the whole matrix rather than modeling by row + an additional small
 change is to make target_id the first column, which means that the final
 summary table in the sleuth_fit object will have target_id first, instead of
 in the middle. This will ease user readability if decide to view the summary
 table directly for columns not selected in sleuth_results + add documentation
 for me_model

---
 R/measurement_error.R | 113 ++++++++++++++++++++----------------------
 man/me_model.Rd       |  43 ++++++++++++++++
 2 files changed, 98 insertions(+), 58 deletions(-)
 create mode 100644 man/me_model.Rd

diff --git a/R/measurement_error.R b/R/measurement_error.R
index 59cbddc..f0b80bd 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -151,19 +151,8 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   A <- solve(t(X) %*% X)
 
   msg("fitting measurement error models")
-  mes <- me_model_by_row(obj, X, obj$bs_summary, which_var)
-  tid <- names(mes)
-
-  mes_df <- dplyr::bind_rows(lapply(mes,
-    function(x) {
-      data.frame(rss = x$rss, sigma_sq = x$sigma_sq, sigma_q_sq = x$sigma_q_sq,
-        mean_obs = x$mean_obs, var_obs = x$var_obs)
-    }))
-
-  mes_df$target_id <- tid
-  rm(tid)
-
-  mes_df <- dplyr::mutate(mes_df, sigma_sq_pmax = pmax(sigma_sq, 0))
+  mes <- me_model(obj, X, obj$bs_summary, which_var)
+  mes_df <- mes$mes_df
 
   # FIXME: sometimes when sigma is negative the shrinkage estimation becomes NA
   # this is for the few set of transcripts, but should be able to just do some
@@ -194,7 +183,7 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   }
 
   obj$fits[[fit_name]] <- list(
-    models = mes,
+    models = mes$models,
     summary = l_smooth,
     beta_covars = beta_covars,
     formula = formula,
@@ -325,33 +314,6 @@ covar_beta <- function(sigma, X, A) {
   A %*% (t(X) %*% diag(sigma) %*% X) %*% A
 }
 
-# Measurement error model
-#
-# Fit the measurement error model across all samples
-#
-# @param obj a \code{sleuth} object
-# @param design a design matrix
-# @param bs_summary a list from \code{bs_sigma_summary}
-# @return a list with a bunch of objects that are useful for shrinking
-me_model_by_row <- function(obj, design, bs_summary, which_var = 'obs_counts') {
-  which_var <- match.arg(which_var, c('obs_counts', 'obs_tpm'))
-  if (which_var == "obs_counts")
-    sigma_var <- "sigma_q_sq"
-  else
-    sigma_var <- "sigma_q_sq_tpm"
-
-  stopifnot( all.equal(names(bs_summary[[sigma_var]]), rownames(bs_summary[[which_var]])) )
-  stopifnot( length(bs_summary[[sigma_var]]) == nrow(bs_summary[[which_var]]))
-
-  models <- lapply(1:nrow(bs_summary[[which_var]]),
-    function(i) {
-      me_model(design, bs_summary[[which_var]][i, ], bs_summary[[sigma_var]][i])
-    })
-  names(models) <- rownames(bs_summary[[which_var]])
-
-  models
-}
-
 # non-equal var
 #
 # word
@@ -565,24 +527,59 @@ gene_summary <- function(obj, which_column, transform = identity,
   list(obs_counts = obs_counts, sigma_q_sq = bs_sigma)
 }
 
-me_model <- function(X, y, sigma_q_sq) {
+#' Measurement error model
+#'
+#' Fit the measurement error model across all samples. This is an internal function
+#' called by \code{sleuth_fit}, and is not intended to be used directly by users.
+#'
+#' @param obj a \code{sleuth} object
+#' @param design a design matrix
+#' @param bs_summary a list from \code{bs_sigma_summary}
+#' @param which_var "obs_counts" (default) to model estimated counts;
+#'  "obs_tpm" to model TPMs
+#' @return a list with two items:
+#'    \itemize{
+#'       \item models the list of values returned by lm.fit. At the end of \code{sleuth_fit},
+#'         this object is found here: obj$fits$[[<model_name>]]$models
+#'       \item mes_df this data frame contains all of the important variables that are used for
+#'         variance shrinkage estimation and for testing. The final result of this table
+#'         is found here: obj$fits[[<model_name>]]$summary. This data.frame contains the
+#'         the following columns:
+#'         \itemize{
+#'           \item \code{target_id}: the feature ID (transcript or gene, depending on
+#'           \code{obj$gene_mode})
+#'           \item \code{rss}: the residual sum of squares from the linear model
+#'           \item \code{sigma_sq}: the estimated biological variance from the linear model
+#'           \item \code{mean_obs}: the mean of the transformed observations
+#'           \item \code{var_obs}: the raw variance of the transformed observations
+#'           \item \code{sigma_sq_pmax}: the greater of sigma_sq or 0. This prevents
+#'           negative estimates for the biological variance
+#'         }
+#'    }
+me_model <- function (obj, design, bs_summary, which_var = "obs_counts") {
+  which_var <- match.arg(which_var, c("obs_counts", "obs_tpm"))
+  if (which_var == "obs_counts")
+    sigma_var <- "sigma_q_sq"
+  else sigma_var <- "sigma_q_sq_tpm"
+
+  stopifnot(all.equal(names(bs_summary[[sigma_var]]), rownames(bs_summary[[which_var]])))
+  stopifnot(length(bs_summary[[sigma_var]]) == nrow(bs_summary[[which_var]]))
+
+  sigma_q_sq <- bs_summary[[sigma_var]]
+  y <- t(bs_summary[[which_var]])
+  X <- design
   n <- nrow(X)
   degrees_free <- n - ncol(X)
-
   ols_fit <- lm.fit(X, y)
-  rss <- sum(ols_fit$residuals ^ 2)
-  sigma_sq <- rss / (degrees_free) - sigma_q_sq
-
-  mean_obs <- mean(y)
-  var_obs <- var(y)
-
-  list(
-    ols_fit = ols_fit,
-    b1 = ols_fit$coefficients[2],
-    rss = rss,
-    sigma_sq = sigma_sq,
-    sigma_q_sq = sigma_q_sq,
-    mean_obs = mean_obs,
-    var_obs = var_obs
-    )
+  rss <- colSums(ols_fit$residuals^2)
+  sigma_sq <- rss/(degrees_free) - sigma_q_sq
+  mean_obs <- colMeans(y)
+  var_obs <- matrixStats::colVars(y)
+  mes_df <- data.frame(rss = rss, sigma_sq = sigma_sq, sigma_q_sq = sigma_q_sq,
+                       mean_obs = mean_obs, var_obs = var_obs)
+  mes_df <- data.frame(target_id = rownames(mes_df), mes_df)
+  mes_df$target_id <- as.character(mes_df$target_id)
+  rownames(mes_df) <- NULL
+  mes_df <- dplyr::mutate(mes_df, sigma_sq_pmax = pmax(sigma_sq, 0))
+  list(models = ols_fit, mes_df = mes_df)
 }
diff --git a/man/me_model.Rd b/man/me_model.Rd
new file mode 100644
index 0000000..70989f9
--- /dev/null
+++ b/man/me_model.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/measurement_error.R
+\name{me_model}
+\alias{me_model}
+\title{Measurement error model}
+\usage{
+me_model(obj, design, bs_summary, which_var = "obs_counts")
+}
+\arguments{
+\item{obj}{a \code{sleuth} object}
+
+\item{design}{a design matrix}
+
+\item{bs_summary}{a list from \code{bs_sigma_summary}}
+
+\item{which_var}{"obs_counts" (default) to model estimated counts;
+"obs_tpm" to model TPMs}
+}
+\value{
+a list with two items:
+   \itemize{
+      \item models the list of values returned by lm.fit. At the end of \code{sleuth_fit},
+        this object is found here: obj$fits$[[<model_name>]]$models
+      \item mes_df this data frame contains all of the important variables that are used for
+        variance shrinkage estimation and for testing. The final result of this table
+        is found here: obj$fits[[<model_name>]]$summary. This data.frame contains the
+        the following columns:
+        \itemize{
+          \item \code{target_id}: the feature ID (transcript or gene, depending on
+          \code{obj$gene_mode})
+          \item \code{rss}: the residual sum of squares from the linear model
+          \item \code{sigma_sq}: the estimated biological variance from the linear model
+          \item \code{mean_obs}: the mean of the transformed observations
+          \item \code{var_obs}: the raw variance of the transformed observations
+          \item \code{sigma_sq_pmax}: the greater of sigma_sq or 0. This prevents
+          negative estimates for the biological variance
+        }
+   }
+}
+\description{
+Fit the measurement error model across all samples. This is an internal function
+called by \code{sleuth_fit}, and is not intended to be used directly by users.
+}

From a289b4dc0fb6feea32dbe1d018f3d568f9ea75ee Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Thu, 21 Jun 2018 18:17:47 -0500
Subject: [PATCH 05/23] remove redundant code; this is taken care of in
 sleuth_fit

---
 R/likelihood.R | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/R/likelihood.R b/R/likelihood.R
index 675eb63..61ddff3 100644
--- a/R/likelihood.R
+++ b/R/likelihood.R
@@ -12,11 +12,6 @@ compute_likelihood <- function(obj, which_model) {
   # the observations can be recovered by:
   #   obj$fits$full$models[[1]]$ols_fit$residuals + fitted values
 
-  # TODO: move this elsewhere
-  obj$fits[[which_model]]$summary <- obj$fits[[which_model]]$summary[
-    match(names(obj$fits[[which_model]]$models),
-      obj$fits[[which_model]]$summary$target_id), ]
-
   all_likelihood <- sapply(seq_along(obj$fits[[which_model]]$models),
     function( i ) {
       cur_model <- obj$fits[[which_model]]$models[[ i ]]

From b7cdbc8686031072a93284f4ed2d9a433296231b Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Thu, 21 Jun 2018 18:31:11 -0500
Subject: [PATCH 06/23] change sleuth_lrt, sleuth_wt, and extract_model to
 handle the new sleuth_fit API: + the models slot for the sleuth_model object
 is now the one model with the full matrix + these functions have retained the
 old code to remain backwards compatible with older versions of sleuth

---
 R/likelihood.R        | 48 ++++++++++++++++++++++++++-----------------
 R/measurement_error.R | 12 ++++++++---
 R/model.R             | 22 +++++++++++++++-----
 3 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/R/likelihood.R b/R/likelihood.R
index 61ddff3..63f751e 100644
--- a/R/likelihood.R
+++ b/R/likelihood.R
@@ -4,29 +4,39 @@ compute_likelihood <- function(obj, which_model) {
   stopifnot(is(obj, "sleuth"))
   model_exists(obj, which_model)
 
-  # we basically do lapply on all of the models
-  #
   # the fitted values are here:
-  #   obj$fits[[which_model]]$models$ols_fit$fitted.values
+  #   obj$fits[[which_model]]$models$fitted.values
   #
   # the observations can be recovered by:
-  #   obj$fits$full$models[[1]]$ols_fit$residuals + fitted values
-
-  all_likelihood <- sapply(seq_along(obj$fits[[which_model]]$models),
-    function( i ) {
-      cur_model <- obj$fits[[which_model]]$models[[ i ]]
-      cur_mu <- cur_model$ols_fit$fitted.values
-      obs <- cur_model$ols_fit$residuals + cur_mu
-
-      cur_summary <- obj$fits[[which_model]]$summary
-
-      cur_var <- cur_summary[i, "smooth_sigma_sq_pmax"] +
-        cur_summary[i, "sigma_q_sq"]
-
-      sum(dnorm(obs, mean = cur_mu, sd = sqrt(cur_var), log = TRUE))
+  #   obj$fits[[which_model]]$models$residuals + fitted values
+
+  models <- obj$fits[[which_model]]$models
+  if (names(models)[1] == "coefficients") {
+    cur_model <- models
+    cur_mu <- cur_model$fitted.values
+    obs <- cur_model$residuals + cur_mu
+    cur_summary <- obj$fits[[which_model]]$summary
+    cur_var <- cur_summary$smooth_sigma_sq_pmax + cur_summary$sigma_q_sq
+    cur_var <- matrix(rep(cur_var, nrow(obs)), nrow = nrow(obs), byrow = TRUE)
+
+    all_likelihood <- colSums(dnorm(obs, mean = cur_mu, sd = sqrt(cur_var), log = TRUE))
+  } else {
+    # This is retained for backward compatibility with older versions of sleuth
+    all_likelihood <- sapply(seq_along(models),
+      function(i) {
+        cur_model <- models[[i]]
+        cur_mu <- cur_model$ols_fit$fitted.values
+        obs <- cur_model$ols_fit$residuals + cur_mu
+
+        cur_summary <- obj$fits[[which_model]]$summary
+
+        cur_var <- cur_summary[i, "smooth_sigma_sq_pmax"] +
+          cur_summary[i, "sigma_q_sq"]
+
+        sum(dnorm(obs, mean = cur_mu, sd = sqrt(cur_var), log = TRUE))
     })
-
-  names(all_likelihood) <- names(obj$fits[[which_model]]$models)
+    names(all_likelihood) <- names(models)
+  }
 
   obj$fits[[which_model]]$likelihood <- all_likelihood
 
diff --git a/R/measurement_error.R b/R/measurement_error.R
index f0b80bd..96f1e4b 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -263,11 +263,17 @@ sleuth_wt <- function(obj, which_beta, which_model = 'full') {
         paste(colnames(d_matrix[beta_i]), collapse = ' ')))
   }
 
-  b <- sapply(obj$fits[[ which_model ]]$models,
-    function(x) {
+  fit <- obj$fits[[which_model]]$models
+  if (names(fit)[1] == "coefficients") {
+    b <- fit$coefficients[beta_i, ]
+    names(b) <- colnames(fit$coefficients)
+  } else {
+    # This is retained for backward compatibility with older versions of sleuth
+    b <- sapply(fit, function(x) {
       x$ols_fit$coefficients[ beta_i ]
     })
-  names(b) <- names(obj$fits[[ which_model ]]$models)
+    names(b) <- names(obj$fits[[ which_model ]]$models)
+  }
 
   res <- obj$fits[[ which_model ]]$summary
   res$target_id <- as.character(res$target_id)
diff --git a/R/model.R b/R/model.R
index 3cd4bd4..e8f7abe 100644
--- a/R/model.R
+++ b/R/model.R
@@ -537,14 +537,26 @@ extract_model <- function(obj, which_model) {
       ". Please check  models(", substitute(obj), ") for fitted models.")
   }
 
-  res <- lapply(seq_along(obj$fits[[which_model]]$models),
+  fit <- obj$fits[[which_model]]
+  if (names(fit$models)[1] == "coefficients") {
+    coefficients <- fit$models$coefficients
+    target_ids <- rep(colnames(coefficients), each = nrow(coefficients))
+    terms <- rownames(coefficients)
+    sq_std_err <- as.numeric(sapply(fit$beta_covars, function(x) diag(x)))
+    res <- data.frame(target_id = as.character(target_ids), term = as.character(terms),
+                      estimate = as.numeric(coefficients), std_error = sqrt(sq_std_err))
+    res
+  } else {
+    # This is retained for backward compatibility with older versions of sleuth
+    res <- lapply(seq_along(fit$models),
     function(i) {
-      x <- obj$fits[[which_model]]$models[[i]]
+      x <- fit$models[[i]]
       coefficients <- coef(x$ols_fit)
       list(
-        target_id = rep_len(names(obj$fits[[which_model]]$models)[i], length(coefficients)),
+        target_id = rep_len(names(fit$models)[i], length(coefficients)),
         term = names(coefficients), estimate = coefficients,
-        std_error = sqrt(diag(obj$fits[[which_model]]$beta_covars[[i]])))
+        std_error = sqrt(diag(fit$beta_covars[[i]])))
     })
-  dplyr::bind_rows(res)
+    as.data.frame(dplyr::bind_rows(res))
+  }
 }

From 0a8034c293585166e94585cd42b01fd27250e5b9 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Tue, 24 Jul 2018 13:58:51 -0500
Subject: [PATCH 07/23] update how beta_covars is calculated and retrieved for
 sleuth_model objects: + speed up the calculation using matrix multiplication
 rather than lapply + calculate beta covariances for later use + bonus that
 this reduces memory footprint of the sleuth_model object further + update
 'extract_model' and 'sleuth_wt' to use the new matrix format

---
 R/measurement_error.R | 45 ++++++++++++++++++++++++++++---------------
 R/model.R             |  2 +-
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/R/measurement_error.R b/R/measurement_error.R
index 96f1e4b..0218f3d 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -169,14 +169,28 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
     smooth_sigma_sq_pmax = pmax(smooth_sigma_sq, sigma_sq))
 
   msg('computing variance of betas')
-  beta_covars <- lapply(1:nrow(l_smooth),
+  sigma <- rowSums(l_smooth[, c("smooth_sigma_sq_pmax", "sigma_q_sq")])
+  beta_covars <- sigma %*% t(diag(A))
+  # the full beta covariance matrix is symmetric
+  # so we just need one set of off-diagonals
+  off_diag_covars <- sigma %*% t(A[lower.tri(A)])
+  # we set the column names of the off-diagonal covariances
+  # as (beta #1):(beta #2)
+  # for example, if the formula is ~condition and
+  # the experiment has two conditions -- "A" and "B" --
+  # there would be one off-diagonal column with the label
+  # "(Intercept):conditionB"
+  colnames(off_diag_covars) <- unlist(lapply(1:(nrow(A)-1),
     function(i) {
-      row <- l_smooth[i, ]
-      with(row,
-          covar_beta(smooth_sigma_sq_pmax + sigma_q_sq, X, A)
-        )
+      c_names <- colnames(A)[i:ncol(A)]
+      r_name <- rownames(A)[i]
+      if (r_name == "(Intercept)") r_name <- "Intercept"
+      paste0("(", r_name, "):(", c_names, ")")
     })
-  names(beta_covars) <- l_smooth$target_id
+  )
+
+  beta_covars <- cbind(beta_covars, off_diag_covars)
+  rownames(beta_covars) <- l_smooth$target_id
 
   if ( is.null(obj$fits) ) {
     obj$fits <- list()
@@ -267,29 +281,30 @@ sleuth_wt <- function(obj, which_beta, which_model = 'full') {
   if (names(fit)[1] == "coefficients") {
     b <- fit$coefficients[beta_i, ]
     names(b) <- colnames(fit$coefficients)
+    se <- obj$fits[[which_model]]$beta_covars[, beta_i]
   } else {
     # This is retained for backward compatibility with older versions of sleuth
     b <- sapply(fit, function(x) {
       x$ols_fit$coefficients[ beta_i ]
     })
     names(b) <- names(obj$fits[[ which_model ]]$models)
+    se <- sapply(obj$fits[[ which_model ]]$beta_covars,
+      function(x) {
+        x[beta_i, beta_i]
+      })
   }
 
+  se <- sqrt( se )
+  se <- se[ names(b) ]
+
+  stopifnot( all.equal(names(b), names(se)) )
+
   res <- obj$fits[[ which_model ]]$summary
   res$target_id <- as.character(res$target_id)
   res <- res[match(names(b), res$target_id), ]
 
   stopifnot( all.equal(res$target_id, names(b)) )
 
-  se <- sapply(obj$fits[[ which_model ]]$beta_covars,
-    function(x) {
-      x[beta_i, beta_i]
-    })
-  se <- sqrt( se )
-  se <- se[ names(b) ]
-
-  stopifnot( all.equal(names(b), names(se)) )
-
   res <- dplyr::mutate(res,
     b = b,
     se_b = se,
diff --git a/R/model.R b/R/model.R
index e8f7abe..c91ba82 100644
--- a/R/model.R
+++ b/R/model.R
@@ -542,7 +542,7 @@ extract_model <- function(obj, which_model) {
     coefficients <- fit$models$coefficients
     target_ids <- rep(colnames(coefficients), each = nrow(coefficients))
     terms <- rownames(coefficients)
-    sq_std_err <- as.numeric(sapply(fit$beta_covars, function(x) diag(x)))
+    sq_std_err <- as.numeric(t(fit$beta_covars[, terms]))
     res <- data.frame(target_id = as.character(target_ids), term = as.character(terms),
                       estimate = as.numeric(coefficients), std_error = sqrt(sq_std_err))
     res

From 4176b7a692bb18d251912717ddadff6c48e55d11 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Wed, 27 Jun 2018 11:17:35 -0500
Subject: [PATCH 08/23] move sleuth shrinkage procedure to a separate function:
 + this will allow the option to supply custom shrinkage procedures + the
 default is now 'basic_shrink_fun', which contains the old sleuth shrinkage
 procedure done by 'sliding_window_grouping' and then 'shrink_df' + any custom
 function must take the list produced by me_model, and then output the mes_df
 within that list modified with at least one new column: "smooth_sigma_sq",
 the smooth variances + add updated documentation to 'sleuth_fit' + add
 documentation for 'basic_shrink_fun' + shrink_fun is now an additional slot
 on the sleuth_model object, to provide provenance for how the variance
 estimation was done

---
 NAMESPACE               |   1 +
 R/measurement_error.R   | 124 ++++++++++++++++++++++++++++++----------
 man/basic_shrink_fun.Rd |  51 +++++++++++++++++
 man/sleuth_fit.Rd       |  16 +++++-
 4 files changed, 159 insertions(+), 33 deletions(-)
 create mode 100644 man/basic_shrink_fun.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 01d495b..1e04e93 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -24,6 +24,7 @@ S3method(transform_status,sleuth_model)
 export("transform_fun_counts<-")
 export("transform_fun_tpm<-")
 export(basic_filter)
+export(basic_shrink_fun)
 export(bias_table)
 export(counts_to_fpkm)
 export(counts_to_tpm)
diff --git a/R/measurement_error.R b/R/measurement_error.R
index 0218f3d..01a3c53 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -51,8 +51,19 @@
 #'   the \code{'scaled_reads_per_base'} summary statistic.
 #' }
 #'
-#' Advanced options for the sliding window shrinkage procedure (these options are passed to
-#' \code{\link{sliding_window_grouping}}):
+#' Advanced options for the shrinkage procedure:
+#'
+#' \itemize{
+#'   \item \code{shrink_fun}: this function does the shrinkage procedure. It must take the list produced
+#'   by \code{\link{me_model}}, with the full model and the data.frame with means (the \code{'mean_obs'}
+#'   column) and variances (the \code{'sigma_sq_pmax'} column), as input, and must output a modified
+#'   data.frame from list$mes_df, with the required column \code{'smooth_sigma_sq'} containing the
+#'   smoothed variances. Additional columns are allowed, but will not be directly processed by
+#'   \code{\link{sleuth_results}}. The default option for this option is \code{\link{basic_shrink_fun}},
+#'   which is the procedure described in the original sleuth paper.
+#' }
+#'
+#' Advanced options for the default sleuth shrinkage procedure (see \code{\link{basic_shrink_fun}}):
 #'
 #' \itemize{
 #'   \item \code{n_bins}: the number of bins that the data should be split for the sliding window shrinkage
@@ -70,6 +81,7 @@
 #' so <- sleuth_fit(so, ~1, 'reduced')
 #' @seealso \code{\link{models}} for seeing which models have been fit,
 #' \code{\link{sleuth_prep}} for creating a sleuth object,
+#' \code{\link{basic_shrink_fun}} for the sleuth variance shrinkage procedure,
 #' \code{\link{sleuth_wt}} to test whether a coefficient is zero,
 #' \code{\link{sleuth_lrt}} to test nested models.
 #' @export
@@ -80,26 +92,10 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   extra_opts <- list(...)
   if ('which_var' %in% names(extra_opts)) {
     which_var <- extra_opts$which_var
+    which_var <- match.arg(which_var, c('obs_counts', 'obs_tpm'))
   } else {
     which_var <- 'obs_counts'
   }
-  if ('n_bins' %in% names(extra_opts)) {
-    n_bins <- extra_opts$n_bins
-  } else {
-    n_bins <- 100
-  }
-  if ('lwr' %in% names(extra_opts)) {
-    lwr <- extra_opts$lwr
-  } else {
-    lwr <- 0.25
-  }
-  if ('upr' %in% names(extra_opts)) {
-    upr <- extra_opts$lwr
-  } else {
-    upr <- 0.75
-  }
-
-  which_var <- match.arg(which_var, c('obs_counts', 'obs_tpm'))
 
   if (!is.null(obj$fits) && any(sapply(obj$fits, function(x) !x$transform_synced))) {
     stop("Your sleuth has fits which are not based on the current transform function. ",
@@ -152,19 +148,9 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
 
   msg("fitting measurement error models")
   mes <- me_model(obj, X, obj$bs_summary, which_var)
-  mes_df <- mes$mes_df
 
-  # FIXME: sometimes when sigma is negative the shrinkage estimation becomes NA
-  # this is for the few set of transcripts, but should be able to just do some
-  # simple fix
   msg('shrinkage estimation')
-  swg <- sliding_window_grouping(mes_df, 'mean_obs', 'sigma_sq_pmax',
-    n_bins = n_bins, lwr = lwr, upr = upr, ignore_zeroes = TRUE)
-  l_smooth <- shrink_df(swg, sqrt(sqrt(sigma_sq_pmax)) ~ mean_obs, 'iqr')
-  l_smooth <- dplyr::select(
-    dplyr::mutate(l_smooth, smooth_sigma_sq = shrink ^ 4),
-    -shrink)
-
+  l_smooth <- shrink_fun(mes, ...)
   l_smooth <- dplyr::mutate(l_smooth,
     smooth_sigma_sq_pmax = pmax(smooth_sigma_sq, sigma_sq))
 
@@ -203,6 +189,7 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
     formula = formula,
     design_matrix = X,
     transform_synced = TRUE,
+    shrink_fun = shrink_fun,
     which_var = which_var)
 
   class(obj$fits[[fit_name]]) <- 'sleuth_model'
@@ -585,7 +572,6 @@ me_model <- function (obj, design, bs_summary, which_var = "obs_counts") {
 
   stopifnot(all.equal(names(bs_summary[[sigma_var]]), rownames(bs_summary[[which_var]])))
   stopifnot(length(bs_summary[[sigma_var]]) == nrow(bs_summary[[which_var]]))
-
   sigma_q_sq <- bs_summary[[sigma_var]]
   y <- t(bs_summary[[which_var]])
   X <- design
@@ -604,3 +590,79 @@ me_model <- function (obj, design, bs_summary, which_var = "obs_counts") {
   mes_df <- dplyr::mutate(mes_df, sigma_sq_pmax = pmax(sigma_sq, 0))
   list(models = ols_fit, mes_df = mes_df)
 }
+
+#' Default Sleuth Variance Shrinkage Estimation
+#'
+#' This function performs the default procedure to shrink variance estimation.
+#'
+#' @param me_list the list produced by \code{\link{me_model}}.
+#' @param ... advanced options to tweak the shrinkage procedure. See "details"
+#' for more information.
+#'
+#' @return a modified data.frame with the following additional columns:
+#'
+#' \itemize{
+#'   \item \code{'iqr'}: boolean for whether this feature was within the interquartile
+#'   range and used for shrinkage estimation
+#'   \item \code{'failed_ise'}: boolean for whether this feature failed the initial
+#'   round of shrinkage estimation using interpolation from the LOESS curve. This feature's
+#'   shrunken variance was estimated using the 'exact' option for LOESS.
+#'   \item \code{'smooth_sigma_sq'}: the estimated biological variance after the shrinkage
+#'   procedure
+#' }
+#'
+#' @details The default procedure does the following steps: it first bins the
+#' features by the mean of each feature's observations. It then takes the
+#' interquartile (25th-75th percentile) of variances within each bin.
+#' Those means and variances are then input into a LOESS to calculate
+#' a curve using the function mean ~ (variance)^1/4. The LOESS is then used
+#' to interpolate the shrunken variance for all features. The larger of the
+#' variance estimated from the bootstraps or the variance from the shrinkage procedure is used.
+#'
+#' Advanced options for the shrinkage procedure (these options are passed to
+#' \code{\link{sliding_window_grouping}}):
+#'
+#' \itemize{
+#'   \item \code{n_bins}: the number of bins that the data should be split for the sliding window shrinkage
+#'   using the mean-variance curve. The default is 100.
+#'   \item \code{lwr}: the lower range of variances within each bin that should be included for the shrinkage
+#'   procedure. The default is 0.25 (meaning the 25th percentile).
+#'   \item \code{upr}: the upper range of variances within each bin that should be included for the shrinkage
+#'   procedure. The default is 0.75 (meaning the 75th percentile).
+#' }
+#' @export
+basic_shrink_fun <- function(me_list, ...) {
+  extra_opts <- list(...)
+  mes_df <- me_list$mes_df
+  good_opts <- c('n_bins', 'lwr', 'upr')
+  if (any(!(names(extra_opts) %in% good_opts))) {
+    bad_opts <- names(extra_opts)[which(!(names(extra_opts) %in% good_opts))]
+    warning("Some unrecognized options were passed to 'basic_shrink_fun': ", paste(bad_opts, sep = ", "),
+            ". These are being ignored.")
+  }
+
+  if ('n_bins' %in% names(extra_opts)) {
+    n_bins <- extra_opts$n_bins
+  } else {
+    n_bins <- 100
+  }
+  if ('lwr' %in% names(extra_opts)) {
+    lwr <- extra_opts$lwr
+  } else {
+    lwr <- 0.25
+  }
+  if ('upr' %in% names(extra_opts)) {
+    upr <- extra_opts$lwr
+  } else {
+    upr <- 0.75
+  }
+
+  swg <- sliding_window_grouping(mes_df, 'mean_obs', 'sigma_sq_pmax',
+    n_bins = n_bins, lwr = lwr, upr = upr, ignore_zeroes = TRUE)
+  l_smooth <- shrink_df(swg, sqrt(sqrt(sigma_sq_pmax)) ~ mean_obs, 'iqr')
+  l_smooth <- dplyr::select(
+    dplyr::mutate(l_smooth, smooth_sigma_sq = shrink ^ 4),
+    -shrink)
+
+  l_smooth
+}
diff --git a/man/basic_shrink_fun.Rd b/man/basic_shrink_fun.Rd
new file mode 100644
index 0000000..bd15d77
--- /dev/null
+++ b/man/basic_shrink_fun.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/measurement_error.R
+\name{basic_shrink_fun}
+\alias{basic_shrink_fun}
+\title{Default Sleuth Variance Shrinkage Estimation}
+\usage{
+basic_shrink_fun(me_list, ...)
+}
+\arguments{
+\item{me_list}{the list produced by \code{\link{me_model}}.}
+
+\item{...}{advanced options to tweak the shrinkage procedure. See "details"
+for more information.}
+}
+\value{
+a modified data.frame with the following additional columns:
+
+\itemize{
+  \item \code{'iqr'}: boolean for whether this feature was within the interquartile
+  range and used for shrinkage estimation
+  \item \code{'failed_ise'}: boolean for whether this feature failed the initial
+  round of shrinkage estimation using interpolation from the LOESS curve. This feature's
+  shrunken variance was estimated using the 'exact' option for LOESS.
+  \item \code{'smooth_sigma_sq'}: the estimated biological variance after the shrinkage
+  procedure
+}
+}
+\description{
+This function performs the default procedure to shrink variance estimation.
+}
+\details{
+The default procedure does the following steps: it first bins the
+features by the mean of each feature's observations. It then takes the
+interquartile (25th-75th percentile) of variances within each bin.
+Those means and variances are then input into a LOESS to calculate
+a curve using the function mean ~ (variance)^1/4. The LOESS is then used
+to interpolate the shrunken variance for all features. The larger of the
+variance estimated from the bootstraps or the variance from the shrinkage procedure is used.
+
+Advanced options for the shrinkage procedure (these options are passed to
+\code{\link{sliding_window_grouping}}):
+
+\itemize{
+  \item \code{n_bins}: the number of bins that the data should be split for the sliding window shrinkage
+  using the mean-variance curve. The default is 100.
+  \item \code{lwr}: the lower range of variances within each bin that should be included for the shrinkage
+  procedure. The default is 0.25 (meaning the 25th percentile).
+  \item \code{upr}: the upper range of variances within each bin that should be included for the shrinkage
+  procedure. The default is 0.75 (meaning the 75th percentile).
+}
+}
diff --git a/man/sleuth_fit.Rd b/man/sleuth_fit.Rd
index 7aa2f26..0f5eac8 100644
--- a/man/sleuth_fit.Rd
+++ b/man/sleuth_fit.Rd
@@ -48,8 +48,19 @@ Advanced options for modeling choice:
   the \code{'scaled_reads_per_base'} summary statistic.
 }
 
-Advanced options for the sliding window shrinkage procedure (these options are passed to
-\code{\link{sliding_window_grouping}}):
+Advanced options for the shrinkage procedure:
+
+\itemize{
+  \item \code{shrink_fun}: this function does the shrinkage procedure. It must take the list produced
+  by \code{\link{me_model}}, with the full model and the data.frame with means (the \code{'mean_obs'}
+  column) and variances (the \code{'sigma_sq_pmax'} column), as input, and must output a modified
+  data.frame from list$mes_df, with the required column \code{'smooth_sigma_sq'} containing the
+  smoothed variances. Additional columns are allowed, but will not be directly processed by
+  \code{\link{sleuth_results}}. The default option for this option is \code{\link{basic_shrink_fun}},
+  which is the procedure described in the original sleuth paper.
+}
+
+Advanced options for the default sleuth shrinkage procedure (see \code{\link{basic_shrink_fun}}):
 
 \itemize{
   \item \code{n_bins}: the number of bins that the data should be split for the sliding window shrinkage
@@ -69,6 +80,7 @@ so <- sleuth_fit(so, ~1, 'reduced')
 \seealso{
 \code{\link{models}} for seeing which models have been fit,
 \code{\link{sleuth_prep}} for creating a sleuth object,
+\code{\link{basic_shrink_fun}} for the sleuth variance shrinkage procedure,
 \code{\link{sleuth_wt}} to test whether a coefficient is zero,
 \code{\link{sleuth_lrt}} to test nested models.
 }

From a90ac317ae059a556e67bbabdc5df1281088c5c3 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Fri, 29 Jun 2018 14:26:16 -0500
Subject: [PATCH 09/23] add sanity checks for shrink fun

---
 R/measurement_error.R | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/R/measurement_error.R b/R/measurement_error.R
index 01a3c53..82177d7 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -97,6 +97,16 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
     which_var <- 'obs_counts'
   }
 
+  if ('shrink_fun' %in% names(extra_opts)) {
+    shrink_fun <- extra_opts$shrink_fun
+  } else {
+    shrink_fun <- basic_shrink_fun
+  }
+
+  if (!is(shrink_fun, 'function')) {
+    stop("The advanced 'shrink_fun' argument must be a function.")
+  }
+
   if (!is.null(obj$fits) && any(sapply(obj$fits, function(x) !x$transform_synced))) {
     stop("Your sleuth has fits which are not based on the current transform function. ",
          "Please rerun sleuth_prep using the current or new function before running sleuth_fit.")

From 73c61ff230c1e784032b959445469ebb6fb338e6 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Tue, 17 Jul 2018 18:04:36 -0500
Subject: [PATCH 10/23] modify how the shrinkage function is called to prevent
 warnings: + remove specified extra options needed in sleuth_fit call + use
 'do.call' instead of directly calling shrink_fun + this solution was
 discussed [here](https://stackoverflow.com/a/7028630)

---
 R/measurement_error.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/measurement_error.R b/R/measurement_error.R
index 82177d7..b0fbf39 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -93,12 +93,14 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   if ('which_var' %in% names(extra_opts)) {
     which_var <- extra_opts$which_var
     which_var <- match.arg(which_var, c('obs_counts', 'obs_tpm'))
+    extra_opts$which_var <- NULL
   } else {
     which_var <- 'obs_counts'
   }
 
   if ('shrink_fun' %in% names(extra_opts)) {
     shrink_fun <- extra_opts$shrink_fun
+    extra_opts$shrink_fun <- NULL
   } else {
     shrink_fun <- basic_shrink_fun
   }
@@ -160,7 +162,9 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   mes <- me_model(obj, X, obj$bs_summary, which_var)
 
   msg('shrinkage estimation')
-  l_smooth <- shrink_fun(mes, ...)
+  shrink_opts <- list(mes)
+  shrink_opts <- append(shrink_opts, extra_opts)
+  l_smooth <- do.call(shrink_fun, shrink_opts)
   l_smooth <- dplyr::mutate(l_smooth,
     smooth_sigma_sq_pmax = pmax(smooth_sigma_sq, sigma_sq))
 

From a02423d38dc087e1a0ec63e1416bd26a2d321839 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Thu, 28 Jun 2018 15:06:28 -0500
Subject: [PATCH 11/23] add a variance shrinkage function to use limma's
 procedure

---
 NAMESPACE               |  1 +
 R/measurement_error.R   | 63 +++++++++++++++++++++++++++++++++++++++++
 man/limma_shrink_fun.Rd | 42 +++++++++++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 man/limma_shrink_fun.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 1e04e93..03022b8 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -37,6 +37,7 @@ export(get_bootstraps)
 export(get_quantile)
 export(is_kallisto_subset)
 export(kallisto_table)
+export(limma_shrink_fun)
 export(log_transform)
 export(melt_bootstrap_sleuth)
 export(models)
diff --git a/R/measurement_error.R b/R/measurement_error.R
index b0fbf39..d57c616 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -680,3 +680,66 @@ basic_shrink_fun <- function(me_list, ...) {
 
   l_smooth
 }
+
+#' Limma-style variance shrinkage estimation
+#'
+#' This function uses limma's procedure for variance shrinkage estimation
+#' See ?limma::squeezeVar for more details for how it works and advanced options
+#' available.
+#'
+#' @param me_list the list produced by \code{link{me_model}}
+#' @param ... advanced options that will be passed to the squeezeVar function from limma.
+#'   the two advanced options available are:
+#'   \itemize{
+#'     \item \code{robust}: boolean for whether the estimation should be made robust to
+#'     outlier variances. The default is \code{FALSE}.
+#'     \item \code{winsor.tail.p}: if robust is \code{TRUE}, this is a numeric vector of
+#'     length 1 or 2 to give the left and right tail proportions of 'x' to Winsorize.
+#'   }
+#'   please see ?limma::squeezeVar for more information.
+#'
+#' @return a data.frame taken from me_list$mes_df and modified to have the additional
+#' columns:
+#'   \itemize{
+#'     \item \code{'limma_s2_prior'}: contains the location of the distribution; this
+#'     corresponds to the 's2.prior' output by 'limma::eBayes'.
+#'     \item \code{'limma_df_prior'}: contains the degrees of freedom for the
+#'     prior distribution. This will be the same if robust is \code{FALSE} and
+#'     different for each feature if robust is \code{TRUE}.
+#'     \item \code{'smooth_sigma_sq'}: contains limma's estimated shrunken variances.
+#'     This corresponds to the 's2.post' output by 'limma::eBayes'.
+#'   }
+#' @seealso \code{\link{me_model}} for the measurement error model function
+#' @export
+limma_shrink_fun <- function(me_list, ...) {
+  extra_opts <- list(...)
+  good_opts <- c('robust', 'winsor.tail.p')
+  if (any(!(names(extra_opts) %in% good_opts))) {
+    bad_opts <- names(extra_opts)[which(!(names(extra_opts) %in% good_opts))]
+    warning("Some unrecognized options were passed to 'limma_shrink_fun': ", paste(bad_opts, sep = ", "),
+            ". These are being ignored.")
+  }
+
+  if ('robust' %in% names(extra_opts)) {
+    robust <- extra_opts$robust
+  } else {
+    robust <- FALSE
+  }
+  if ('winsor.tail.p' %in% names(extra_opts)) {
+    winsor.tail.p <- extra_opts$winsor.tail.p
+  } else {
+    winsor.tail.p <- c(0.05, 0.1)
+  }
+
+  limma_sv <- suppressWarnings(limma::squeezeVar(
+    var = me_list$mes_df$sigma_sq_pmax,
+    df = me_list$models$df.residual,
+    covariate = me_list$mes_df$mean_obs,
+    robust = robust,
+    winsor.tail.p = winsor.tail.p))
+  l_smooth <- dplyr::mutate(me_list$mes_df,
+                            limma_s2_prior = limma_sv$var.prior,
+                            limma_df_prior = limma_sv$df.prior,
+                            smooth_sigma_sq = limma_sv$var.post)
+  l_smooth
+}
diff --git a/man/limma_shrink_fun.Rd b/man/limma_shrink_fun.Rd
new file mode 100644
index 0000000..54799e8
--- /dev/null
+++ b/man/limma_shrink_fun.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/measurement_error.R
+\name{limma_shrink_fun}
+\alias{limma_shrink_fun}
+\title{Limma-style variance shrinkage estimation}
+\usage{
+limma_shrink_fun(me_list, ...)
+}
+\arguments{
+\item{me_list}{the list produced by \code{link{me_model}}}
+
+\item{...}{advanced options that will be passed to the squeezeVar function from limma.
+the two advanced options available are:
+\itemize{
+  \item \code{robust}: boolean for whether the estimation should be made robust to
+  outlier variances. The default is \code{FALSE}.
+  \item \code{winsor.tail.p}: if robust is \code{TRUE}, this is a numeric vector of
+  length 1 or 2 to give the left and right tail proportions of 'x' to Winsorize.
+}
+please see ?limma::squeezeVar for more information.}
+}
+\value{
+a data.frame taken from me_list$mes_df and modified to have the additional
+columns:
+  \itemize{
+    \item \code{'limma_s2_prior'}: contains the location of the distribution; this
+    corresponds to the 's2.prior' output by 'limma::eBayes'.
+    \item \code{'limma_df_prior'}: contains the degrees of freedom for the
+    prior distribution. This will be the same if robust is \code{FALSE} and
+    different for each feature if robust is \code{TRUE}.
+    \item \code{'smooth_sigma_sq'}: contains limma's estimated shrunken variances.
+    This corresponds to the 's2.post' output by 'limma::eBayes'.
+  }
+}
+\description{
+This function uses limma's procedure for variance shrinkage estimation
+See ?limma::squeezeVar for more details for how it works and advanced options
+available.
+}
+\seealso{
+\code{\link{me_model}} for the measurement error model function
+}

From 55e061dacfd193193f55559fe6ab905e371555c5 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Fri, 29 Jun 2018 13:49:02 -0500
Subject: [PATCH 12/23] add limma to imports list to reflect adding the
 squeezeVar function

---
 DESCRIPTION           | 3 ++-
 NAMESPACE             | 1 +
 R/measurement_error.R | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2a567bc..baea0c0 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,7 +26,8 @@ Imports:
     matrixStats,
     pheatmap,
     shiny,
-    aggregation
+    aggregation,
+    limma
 Suggests:
     MASS,
     lintr,
diff --git a/NAMESPACE b/NAMESPACE
index 03022b8..e88b10c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -86,5 +86,6 @@ importFrom(data.table,fread)
 importFrom(dplyr,"%>%")
 importFrom(lazyeval,interp)
 importFrom(lazyeval,lazy)
+importFrom(limma,squeezeVar)
 importFrom(rhdf5,h5write)
 importFrom(rhdf5,h5write.default)
diff --git a/R/measurement_error.R b/R/measurement_error.R
index d57c616..7e3c116 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -710,6 +710,7 @@ basic_shrink_fun <- function(me_list, ...) {
 #'     This corresponds to the 's2.post' output by 'limma::eBayes'.
 #'   }
 #' @seealso \code{\link{me_model}} for the measurement error model function
+#' @importFrom limma squeezeVar
 #' @export
 limma_shrink_fun <- function(me_list, ...) {
   extra_opts <- list(...)

From f211f2fabb938edac3904050305fd7f50ae5fa25 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Fri, 29 Jun 2018 14:26:51 -0500
Subject: [PATCH 13/23] fix bug that occurs when calculating degrees of freedom
 between two models in sleuth_lrt

---
 R/likelihood.R | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/R/likelihood.R b/R/likelihood.R
index 63f751e..a305872 100644
--- a/R/likelihood.R
+++ b/R/likelihood.R
@@ -96,8 +96,13 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
 
   test_statistic <- 2 * (a_ll - n_ll)
 
-  degrees_free <- obj$fits[[null_model]]$models[[1]]$ols_fit$df.residual -
-    obj$fits[[alt_model]]$models[[1]]$ols_fit$df.residual
+  if (names(obj$fits[[alt_model]]$models)[1] == "coefficients") {
+    degrees_free <- obj$fits[[null_model]]$models$df.residual -
+      obj$fits[[alt_model]]$models$df.residual
+  } else {
+    degrees_free <- obj$fits[[null_model]]$models[[1]]$ols_fit$df.residual -
+      obj$fits[[alt_model]]$models[[1]]$ols_fit$df.residual
+  }
 
   # P(chisq > test_statistic)
   p_value <- pchisq(test_statistic, degrees_free, lower.tail = FALSE)

From d33e6205c19bf907cba3901f1ab1cf243271a4cb Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Fri, 29 Jun 2018 14:28:02 -0500
Subject: [PATCH 14/23] remove a line referencing a basic_shrink_fun specific
 column

---
 R/likelihood.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/likelihood.R b/R/likelihood.R
index a305872..3bd998c 100644
--- a/R/likelihood.R
+++ b/R/likelihood.R
@@ -110,7 +110,6 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
     test_stat = test_statistic, pval = p_value)
   result <- dplyr::mutate(result, qval = p.adjust(pval, method = "BH"))
   model_info <- data.table::data.table(obj$fits[[null_model]]$summary)
-  model_info <- dplyr::select(model_info, -c(iqr))
   result <- dplyr::left_join(
     data.table::data.table(result),
     model_info,

From 243716bb1a9ccfd45fb7f015243478d8bf29ca1b Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Tue, 24 Jul 2018 13:34:32 -0500
Subject: [PATCH 15/23] fix memory leak from formulas that have captured
 external environments + in testing, this was capturing the full sleuth
 object, essentially doubling the size of the final sleuth object

---
 R/measurement_error.R | 1 +
 R/sleuth.R            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/R/measurement_error.R b/R/measurement_error.R
index 7e3c116..145fba1 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -148,6 +148,7 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   # TODO: check if model matrix is full rank
   X <- NULL
   if ( is(formula, 'formula') ) {
+    environment(formula) <- new.env()
     X <- model.matrix(formula, obj$sample_to_covariates)
   } else {
     if ( is.null(colnames(formula)) ) {
diff --git a/R/sleuth.R b/R/sleuth.R
index d0b4e5f..6770151 100644
--- a/R/sleuth.R
+++ b/R/sleuth.R
@@ -385,6 +385,7 @@ sleuth_prep <- function(
 
   design_matrix <- NULL
   if (is(full_model, 'formula')) {
+    environment(full_model) <- new.env()
     design_matrix <- model.matrix(full_model, sample_to_covariates)
   } else if (is(full_model, 'matrix')) {
     if (is.null(colnames(full_model))) {

From f12f6ad5d934756d5b67885b77ef6108d638c32d Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Mon, 13 Aug 2018 15:02:24 -0500
Subject: [PATCH 16/23] replace outdated aperm with t

---
 R/bootstrap.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/bootstrap.R b/R/bootstrap.R
index 6e1d11f..72459f9 100644
--- a/R/bootstrap.R
+++ b/R/bootstrap.R
@@ -406,8 +406,8 @@ process_bootstrap <- function(i, samp_name, kal_path,
                                est_count_sf = est_count_sf)
 
   if (read_bootstrap_tpm) {
-    bs_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm,
-                                eff_len))
+    bs_tpm <- t(apply(bs_mat, 1, counts_to_tpm,
+                      eff_len))
     colnames(bs_tpm) <- colnames(bs_mat)
 
     # gene level code is analogous here to below code
@@ -439,8 +439,8 @@ process_bootstrap <- function(i, samp_name, kal_path,
       rm(tidy_tpm) # these tables are very large
     }
     bs_tpm <- transform_fun_tpm(bs_tpm[, which_ids])
-    bs_quant_tpm <- aperm(apply(bs_tpm, 2,
-                                quantile))
+    bs_quant_tpm <- t(apply(bs_tpm, 2,
+                            quantile))
     colnames(bs_quant_tpm) <- c("min", "lower", "mid",
                                 "upper", "max")
     bs_quants$tpm <- bs_quant_tpm
@@ -495,8 +495,8 @@ process_bootstrap <- function(i, samp_name, kal_path,
 
   bs_mat <- transform_fun_counts(bs_mat[, which_ids])
   if (extra_bootstrap_summary) {
-    bs_quant_est_counts <- aperm(apply(bs_mat, 2,
-                                       quantile))
+    bs_quant_est_counts <- t(apply(bs_mat, 2,
+                                   quantile))
     colnames(bs_quant_est_counts) <- c("min", "lower",
                                        "mid", "upper", "max")
     bs_quants$est_counts <- bs_quant_est_counts

From 17242ba7a57a6986690703a584336dce07280899 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Mon, 13 Aug 2018 15:03:12 -0500
Subject: [PATCH 17/23] add sanity check for adding off diagonal beta
 covariances + now only happens if there is more than one row, indicating that
 there is a beta beyond the intercept-only + this prevents an error occurring
 if the intercept-only model is used

---
 R/measurement_error.R | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/R/measurement_error.R b/R/measurement_error.R
index 145fba1..ffd2770 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -174,23 +174,24 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   beta_covars <- sigma %*% t(diag(A))
   # the full beta covariance matrix is symmetric
   # so we just need one set of off-diagonals
-  off_diag_covars <- sigma %*% t(A[lower.tri(A)])
-  # we set the column names of the off-diagonal covariances
-  # as (beta #1):(beta #2)
-  # for example, if the formula is ~condition and
-  # the experiment has two conditions -- "A" and "B" --
-  # there would be one off-diagonal column with the label
-  # "(Intercept):conditionB"
-  colnames(off_diag_covars) <- unlist(lapply(1:(nrow(A)-1),
-    function(i) {
-      c_names <- colnames(A)[i:ncol(A)]
-      r_name <- rownames(A)[i]
-      if (r_name == "(Intercept)") r_name <- "Intercept"
-      paste0("(", r_name, "):(", c_names, ")")
-    })
-  )
-
-  beta_covars <- cbind(beta_covars, off_diag_covars)
+  if (nrow(A) > 1) {
+    off_diag_covars <- sigma %*% t(A[lower.tri(A)])
+    # we set the column names of the off-diagonal covariances
+    # as (beta #1):(beta #2)
+    # for example, if the formula is ~condition and
+    # the experiment has two conditions -- "A" and "B" --
+    # there would be one off-diagonal column with the label
+    # "(Intercept):conditionB"
+    colnames(off_diag_covars) <- unlist(lapply(1:(nrow(A)-1),
+      function(i) {
+        c_names <- colnames(A)[(i+1):ncol(A)]
+        r_name <- rownames(A)[i]
+        if (r_name == "(Intercept)") r_name <- "Intercept"
+        paste0("(", r_name, "):(", c_names, ")")
+      })
+    )
+    beta_covars <- cbind(beta_covars, off_diag_covars)
+  }
   rownames(beta_covars) <- l_smooth$target_id
 
   if ( is.null(obj$fits) ) {

From 0d7b5cb9405c7b32475f64f745748c3acb6b6a08 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Mon, 13 Aug 2018 15:42:50 -0500
Subject: [PATCH 18/23] change API for transform functions to explicitly
 normalize within the function + process bootstrap now normalizes TPMs by TPM
 size factors instead of indirectly by count size factor + add 'sf' argument
 to the default log_transform function, taking a single size factor when
 normalizing bootstraps, or a vector of size factors equal in length to the
 number of samples when normalizing the observed counts + add 'tpm_transform'
 as an equivalent default function for TPMs. + this will facilitate an
 alternative approach for sleuth-ALR that decouples normalization and
 transformation steps

---
 R/bootstrap.R  |  9 +++----
 R/read_write.R |  5 ++--
 R/sleuth.R     | 66 +++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/R/bootstrap.R b/R/bootstrap.R
index 72459f9..11f014e 100644
--- a/R/bootstrap.R
+++ b/R/bootstrap.R
@@ -380,7 +380,7 @@ dcast_bootstrap.kallisto <- function(obj, units, nsamples = NULL) {
 
 # Function to process bootstraps for parallelization
 process_bootstrap <- function(i, samp_name, kal_path,
-                              num_transcripts, est_count_sf,
+                              num_transcripts, est_count_sf, est_tpm_sf,
                               read_bootstrap_tpm, gene_mode,
                               extra_bootstrap_summary,
                               target_id, mappings, which_ids,
@@ -402,8 +402,7 @@ process_bootstrap <- function(i, samp_name, kal_path,
   eff_len <- rhdf5::h5read(kal_path$path, "aux/eff_lengths")
   bs_mat <- read_bootstrap_mat(fname = kal_path$path,
                                num_bootstraps = num_bootstrap,
-                               num_transcripts = num_transcripts,
-                               est_count_sf = est_count_sf)
+                               num_transcripts = num_transcripts)
 
   if (read_bootstrap_tpm) {
     bs_tpm <- t(apply(bs_mat, 1, counts_to_tpm,
@@ -438,7 +437,7 @@ process_bootstrap <- function(i, samp_name, kal_path,
       bs_tpm <- as.matrix(bs_tpm[, -1])
       rm(tidy_tpm) # these tables are very large
     }
-    bs_tpm <- transform_fun_tpm(bs_tpm[, which_ids])
+    bs_tpm <- transform_fun_tpm(bs_tpm[, which_ids], sf = est_tpm_sf)
     bs_quant_tpm <- t(apply(bs_tpm, 2,
                             quantile))
     colnames(bs_quant_tpm) <- c("min", "lower", "mid",
@@ -493,7 +492,7 @@ process_bootstrap <- function(i, samp_name, kal_path,
     rm(tidy_bs, scaled_bs)
   }
 
-  bs_mat <- transform_fun_counts(bs_mat[, which_ids])
+  bs_mat <- transform_fun_counts(bs_mat[, which_ids], sf = est_count_sf)
   if (extra_bootstrap_summary) {
     bs_quant_est_counts <- t(apply(bs_mat, 2,
                                    quantile))
diff --git a/R/read_write.R b/R/read_write.R
index 26c23c3..97bc100 100644
--- a/R/read_write.R
+++ b/R/read_write.R
@@ -171,11 +171,10 @@ h5check <- function(fname, group, name) {
 # @return a matrix with each row being a bootstrap sample
 read_bootstrap_mat <- function(fname,
   num_bootstraps,
-  num_transcripts,
-  est_count_sf) {
+  num_transcripts) {
   bs_mat <- matrix(ncol = num_bootstraps, nrow = num_transcripts)
   for (i in 1:ncol(bs_mat)) {
-    bs_mat[, i] <- rhdf5::h5read(fname, paste0("bootstrap/bs", i - 1)) / est_count_sf
+    bs_mat[, i] <- rhdf5::h5read(fname, paste0("bootstrap/bs", i - 1))
   }
   bs_mat <- t(bs_mat)
   target_id <- as.character(rhdf5::h5read(fname, "aux/ids"))
diff --git a/R/sleuth.R b/R/sleuth.R
index 6770151..988b362 100644
--- a/R/sleuth.R
+++ b/R/sleuth.R
@@ -35,10 +35,40 @@ basic_filter <- function(row, min_reads = 5, min_prop = 0.47) {
 #'
 #' @param x numeric that must be >=0. represents an individual observed count (one transcript in one sample).
 #' @param offset numeric offset to prevent taking the log of 0.
-#' @return log(x + offset)
+#' @param sf the size factor to normalize the data. Must either be a single number or a numeric vector
+#'  equal in length to the number of samples in x.
+#' @return if sf is NULL, log(x + offset); else log(x / sf + offset)
 #' @export
-log_transform <- function(x, offset=0.5) {
-  log(x + offset)
+log_transform <- function(x, offset = 0.5, sf = 1) {
+  if (length(sf) == 1) {
+    log(x / sf + offset)
+  } else if (length(sf) == ncol(x)) {
+    log(t(t(x) / sf) + offset)
+  } else {
+    stop("please provide a single size factor or a size factor vector equal ",
+         "in length to the number of samples")
+  }
+}
+
+#' Default TPM transformation
+#'
+#' The default transformation for converting the normalized TPMs.
+#'
+#' @param x numeric that must be >=0. represents an individual observed count (one transcript in one sample).
+#' @param offset numeric offset to prevent taking the log of 0.
+#' @param sf the size factor to normalize the data. Must either be a single number or a numeric vector
+#'  equal in length to the number of samples in x.
+#' @return if sf is NULL, log(x + offset); else log(x / sf + offset)
+#' @export
+tpm_transform <- function(x, sf = 1) {
+  if (length(sf) == 1) {
+    x / sf
+  } else if (length(sf) == ncol(x)) {
+    t(t(x) / sf)
+  } else {
+    stop("please provide a single size factor or a size factor vector equal ",
+         "in length to the number of samples")
+  }
 }
 
 # currently defunct
@@ -145,10 +175,17 @@ filter_df_by_groups <- function(df, fun, group_df, ...) {
 #'   Advanced Options for the Transformation Step:
 #'   (NOTE: Be sure you know what you're doing before you use these options)
 #'   \itemize{
-#'     \item \code{transform_fun_counts}: the transformation that should be applied
-#'     to the normalized counts. Default is \code{'log(x+0.5)'} (i.e. natural log with 0.5 offset).
-#'     \item \code{transform_fun_tpm}: the transformation that should be applied
-#'     to the TPM values. Default is \code{'x'} (i.e. the identity function / no transformation)
+#'     \item \code{transform_fun_counts}: the transformation that should be
+#'     applied to the normalized counts. Any custom function must take the
+#'     matrix of raw counts and either a single size factor when normalizing
+#'     bootstrap counts or a vector of size factors when normalizing the
+#'     observed counts. Default is \code{'log(x/sf+0.5)'} (i.e. natural log of
+#'     the normalized counts with 0.5 offset).
+#'     \item \code{transform_fun_tpm}: the transformation that should be
+#'     applied to the TPM values. Any custom function must take the matrix of
+#'     raw TPMs and either a single size factor when normalizing bootstrap TPMs
+#'     or a vector of size factors when normalizing the observed TPMs. Default
+#'     is \code{'x/sf'} (i.e. the normalized TPMs with no further transformation)
 #'   }
 #'
 #'   Advanced Options for Gene Aggregation:
@@ -228,7 +265,7 @@ sleuth_prep <- function(
   if ("transform_fun_tpm" %in% names(extra_opts)) {
     transform_fun_tpm <- extra_opts$transform_fun_tpm
   } else {
-    transform_fun_tpm <- identity
+    transform_fun_tpm <- tpm_transform
   }
   if ("gene_mode" %in% names(extra_opts)) {
     gene_mode <- extra_opts$gene_mode
@@ -620,7 +657,7 @@ sleuth_prep <- function(
       samp_name <- sample_to_covariates$sample[i]
       kal_path <- get_kallisto_path(kal_dirs[i])
       process_bootstrap(i, samp_name, kal_path,
-                        num_transcripts, est_counts_sf[[i]],
+                        num_transcripts, est_counts_sf[[i]], tpm_sf[[i]],
                         read_bootstrap_tpm, ret$gene_mode,
                         extra_bootstrap_summary,
                         target_id, mappings, which_ids, ret$gene_column,
@@ -678,9 +715,9 @@ sleuth_prep <- function(
     }
 
     sigma_q_sq <- sigma_q_sq[order(names(sigma_q_sq))]
-    obs_counts <- ret$transform_fun_counts(obs_counts)
+    obs_counts <- ret$transform_fun_counts(obs_counts, sf = est_counts_sf)
     obs_counts <- obs_counts[order(rownames(obs_counts)),]
-    obs_tpm <- ret$transform_fun_tpm(obs_tpm)
+    obs_tpm <- ret$transform_fun_tpm(obs_tpm, sf = tpm_sf)
     obs_tpm <- obs_tpm[order(rownames(obs_tpm)),]
 
     ret$bs_summary <- list(obs_counts = obs_counts, sigma_q_sq = sigma_q_sq)
@@ -1010,10 +1047,11 @@ kallisto_table <- function(obj,
 #
 # @param obj is a sleuth object
 # @param value_name either "est_counts" or "tpm"
+# @param which_df "obs_raw" or "obs_norm"
 # @return a matrix with the appropriate names
-obs_to_matrix <- function(obj, value_name) {
-
-  obs_counts <- reshape2::dcast(obj$obs_norm, target_id ~ sample,
+obs_to_matrix <- function(obj, value_name, which_df = "obs_raw") {
+  which_df <- match.arg(which_df, c("obs_raw", "obs_norm"))
+  obs_counts <- reshape2::dcast(obj[[which_df]], target_id ~ sample,
     value.var = value_name)
 
   obs_counts <- as.data.frame(obs_counts)

From 79e99cf535be3bf784e2d2fa5a76d5308fc3b393 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Mon, 20 Aug 2018 10:49:58 -0500
Subject: [PATCH 19/23] fix bug where sleuth_model summary table was in a
 different order than the target IDs in the fit + this now guarantees that the
 fit object, the summary table, and the beta_covars table are all in the same
 order

---
 R/measurement_error.R | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/R/measurement_error.R b/R/measurement_error.R
index ffd2770..a7eb93e 100644
--- a/R/measurement_error.R
+++ b/R/measurement_error.R
@@ -169,6 +169,13 @@ sleuth_fit <- function(obj, formula = NULL, fit_name = NULL, ...) {
   l_smooth <- dplyr::mutate(l_smooth,
     smooth_sigma_sq_pmax = pmax(smooth_sigma_sq, sigma_sq))
 
+  tids <- colnames(mes$models$coefficients)
+  l_smooth <- l_smooth[match(tids, l_smooth$target_id), ]
+  if (!identical(l_smooth$target_id, tids)) {
+    stop("Something went wrong during the shrinkage estimation step. ",
+         "The fit summary target IDs don't match the observed data target IDs")
+  }
+
   msg('computing variance of betas')
   sigma <- rowSums(l_smooth[, c("smooth_sigma_sq_pmax", "sigma_q_sq")])
   beta_covars <- sigma %*% t(diag(A))

From d733061eb943cf73874416f169ebbf17812d0f43 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Tue, 21 Aug 2018 15:11:02 -0500
Subject: [PATCH 20/23] fix eval statements for scatter plot on shiny: + switch
 to rlang::eval_tidy + add rlang to NAMESPACE and import list + fixes issue
 #194

---
 DESCRIPTION | 3 ++-
 NAMESPACE   | 1 +
 R/shiny.R   | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2a567bc..2ba3070 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -21,6 +21,7 @@ Imports:
     tidyr,
     reshape2,
     rhdf5,
+    rlang,
     parallel,
     lazyeval,
     matrixStats,
@@ -33,4 +34,4 @@ Suggests:
     testthat,
     knitr
 VignetteBuilder: knitr
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.0
diff --git a/NAMESPACE b/NAMESPACE
index bb5f80d..48c983a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -87,3 +87,4 @@ importFrom(lazyeval,interp)
 importFrom(lazyeval,lazy)
 importFrom(rhdf5,h5write)
 importFrom(rhdf5,h5write.default)
+importFrom(rlang,eval_tidy)
diff --git a/R/shiny.R b/R/shiny.R
index bd37f6e..faae5d7 100644
--- a/R/shiny.R
+++ b/R/shiny.R
@@ -26,6 +26,7 @@
 #' @param options additional options which are sent to shiny
 #' @param ... additional parameters sent to plotting functions
 #' @return a \code{\link{shinyApp}} result
+#' @importFrom rlang eval_tidy
 #' @export
 #' @seealso \code{\link{sleuth_fit}}, \code{\link{sleuth_live_settings}}, \code{\link{sleuth_deploy}}
 sleuth_live <- function(obj, settings = sleuth_live_settings(),
@@ -1205,8 +1206,8 @@ sleuth_live <- function(obj, settings = sleuth_live_settings(),
         use_filtered = input$scatter_filt,
         offset = as.numeric(input$scatter_offset))
       # get the data in the form that it is displayed in the plot
-      x <- eval(p$mapping$x, envir = p$data)
-      y <- eval(p$mapping$y, envir = p$data)
+      x <- rlang::eval_tidy(p$mapping$x, data = p$data)
+      y <- rlang::eval_tidy(p$mapping$y, data = p$data)
       rv_scatter$data <- data.frame(target_id = p$data$target_id, x = x, y = y,
         stringsAsFactors = FALSE)
       saved_plots_and_tables$scatter_plt <- p

From 653908d9d7b0dc608fc12894864ee389c79b8746 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Mon, 5 Nov 2018 13:38:41 -0600
Subject: [PATCH 21/23] update sanity check for gene_mode and pval_aggregate in
 sleuth_results

---
 R/model.R | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/R/model.R b/R/model.R
index 3cd4bd4..eabb415 100644
--- a/R/model.R
+++ b/R/model.R
@@ -394,7 +394,17 @@ sleuth_results <- function(obj, test, test_type = 'wt',
   # }
 
   if (obj$gene_mode && pval_aggregate) {
-    stop("This shouldn't happen. Please report this issue.")
+    if (obj$pval_aggregate) {
+      stop("Both 'gene_mode' and 'pval_aggregate' are TRUE in this sleuth ",
+           "object. This shouldn't happen. Please report this issue.")
+    } else {
+      stop("'pval_aggregate' is set to TRUE, but the sleuth object has ",
+           "already gone through count aggregation when 'gene_mode' was set ",
+           "to TRUE in 'sleuth_prep'. If you intend count aggregation, omit ",
+           "the 'pval_aggregate' argument here. If you intend p-value ",
+           "aggregation, repeat 'sleuth_prep' without the 'gene_mode' ",
+           "argument.")
+    }
   }
 
   if (pval_aggregate && is.null(obj$gene_column)) {

From 4dc43552626fd2649972bc87b5b412a8e390680e Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Sat, 16 Mar 2019 11:29:21 -0500
Subject: [PATCH 22/23] fix bug where obs_to_matrix was using obs_raw rather
 than obs_norm

---
 R/sleuth.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/sleuth.R b/R/sleuth.R
index 0fd1a1e..fe117d1 100644
--- a/R/sleuth.R
+++ b/R/sleuth.R
@@ -1049,7 +1049,7 @@ kallisto_table <- function(obj,
 # @param value_name either "est_counts" or "tpm"
 # @param which_df "obs_raw" or "obs_norm"
 # @return a matrix with the appropriate names
-obs_to_matrix <- function(obj, value_name, which_df = "obs_raw") {
+obs_to_matrix <- function(obj, value_name, which_df = "obs_norm") {
   which_df <- match.arg(which_df, c("obs_raw", "obs_norm"))
   obs_counts <- reshape2::dcast(obj[[which_df]], target_id ~ sample,
     value.var = value_name)

From b3b1c58c79d78a00b1bcf48566df1e6c762c2417 Mon Sep 17 00:00:00 2001
From: warrenmcg <warren-mcgee@fsm.northwestern.edu>
Date: Mon, 18 Mar 2019 13:59:37 -0500
Subject: [PATCH 23/23] small version bump

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 87004fb..0701590 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: sleuth
 Title: Tools for investigating RNA-Seq
-Version: 0.30.0
+Version: 0.30.0-1
 Authors@R: c(
   person("Harold", "Pimentel", , "haroldpimentel@gmail.com", role = c("aut", "cre")),
   person("Warren", "McGee", , "warren-mcgee@fsm.northwestern.edu", role = "aut"))