diff --git a/DESCRIPTION b/DESCRIPTION index 2054c14b..a072bbff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,8 @@ Suggests: pander, xtable, rrelaxiv, - magrittr + magrittr, + MASS Enhances: CBPS, haven diff --git a/NAMESPACE b/NAMESPACE index 05ebf3c9..16ba496a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -164,6 +164,7 @@ importFrom(stats,terms) importFrom(stats,terms.formula) importFrom(stats,update) importFrom(stats,update.formula) +importFrom(stats,var) importFrom(tibble,as_tibble) importFrom(tibble,enframe) importFrom(tibble,tibble) diff --git a/NEWS.md b/NEWS.md index ab8d00b9..3ecca2aa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,11 @@ - `optmatch:::scoreCaliper()` gains an optional `within=` argument (#245) +- `match_on(z~x, method="rank_mahalanobis", within=foo)` makes + better use of restrictions in foo to contain time/memory costs. +- For rank Mahalanobis with underlying covariances pooled across + treatment and control groups, you can now use + `match_on(z~x, method="pooled_cov_rank_mahalanobis")`. - Updates to internal C++ code ## Changes in **optmatch** Version 0.10.7 diff --git a/R/match_on.R b/R/match_on.R index 7064f0f8..64e89df9 100644 --- a/R/match_on.R +++ b/R/match_on.R @@ -274,8 +274,14 @@ match_on.bigglm <- function(x, #' redundancies among the variables by scaling down variables contributions in #' proportion to their correlations with other included variables.) #' -#' Euclidean distance is also available, via \code{method="euclidean"}, and -#' ranked, Mahalanobis distance, via \code{method="rank_mahalanobis"}. +#' Euclidean distance is also available, via \code{method="euclidean"}, as +#' are two flavors of ranked-based Mahalanobis distance, via +#' \code{method="rank_mahalanobis"} or \code{method="pooled_cov_rank_mahalanobis"}. +#' Either rank-transforms the covariates first; they differ in whether +#' subsequent covariance of thus-transformed covariates is calculated +#' on all subjects or by pooling of with-group covariances across +#' treatment and control. The \code{method=} argument can be abbreviated +#' in the usual way (via [base::pmatch()]). #' #' The treatment indicator \code{Z} as noted above must either be numeric #' (1 representing treated units and 0 control units) or logical @@ -404,15 +410,19 @@ match_on.formula <- function(x, methodname <- as.character(class(method)) } - which.method <- pmatch(methodname, c("mahalanobis", "euclidean", "rank_mahalanobis", "function"), 4) + which.method <- pmatch(methodname, + c("mahalanobis", "euclidean", + "rank_mahalanobis", "pooled_cov_rank_mahalanobis", + "function"), 5) tmp <- switch(which.method, - makedist(z, data, compute_mahalanobis, within), - makedist(z, data, compute_euclidean, within), - makedist(z, data, compute_rank_mahalanobis, within), - { - warning("Passing a user-defined `method` to `match_on.formula` is not supported and results are not guaranteed. User-defined distances should use `match_on.function` instead.") - makedist(z, data, match.fun(method), within) - } + makedist(z, data, compute_mahalanobis, within), + makedist(z, data, compute_euclidean, within), + makedist(z, data, compute_rank_mahalanobis, within), + makedist(z, data, compute_pooled_cov_rank_mahalanobis, within), + { + warning("Passing a user-defined `method` to `match_on.formula` is not supported and results are not guaranteed. User-defined distances should use `match_on.function` instead.") + makedist(z, data, match.fun(method), within) + } ) rm(mf) @@ -509,19 +519,7 @@ compute_mahalanobis <- function(index, data, z) { cv <- mt + mc rm(mt, mc) - inv.scale.matrix <- try(solve(cv), silent = TRUE) - - if (inherits(inv.scale.matrix,"try-error")) { - dnx <- dimnames(cv) - s <- svd(cv) - nz <- (s$d > sqrt(.Machine$double.eps) * s$d[1]) - if (!any(nz)) stop("covariance has rank zero") - - inv.scale.matrix <- s$v[, nz] %*% (t(s$u[, nz])/s$d[nz]) - dimnames(inv.scale.matrix) <- dnx[2:1] - rm(dnx, s, nz) - } - + inv.scale.matrix <- safe_invert(cv) rm(cv) return(mahalanobisHelper(data, index, inv.scale.matrix)) @@ -548,19 +546,50 @@ compute_rank_mahalanobis <- function(index, data, z) { if (is.null(rownames(data)) | !all(index %in% rownames(data))) stop("data must have row names matching index") - # begin workaround solution to #128 - all_treated <- rownames(data)[as.logical(z)] - all_control <- rownames(data)[!z] - all_indices <- expand.grid(all_treated, all_control, - KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE) - all_indices <- paste(all_indices[[1]], all_indices[[2]], sep="%@%") - short_indices <- paste(index[,1], index[,2], sep="%@%") - indices <- match(short_indices, all_indices) - if (any(is.na(indices))) stop("Unanticipated problem. (Make sure row names of data don't use the string '%@%'.)") - # Now, since `r_smahal` is ignoring its `index` argument anyway: - rankdists <- sqrt(r_smahal(NULL, data, z)) - rankdists <- rankdists[indices] - return(rankdists) + data <- apply(data, 2, rank) + n <- nrow(data) + m <- cov(data) + cv <- scale_addressing_ties(nrow(data), cov(data)) + inv.scale.matrix <- safe_invert(cv) + rm(cv) + + return(mahalanobisHelper(data, index, inv.scale.matrix)) +} + +compute_pooled_cov_rank_mahalanobis <- function(index, data, z) { + if (!all(is.finite(data))) + stop("Infinite or NA values detected in data for Mahalanobis computations.") + + if (is.null(rownames(data)) | !all(index %in% rownames(data))) + stop("data must have row names matching index") + + data <- apply(data, 2, rank) + + if (sum(z) == 1) { + mt <- 0 # Addressing #168 + } else { + treated <- data[z, ,drop = FALSE] + nt <- nrow(treated) + mt <- scale_addressing_ties(nt, cov(treated)) + mt <- mt * (sum(z) - 1) / (length(z) - 2) + } + + if (sum(!z) == 1) { + mc <- 0 # Addressing #168 + } else { + control <- data[!z, ,drop = FALSE] + nc <- nrow(control) + mc <- scale_addressing_ties(nc, cov(control)) + mc <- mc * (sum(!z) - 1) / (length(!z) - 2) + } + + cv <- mt + mc + rm(mt, mc) + + inv.scale.matrix <- safe_invert(cv) + rm(cv) + + return(mahalanobisHelper(data, index, inv.scale.matrix)) } #' @details \bold{First argument (\code{x}): \code{function}.} The passed function diff --git a/R/utilities.R b/R/utilities.R index ed3aa2e0..79bf2d7c 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -138,3 +138,30 @@ missing_x_msg <- function(x_str, data_str, ...) { paste(data_str, "$", x_str, sep=""), msg_tail) } + +#' @importFrom stats var +scale_addressing_ties <- function(n, cv) { + vuntied <- var(1:n) + rat <- sqrt(vuntied/diag(cv)) + if (length(rat) > 1) { + diag_rat <- diag(rat) + } else { + diag_rat <- as.matrix(rat) + } + return(diag_rat %*% cv %*% diag_rat) +} + +safe_invert <- function(x) { + inv.scale.matrix <- try(solve(x), silent = TRUE) + + if (inherits(inv.scale.matrix,"try-error")) { + dnx <- dimnames(x) + s <- svd(x) + nz <- (s$d > sqrt(.Machine$double.eps) * s$d[1]) + if (!any(nz)) stop("covariance has rank zero") + + inv.scale.matrix <- s$v[, nz] %*% (t(s$u[, nz])/s$d[nz]) + dimnames(inv.scale.matrix) <- dnx[2:1] + } + return(inv.scale.matrix) +} diff --git a/man/match_on-methods.Rd b/man/match_on-methods.Rd index 3c5a349f..7a4a340b 100644 --- a/man/match_on-methods.Rd +++ b/man/match_on-methods.Rd @@ -208,8 +208,14 @@ Details for each particular first type of argument follow: redundancies among the variables by scaling down variables contributions in proportion to their correlations with other included variables.) - Euclidean distance is also available, via \code{method="euclidean"}, and - ranked, Mahalanobis distance, via \code{method="rank_mahalanobis"}. + Euclidean distance is also available, via \code{method="euclidean"}, as + are two flavors of ranked-based Mahalanobis distance, via + \code{method="rank_mahalanobis"} or \code{method="pooled_cov_rank_mahalanobis"}. + Either rank-transforms the covariates first; they differ in whether + subsequent covariance of thus-transformed covariates is calculated + on all subjects or by pooling of with-group covariances across + treatment and control. The \code{method=} argument can be abbreviated + in the usual way (via [base::pmatch()]). The treatment indicator \code{Z} as noted above must either be numeric (1 representing treated units and 0 control units) or logical diff --git a/tests/testthat/test.match_on.R b/tests/testthat/test.match_on.R index 12c71393..4247ccec 100644 --- a/tests/testthat/test.match_on.R +++ b/tests/testthat/test.match_on.R @@ -174,6 +174,7 @@ test_that("Issue 87: NA's in data => unmatchable, but retained, units in distanc expect_equivalent(f("mahalanobis"), expectedM) expect_equivalent(f("euclid"), expectedM) expect_equivalent(f("rank_mahal"), expectedM) + expect_equivalent(f("pooled_cov"), expectedM) cal1 <- caliper(match_on(z~x1, data=d), width=1e3) expect_equivalent(g(as.matrix(match_on(z ~ x1 + x2, data = d, diff --git a/tests/testthat/test.rank.mahal.R b/tests/testthat/test.rank.mahal.R index 4ff46fd7..05905816 100644 --- a/tests/testthat/test.rank.mahal.R +++ b/tests/testthat/test.rank.mahal.R @@ -111,7 +111,7 @@ test_that("Fix for #128 (`compute_rank_mahalanobis` ignores index argument) hold reference_rankmahal <- compute_smahal(z, X) - indices <- expand.grid(rownames(reference_rankmahal), colnames(reference_rankmahal)) + indices <- expand.grid(rownames(reference_rankmahal), colnames(reference_rankmahal)) indices <- as.matrix(indices) expect_equivalent(optmatch:::compute_rank_mahalanobis(indices, X, as.logical(z)), reference_rankmahal[1L:numdists]) @@ -125,3 +125,42 @@ test_that("Fix for #128 (`compute_rank_mahalanobis` ignores index argument) hold }) + +is_scalar_multiple <- function(A, B) { + # Ensure dimensions match + if (!all(dim(A) == dim(B))) { + return(FALSE) + } + + # Find positions where B is non-zero to avoid division by zero + non_zero_positions <- B != 0 + + # Compute element-wise ratio where B != 0 + ratios <- A[non_zero_positions] / B[non_zero_positions] + + # Check if all ratios are (approximately) equal + return(all(abs(ratios - ratios[1]) < 1e-8)) +} + +test_that("compute_pooled_cov_rank_mahalanobis results match ordinary Mahalanobis's", { + ## nr number of samples + nr <- 10L + z <- integer(nr) + ## two outcomes: 0 (from initialization), and 1 (assigned below randomly) + z[sample(1:nr, nr / 2L)] <- 1L + + ## Goal: two groups with the same within-group variance and no rank ties + df <- data.frame(z = z, X = integer(nr)) + df[df$z == 0, 'X'] <- seq(1, by=2, len = nr / 2) # odds + df[df$z == 1, 'X'] <- seq(2, by=2, len = nr / 2) # evens + + A <- match_on(z~., data=df, method="pooled_cov") + B <- match_on(z~., data=df, method="mahalanobis") + # Check if all ratios are (approximately) equal + expect_true(is_scalar_multiple(A, B)) + + ez <- exactMatch(z~., data=df) + A <- match_on(z~., data=df, method="pooled_cov", within=ez) + B <- match_on(z~., data=df, method="mahalanobis", within=ez) + expect_true(is_scalar_multiple(A, B)) +}) diff --git a/tests/testthat/test.utilities.R b/tests/testthat/test.utilities.R index b7dcb127..8b4bf131 100644 --- a/tests/testthat/test.utilities.R +++ b/tests/testthat/test.utilities.R @@ -2,6 +2,8 @@ # Tests for utility functions ################################################################################ +library(MASS) + context("Utility Functions") test_that("toZ", { @@ -60,3 +62,25 @@ test_that("#159 - toZ for labelled", { expect_true(TRUE) # avoiding empty test warning } }) + +test_that("scale_addressing_ties", { + x <- cbind(sample(1:5, 10, replace=TRUE), + sample(1:5, 10, replace=TRUE)) + y <- scale_addressing_ties(nrow(x), cov(x)) + dy <- diag(y) + expect_equal(dy, rep(var(1:nrow(x)), length(dy))) +}) + +test_that("safe_invert", { + ## full rank symmetric square matrix + A <- matrix(runif(25), 5, 5) + symmetric_matrix <- A %*% t(A) + inv_A <- safe_invert(A) + expect_equal(inv_A, solve(A)) + + ## rank deficient symmetric square matrix + B <- matrix(runif(15), 5, 3) + symmetric_matrix <- B %*% t(B) + inv_B <- safe_invert(B) + expect_equal(inv_B, ginv(B)) +})