9 trans preds t #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

mmyrte merged 27 commits into main from 9-trans-preds-t

Dec 2, 2025

DESCRIPTION

-Original file line number
+Diff line change
@@ Expand Up / @@ -28,8 +28,35 @@ Imports: @@
         terra
     Suggests:
         tinytest,
-        quarto
+        quarto,
+        ranger
     VignetteBuilder: quarto
     Config/testthat/edition: 3
-    LinkingTo:
+    LinkingTo:
         Rcpp
+    Collate:
+        'RcppExports.R'
+        'alloc_params_t.R'
+        'coords_t.R'
+        'covariance_filter.R'
+        'parquet_duckdb.R'
+        'evoland_db.R'
+        'evoland_db_neighbors.R'
+        'evoland_db_tables.R'
+        'evoland_db_views.R'
+        'grrf_filter.r'
+        'init.R'
+        'intrv_masks_t.R'
+        'intrv_meta_t.R'
+        'lulc_data_t.R'
+        'lulc_meta_t.R'
+        'neighbors_t.R'
+        'periods_t.R'
+        'pred_data_t.R'
+        'pred_meta_t.R'
+        'trans_meta_t.R'
+        'trans_models_t.R'
+        'trans_preds_t.R'
+        'util.R'
+        'util_download.R'
+        'util_terra.R'

NAMESPACE

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ S3method(print,intrv_masks_t) @@
     S3method(print,intrv_meta_t)
     S3method(print,lulc_data_t)
     S3method(print,lulc_meta_t)
+    S3method(print,neighbors_t)
     S3method(print,periods_t)
     S3method(print,pred_data_t)
     S3method(print,pred_meta_t)
@@ Expand All / @@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t) @@
     S3method(validate,intrv_meta_t)
     S3method(validate,lulc_data_t)
     S3method(validate,lulc_meta_t)
+    S3method(validate,neighbors_t)
     S3method(validate,periods_t)
     S3method(validate,pred_data_t)
     S3method(validate,pred_data_t_bool)
@@ Expand All / @@ -37,24 +39,27 @@ export(as_intrv_masks_t) @@
     export(as_intrv_meta_t)
     export(as_lulc_data_t)
     export(as_lulc_meta_t)
+    export(as_neighbors_t)
     export(as_periods_t)
     export(as_pred_data_t)
     export(as_pred_meta_t)
     export(as_trans_meta_t)
     export(as_trans_models_t)
     export(as_trans_preds_t)
-    export(compute_neighbors)
+    export(covariance_filter)
     export(create_coords_t_square)
     export(create_intrv_meta_t)
     export(create_intrv_meta_t_row)
     export(create_lulc_meta_t)
+    export(create_neighbors_t)
     export(create_periods_t)
     export(create_pred_meta_t)
     export(create_trans_meta_t)
-    export(create_trans_preds_t)
     export(download_and_verify)
     export(evoland_db)
     export(extract_using_coords_t)
+    export(grrf_filter)
+    export(parquet_duckdb)
     export(print_rowwise_yaml)
     export(validate)
     importFrom(Rcpp,sourceCpp)
@@ Expand Down @@

R/RcppExports.R

-Original file line number
+Diff line change
@@ -1,7 +1,7 @@
     # Generated by using Rcpp::compileAttributes() -> do not edit by hand
     # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
-    distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) {
-        .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution)
+    distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) {
+        .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet)
     }

R/coords_t.R

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -34,9 +34,9 @@ as_coords_t <- function(x) {
  
          geom_polygon = list()

        )

      }

      cast_dt_col(x, "id_coord", as.integer)

      cast_dt_col(x, "id_coord", "int")

      if (!is.null(x[["region"]])) {

        cast_dt_col(x, "region", as.factor)

        cast_dt_col(x, "region", "factor")

      }

      new_evoland_table(

        x,

R/covariance_filter.R

-Original file line number
+Diff line change
@@ -0,0 +1,168 @@
+    #' Two stage covariate filtering
+    #'
+    #' The `covariance_filter` returns a set of covariates for land use land cover change
+    #' (LULCC) models based on a two-stage variable selection: a first statistical fit
+    #' estimates a covariate's quality for a given prediction task. A second step selects
+    #' all variables below a given correlation threshold: We iterate over a correlation
+    #' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e.
+    #' candidates) greater than the given threshold are dropped from the full set of
+    #' candidates. This candidate selection is retained and used to select the next column,
+    #' until no further columns are left to investigate. The columns that were iterated over
+    #' are those returned as a character vector of selected variable names.
+    #'
+    #' @param data A data.table of target variable and candidate covariates to be filtered;
+    #'        wide format with one predictor per column.
+    #' @param result_col Name of the column representing the transition results (0: no
+    #'        trans, 1: trans)
+    #' @param rank_fun Optional function to compute ranking scores for each covariate.
+    #'        Should take arguments (x, y, weights, ...) and return a single numeric value
+    #'        (lower = better). Defaults to polynomial GLM p-value ranking.
+    #' @param weights Optional vector of weights to be used in the ranking function. Defaults to
+    #'        class-balanced weights
+    #' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation
+    #'        coefficients above this threshold will be filtered out. Default is 0 (no filtering).
+    #' @param ... Additional arguments passed to rank_fun.
+    #'
+    #' @return A set of column names (covariates) to retain
+    #'
+    #' @details
+    #' The function first ranks covariates using the provided ranking function (default:
+    #' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson)
+    #' correlated variables based on the correlation cutoff threshold, preserving variables
+    #' in order of their ranking. See
+    #' <https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r> for
+    #' where the concept came from. The original author was Antoine Adde, with edits by
+    #' Benjamin Black. A similar mechanism is found in <https://github.com/antadde/covsel/>.
+    #'
+    #' @name covariance_filter
+    #'
+    #' @export
+    covariance_filter <- function(
+      data,
+      result_col = "result",
+      rank_fun = rank_poly_glm,
+      weights = compute_balanced_weights(data[[result_col]]),
+      corcut = 0.7,
+      ...
+    ) {
+      # Early return for single covariate
+      if (ncol(data) == 1) {
+        return(data)
+      }
+      data.table::setDT(data)
+      # Validate binary outcome
+      stopifnot(
+        "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1
+      )
+      # Compute ranking scores for all covariates (vectorized where possible)
+      scores <- vapply(
+        data[, -..result_col],
+        rank_fun,
+        FUN.VALUE = numeric(1),
+        y = data[[result_col]],
+        weights = weights,
+        ...
+      )
+      # Sort by scores (lower = better/more significant)
+      ranked_order <- names(sort(scores))
+      # If no correlation filtering needed, return ranked predictors
+      if (corcut == 1) {
+        return(ranked_order)
+      }
+      # Compute correlation matrix once
+      cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs"))
+      # Iteratively select covariates based on correlation threshold
+      select_by_correlation(cor_mat, corcut)
+    }
+    #' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns
+    #' the lower p value for each of the polynomial terms
+    #' @param x A numeric vector representing a single covariate
+    #' @param y A binary outcome vector (0/1)
+    #' @param weights Optional weights vector
+    #' @keywords internal
+    rank_poly_glm <- function(x, y, weights = NULL, ...) {
+      fit <- glm.fit(
+        x = cbind(1, poly(x, degree = 2, simple = TRUE)),
+        y = y,
+        family = quasibinomial(),
+        weights = weights
+      )
+      # Get p-values for linear and quadratic terms
+      coef_summary <- summary.glm(fit)$coefficients
+      # Return minimum p-value (most significant term)
+      min(coef_summary[2:3, 4], na.rm = TRUE)
+    }
+    #' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary
+    #' outcomes; returns a numeric vector
+    #' @param trans_result Binary outcome vector (0/1)
+    #' @param legacy Bool, use legacy weighting?
+    #' @keywords internal
+    compute_balanced_weights <- function(trans_result, legacy = FALSE) {
+      n_total <- length(trans_result)
+      n_trans <- sum(trans_result)
+      n_non_trans <- sum(!trans_result)
+      # Compute inverse frequency weights
+      weights <- numeric(n_total)
+      if (legacy) {
+        # I found this weighting in evoland-plus-legacy, but the models wouldn't converge
+        # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r
+        # This is actually just setting the underrepresented class to the rounded imbalance ratio
+        weights[!trans_result] <- 1
+        weights[trans_result] <- round(n_non_trans / n_trans)
+        return(weights)
+      }
+      # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y))
+      # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint
+      # This weighting maintains the exact imbalance ratio
+      weights[trans_result] <- n_total / (2 * n_trans)
+      weights[!trans_result] <- n_total / (2 * n_non_trans)
+      weights
+    }
+    #' @describeIn covariance_filter Implements the iterative selection procedure.
+    #' @param cor_mat Absolute correlation matrix
+    #' @param corcut Correlation cutoff threshold
+    #' @keywords internal
+    select_by_correlation <- function(cor_mat, corcut) {
+      var_names <- colnames(cor_mat)
+      # Early return if all correlations are below threshold
+      if (all(cor_mat[lower.tri(cor_mat)] < corcut)) {
+        return(var_names)
+      }
+      selected <- character(0)
+      remaining_idx <- seq_along(var_names)
+      while (length(remaining_idx) > 0) {
+        # Select the first remaining variable (highest ranked)
+        current_var <- remaining_idx[1]
+        selected <- c(selected, var_names[current_var])
+        # Find variables with correlation <= corcut with current variable
+        # (excluding the variable itself)
+        keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut)
+        remaining_idx <- remaining_idx[keep_idx]
+      }
+      selected
+    }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

9 trans preds t #18

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!