Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9185877
add a covariance filter function
mmyrte Nov 27, 2025
1c7a1f5
update docs
mmyrte Nov 27, 2025
6809db2
better docs
mmyrte Nov 27, 2025
6db820e
drop id_trans; will be set by DB
mmyrte Nov 27, 2025
4d98d62
parquet_duckdb: move content-agnostic evoland_db logic
mmyrte Nov 27, 2025
0aa5a46
initial trans-preds implementation
mmyrte Nov 27, 2025
f394e85
binomial -> quasibinomial
mmyrte Nov 27, 2025
5625308
move neighbors logic into codebase + refactor: use $set to add method…
mmyrte Nov 28, 2025
e83d119
cast_dt_col - check before coercing
mmyrte Nov 28, 2025
46ee96a
error if table is missing
mmyrte Nov 28, 2025
176da53
weights makes no sense at this level
mmyrte Nov 28, 2025
e8f2ef7
compute_neighbors -> create_neighbors_t
mmyrte Nov 28, 2025
a0a6256
make transition results bool
mmyrte Nov 28, 2025
3cc787e
more informative / reflective printing method
mmyrte Nov 29, 2025
51efad0
set_neighbors: more idiomatic
mmyrte Nov 29, 2025
f3223df
changed print method
mmyrte Nov 30, 2025
ea5ed53
cast for safety
mmyrte Nov 30, 2025
0feb2d7
enable committing from existing in-memory tables
mmyrte Nov 30, 2025
a9fea62
corrected implementation of neighbour predictors
mmyrte Nov 30, 2025
6bd2277
add progress bar to neighbourhood calc
mmyrte Dec 1, 2025
7d6c453
commit: again a single public method; needed to manage table attachment
mmyrte Dec 1, 2025
8525327
enable filling trans_pred_data_v with 0s for NAs / simpler passing of…
mmyrte Dec 1, 2025
be909cc
trans_pred_data_v: enable filtering on id_pred
mmyrte Dec 1, 2025
1e8d2ec
make the predictor selection a pruning of a potentially existing tran…
mmyrte Dec 1, 2025
bd16b40
shut up distance progress
mmyrte Dec 2, 2025
d7e6303
add grrf filter
mmyrte Dec 2, 2025
e6c8e5c
make prune robust / normalize importance to [0,1]
mmyrte Dec 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,35 @@ Imports:
terra
Suggests:
tinytest,
quarto
quarto,
ranger
VignetteBuilder: quarto
Config/testthat/edition: 3
LinkingTo:
LinkingTo:
Rcpp
Collate:
'RcppExports.R'
'alloc_params_t.R'
'coords_t.R'
'covariance_filter.R'
'parquet_duckdb.R'
'evoland_db.R'
'evoland_db_neighbors.R'
'evoland_db_tables.R'
'evoland_db_views.R'
'grrf_filter.r'
'init.R'
'intrv_masks_t.R'
'intrv_meta_t.R'
'lulc_data_t.R'
'lulc_meta_t.R'
'neighbors_t.R'
'periods_t.R'
'pred_data_t.R'
'pred_meta_t.R'
'trans_meta_t.R'
'trans_models_t.R'
'trans_preds_t.R'
'util.R'
'util_download.R'
'util_terra.R'
9 changes: 7 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ S3method(print,intrv_masks_t)
S3method(print,intrv_meta_t)
S3method(print,lulc_data_t)
S3method(print,lulc_meta_t)
S3method(print,neighbors_t)
S3method(print,periods_t)
S3method(print,pred_data_t)
S3method(print,pred_meta_t)
Expand All @@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t)
S3method(validate,intrv_meta_t)
S3method(validate,lulc_data_t)
S3method(validate,lulc_meta_t)
S3method(validate,neighbors_t)
S3method(validate,periods_t)
S3method(validate,pred_data_t)
S3method(validate,pred_data_t_bool)
Expand All @@ -37,24 +39,27 @@ export(as_intrv_masks_t)
export(as_intrv_meta_t)
export(as_lulc_data_t)
export(as_lulc_meta_t)
export(as_neighbors_t)
export(as_periods_t)
export(as_pred_data_t)
export(as_pred_meta_t)
export(as_trans_meta_t)
export(as_trans_models_t)
export(as_trans_preds_t)
export(compute_neighbors)
export(covariance_filter)
export(create_coords_t_square)
export(create_intrv_meta_t)
export(create_intrv_meta_t_row)
export(create_lulc_meta_t)
export(create_neighbors_t)
export(create_periods_t)
export(create_pred_meta_t)
export(create_trans_meta_t)
export(create_trans_preds_t)
export(download_and_verify)
export(evoland_db)
export(extract_using_coords_t)
export(grrf_filter)
export(parquet_duckdb)
export(print_rowwise_yaml)
export(validate)
importFrom(Rcpp,sourceCpp)
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) {
.Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution)
distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) {
.Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet)
}

4 changes: 2 additions & 2 deletions R/coords_t.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ as_coords_t <- function(x) {
geom_polygon = list()
)
}
cast_dt_col(x, "id_coord", as.integer)
cast_dt_col(x, "id_coord", "int")
if (!is.null(x[["region"]])) {
cast_dt_col(x, "region", as.factor)
cast_dt_col(x, "region", "factor")
}
new_evoland_table(
x,
Expand Down
168 changes: 168 additions & 0 deletions R/covariance_filter.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#' Two stage covariate filtering
#'
#' The `covariance_filter` returns a set of covariates for land use land cover change
#' (LULCC) models based on a two-stage variable selection: a first statistical fit
#' estimates a covariate's quality for a given prediction task. A second step selects
#' all variables below a given correlation threshold: We iterate over a correlation
#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e.
#' candidates) greater than the given threshold are dropped from the full set of
#' candidates. This candidate selection is retained and used to select the next column,
#' until no further columns are left to investigate. The columns that were iterated over
#' are those returned as a character vector of selected variable names.
#'
#' @param data A data.table of target variable and candidate covariates to be filtered;
#' wide format with one predictor per column.
#' @param result_col Name of the column representing the transition results (0: no
#' trans, 1: trans)
#' @param rank_fun Optional function to compute ranking scores for each covariate.
#' Should take arguments (x, y, weights, ...) and return a single numeric value
#' (lower = better). Defaults to polynomial GLM p-value ranking.
#' @param weights Optional vector of weights to be used in the ranking function. Defaults to
#' class-balanced weights
#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation
#' coefficients above this threshold will be filtered out. Default is 0 (no filtering).
#' @param ... Additional arguments passed to rank_fun.
#'
#' @return A set of column names (covariates) to retain
#'
#' @details
#' The function first ranks covariates using the provided ranking function (default:
#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson)
#' correlated variables based on the correlation cutoff threshold, preserving variables
#' in order of their ranking. See
#' <https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r> for
#' where the concept came from. The original author was Antoine Adde, with edits by
#' Benjamin Black. A similar mechanism is found in <https://github.com/antadde/covsel/>.
#'
#' @name covariance_filter
#'
#' @export

covariance_filter <- function(
data,
result_col = "result",
rank_fun = rank_poly_glm,
weights = compute_balanced_weights(data[[result_col]]),
corcut = 0.7,
...
) {
# Early return for single covariate
if (ncol(data) == 1) {
return(data)
}

data.table::setDT(data)

# Validate binary outcome
stopifnot(
"corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1
)

# Compute ranking scores for all covariates (vectorized where possible)
scores <- vapply(
data[, -..result_col],
rank_fun,
FUN.VALUE = numeric(1),
y = data[[result_col]],
weights = weights,
...
)

# Sort by scores (lower = better/more significant)
ranked_order <- names(sort(scores))

# If no correlation filtering needed, return ranked predictors
if (corcut == 1) {
return(ranked_order)
}

# Compute correlation matrix once
cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs"))

# Iteratively select covariates based on correlation threshold
select_by_correlation(cor_mat, corcut)
}


#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns
#' the lower p value for each of the polynomial terms
#' @param x A numeric vector representing a single covariate
#' @param y A binary outcome vector (0/1)
#' @param weights Optional weights vector
#' @keywords internal
rank_poly_glm <- function(x, y, weights = NULL, ...) {
fit <- glm.fit(
x = cbind(1, poly(x, degree = 2, simple = TRUE)),
y = y,
family = quasibinomial(),
weights = weights
)

# Get p-values for linear and quadratic terms
coef_summary <- summary.glm(fit)$coefficients

# Return minimum p-value (most significant term)
min(coef_summary[2:3, 4], na.rm = TRUE)
}


#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary
#' outcomes; returns a numeric vector
#' @param trans_result Binary outcome vector (0/1)
#' @param legacy Bool, use legacy weighting?
#' @keywords internal
compute_balanced_weights <- function(trans_result, legacy = FALSE) {
n_total <- length(trans_result)
n_trans <- sum(trans_result)
n_non_trans <- sum(!trans_result)

# Compute inverse frequency weights
weights <- numeric(n_total)

if (legacy) {
# I found this weighting in evoland-plus-legacy, but the models wouldn't converge
# https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r
# This is actually just setting the underrepresented class to the rounded imbalance ratio
weights[!trans_result] <- 1
weights[trans_result] <- round(n_non_trans / n_trans)
return(weights)
}

# This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y))
# https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint
# This weighting maintains the exact imbalance ratio
weights[trans_result] <- n_total / (2 * n_trans)
weights[!trans_result] <- n_total / (2 * n_non_trans)

weights
}


#' @describeIn covariance_filter Implements the iterative selection procedure.
#' @param cor_mat Absolute correlation matrix
#' @param corcut Correlation cutoff threshold
#' @keywords internal
select_by_correlation <- function(cor_mat, corcut) {
var_names <- colnames(cor_mat)

# Early return if all correlations are below threshold
if (all(cor_mat[lower.tri(cor_mat)] < corcut)) {
return(var_names)
}

selected <- character(0)
remaining_idx <- seq_along(var_names)

while (length(remaining_idx) > 0) {
# Select the first remaining variable (highest ranked)
current_var <- remaining_idx[1]
selected <- c(selected, var_names[current_var])

# Find variables with correlation <= corcut with current variable
# (excluding the variable itself)
keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut)
remaining_idx <- remaining_idx[keep_idx]
}

selected
}
Loading