From 9185877dd484816b6a46ec9e23e4f9d40e1a9edb Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 11:45:00 +0100 Subject: [PATCH 01/27] add a covariance filter function this is derived from the original by antoine adde --- NAMESPACE | 1 + R/covariance_filter.R | 169 +++++++++++++++++++++++++++++++++++++++ man/covariance_filter.Rd | 81 +++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 R/covariance_filter.R create mode 100644 man/covariance_filter.Rd diff --git a/NAMESPACE b/NAMESPACE index 77baa3d..888b193 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -44,6 +44,7 @@ export(as_trans_meta_t) export(as_trans_models_t) export(as_trans_preds_t) export(compute_neighbors) +export(covariance_filter) export(create_coords_t_square) export(create_intrv_meta_t) export(create_intrv_meta_t_row) diff --git a/R/covariance_filter.R b/R/covariance_filter.R new file mode 100644 index 0000000..42ed84f --- /dev/null +++ b/R/covariance_filter.R @@ -0,0 +1,169 @@ +#' Filter covariates for land use land cover change (LULCC) models +#' +#' This function filters a set of covariates for land use land cover change (LULCC) +#' models based on various statistical methods and correlation thresholds. +#' +#' @param data A data.table of target variable and candidate covariates to be filtered; +#' wide format with one predictor per column. +#' @param result_col Name of the column representing the transition results (0: no +#' trans, 1: trans) +#' @param rank_fun Optional function to compute ranking scores for each covariate. +#' Should take arguments (x, y, weights, ...) and return a single numeric value +#' (lower = better). Defaults to polynomial GLM p-value ranking. +#' @param weights Optional vector of weights to be used in the ranking function. +#' If NULL and rank_fun uses default, class-balanced weights are computed automatically. +#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation +#' coefficients above this threshold will be filtered out. Default is 0 (no filtering). +#' @param ... Additional arguments passed to rank_fun. +#' +#' @return A filtered data.table containing only the selected covariates after ranking +#' by the specified method and filtering based on correlation threshold. +#' +#' @details +#' The function first ranks covariates using the provided ranking function (default: +#' quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables +#' based on the correlation cutoff threshold, preserving variables in order of their +#' ranking. See +#' for +#' where the concept came from. The original author was Antoine Adde, with edits by +#' Benjamin Black. +#' +#' @name covariance_filter +#' +#' @export + +covariance_filter <- function( + data, + result_col = "result", + rank_fun = rank_poly_glm, + weights = compute_balanced_weights(data[[result_col]]), + corcut = 0.7, + ... +) { + # Early return for single covariate + if (ncol(data) == 1) { + return(data) + } + + data.table::setDT(data) + + # Validate binary outcome + stopifnot( + "result_col must be binary (0/1)" = length(unique(data[[result_col]])) == 2, + "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1 + ) + + # Compute ranking scores for all covariates (vectorized where possible) + scores <- vapply( + data[, -..result_col], + rank_fun, + FUN.VALUE = numeric(1), + y = data[[result_col]], + weights = weights, + ... + ) + + # Sort by scores (lower = better/more significant) + ranked_order <- order(scores) + data_ranked <- data[, ..ranked_order] + + # If no correlation filtering needed, return ranked data + if (corcut == 1) { + return(data_ranked) + } + + # Compute correlation matrix once + cor_mat <- abs(cor(data_ranked, use = "pairwise.complete.obs")) + + # Iteratively select covariates based on correlation threshold + selected <- select_by_correlation(cor_mat, corcut) + + # Return selected covariates + data_ranked[, ..selected, drop = FALSE] +} + + +#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns +#' the lower p value for each of the polynomial terms +#' @param x A numeric vector representing a single covariate +#' @param y A binary outcome vector (0/1) +#' @param weights Optional weights vector +#' @param ... Additional arguments (ignored) +#' @keywords internal +rank_poly_glm <- function(x, y, weights = NULL, ...) { + fit <- glm.fit( + x = cbind(1, poly(x, degree = 2, simple = TRUE)), + y = y, + family = binomial(), + weights = weights + ) + + # Get p-values for linear and quadratic terms + coef_summary <- summary.glm(fit)$coefficients + + # Return minimum p-value (most significant term) + min(coef_summary[2:3, 4], na.rm = TRUE) +} + + +#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary +#' outcomes; returns a numeric vector +#' @param trans_result Binary outcome vector (0/1) +#' @param legacy Bool, use the legacy weighting? +#' @keywords internal +compute_balanced_weights <- function(trans_result, legacy = FALSE) { + n_total <- length(trans_result) + n_trans <- sum(trans_result == 1) + n_non_trans <- sum(trans_result == 0) + + # Compute inverse frequency weights + weights <- numeric(n_total) + + if (legacy) { + # I found this weighting in evoland-plus-legacy, but the models wouldn't converge + # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r + # This is actually just setting the underrepresented class to the rounded imbalance ratio + weights[trans_result == 0] <- 1 + weights[trans_result == 1] <- round(n_non_trans / n_trans) + return(weights) + } + + # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y)) + # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint + # This weighting maintains the exact imbalance ratio + weights[trans_result == 1] <- n_total / (2 * n_trans) + weights[trans_result == 0] <- n_total / (2 * n_non_trans) + + weights +} + + +#' @describeIn covariance_filter Select variables iteratively based on correlation +#' threshold; returns a character vector of selected variable names +#' @param cor_mat Absolute correlation matrix +#' @param corcut Correlation cutoff threshold +#' @keywords internal +select_by_correlation <- function(cor_mat, corcut) { + var_names <- colnames(cor_mat) + + # Early return if all correlations are below threshold + if (all(cor_mat[lower.tri(cor_mat)] < corcut)) { + return(var_names) + } + + selected <- character(0) + remaining_idx <- seq_along(var_names) + + while (length(remaining_idx) > 0) { + # Select the first remaining variable (highest ranked) + current_var <- remaining_idx[1] + selected <- c(selected, var_names[current_var]) + + # Find variables with correlation <= corcut with current variable + # (excluding the variable itself) + keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut) + remaining_idx <- remaining_idx[keep_idx] + } + + selected +} diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd new file mode 100644 index 0000000..3e47752 --- /dev/null +++ b/man/covariance_filter.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/covariance_filter.R +\name{covariance_filter} +\alias{covariance_filter} +\alias{rank_poly_glm} +\alias{compute_balanced_weights} +\alias{select_by_correlation} +\title{Filter covariates for land use land cover change (LULCC) models} +\usage{ +covariance_filter( + data, + result_col = "result", + rank_fun = rank_poly_glm, + weights = compute_balanced_weights(data[[result_col]]), + corcut = 0.7, + ... +) + +rank_poly_glm(x, y, weights = NULL, ...) + +compute_balanced_weights(trans_result, legacy = FALSE) + +select_by_correlation(cor_mat, corcut) +} +\arguments{ +\item{data}{A data.table of target variable and candidate covariates to be filtered; +wide format with one predictor per column.} + +\item{result_col}{Name of the column representing the transition results (0: no +trans, 1: trans)} + +\item{rank_fun}{Optional function to compute ranking scores for each covariate. +Should take arguments (x, y, weights, ...) and return a single numeric value +(lower = better). Defaults to polynomial GLM p-value ranking.} + +\item{weights}{Optional weights vector} + +\item{corcut}{Correlation cutoff threshold} + +\item{...}{Additional arguments (ignored)} + +\item{x}{A numeric vector representing a single covariate} + +\item{y}{A binary outcome vector (0/1)} + +\item{trans_result}{Binary outcome vector (0/1)} + +\item{legacy}{Bool, use the legacy weighting?} + +\item{cor_mat}{Absolute correlation matrix} +} +\value{ +A filtered data.table containing only the selected covariates after ranking +by the specified method and filtering based on correlation threshold. +} +\description{ +This function filters a set of covariates for land use land cover change (LULCC) +models based on various statistical methods and correlation thresholds. +} +\details{ +The function first ranks covariates using the provided ranking function (default: +quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables +based on the correlation cutoff threshold, preserving variables in order of their +ranking. See +\url{https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r} for +where the concept came from. The original author was Antoine Adde, with edits by +Benjamin Black. +} +\section{Functions}{ +\itemize{ +\item \code{rank_poly_glm()}: Default ranking function using polynomial GLM. Returns +the lower p value for each of the polynomial terms + +\item \code{compute_balanced_weights()}: Compute class-balanced weights for imbalanced binary +outcomes; returns a numeric vector + +\item \code{select_by_correlation()}: Select variables iteratively based on correlation +threshold; returns a character vector of selected variable names + +}} +\keyword{internal} From 1c7a1f5aec1735414e6c9bcaf9759e66d24a0a6a Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 11:45:26 +0100 Subject: [PATCH 02/27] update docs --- R/evoland_db.R | 1 + R/evoland_db_views.R | 2 +- man/evoland_db.Rd | 33 +++++++++++++++++++++++++++++---- man/evoland_db_views.Rd | 16 +++++++++++++--- man/trans_meta_t.Rd | 5 +++-- 5 files changed, 47 insertions(+), 10 deletions(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index 5179ffd..ad67db3 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -320,6 +320,7 @@ evoland_db <- R6::R6Class( #' them into R objects. #' @param table_name Character vector. Names of table to attach. #' @param columns Character vector. Optional sql column selection, defaults to "*" + #' @param where A SQL where statement to optionally subset the table being attached. attach_table = function(table_name, columns = "*", where = NULL) { # TODO the attach/detach combo should be available in a DB method "with # table"; it should do nothing if the table is already attached diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index f17c8df..b679911 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -84,7 +84,7 @@ make_extent_db <- function(self, private) { terra::ext() } -#' @describeIn evoland_db_views +#' @describeIn evoland_db_views Returns transitions based on lulc_data_t make_transitions_v <- function(self, private, where = NULL) { self$attach_table("lulc_data_t") on.exit(self$detach_table("lulc_data_t")) diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index e555300..af49fa0 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -39,6 +39,8 @@ object to assign. Assigning is an upsert operation.} \item{\code{lulc_data_t}}{A \code{lulc_data_t} instance; see \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} for the type of object to assign. Assigning is an upsert operation.} +\item{\code{transitions_v}}{Get the transitions from \code{lulc_data_t}.} + \item{\code{pred_data_t_float}}{A \code{pred_data_t_float} instance; see \code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an upsert operation.} @@ -100,6 +102,7 @@ of object to assign. Assigning is an upsert operation.} \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}} \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}} \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}} +\item \href{#method-evoland_db-print}{\code{evoland_db$print()}} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } @@ -297,11 +300,11 @@ No. of rows affected by statement \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-attach_table}{}}} \subsection{Method \code{attach_table()}}{ -Attach one or more tables from the database folder as temporary tables in DuckDB. -This is useful for working with multiple tables in SQL queries without loading -them into R memory. +Attach a single table as a temporary table in DuckDB memory. This is +useful for working with multiple tables in SQL queries instead of loading +them into R objects. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$attach_table(table_name, columns = "*")}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{evoland_db$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -310,6 +313,8 @@ them into R memory. \item{\code{table_name}}{Character vector. Names of table to attach.} \item{\code{columns}}{Character vector. Optional sql column selection, defaults to "*"} + +\item{\code{where}}{A SQL where statement to optionally subset the table being attached.} } \if{html}{\out{}} } @@ -462,6 +467,26 @@ Add a predictor to the database } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-print}{}}} +\subsection{Method \code{print()}}{ +Print method for evoland_db +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$print(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{Not used} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +self (invisibly) +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd index 68d5b66..d149f48 100644 --- a/man/evoland_db_views.Rd +++ b/man/evoland_db_views.Rd @@ -5,13 +5,19 @@ \alias{make_pred_sources_v} \alias{make_lulc_meta_long_v} \alias{make_coords_minimal} +\alias{make_extent_db} +\alias{make_transitions_v} \title{Views on the evoland-plus data model} \usage{ -make_pred_sources_v(self, private) +make_pred_sources_v(self, private, where = NULL) -make_lulc_meta_long_v(self, private) +make_lulc_meta_long_v(self, private, where = NULL) -make_coords_minimal(self, private) +make_coords_minimal(self, private, where = NULL) + +make_extent_db(self, private) + +make_transitions_v(self, private, where = NULL) } \description{ Functions to generate views on the database @@ -25,4 +31,8 @@ md5sum \item \code{make_coords_minimal()}: Minimal coordinate representation (id_coord, lon, lat) +\item \code{make_extent_db()}: Returns the extent of the coords_t as terra::SpatExtent + +\item \code{make_transitions_v()}: Returns transitions based on lulc_data_t + }} diff --git a/man/trans_meta_t.Rd b/man/trans_meta_t.Rd index 8c1d23b..b65c9e8 100644 --- a/man/trans_meta_t.Rd +++ b/man/trans_meta_t.Rd @@ -10,7 +10,7 @@ as_trans_meta_t(x) create_trans_meta_t( - lulc_data, + transitions, min_cardinality_abs = NULL, min_frequency_rel = NULL, exclude_anterior = NULL, @@ -20,7 +20,8 @@ create_trans_meta_t( \method{print}{trans_meta_t}(x, nrow = 10, ...) } \arguments{ -\item{lulc_data}{A \link{lulc_data_t} table with land use observations} +\item{transitions}{A transitions_v table, with columns id_coord, id_lulc_anterior, +id_lulc_posterior, id_period} \item{min_cardinality_abs}{Minimum absolute number of transitions for viability (optional)} From 6809db2f00e1a69d71c4dc37c26cf8235d02ab42 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:36:44 +0100 Subject: [PATCH 03/27] better docs --- R/covariance_filter.R | 27 ++++++++++++++++----------- man/covariance_filter.Rd | 28 +++++++++++++++++----------- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/R/covariance_filter.R b/R/covariance_filter.R index 42ed84f..b2475f6 100644 --- a/R/covariance_filter.R +++ b/R/covariance_filter.R @@ -1,7 +1,14 @@ -#' Filter covariates for land use land cover change (LULCC) models +#' Two stage covariate filtering #' -#' This function filters a set of covariates for land use land cover change (LULCC) -#' models based on various statistical methods and correlation thresholds. +#' The `covariance_filter` returns a set of covariates for land use land cover change +#' (LULCC) models based on a two-stage variable selection: a first statistical fit +#' estimates a covariate's quality for a given prediction task. A second step selects +#' all variables below a given correlation threshold: We iterate over a correlation +#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e. +#' candidates) greater than the given threshold are dropped from the full set of +#' candidates. This candidate selection is retained and used to select the next column, +#' until no further columns are left to investigate. The columns that were iterated over +#' are those returned as a character vector of selected variable names. #' #' @param data A data.table of target variable and candidate covariates to be filtered; #' wide format with one predictor per column. @@ -21,12 +28,12 @@ #' #' @details #' The function first ranks covariates using the provided ranking function (default: -#' quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables -#' based on the correlation cutoff threshold, preserving variables in order of their -#' ranking. See +#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson) +#' correlated variables based on the correlation cutoff threshold, preserving variables +#' in order of their ranking. See #' for #' where the concept came from. The original author was Antoine Adde, with edits by -#' Benjamin Black. +#' Benjamin Black. A similar mechanism is found in . #' #' @name covariance_filter #' @@ -88,7 +95,6 @@ covariance_filter <- function( #' @param x A numeric vector representing a single covariate #' @param y A binary outcome vector (0/1) #' @param weights Optional weights vector -#' @param ... Additional arguments (ignored) #' @keywords internal rank_poly_glm <- function(x, y, weights = NULL, ...) { fit <- glm.fit( @@ -109,7 +115,7 @@ rank_poly_glm <- function(x, y, weights = NULL, ...) { #' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary #' outcomes; returns a numeric vector #' @param trans_result Binary outcome vector (0/1) -#' @param legacy Bool, use the legacy weighting? +#' @param legacy Bool, use legacy weighting? #' @keywords internal compute_balanced_weights <- function(trans_result, legacy = FALSE) { n_total <- length(trans_result) @@ -138,8 +144,7 @@ compute_balanced_weights <- function(trans_result, legacy = FALSE) { } -#' @describeIn covariance_filter Select variables iteratively based on correlation -#' threshold; returns a character vector of selected variable names +#' @describeIn covariance_filter Implements the iterative selection procedure. #' @param cor_mat Absolute correlation matrix #' @param corcut Correlation cutoff threshold #' @keywords internal diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd index 3e47752..ffa87fc 100644 --- a/man/covariance_filter.Rd +++ b/man/covariance_filter.Rd @@ -5,7 +5,7 @@ \alias{rank_poly_glm} \alias{compute_balanced_weights} \alias{select_by_correlation} -\title{Filter covariates for land use land cover change (LULCC) models} +\title{Two stage covariate filtering} \usage{ covariance_filter( data, @@ -37,7 +37,7 @@ Should take arguments (x, y, weights, ...) and return a single numeric value \item{corcut}{Correlation cutoff threshold} -\item{...}{Additional arguments (ignored)} +\item{...}{Additional arguments passed to rank_fun.} \item{x}{A numeric vector representing a single covariate} @@ -45,7 +45,7 @@ Should take arguments (x, y, weights, ...) and return a single numeric value \item{trans_result}{Binary outcome vector (0/1)} -\item{legacy}{Bool, use the legacy weighting?} +\item{legacy}{Bool, use legacy weighting?} \item{cor_mat}{Absolute correlation matrix} } @@ -54,17 +54,24 @@ A filtered data.table containing only the selected covariates after ranking by the specified method and filtering based on correlation threshold. } \description{ -This function filters a set of covariates for land use land cover change (LULCC) -models based on various statistical methods and correlation thresholds. +The \code{covariance_filter} returns a set of covariates for land use land cover change +(LULCC) models based on a two-stage variable selection: a first statistical fit +estimates a covariate's quality for a given prediction task. A second step selects +all variables below a given correlation threshold: We iterate over a correlation +matrix ordered in the first step. Starting within the leftmost column, all rows (i.e. +candidates) greater than the given threshold are dropped from the full set of +candidates. This candidate selection is retained and used to select the next column, +until no further columns are left to investigate. The columns that were iterated over +are those returned as a character vector of selected variable names. } \details{ The function first ranks covariates using the provided ranking function (default: -quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables -based on the correlation cutoff threshold, preserving variables in order of their -ranking. See +quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson) +correlated variables based on the correlation cutoff threshold, preserving variables +in order of their ranking. See \url{https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r} for where the concept came from. The original author was Antoine Adde, with edits by -Benjamin Black. +Benjamin Black. A similar mechanism is found in \url{https://github.com/antadde/covsel/}. } \section{Functions}{ \itemize{ @@ -74,8 +81,7 @@ the lower p value for each of the polynomial terms \item \code{compute_balanced_weights()}: Compute class-balanced weights for imbalanced binary outcomes; returns a numeric vector -\item \code{select_by_correlation()}: Select variables iteratively based on correlation -threshold; returns a character vector of selected variable names +\item \code{select_by_correlation()}: Implements the iterative selection procedure. }} \keyword{internal} From 6db820e8f60ac6480957534075afbe6b4673ffc3 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:36:52 +0100 Subject: [PATCH 04/27] drop id_trans; will be set by DB --- R/trans_meta_t.R | 6 ++---- man/trans_meta_t.Rd | 3 ++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/R/trans_meta_t.R b/R/trans_meta_t.R index 8613add..db81f8e 100644 --- a/R/trans_meta_t.R +++ b/R/trans_meta_t.R @@ -41,7 +41,8 @@ as_trans_meta_t <- function(x) { ) } -#' @describeIn trans_meta_t Calculate the transition metadata and mark for modelling feasibility +#' @describeIn trans_meta_t Calculate the transition metadata and mark for modelling +#' feasibility. Does not attribute `id_trans`; this only makes sense as part of a DB. #' @export create_trans_meta_t <- function( transitions, @@ -108,9 +109,6 @@ create_trans_meta_t <- function( } } - # Add id_trans - trans_summary[, id_trans := .I] - as_trans_meta_t(trans_summary) } diff --git a/man/trans_meta_t.Rd b/man/trans_meta_t.Rd index b65c9e8..9577213 100644 --- a/man/trans_meta_t.Rd +++ b/man/trans_meta_t.Rd @@ -59,6 +59,7 @@ entries for each viable transition type. }} \section{Functions}{ \itemize{ -\item \code{create_trans_meta_t()}: Calculate the transition metadata and mark for modelling feasibility +\item \code{create_trans_meta_t()}: Calculate the transition metadata and mark for modelling +feasibility. Does not attribute \code{id_trans}; this only makes sense as part of a DB. }} From 4d98d621a0a42734012c294ec832e2c34a340a06 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:42:27 +0100 Subject: [PATCH 05/27] parquet_duckdb: move content-agnostic evoland_db logic plus adding a with_tables function --- DESCRIPTION | 22 ++ NAMESPACE | 1 + R/evoland_db.R | 519 ++----------------------- R/evoland_db_views.R | 330 ++++++++++++---- R/parquet_duckdb.R | 576 ++++++++++++++++++++++++++++ inst/tinytest/test_evoland_db.R | 244 +++++------- inst/tinytest/test_parquet_duckdb.R | 349 +++++++++++++++++ man/evoland_db.Rd | 266 +------------ man/evoland_db_views.Rd | 12 + man/parquet_duckdb.Rd | 395 +++++++++++++++++++ 10 files changed, 1748 insertions(+), 966 deletions(-) create mode 100644 R/parquet_duckdb.R create mode 100644 inst/tinytest/test_parquet_duckdb.R create mode 100644 man/parquet_duckdb.Rd diff --git a/DESCRIPTION b/DESCRIPTION index cb31905..fb2d27f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,3 +33,25 @@ VignetteBuilder: quarto Config/testthat/edition: 3 LinkingTo: Rcpp +Collate: + 'RcppExports.R' + 'alloc_params_t.R' + 'coords_t.R' + 'covariance_filter.R' + 'parquet_duckdb.R' + 'evoland_db.R' + 'evoland_db_views.R' + 'init.R' + 'intrv_masks_t.R' + 'intrv_meta_t.R' + 'lulc_data_t.R' + 'lulc_meta_t.R' + 'periods_t.R' + 'pred_data_t.R' + 'pred_meta_t.R' + 'trans_meta_t.R' + 'trans_models_t.R' + 'trans_preds_t.R' + 'util.R' + 'util_download.R' + 'util_terra.R' diff --git a/NAMESPACE b/NAMESPACE index 888b193..50faaf3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(create_trans_preds_t) export(download_and_verify) export(evoland_db) export(extract_using_coords_t) +export(parquet_duckdb) export(print_rowwise_yaml) export(validate) importFrom(Rcpp,sourceCpp) diff --git a/R/evoland_db.R b/R/evoland_db.R index ad67db3..f40bd82 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -6,25 +6,17 @@ #' This class uses DuckDB for in-memory SQL operations while persisting data #' to disk in parquet format for better compression. #' +#' Inherits from [parquet_duckdb] for generic database operations. +#' +#' @include parquet_duckdb.R #' @export evoland_db <- R6::R6Class( classname = "evoland_db", + inherit = parquet_duckdb, ## Public Methods ---- public = list( - #' @field connection DBI connection object to an in-memory DuckDB database - connection = NULL, - - #' @field path Character string path to the data folder - path = NULL, - - #' @field default_format Default file format for new tables - default_format = NULL, - - #' @field writeopts Default write options for DuckDB, see - writeopts = NULL, - #' @description #' Initialize a new evoland_db object #' @param path Character string. Path to the data folder. @@ -38,217 +30,23 @@ evoland_db <- R6::R6Class( default_format = c("parquet", "csv"), ... ) { - # Create folder if it doesn't exist - self$path <- ensure_dir(path) - - # set format / writeopts - self$default_format <- match.arg(default_format) - self$writeopts <- switch( - self$default_format, - parquet = "FORMAT parquet, COMPRESSION zstd", - csv = "FORMAT csv", - stop(glue::glue("Unsupported format: {format}")) + # Initialize parent class with spatial extension + super$initialize( + path = path, + default_format = default_format, + extensions = "spatial" ) - # Create in-memory connection for SQL operations - self$connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") - self$execute("INSTALL spatial; LOAD spatial;") - + # Set evoland-specific reporting metadata self$set_report(...) invisible(self) }, - ### DB methods ---- - # TODO the database methods should be spun out into a parent class "parquet_db" that is - # independent of domain specific logic. the domain logic would then live in evoland_db. - #' @description - #' Commit data in overwrite mode - #' @param x Data frame to commit - #' @param table_name Character string table name - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - commit_overwrite = function( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) - - # if there are any of these, first get existing max values - if (length(intersect(autoincrement_cols, names(x))) > 0) { - warning(glue::glue( - "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", - "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" - )) - } - - # autoincrement = row number; all other cols treated as ordinary - ordinary_cols <- setdiff(names(x), autoincrement_cols) - select_expr <- glue::glue_collapse( - c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), - sep = ",\n " - ) - - self$execute(glue::glue( - r"{ - copy ( - select {select_expr} - from new_data_v - ) to '{file_path}' ({self$writeopts}) - }" - )) - }, - - #' @description - #' Commit data in append mode - #' @param x Data frame to commit - #' @param table_name Character string table name - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - commit_append = function( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - self$commit_overwrite(x, table_name, autoincrement_cols, map_cols) - } - - self$attach_table(table_name) - on.exit(self$detach_table(table_name)) - private$set_autoincrement_vars(table_name, autoincrement_cols) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) - - ordinary_cols <- setdiff(names(x), autoincrement_cols) - select_new <- glue::glue_collapse( - c( - glue::glue( - "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" - ), - ordinary_cols - ), - sep = ",\n " - ) - - # concatenation using union all; the "by name" option inserts NULLs if a column is - # missing in one of the queries. also makes it robust against differing col orders. - self$execute(glue::glue( - r"{ - copy ( - select - * - from - {table_name} - union all by name - select - {select_new} - from - new_data_v - ) - to '{file_info$path}' ({self$writeopts}) - }" - )) - }, - - #' @description - #' Commit data in upsert mode - #' @param x Data frame to commit - #' @param table_name Character string table name - #' @param key_cols Identify unique columns - heuristic: if prefixed with - #' id_, the set of all columns designates a uniqueness condition - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - commit_upsert = function( - x, - table_name, - key_cols = grep("^id_", names(x), value = TRUE), - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) - } else if (length(key_cols) == 0) { - return(self$commit_append(x, table_name, autoincrement_cols, map_cols)) - } - - self$attach_table(table_name) - on.exit(self$detach_table(table_name)) - - private$set_autoincrement_vars(table_name, autoincrement_cols) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) - - # Update existing data without touching key_cols or autoincrement_cols - ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols)) - update_select_expr <- glue::glue_collapse( - glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), - sep = ",\n " - ) - update_join_condition <- glue::glue_collapse( - glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"), - sep = "\nand " - ) - - self$execute(glue::glue( - r"{ - update {table_name} set - {update_select_expr} - from new_data_v - where - {update_join_condition}; - }" - )) - - # Insert new data with incrementing autoincrement_cols - insert_select_expr <- glue::glue_collapse( - c( - glue::glue( - "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" - ), - glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}") - ), - sep = ",\n " - ) - null_condition <- glue::glue_collapse( - glue::glue("{table_name}.{key_cols} is null"), - sep = "\nand " - ) - - self$execute(glue::glue( - r"{ - insert into {table_name} - select - {insert_select_expr} - from - new_data_v - left join - {table_name} - on - {update_join_condition} - where - {null_condition} - ; - }" - )) - - self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})")) - }, + ### Evoland-specific methods ---- #' @description - #' Fetch data from storage + #' Fetch data from storage with evoland-specific view support #' @param table_name Character string. Name of the table to query. #' @param where Character string. Optional WHERE clause for the SQL query. #' @param limit integerish, limit the amount of rows to return @@ -274,128 +72,8 @@ evoland_db <- R6::R6Class( return(private$get_empty_table(table_name)) } - # Build SQL query - sql <- glue::glue("SELECT * FROM read_{file_info$format}('{file_info$path}')") - - if (!is.null(where)) { - sql <- glue::glue("{sql} WHERE {where}") - } - if (!is.null(limit)) { - sql <- glue::glue("{sql} LIMIT {limit}") - } - - self$get_query(sql) - }, - - #' @description - #' List all tables (files) in storage - #' @return Character vector of table names - list_tables = function() { - list.files(self$path, pattern = "\\.(parquet|csv)$", full.names = FALSE) |> - tools::file_path_sans_ext() |> - unique() |> - sort() - }, - - #' @description - #' Execute statement - #' @param statement A SQL statement - #' @return No. of rows affected by statement - execute = function(statement) { - DBI::dbExecute(self$connection, statement) - }, - - #' @description - #' Get Query - #' @param statement A SQL statement - #' @return No. of rows affected by statement - get_query = function(statement) { - DBI::dbGetQuery(self$connection, statement) |> - data.table::as.data.table() - }, - - #' @description - #' Attach a single table as a temporary table in DuckDB memory. This is - #' useful for working with multiple tables in SQL queries instead of loading - #' them into R objects. - #' @param table_name Character vector. Names of table to attach. - #' @param columns Character vector. Optional sql column selection, defaults to "*" - #' @param where A SQL where statement to optionally subset the table being attached. - attach_table = function(table_name, columns = "*", where = NULL) { - # TODO the attach/detach combo should be available in a DB method "with - # table"; it should do nothing if the table is already attached - file_info <- private$get_file_path(table_name) - - # Build SQL query - sql <- glue::glue( - "CREATE TEMP TABLE {table_name} AS ", - "SELECT {paste(columns, collapse = ', ')} ", - "FROM read_{file_info$format}('{file_info$path}')" - ) - - if (!is.null(where)) { - sql <- glue::glue("{sql} WHERE {where}") - } - - # Execute SQL - self$execute(sql) - }, - - #' @description - #' Detach one or more tables from the database. - #' @param table_name Character. Name of table to drop. - detach_table = function(table_name) { - self$execute(paste0("drop table ", table_name, ";")) - }, - - #' @description - #' Get table row count - #' @param table_name Character string. Name of the table to query. - #' @return No. of rows - row_count = function(table_name) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(0L) - } - - self$get_query( - glue::glue("SELECT COUNT(*) as n FROM read_{file_info$format}('{file_info$path}')") - )[[1]] - }, - - #' @description - #' Delete rows from a table - #' @param table_name Character string. Name of the table to delete from. - #' @param where Character string, defaults to NULL: delete everything in table. - #' @return No. of rows affected - delete_from = function(table_name, where = NULL) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(0L) - } - - count_before <- self$row_count(table_name) - - if (is.null(where)) { - file.remove(file_info$path) - return(count_before) - } - - self$execute(glue::glue( - r"{ - COPY ( - SELECT * FROM read_{file_info$format}('{file_info$path}') - WHERE NOT ({where}) - ) - TO '{file_info$path}' ({self$writeopts}) - }" - )) - - count_after <- self$row_count(table_name) - - return(count_before - count_after) + # Call parent method + super$fetch(table_name, where, limit) }, ### Setter methods ---- @@ -406,10 +84,15 @@ evoland_db <- R6::R6Class( set_report = function(...) { params <- list(...) if (self$row_count("reporting_t") == 0L) { - params[["report_name"]] <- "evoland_scenario" - params[["report_name_pretty"]] <- "Default Evoland Scenario" - params[["report_include_date"]] <- "TRUE" - params[["creator_username"]] <- Sys.getenv("USER", unset = "unknown") + # only upsert if these values are missing upon DB init + params[["report_name"]] <- + params[["report_name"]] %||% "evoland_scenario" + params[["report_name_pretty"]] <- + params[["report_name_pretty"]] %||% "Default Evoland Scenario" + params[["report_include_date"]] <- + params[["report_include_date"]] %||% "TRUE" + params[["creator_username"]] <- + params[["creator_username"]] %||% Sys.getenv("USER", unset = "unknown") } params[["last_opened"]] <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC") params[["last_opened_username"]] <- Sys.getenv("USER", unset = "unknown") @@ -497,10 +180,11 @@ evoland_db <- R6::R6Class( #' @param ... Not used #' @return self (invisibly) print = function(...) { - cat("\n") - cat(sprintf("Database path: %s\n\n", self$path)) + cat(" (extends parquet_duckdb)\n") + cat(sprintf("Database path: %s\n", self$path)) + cat(sprintf("Default format: %s\n\n", self$default_format)) - cat("Database methods:\n") + cat("Database methods (inherited):\n") cat(" Commit: commit_overwrite(x, table_name, autoincrement_cols, map_cols),\n") cat(" commit_append(x, table_name, autoincrement_cols, map_cols),\n") cat(" commit_upsert(x, table_name, key_cols, autoincrement_cols, map_cols)\n") @@ -521,7 +205,9 @@ evoland_db <- R6::R6Class( cat(" intrv_meta_t, intrv_masks_t, trans_models_t, alloc_params_t\n\n") cat("Active bindings (read-only):\n") - cat(" extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n") + cat(" extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n\n") + + cat(sprintf("Tables: %d\n", length(self$list_tables()))) invisible(self) } @@ -721,150 +407,7 @@ evoland_db <- R6::R6Class( ## Private Methods ---- private = list( - # r6 hook called on gc(); Close the database connection - # - # return NULL (called for side effects) - finalize = function() { - if (!is.null(self$connection)) { - DBI::dbDisconnect(self$connection) - self$connection <- NULL - } - }, - - # Get file path and format for a table - # - # param table_name Character string table name - # return List with path, format, and exists flag - get_file_path = function(table_name) { - # Check for parquet first, then csv - parquet_path <- file.path(self$path, paste0(table_name, ".parquet")) - csv_path <- file.path(self$path, paste0(table_name, ".csv")) - - if (file.exists(parquet_path)) { - return(list(path = parquet_path, format = "parquet", exists = TRUE)) - } else if (file.exists(csv_path)) { - return(list(path = csv_path, format = "csv", exists = TRUE)) - } else { - # Return default format for new file - default_path <- file.path( - self$path, - paste0(table_name, ".", self$default_format) - ) - return(list( - path = default_path, - format = self$default_format, - exists = FALSE - )) - } - }, - - # Register new_data_v table, optionally converting MAP columns - # - # param x Data to register or character name of existing table/view - # param map_cols Character vector of columns to convert to MAP format - # return NULL (called for side effects) - register_new_data_v = function(x, map_cols = character(0)) { - # If x is a table name, create alias to new_data_v - if (is.character(x) && length(x) == 1) { - self$execute(glue::glue("create view new_data_v as select * from {x}")) - return(invisible(NULL)) - } - - if (length(map_cols) == 0) { - # No MAP conversion needed - register directly - duckdb::duckdb_register(self$connection, "new_data_v", x) - } else { - # Convert list columns to key-value dataframes - x <- - data.table::copy(x) |> - convert_list_cols(map_cols, list_to_kv_df) - - # Register as intermediate table - duckdb::duckdb_register(self$connection, "new_data_raw", x) - - # Build SELECT expression with map_from_entries for MAP columns - map_exprs <- glue::glue("map_from_entries({map_cols}) as {map_cols}") - other_cols <- setdiff(names(x), map_cols) - all_exprs <- c(other_cols, map_exprs) - select_expr <- glue::glue_collapse(all_exprs, sep = ", ") - - # Create new_data_v from new_data_raw - self$execute(glue::glue( - "create temp table new_data_v as select {select_expr} from new_data_raw" - )) - } - - invisible(NULL) - }, - - # Cleanup new_data_v and related tables - # - # param map_cols Character vector indicating if MAP conversion was used - # return NULL (called for side effects) - cleanup_new_data_v = function(map_cols = character(0)) { - if (length(map_cols) == 0) { - duckdb::duckdb_unregister(self$connection, "new_data_v") - } else { - self$execute( - "drop table if exists new_data_v; - drop view if exists new_data_v" - ) - duckdb::duckdb_unregister(self$connection, "new_data_raw") - } - - invisible(NULL) - }, - - # Write data to file - # - # param x Data frame to write - # param table_name Character string table name - write_file = function(x, table_name) { - file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) - - duckdb::duckdb_register(self$connection, "temp_write_table", x) - on.exit(duckdb::duckdb_unregister(self$connection, "temp_write_table")) - - sql <- glue::glue("COPY temp_write_table TO '{file_path}' ({self$writeopts})") - DBI::dbExecute(self$connection, sql) - }, - - # Set one duckdb variable name max_{colname} to the maximum found for each - # autoincrement_col in table_name. if column is missing from table_name, set to 0. - set_autoincrement_vars = function(table_name, autoincrement_cols) { - if (length(autoincrement_cols) == 0L) { - return(NULL) - } - if (!DBI::dbExistsTable(self$connection, table_name)) { - # attach / detach unless it was already there - self$attach_table(table_name) - on.exit(self$detach_table(table_name)) - } - - existing_cols <- - glue::glue("select column_name from (describe {table_name})") |> - self$get_query() |> - (\(x) x[[1]])() |> - intersect(autoincrement_cols) - - missing_cols <- setdiff(autoincrement_cols, existing_cols) - - set_exprs <- glue::glue_collapse( - c( - glue::glue( - "set variable max_{existing_cols} = (select coalesce(max({existing_cols}), 0) from {table_name});" - ), - glue::glue( - "set variable max_{missing_cols} = 0;" - ) - ), - sep = "\n" - ) - - self$execute(set_exprs) - }, - - # Get empty table with proper structure + # Get empty table with proper structure (evoland-specific) # # param table_name Character string table name # return Empty data.table with correct columns diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index b679911..b0533e9 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -9,103 +9,269 @@ NULL #' @describeIn evoland_db_views Retrieve a table of distinct predictor urls and their #' md5sum make_pred_sources_v <- function(self, private, where = NULL) { - self$attach_table("pred_meta_t") - on.exit(self$detach_table("pred_meta_t")) - - where_clause <- if (!is.null(where)) paste("AND", where) else "" - - self$get_query(glue::glue( - r"{ - select distinct - unnest(sources).url as url, - unnest(sources).md5sum as md5sum - from pred_meta_t - where sources is not null {where_clause} - }" - )) + self$with_tables("pred_meta_t", function() { + where_clause <- if (!is.null(where)) paste("AND", where) else "" + + self$get_query(glue::glue( + r"{ + select distinct + unnest(sources).url as url, + unnest(sources).md5sum as md5sum + from pred_meta_t + where sources is not null {where_clause} + }" + )) + }) } #' @describeIn evoland_db_views Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`. make_lulc_meta_long_v <- function(self, private, where = NULL) { - self$attach_table("lulc_meta_t") - on.exit(self$detach_table("lulc_meta_t")) - - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - - self$get_query(glue::glue( - r"{ - select - id_lulc, - name, - unnest(src_classes) as src_class - from - lulc_meta_t - {where_clause} - }" - )) + self$with_tables("lulc_meta_t", function() { + where_clause <- if (!is.null(where)) paste("WHERE", where) else "" + + self$get_query(glue::glue( + r"{ + select + id_lulc, + name, + unnest(src_classes) as src_class + from + lulc_meta_t + {where_clause} + }" + )) + }) } #' @describeIn evoland_db_views Minimal coordinate representation (id_coord, lon, lat) make_coords_minimal <- function(self, private, where = NULL) { - self$attach_table("coords_t", c("id_coord", "lon", "lat")) - on.exit(self$detach_table("coords_t")) - - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - - self$get_query(glue::glue( - r"{ - select id_coord, lon, lat - from coords_t - {where_clause} - }" - )) |> - cast_dt_col("id_coord", as.integer) |> - data.table::setkeyv("id_coord") + self$with_tables("coords_t", function() { + where_clause <- if (!is.null(where)) paste("WHERE", where) else "" + + self$get_query(glue::glue( + r"{ + select id_coord, lon, lat + from coords_t + {where_clause} + }" + )) |> + cast_dt_col("id_coord", as.integer) |> + data.table::setkeyv("id_coord") + }) } #' @describeIn evoland_db_views Returns the extent of the coords_t as terra::SpatExtent make_extent_db <- function(self, private) { - self$attach_table("coords_t", c("lon", "lat")) - on.exit(self$detach_table("coords_t")) - - self$get_query(glue::glue( - r"{ - SELECT - min(lon) as xmin, - max(lon) as xmax, - min(lat) as ymin, - max(lat) as ymax - FROM - coords_t - }" - )) |> - unlist() |> - terra::ext() + self$with_tables("coords_t", function() { + self$get_query(glue::glue( + r"{ + SELECT + min(lon) as xmin, + max(lon) as xmax, + min(lat) as ymin, + max(lat) as ymax + FROM + coords_t + }" + )) |> + unlist() |> + terra::ext() + }) } #' @describeIn evoland_db_views Returns transitions based on lulc_data_t make_transitions_v <- function(self, private, where = NULL) { - self$attach_table("lulc_data_t") - on.exit(self$detach_table("lulc_data_t")) - - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - - self$get_query(glue::glue( - r"{ - SELECT - curr.id_period, - prev.id_lulc as id_lulc_anterior, - curr.id_lulc as id_lulc_posterior, - curr.id_coord - FROM - lulc_data_t as curr - INNER JOIN - lulc_data_t as prev - ON - curr.id_coord = prev.id_coord - AND curr.id_period = prev.id_period + 1 - {where_clause} - }" - )) + self$with_tables("lulc_data_t", function() { + where_clause <- if (!is.null(where)) paste("WHERE", where) else "" + + self$get_query(glue::glue( + r"{ + SELECT + curr.id_period, + prev.id_lulc as id_lulc_anterior, + curr.id_lulc as id_lulc_posterior, + curr.id_coord + FROM + lulc_data_t as curr + INNER JOIN + lulc_data_t as prev + ON + curr.id_coord = prev.id_coord + AND curr.id_period = prev.id_period + 1 + {where_clause} + }" + )) + }) +} + +#' @describeIn evoland_db_views Returns wide table of transition results and predictor data +#' for a specific transition. Used as input to covariance filtering. +#' +#' @param id_trans Integer, the transition ID to generate data for +#' @return data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N +make_trans_pred_data_v <- function(self, private, id_trans) { + stopifnot( + "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans) + ) + + # Determine which predictor tables exist + all_tables <- self$list_tables() + pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool") + existing_pred_tables <- intersect(pred_tables, all_tables) + + # Build list of tables to attach + tables_to_attach <- c("trans_meta_t", "lulc_data_t", "pred_meta_t", existing_pred_tables) + + self$with_tables( + tables_to_attach, + function() { + # Get transition metadata + trans_info <- self$get_query(glue::glue( + "SELECT id_lulc_anterior, id_lulc_posterior + FROM trans_meta_t + WHERE id_trans = {id_trans}" + )) + + if (nrow(trans_info) == 0L) { + stop(glue::glue("Transition id_trans = {id_trans} not found in trans_meta_t")) + } + + id_lulc_ant <- trans_info$id_lulc_anterior + id_lulc_post <- trans_info$id_lulc_posterior + + # Build CTEs dynamically based on which predictor tables exist + ctes <- list() + + # Always include trans_result + ctes$trans_result <- glue::glue( + "trans_result AS ( + SELECT + curr.id_coord, + curr.id_period, + CASE + WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN 1 + WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN 0 + ELSE NULL + END AS result + FROM lulc_data_t AS curr + INNER JOIN lulc_data_t AS prev + ON curr.id_coord = prev.id_coord + AND curr.id_period = prev.id_period + 1 + WHERE prev.id_lulc = {id_lulc_ant} + )" + ) + + # Add predictor CTEs for each type that exists + # UNION period-specific data (period >= 1) with cross-joined static data (period 0) + if ("pred_data_t_float" %in% existing_pred_tables) { + ctes$pred_float_combined <- "pred_float_combined AS ( + SELECT id_coord, id_period, id_pred, value + FROM pred_data_t_float + WHERE id_period >= 1 + UNION ALL + SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value + FROM pred_data_t_float AS p0 + CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods + WHERE p0.id_period = 0 + )" + + ctes$pred_float_wide <- "pred_float_wide AS ( + PIVOT pred_float_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period + )" + } + + if ("pred_data_t_int" %in% existing_pred_tables) { + ctes$pred_int_combined <- "pred_int_combined AS ( + SELECT id_coord, id_period, id_pred, value + FROM pred_data_t_int + WHERE id_period >= 1 + UNION ALL + SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value + FROM pred_data_t_int AS p0 + CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods + WHERE p0.id_period = 0 + )" + + ctes$pred_int_wide <- "pred_int_wide AS ( + PIVOT pred_int_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period + )" + } + + if ("pred_data_t_bool" %in% existing_pred_tables) { + ctes$pred_bool_combined <- "pred_bool_combined AS ( + SELECT id_coord, id_period, id_pred, value + FROM pred_data_t_bool + WHERE id_period >= 1 + UNION ALL + SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value + FROM pred_data_t_bool AS p0 + CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods + WHERE p0.id_period = 0 + )" + + ctes$pred_bool_wide <- "pred_bool_wide AS ( + PIVOT pred_bool_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period + )" + } + + # Build SELECT columns + select_cols <- "tr.result" + if ("pred_data_t_float" %in% existing_pred_tables) { + select_cols <- paste0(select_cols, ", pf.* EXCLUDE (id_coord, id_period)") + } + if ("pred_data_t_int" %in% existing_pred_tables) { + select_cols <- paste0(select_cols, ", pi.* EXCLUDE (id_coord, id_period)") + } + if ("pred_data_t_bool" %in% existing_pred_tables) { + select_cols <- paste0(select_cols, ", pb.* EXCLUDE (id_coord, id_period)") + } + + # Build JOINs + joins <- "" + if ("pred_data_t_float" %in% existing_pred_tables) { + joins <- paste0( + joins, + "\n LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period" + ) + } + if ("pred_data_t_int" %in% existing_pred_tables) { + joins <- paste0( + joins, + "\n LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period" + ) + } + if ("pred_data_t_bool" %in% existing_pred_tables) { + joins <- paste0( + joins, + "\n LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period" + ) + } + + # Combine everything into final query + cte_string <- paste(unlist(ctes), collapse = ",\n\n ") + + query <- glue::glue( + "WITH {cte_string} + + SELECT {select_cols} + FROM trans_result AS tr{joins} + WHERE tr.result IS NOT NULL" + ) + + result <- self$get_query(query) + + # Rename columns to id_pred_{N} format + old_names <- names(result) + new_names <- old_names + for (i in seq_along(old_names)) { + if (old_names[i] != "result" && grepl("^\\d+$", old_names[i])) { + new_names[i] <- paste0("id_pred_", old_names[i]) + } + } + data.table::setnames(result, old_names, new_names) + + result + } + ) } diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R new file mode 100644 index 0000000..0fa7d51 --- /dev/null +++ b/R/parquet_duckdb.R @@ -0,0 +1,576 @@ +#' R6 Base Class for Parquet-Backed DuckDB Storage +#' +#' @description +#' A domain-agnostic R6 class that provides an interface to a folder-based data +#' storage system using DuckDB for in-memory SQL operations and parquet files +#' for efficient on-disk persistence. This class can be inherited by +#' domain-specific database classes. +#' +#' @export + +parquet_duckdb <- R6::R6Class( + classname = "parquet_duckdb", + + ## Public Methods ---- + public = list( + #' @field connection DBI connection object to an in-memory DuckDB database + connection = NULL, + + #' @field path Character string path to the data folder + path = NULL, + + #' @field default_format Default file format for new tables + default_format = NULL, + + #' @field writeopts Default write options for DuckDB + writeopts = NULL, + + #' @description + #' Initialize a new parquet_duckdb object + #' @param path Character string. Path to the data folder. + #' @param default_format Character. Default file format ("parquet" or "csv"). + #' Default is "parquet". + #' @param extensions Character vector of DuckDB extensions to load (e.g., "spatial") + #' + #' @return A new `parquet_duckdb` object + initialize = function( + path, + default_format = c("parquet", "csv"), + extensions = character(0) + ) { + # Create folder if it doesn't exist + self$path <- ensure_dir(path) + + # Set format / writeopts + self$default_format <- match.arg(default_format) + self$writeopts <- switch( + self$default_format, + parquet = "format parquet, compression zstd", + csv = "format csv", + stop(glue::glue("Unsupported format: {self$default_format}")) + ) + + # Create in-memory connection for SQL operations + self$connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + + # load extensions + for (ext in extensions) { + self$execute(glue::glue("install {ext}; load {ext};")) + } + + invisible(self) + }, + + ### Core Database Methods ---- + + #' @description + #' Execute a SQL statement + #' @param statement A SQL statement + #' @return Number of rows affected by statement + execute = function(statement) { + DBI::dbExecute(self$connection, statement) + }, + + #' @description + #' Execute a SQL query and return results + #' @param statement A SQL query statement + #' @return A data.table with query results + get_query = function(statement) { + DBI::dbGetQuery(self$connection, statement) |> + data.table::as.data.table() + }, + + #' @description + #' Attach a table from parquet/CSV file as a temporary table in DuckDB + #' @param table_name Character. Name of table to attach. + #' @param columns Character vector. Optional SQL column selection, defaults to "*" + #' @param where Character. Optional SQL WHERE clause to subset the table. + #' @return Invisible NULL (called for side effects) + attach_table = function(table_name, columns = "*", where = NULL) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + stop(glue::glue("Table '{table_name}' does not exist at path: {self$path}")) + } + + # Build SQL query + sql <- glue::glue( + "create temp table {table_name} as ", + "select {paste(columns, collapse = ', ')} ", + "from read_{file_info$format}('{file_info$path}')" + ) + + if (!is.null(where)) { + sql <- glue::glue("{sql} where {where}") + } + + # Execute SQL + self$execute(sql) + invisible(NULL) + }, + + #' @description + #' Detach a table from the in-memory database + #' @param table_name Character. Name of table to drop. + #' @return Invisible NULL (called for side effects) + detach_table = function(table_name) { + self$execute(paste0("drop table if exists ", table_name, ";")) + invisible(NULL) + }, + + #' @description + #' Get row count for a table + #' @param table_name Character string. Name of the table to query. + #' @return Integer number of rows + row_count = function(table_name) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(0L) + } + + self$get_query( + glue::glue("select count(*) as n from read_{file_info$format}('{file_info$path}')") + )[[1]] + }, + + #' @description + #' List all tables (files) in storage + #' @return Character vector of table names + list_tables = function() { + list.files(self$path, pattern = "\\.(parquet|csv)$", full.names = FALSE) |> + tools::file_path_sans_ext() |> + unique() |> + sort() + }, + + #' @description + #' Execute a function with specified tables attached, handling attach/detach automatically. + #' If a table is already attached in the DuckDB instance, it won't be re-attached or detached. + #' + #' @param tables Character vector of table names to attach + #' @param func Function to execute with tables attached + #' @param ... Additional arguments passed to func + #' @return Result of func + with_tables = function(tables, func, ...) { + # Track which tables we attach (so we know which to detach) + attached_tables <- character(0) + + # Check which tables are already attached + existing_tables <- DBI::dbListTables(self$connection) + + # Attach tables that aren't already present + for (table in tables) { + if (!table %in% existing_tables) { + self$attach_table(table) + attached_tables <- c(attached_tables, table) + } + } + + # Ensure cleanup on exit + on.exit( + { + for (table in attached_tables) { + self$detach_table(table) + } + }, + add = TRUE + ) + + # Execute the function + func(...) + }, + + #' @description + #' Fetch data from a table + #' @param table_name Character string. Name of the table to query. + #' @param where Character string. Optional WHERE clause for the SQL query. + #' @param limit Integer. Optional limit on number of rows to return. + #' + #' @return A data.table + fetch = function(table_name, where = NULL, limit = NULL) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(data.table::data.table()) + } + + # build sql query + sql <- glue::glue("select * from read_{file_info$format}('{file_info$path}')") + + if (!is.null(where)) { + sql <- glue::glue("{sql} where {where}") + } + if (!is.null(limit)) { + sql <- glue::glue("{sql} limit {limit}") + } + + self$get_query(sql) + }, + + #' @description + #' Delete rows from a table + #' @param table_name Character string. Name of the table to delete from. + #' @param where Character string. Optional WHERE clause; if NULL, deletes all rows. + #' @return Number of rows deleted + delete_from = function(table_name, where = NULL) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(0L) + } + + count_before <- self$row_count(table_name) + + if (is.null(where)) { + file.remove(file_info$path) + return(count_before) + } + + self$execute(glue::glue( + r"{ + copy ( + select * from read_{file_info$format}('{file_info$path}') + where not ({where}) + ) + to '{file_info$path}' ({self$writeopts}) + }" + )) + + count_after <- self$row_count(table_name) + + return(count_before - count_after) + }, + + ### Commit Methods ---- + + #' @description + #' Commit data in overwrite mode + #' @param x Data frame to commit + #' @param table_name Character string table name + #' @param autoincrement_cols Character vector of column names to auto-increment + #' @param map_cols Character vector of columns to convert to MAP format + #' @return Invisible NULL (called for side effects) + commit_overwrite = function( + x, + table_name, + autoincrement_cols = character(0), + map_cols = character(0) + ) { + file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) + + private$register_new_data_v(x, map_cols) + on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) + + # Warn if overriding existing IDs + if (length(intersect(autoincrement_cols, names(x))) > 0) { + warning(glue::glue( + "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", + "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" + )) + } + + # Build SELECT expression + ordinary_cols <- setdiff(names(x), autoincrement_cols) + select_expr <- glue::glue_collapse( + c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), + sep = ",\n " + ) + + self$execute(glue::glue( + r"{ + copy ( + select {select_expr} + from new_data_v + ) to '{file_path}' ({self$writeopts}) + }" + )) + + invisible(NULL) + }, + + #' @description + #' Commit data in append mode + #' @param x Data frame to commit + #' @param table_name Character string table name + #' @param autoincrement_cols Character vector of column names to auto-increment + #' @param map_cols Character vector of columns to convert to MAP format + #' @return Invisible NULL (called for side effects) + commit_append = function( + x, + table_name, + autoincrement_cols = character(0), + map_cols = character(0) + ) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) + } + + self$attach_table(table_name) + on.exit(self$detach_table(table_name), add = TRUE) + private$set_autoincrement_vars(table_name, autoincrement_cols) + + private$register_new_data_v(x, map_cols) + on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) + + ordinary_cols <- setdiff(names(x), autoincrement_cols) + select_new <- glue::glue_collapse( + c( + glue::glue( + "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" + ), + ordinary_cols + ), + sep = ",\n " + ) + + # Concatenation using UNION ALL; "by name" handles missing columns + self$execute(glue::glue( + r"{ + copy ( + select * from {table_name} + union all by name + select {select_new} + from new_data_v + ) + to '{file_info$path}' ({self$writeopts}) + }" + )) + + invisible(NULL) + }, + + #' @description + #' Commit data in upsert mode (update existing, insert new) + #' @param x Data frame to commit + #' @param table_name Character string table name + #' @param key_cols Character vector of columns that define uniqueness + #' @param autoincrement_cols Character vector of column names to auto-increment + #' @param map_cols Character vector of columns to convert to MAP format + #' @return Invisible NULL (called for side effects) + commit_upsert = function( + x, + table_name, + key_cols = grep("^id_", names(x), value = TRUE), + autoincrement_cols = character(0), + map_cols = character(0) + ) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) + } else if (length(key_cols) == 0) { + return(self$commit_append(x, table_name, autoincrement_cols, map_cols)) + } + + self$attach_table(table_name) + on.exit(self$detach_table(table_name), add = TRUE) + + private$set_autoincrement_vars(table_name, autoincrement_cols) + + private$register_new_data_v(x, map_cols) + on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) + + # Update existing data + ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols)) + update_select_expr <- glue::glue_collapse( + glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), + sep = ",\n " + ) + update_join_condition <- glue::glue_collapse( + glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"), + sep = "\nand " + ) + + self$execute(glue::glue( + r"{ + update {table_name} set + {update_select_expr} + from new_data_v + where + {update_join_condition}; + }" + )) + + # Insert new data + insert_select_expr <- glue::glue_collapse( + c( + glue::glue( + "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" + ), + glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}") + ), + sep = ",\n " + ) + null_condition <- glue::glue_collapse( + glue::glue("{table_name}.{key_cols} is null"), + sep = "\nand " + ) + + self$execute(glue::glue( + r"{ + insert into {table_name} + select + {insert_select_expr} + from + new_data_v + left join + {table_name} + on + {update_join_condition} + where + {null_condition} + ; + }" + )) + + self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})")) + + invisible(NULL) + }, + + #' @description + #' Print method for parquet_duckdb + #' @param ... Not used + #' @return self (invisibly) + print = function(...) { + cat("\n") + cat(sprintf("Database path: %s\n", self$path)) + cat(sprintf("Default format: %s\n", self$default_format)) + cat(sprintf("Tables: %d\n", length(self$list_tables()))) + invisible(self) + } + ), + + ## Private Methods ---- + private = list( + # R6 hook called on gc(); Close the database connection + finalize = function() { + if (!is.null(self$connection)) { + DBI::dbDisconnect(self$connection) + self$connection <- NULL + } + }, + + # Get file path and format for a table + # + # @param table_name Character string table name + # @return List with path, format, and exists flag + get_file_path = function(table_name) { + # Check for parquet first, then csv + parquet_path <- file.path(self$path, paste0(table_name, ".parquet")) + csv_path <- file.path(self$path, paste0(table_name, ".csv")) + + if (file.exists(parquet_path)) { + return(list(path = parquet_path, format = "parquet", exists = TRUE)) + } else if (file.exists(csv_path)) { + return(list(path = csv_path, format = "csv", exists = TRUE)) + } else { + # Return default format for new file + default_path <- file.path( + self$path, + paste0(table_name, ".", self$default_format) + ) + return(list( + path = default_path, + format = self$default_format, + exists = FALSE + )) + } + }, + + # Register new_data_v table, optionally converting MAP columns + # + # @param x Data to register + # @param map_cols Character vector of columns to convert to MAP format + # @return NULL (called for side effects) + register_new_data_v = function(x, map_cols = character(0)) { + if (length(map_cols) == 0) { + # No MAP conversion needed - register directly + duckdb::duckdb_register(self$connection, "new_data_v", x) + } else { + # Convert list columns to key-value dataframes + x <- + data.table::copy(x) |> + convert_list_cols(map_cols, list_to_kv_df) + + # Register as intermediate table + duckdb::duckdb_register(self$connection, "new_data_raw", x) + + # Build SELECT expression with map_from_entries for MAP columns + map_exprs <- glue::glue("map_from_entries({map_cols}) as {map_cols}") + other_cols <- setdiff(names(x), map_cols) + all_exprs <- c(other_cols, map_exprs) + select_expr <- glue::glue_collapse(all_exprs, sep = ", ") + + # Create new_data_v from new_data_raw + self$execute(glue::glue( + "create temp table new_data_v as select {select_expr} from new_data_raw" + )) + } + + invisible(NULL) + }, + + # Cleanup new_data_v and related tables + # + # @param map_cols Character vector indicating if MAP conversion was used + # @return NULL (called for side effects) + cleanup_new_data_v = function(map_cols = character(0)) { + if (length(map_cols) == 0) { + duckdb::duckdb_unregister(self$connection, "new_data_v") + } else { + self$execute( + "drop table if exists new_data_v; + drop view if exists new_data_v" + ) + duckdb::duckdb_unregister(self$connection, "new_data_raw") + } + + invisible(NULL) + }, + + # Set DuckDB variables max_{colname} for autoincrement columns + # + # @param table_name Character string table name + # @param autoincrement_cols Character vector of column names + # @return NULL (called for side effects) + set_autoincrement_vars = function(table_name, autoincrement_cols) { + if (length(autoincrement_cols) == 0L) { + return(NULL) + } + + if (!DBI::dbExistsTable(self$connection, table_name)) { + # Attach/detach if not already attached + self$attach_table(table_name) + on.exit(self$detach_table(table_name)) + } + + existing_cols <- + glue::glue("select column_name from (describe {table_name})") |> + self$get_query() |> + (\(x) x[[1]])() |> + intersect(autoincrement_cols) + + missing_cols <- setdiff(autoincrement_cols, existing_cols) + + set_exprs <- glue::glue_collapse( + c( + glue::glue( + "set variable max_{existing_cols} = + (select coalesce(max({existing_cols}), 0) from {table_name});" + ), + glue::glue( + "set variable max_{missing_cols} = 0;" + ) + ), + sep = "\n" + ) + + self$execute(set_exprs) + invisible(NULL) + } + ) +) diff --git a/inst/tinytest/test_evoland_db.R b/inst/tinytest/test_evoland_db.R index b84f060..4a73bc6 100644 --- a/inst/tinytest/test_evoland_db.R +++ b/inst/tinytest/test_evoland_db.R @@ -1,10 +1,12 @@ -# Test that a new database can be set up using evoland_db$new() +# Test evoland_db domain-specific functionality +# Generic parquet_duckdb tests are in test_parquet_duckdb.R library(tinytest) # Create temporary directory for testing test_dir <- tempfile("evoland_test_") on.exit(unlink(test_dir, recursive = TRUE), add = TRUE) +# Test 1: evoland_db initialization with reporting expect_silent( db <- evoland_db$new( path = test_dir, @@ -13,13 +15,15 @@ expect_silent( ) ) expect_true(inherits(db, "evoland_db")) +expect_true(inherits(db, "parquet_duckdb")) # Should inherit from parent -# In folder-based storage, only reporting_t exists initially -# Other tables are created on demand -expected_tables_initial <- c("reporting_t") -expect_identical(db$list_tables(), expected_tables_initial) - +# Test 2: Reporting table is created and populated +expect_identical(db$list_tables(), "reporting_t") reporting1 <- db$fetch("reporting_t") +expect_true("report_name" %in% reporting1$key) +expect_true("tinytest" %in% reporting1$value) + +# Test 3: Persistence of reporting data rm(db) gc() db <- evoland_db$new( @@ -34,9 +38,7 @@ expect_equal( sort(reporting2$key) ) - -# Check that accessing non-existent tables returns empty data.tables -# (these tables don't appear in list_tables() until they have data) +# Test 4: Domain-specific empty table structures via active bindings empty_tables <- c( "alloc_params_t", "coords_t", @@ -57,7 +59,7 @@ for (table in empty_tables) { expect_equal(nrow(db[[table]]), 0L) } -# Create synthetic, minimal test data +# Test 5: Create synthetic evoland-specific test data coords_t <- create_coords_t_square( epsg = 2056, extent = terra::ext(c( @@ -230,8 +232,7 @@ intrv_masks_t <- as_intrv_masks_t( ) ) -# Test DB roundtrips & integrity checks, repeated assignments (~upserts) -# TODO make this into a testing harness that iterates over a list of sample data +# Test 6: Active bindings - coords_t expect_silent(db$coords_t <- coords_t) expect_silent(db$coords_t <- coords_t) expect_identical(db$coords_t, coords_t) @@ -240,12 +241,15 @@ expect_identical( data.table::as.data.table(coords_t[, 1:3]) ) +# Test 7: Active bindings - lulc_meta_t expect_silent(db$lulc_meta_t <- lulc_meta_t) expect_identical(db$lulc_meta_t, lulc_meta_t) +# Test 8: Active bindings - periods_t expect_silent(db$periods_t <- periods_t) expect_identical(db$periods_t, periods_t) +# Test 9: Active bindings - pred_meta_t with auto-increment expect_silent(db$pred_meta_t <- pred_meta_t[1, ]) expect_silent(db$pred_meta_t <- pred_meta_t) # After committing, id_pred should be assigned (1:2 in this case) @@ -254,20 +258,24 @@ expect_equal(retrieved_pred_meta[["id_pred"]], 1:2) expect_equal(retrieved_pred_meta[["name"]], c("noise", "distance_to_lake")) db$delete_from("pred_meta_t") # clear up, we want to ensure this works with add_predictor +# Test 10: Active bindings - intrv_meta_t (with MAP columns) expect_silent(db$intrv_meta_t <- intrv_meta_t) expect_equal(db$intrv_meta_t, intrv_meta_t) +# Test 11: Active bindings - trans_meta_t expect_silent(db$trans_meta_t <- trans_meta_t) expect_equal(db$trans_meta_t[, c(-1)], trans_meta_t) +# Test 12: Active bindings - trans_models_t (with MAP and BLOB columns) expect_silent(db$trans_models_t <- trans_models_t) expect_equal(db$trans_models_t, trans_models_t) -# now that we have the trans_meta we shouldn't be violating the FK anymore +# Test 13: Active bindings - alloc_params_t (with MAP columns) expect_silent(db$alloc_params_t <- alloc_params_t) expect_equal(db$alloc_params_t, alloc_params_t) -# repeated upsert should be idempotent +# Test 14: add_predictor method - first predictor +# Test 15: add_predictor method - second predictor expect_silent( db$add_predictor( pred_spec = pred_spec["noise"], @@ -284,17 +292,19 @@ expect_silent( pred_type = "float" ) ) -# check that add_predictor works for metadata +# Test 16: add_predictor correctly updates metadata retrieved_pred_meta <- db$pred_meta_t expect_equal(retrieved_pred_meta[["id_pred"]], 1:2) expect_equal(retrieved_pred_meta[["name"]], c("noise", "distance_to_lake")) -# check that add_predictor works for data +# Test 17: add_predictor correctly updates data expect_equal(db$row_count("pred_data_t_float"), 48L) +# Test 18: Active bindings - pred_data_t_float upsert is idempotent expect_silent(db$pred_data_t_float <- pred_data_t) expect_equal(db$row_count("pred_data_t_float"), 48L) expect_silent(db$pred_data_t_float <- pred_data_t) expect_equal(db$row_count("pred_data_t_float"), 48L) +# Test 19: Active bindings enforce type validation expect_error( # should only be able to insert the correct class db$pred_data_t_float <- data.table::data.table( @@ -306,6 +316,7 @@ expect_error( r"(^inherits.* is not TRUE$)" ) +# Test 20: lulc_data_t requires proper type expect_error( db$lulc_data_t <- lulc_data_dt <- data.table::data.table( id_coord = c(1L, 2L, 1L), @@ -318,146 +329,77 @@ expect_silent( db$lulc_data_t <- as_lulc_data_t(lulc_data_dt) ) -# Test delete_from functionality -# Setup: we already have pred_data_t_float with 48 rows (id_pred 1:2, id_coord 3:50, id_period 1) -expect_equal(db$row_count("pred_data_t_float"), 48L) - -# Test 1: Delete with WHERE clause - delete specific predictor -deleted_count <- db$delete_from("pred_data_t_float", where = "id_pred = 1") -expect_equal(deleted_count, 24L) -expect_equal(db$row_count("pred_data_t_float"), 24L) - -# Verify only id_pred = 2 remains -remaining <- db$pred_data_t_float -expect_equal(unique(remaining$id_pred), 2L) -expect_equal(nrow(remaining), 24L) +# Test 21: Domain-specific view - coords_minimal +coords_minimal <- db$coords_minimal +expect_true(inherits(coords_minimal, "data.table")) +expect_equal(ncol(coords_minimal), 3L) +expect_true(all(c("id_coord", "lon", "lat") %in% names(coords_minimal))) +expect_equal(nrow(coords_minimal), nrow(coords_t)) + +# Test 22: Domain-specific view - extent +# Set up coords first +db$coords_t <- coords_t +extent <- db$extent +expect_true(inherits(extent, "SpatExtent")) + +# Test 23: Domain-specific view - lulc_meta_long_v +db$lulc_meta_t <- lulc_meta_t +lulc_long <- db$lulc_meta_long_v +expect_true(inherits(lulc_long, "data.table")) +expect_true("src_class" %in% names(lulc_long)) +# Should have one row per src_class +expect_true(nrow(lulc_long) > nrow(lulc_meta_t)) + +# Test 24: Domain-specific view - pred_sources_v +db$pred_meta_t <- pred_meta_t +sources <- db$pred_sources_v +expect_true(inherits(sources, "data.table")) +expect_true(all(c("url", "md5sum") %in% names(sources))) +expect_true(nrow(sources) > 0L) + +# Test 25: set_coords method +test_dir_coords <- tempfile("evoland_coords_") +on.exit(unlink(test_dir_coords, recursive = TRUE), add = TRUE) +db_coords <- evoland_db$new(test_dir_coords) -# Test 2: Delete with complex WHERE clause -# Add back some data first -db$pred_data_t_float <- pred_data_t -expect_equal(db$row_count("pred_data_t_float"), 48L) - -# Delete only specific coordinates -deleted_count <- db$delete_from("pred_data_t_float", where = "id_coord < 10") -expect_true(deleted_count > 0L) -remaining <- db$pred_data_t_float -expect_true(all(remaining$id_coord >= 10)) - -# Test 3: Delete all rows (NULL where clause) -count_before_delete <- db$row_count("pred_data_t_float") -deleted_count <- db$delete_from("pred_data_t_float", where = NULL) -expect_equal(deleted_count, count_before_delete) -expect_equal(db$row_count("pred_data_t_float"), 0L) - -# Test 4: Delete from non-existent table returns 0 -deleted_count <- db$delete_from("nonexistent_table", where = "id = 1") -expect_equal(deleted_count, 0L) - -# Test 5: Delete with WHERE that matches nothing -db$pred_data_t_float <- pred_data_t -initial_count <- db$row_count("pred_data_t_float") -deleted_count <- db$delete_from("pred_data_t_float", where = "id_pred = 999") -expect_equal(deleted_count, 0L) -expect_equal(db$row_count("pred_data_t_float"), initial_count) - -# Test auto-increment functionality -# Create a new test database for auto-increment tests -test_dir_autoinc <- tempfile("evoland_autoinc_") -on.exit(unlink(test_dir_autoinc, recursive = TRUE), add = TRUE) -db_autoinc <- evoland_db$new(test_dir_autoinc) - -# Test 1: Auto-increment on overwrite mode (new table) -test_data_1 <- data.table::data.table( - name = c("predictor_a", "predictor_b", "predictor_c"), - unit = c("m", "kg", "s") -) -db_autoinc$commit_overwrite( - test_data_1, - "test_autoinc_t", - autoincrement_cols = "id_test" -) -result_1 <- db_autoinc$fetch("test_autoinc_t") -expect_equal(result_1$id_test, 1:3) -expect_equal(result_1$name, c("predictor_a", "predictor_b", "predictor_c")) - -# Test 2: Auto-increment on append mode -test_data_2 <- data.table::data.table( - name = c("predictor_d", "predictor_e"), - unit = c("A", "V") -) -db_autoinc$commit_append( - test_data_2, - "test_autoinc_t", - autoincrement_cols = "id_test" -) -result_2 <- db_autoinc$fetch("test_autoinc_t") -expect_equal(nrow(result_2), 5L) -expect_equal(result_2$id_test, 1:5) -expect_equal(result_2$name[4:5], c("predictor_d", "predictor_e")) - -# Test 3: Auto-increment on upsert mode with new rows -test_data_3 <- data.table::data.table( - name = c("predictor_f", "predictor_g"), - unit = c("W", "J") -) -db_autoinc$commit_upsert( - test_data_3, - "test_autoinc_t", - autoincrement_cols = "id_test" -) -result_3 <- db_autoinc$fetch("test_autoinc_t") -expect_equal(nrow(result_3), 7L) -expect_equal(result_3$id_test, 1:7) - -# Test 4: Auto-increment preserves existing IDs in data -test_data_4 <- data.table::data.table( - id_test = c(NA, 100L, NA), - name = c("new_a", "existing", "new_b"), - unit = c("x", "y", "z") +expect_silent( + db_coords$set_coords( + type = "square", + epsg = 2056, + extent = terra::ext(c(xmin = 2697000, xmax = 2698000, ymin = 1252000, ymax = 1253000)), + resolution = 100 + ) ) +expect_true(db_coords$row_count("coords_t") > 0L) + +# Should refuse to overwrite expect_warning( - db_autoinc$commit_overwrite( - test_data_4, - "test_autoinc2_t", - autoincrement_cols = "id_test" + db_coords$set_coords( + type = "square", + epsg = 2056, + extent = terra::ext(c(xmin = 2697000, xmax = 2698000, ymin = 1252000, ymax = 1253000)), + resolution = 100 ), - "Overriding existing IDs" + "not empty" ) -result_4 <- db_autoinc$fetch("test_autoinc2_t") -expect_equal(result_4$id_test, 1:3) -# Test 5: Multiple auto-increment columns -test_data_5 <- data.table::data.table( - name = c("item1", "item2"), - value = c(10, 20) -) -db_autoinc$commit_overwrite( - test_data_5, - "test_multi_autoinc_t", - autoincrement_cols = c("id_a", "id_b") -) -result_5 <- db_autoinc$fetch("test_multi_autoinc_t") -expect_equal(result_5$id_a, 1:2) -expect_equal(result_5$id_b, 1:2) - -# Test 6: Auto-increment continues from max in append -test_data_6a <- data.table::data.table( - id_seq = c(5L, 10L, 15L), - value = c(100, 200, 300) -) -db_autoinc$commit_overwrite( - test_data_6a, - "test_continue_t" -) -test_data_6b <- data.table::data.table( - value = c(400, 500) +# Test 26: set_periods method +test_dir_periods <- tempfile("evoland_periods_") +on.exit(unlink(test_dir_periods, recursive = TRUE), add = TRUE) +db_periods <- evoland_db$new(test_dir_periods) + +expect_silent( + db_periods$set_periods( + period_length_str = "P10Y", + start_observed = "1985-01-01", + end_observed = "2020-01-01", + end_extrapolated = "2060-01-01" + ) ) -db_autoinc$commit_append( - test_data_6b, - "test_continue_t", - autoincrement_cols = "id_seq" +expect_true(db_periods$row_count("periods_t") > 0L) + +# Should refuse to overwrite +expect_warning( + db_periods$set_periods(), + "not empty" ) -result_6 <- db_autoinc$fetch("test_continue_t") -expect_equal(nrow(result_6), 5L) -expect_equal(result_6$id_seq[4:5], c(16L, 17L)) -expect_equal(result_6$value[4:5], c(400, 500)) diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R new file mode 100644 index 0000000..fe9b188 --- /dev/null +++ b/inst/tinytest/test_parquet_duckdb.R @@ -0,0 +1,349 @@ +# Test generic parquet_duckdb functionality +library(tinytest) + +# Create temporary directory for testing +test_dir <- tempfile("parquet_duckdb_test_") +on.exit(unlink(test_dir, recursive = TRUE), add = TRUE) + +# Test 1: Initialization +expect_silent( + db <- parquet_duckdb$new( + path = test_dir, + default_format = "parquet" + ) +) +expect_true(inherits(db, "parquet_duckdb")) +expect_true(dir.exists(test_dir)) +expect_equal(db$default_format, "parquet") +expect_true(!is.null(db$connection)) +expect_true(inherits(db$connection, "duckdb_connection")) + +# Test 2: Initial state - no tables +expect_identical(db$list_tables(), character(0)) + +# Test 3: Fetch from non-existent table returns empty data.table +result <- db$fetch("nonexistent_table") +expect_true(inherits(result, "data.table")) +expect_equal(nrow(result), 0L) + +# Test 4: Row count for non-existent table +expect_equal(db$row_count("nonexistent_table"), 0L) + +# Test 5: commit_overwrite creates new table +test_data_1 <- data.table::data.table( + id = 1:5, + name = letters[1:5], + value = c(10.1, 20.2, 30.3, 40.4, 50.5) +) +expect_silent( + db$commit_overwrite(test_data_1, "test_table_1") +) +expect_true("test_table_1" %in% db$list_tables()) +expect_equal(db$row_count("test_table_1"), 5L) + +# Test 6: Fetch retrieves committed data +retrieved <- db$fetch("test_table_1") +expect_equal(retrieved, test_data_1) + +# Test 7: commit_overwrite replaces existing data +test_data_1b <- data.table::data.table( + id = 10:12, + name = letters[24:26], + value = c(100.1, 200.2, 300.3) +) +expect_silent( + db$commit_overwrite(test_data_1b, "test_table_1") +) +expect_equal(db$row_count("test_table_1"), 3L) +retrieved <- db$fetch("test_table_1") +expect_equal(retrieved, test_data_1b) + +# Test 8: commit_append adds to existing data +test_data_1c <- data.table::data.table( + id = 13:15, + name = letters[1:3], + value = c(111.1, 222.2, 333.3) +) +expect_silent( + db$commit_append(test_data_1c, "test_table_1") +) +expect_equal(db$row_count("test_table_1"), 6L) +retrieved <- db$fetch("test_table_1") +expect_equal(nrow(retrieved), 6L) +expect_true(all(c(10:15) %in% retrieved$id)) + +# Test 9: commit_append on non-existent table creates it +expect_silent( + db$commit_append(test_data_1, "test_table_2") +) +expect_true("test_table_2" %in% db$list_tables()) +expect_equal(db$row_count("test_table_2"), 5L) + +# Test 10: commit_upsert on non-existent table creates it +test_data_3 <- data.table::data.table( + id_key = 1:3, + name = c("a", "b", "c"), + value = c(1.1, 2.2, 3.3) +) +expect_silent( + db$commit_upsert(test_data_3, "test_table_3", key_cols = "id_key") +) +expect_true("test_table_3" %in% db$list_tables()) +expect_equal(db$row_count("test_table_3"), 3L) + +# Test 11: commit_upsert updates existing rows and inserts new ones +test_data_3b <- data.table::data.table( + id_key = c(2L, 3L, 4L), + name = c("b_updated", "c_updated", "d"), + value = c(22.2, 33.3, 44.4) +) +expect_silent( + db$commit_upsert(test_data_3b, "test_table_3", key_cols = "id_key") +) +expect_equal(db$row_count("test_table_3"), 4L) +retrieved <- db$fetch("test_table_3") +expect_equal(nrow(retrieved), 4L) +expect_equal(retrieved[id_key == 2]$name, "b_updated") +expect_equal(retrieved[id_key == 3]$value, 33.3) +expect_equal(retrieved[id_key == 4]$name, "d") +expect_equal(retrieved[id_key == 1]$name, "a") # unchanged + +# Test 12: Fetch with WHERE clause +result <- db$fetch("test_table_3", where = "id_key >= 3") +expect_equal(nrow(result), 2L) +expect_true(all(result$id_key >= 3)) + +# Test 13: Fetch with LIMIT +result <- db$fetch("test_table_3", limit = 2) +expect_equal(nrow(result), 2L) + +# Test 14: Fetch with WHERE and LIMIT +result <- db$fetch("test_table_3", where = "value > 20", limit = 1) +expect_equal(nrow(result), 1L) +expect_true(result$value > 20) + +# Test 15: delete_from with WHERE clause +deleted_count <- db$delete_from("test_table_3", where = "id_key = 1") +expect_equal(deleted_count, 1L) +expect_equal(db$row_count("test_table_3"), 3L) +retrieved <- db$fetch("test_table_3") +expect_false(1L %in% retrieved$id_key) + +# Test 16: delete_from with complex WHERE +db$commit_overwrite(test_data_1, "test_table_4") +deleted_count <- db$delete_from("test_table_4", where = "id < 3") +expect_equal(deleted_count, 2L) +retrieved <- db$fetch("test_table_4") +expect_true(all(retrieved$id >= 3)) + +# Test 17: delete_from with NULL (delete all) +count_before <- db$row_count("test_table_4") +deleted_count <- db$delete_from("test_table_4", where = NULL) +expect_equal(deleted_count, count_before) +expect_equal(db$row_count("test_table_4"), 0L) + +# Test 18: delete_from on non-existent table +deleted_count <- db$delete_from("nonexistent", where = "id = 1") +expect_equal(deleted_count, 0L) + +# Test 19: delete_from with WHERE that matches nothing +db$commit_overwrite(test_data_1, "test_table_5") +initial_count <- db$row_count("test_table_5") +deleted_count <- db$delete_from("test_table_5", where = "id = 999") +expect_equal(deleted_count, 0L) +expect_equal(db$row_count("test_table_5"), initial_count) + +# Test 20: Auto-increment on overwrite (new table) +test_autoinc_1 <- data.table::data.table( + name = c("item_a", "item_b", "item_c"), + value = c(10, 20, 30) +) +db$commit_overwrite( + test_autoinc_1, + "test_autoinc_1", + autoincrement_cols = "id" +) +result <- db$fetch("test_autoinc_1") +expect_equal(result$id, 1:3) +expect_equal(result$name, c("item_a", "item_b", "item_c")) + +# Test 21: Auto-increment on append +test_autoinc_2 <- data.table::data.table( + name = c("item_d", "item_e"), + value = c(40, 50) +) +db$commit_append( + test_autoinc_2, + "test_autoinc_1", + autoincrement_cols = "id" +) +result <- db$fetch("test_autoinc_1") +expect_equal(nrow(result), 5L) +expect_equal(result$id, 1:5) +expect_equal(result$name[4:5], c("item_d", "item_e")) + +# Test 22: Auto-increment on upsert with new rows +test_autoinc_3 <- data.table::data.table( + name = c("item_f", "item_g"), + value = c(60, 70) +) +db$commit_upsert( + test_autoinc_3, + "test_autoinc_1", + autoincrement_cols = "id" +) +result <- db$fetch("test_autoinc_1") +expect_equal(nrow(result), 7L) +expect_equal(result$id, 1:7) + +# Test 23: Auto-increment warning when overriding existing IDs +test_autoinc_with_ids <- data.table::data.table( + id = c(NA, 100L, NA), + name = c("new_a", "existing", "new_b"), + value = c(1, 2, 3) +) +expect_warning( + db$commit_overwrite( + test_autoinc_with_ids, + "test_autoinc_2", + autoincrement_cols = "id" + ), + "Overriding existing IDs" +) +result <- db$fetch("test_autoinc_2") +expect_equal(result$id, 1:3) + +# Test 24: Multiple auto-increment columns +test_multi_autoinc <- data.table::data.table( + name = c("item1", "item2"), + value = c(10, 20) +) +db$commit_overwrite( + test_multi_autoinc, + "test_multi_autoinc", + autoincrement_cols = c("id_a", "id_b") +) +result <- db$fetch("test_multi_autoinc") +expect_equal(result$id_a, 1:2) +expect_equal(result$id_b, 1:2) + +# Test 25: Auto-increment continues from max +test_continue_a <- data.table::data.table( + id_seq = c(5L, 10L, 15L), + value = c(100, 200, 300) +) +db$commit_overwrite(test_continue_a, "test_continue") + +test_continue_b <- data.table::data.table( + value = c(400, 500) +) +db$commit_append( + test_continue_b, + "test_continue", + autoincrement_cols = "id_seq" +) +result <- db$fetch("test_continue") +expect_equal(nrow(result), 5L) +expect_equal(result$id_seq[4:5], c(16L, 17L)) +expect_equal(result$value[4:5], c(400, 500)) + +# Test 26: attach_table and detach_table +db$commit_overwrite(test_data_1, "test_attach") +expect_silent(db$attach_table("test_attach")) +# Verify table is attached by querying it directly +result <- db$get_query("SELECT COUNT(*) as n FROM test_attach") +expect_equal(result$n, 5L) +expect_silent(db$detach_table("test_attach")) +# After detach, table should not be accessible +expect_error( + db$get_query("SELECT COUNT(*) as n FROM test_attach"), + "test_attach" +) + +# Test 27: attach_table with column selection +db$attach_table("test_attach", columns = c("id", "name")) +result <- db$get_query("SELECT * FROM test_attach") +expect_equal(ncol(result), 2L) +expect_true(all(c("id", "name") %in% names(result))) +expect_false("value" %in% names(result)) +db$detach_table("test_attach") + +# Test 28: attach_table with WHERE clause +db$attach_table("test_attach", where = "id > 3") +result <- db$get_query("SELECT * FROM test_attach") +expect_equal(nrow(result), 2L) +expect_true(all(result$id > 3)) +db$detach_table("test_attach") + +# Test 29: execute() method +db$attach_table("test_attach") +rows_affected <- db$execute("DELETE FROM test_attach WHERE id = 1") +expect_true(rows_affected >= 0) # DuckDB returns number of affected rows +db$detach_table("test_attach") + +# Test 30: get_query() method +db$attach_table("test_attach") +result <- db$get_query("SELECT MAX(id) as max_id FROM test_attach") +expect_true(inherits(result, "data.table")) +expect_true("max_id" %in% names(result)) +db$detach_table("test_attach") + +# Test 31: CSV format support +test_dir_csv <- tempfile("parquet_duckdb_csv_") +on.exit(unlink(test_dir_csv, recursive = TRUE), add = TRUE) + +db_csv <- parquet_duckdb$new( + path = test_dir_csv, + default_format = "csv" +) +expect_equal(db_csv$default_format, "csv") + +test_csv_data <- data.table::data.table( + id = 1:3, + name = c("a", "b", "c") +) +db_csv$commit_overwrite(test_csv_data, "csv_table") +expect_true("csv_table" %in% db_csv$list_tables()) +retrieved <- db_csv$fetch("csv_table") +expect_equal(retrieved, test_csv_data) + +# Test 32: Extension loading +test_dir_ext <- tempfile("parquet_duckdb_ext_") +on.exit(unlink(test_dir_ext, recursive = TRUE), add = TRUE) + +db_ext <- parquet_duckdb$new( + path = test_dir_ext, + extensions = "spatial" +) +# Verify spatial extension is loaded by using a spatial function +expect_silent( + db_ext$get_query("SELECT ST_Point(0, 0) as geom") +) + +# Test 33: Persistence across connections +db$commit_overwrite(test_data_1, "persist_test") +rm(db) +gc() + +# Reconnect to same path +db <- parquet_duckdb$new(path = test_dir) +expect_true("persist_test" %in% db$list_tables()) +retrieved <- db$fetch("persist_test") +expect_equal(retrieved, test_data_1) + +# Test 34: commit_upsert with no key_cols defaults to append +test_no_keys <- data.table::data.table( + name = c("x", "y"), + value = c(1, 2) +) +db$commit_overwrite(test_no_keys, "no_keys_test") +expect_equal(db$row_count("no_keys_test"), 2L) + +db$commit_upsert(test_no_keys, "no_keys_test", key_cols = character(0)) +expect_equal(db$row_count("no_keys_test"), 4L) # Should append + +# Test 35: Print method +output <- capture.output(print(db)) +expect_true(any(grepl("parquet_duckdb", output))) +expect_true(any(grepl("Database path", output))) +expect_true(any(grepl("Default format", output))) diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index af49fa0..49cf8f8 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -8,19 +8,11 @@ An R6 class that provides an interface to a folder-based data storage system for the evoland package. Each table is stored as a parquet (or CSV) file. This class uses DuckDB for in-memory SQL operations while persisting data to disk in parquet format for better compression. -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{connection}}{DBI connection object to an in-memory DuckDB database} - -\item{\code{path}}{Character string path to the data folder} -\item{\code{default_format}}{Default file format for new tables} - -\item{\code{writeopts}}{Default write options for DuckDB, see} +Inherits from \link{parquet_duckdb} for generic database operations. } -\if{html}{\out{
}} +\section{Super class}{ +\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db} } \section{Active bindings}{ \if{html}{\out{
}} @@ -87,17 +79,7 @@ of object to assign. Assigning is an upsert operation.} \subsection{Public methods}{ \itemize{ \item \href{#method-evoland_db-new}{\code{evoland_db$new()}} -\item \href{#method-evoland_db-commit_overwrite}{\code{evoland_db$commit_overwrite()}} -\item \href{#method-evoland_db-commit_append}{\code{evoland_db$commit_append()}} -\item \href{#method-evoland_db-commit_upsert}{\code{evoland_db$commit_upsert()}} \item \href{#method-evoland_db-fetch}{\code{evoland_db$fetch()}} -\item \href{#method-evoland_db-list_tables}{\code{evoland_db$list_tables()}} -\item \href{#method-evoland_db-execute}{\code{evoland_db$execute()}} -\item \href{#method-evoland_db-get_query}{\code{evoland_db$get_query()}} -\item \href{#method-evoland_db-attach_table}{\code{evoland_db$attach_table()}} -\item \href{#method-evoland_db-detach_table}{\code{evoland_db$detach_table()}} -\item \href{#method-evoland_db-row_count}{\code{evoland_db$row_count()}} -\item \href{#method-evoland_db-delete_from}{\code{evoland_db$delete_from()}} \item \href{#method-evoland_db-set_report}{\code{evoland_db$set_report()}} \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}} \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}} @@ -106,6 +88,23 @@ of object to assign. Assigning is an upsert operation.} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } +\if{html}{\out{ +
Inherited methods + +
+}} \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-new}{}}} @@ -132,98 +131,10 @@ A new \code{evoland_db} object } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-commit_overwrite}{}}} -\subsection{Method \code{commit_overwrite()}}{ -Commit data in overwrite mode -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$commit_overwrite( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} - -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-commit_append}{}}} -\subsection{Method \code{commit_append()}}{ -Commit data in append mode -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$commit_append( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} - -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-commit_upsert}{}}} -\subsection{Method \code{commit_upsert()}}{ -Commit data in upsert mode -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$commit_upsert( - x, - table_name, - key_cols = grep("^id_", names(x), value = TRUE), - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} - -\item{\code{key_cols}}{Identify unique columns - heuristic: if prefixed with -id_, the set of all columns designates a uniqueness condition} - -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-fetch}{}}} \subsection{Method \code{fetch()}}{ -Fetch data from storage +Fetch data from storage with evoland-specific view support \subsection{Usage}{ \if{html}{\out{
}}\preformatted{evoland_db$fetch(table_name, where = NULL, limit = NULL)}\if{html}{\out{
}} } @@ -244,141 +155,6 @@ A data.table } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-list_tables}{}}} -\subsection{Method \code{list_tables()}}{ -List all tables (files) in storage -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$list_tables()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -Character vector of table names -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-execute}{}}} -\subsection{Method \code{execute()}}{ -Execute statement -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$execute(statement)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{statement}}{A SQL statement} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows affected by statement -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-get_query}{}}} -\subsection{Method \code{get_query()}}{ -Get Query -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$get_query(statement)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{statement}}{A SQL statement} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows affected by statement -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-attach_table}{}}} -\subsection{Method \code{attach_table()}}{ -Attach a single table as a temporary table in DuckDB memory. This is -useful for working with multiple tables in SQL queries instead of loading -them into R objects. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character vector. Names of table to attach.} - -\item{\code{columns}}{Character vector. Optional sql column selection, defaults to "*"} - -\item{\code{where}}{A SQL where statement to optionally subset the table being attached.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-detach_table}{}}} -\subsection{Method \code{detach_table()}}{ -Detach one or more tables from the database. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$detach_table(table_name)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character. Name of table to drop.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-row_count}{}}} -\subsection{Method \code{row_count()}}{ -Get table row count -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$row_count(table_name)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character string. Name of the table to query.} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-delete_from}{}}} -\subsection{Method \code{delete_from()}}{ -Delete rows from a table -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$delete_from(table_name, where = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character string. Name of the table to delete from.} - -\item{\code{where}}{Character string, defaults to NULL: delete everything in table.} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows affected -} -} -\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-set_report}{}}} \subsection{Method \code{set_report()}}{ diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd index d149f48..a080c24 100644 --- a/man/evoland_db_views.Rd +++ b/man/evoland_db_views.Rd @@ -7,6 +7,7 @@ \alias{make_coords_minimal} \alias{make_extent_db} \alias{make_transitions_v} +\alias{make_trans_pred_data_v} \title{Views on the evoland-plus data model} \usage{ make_pred_sources_v(self, private, where = NULL) @@ -18,6 +19,14 @@ make_coords_minimal(self, private, where = NULL) make_extent_db(self, private) make_transitions_v(self, private, where = NULL) + +make_trans_pred_data_v(self, private, id_trans) +} +\arguments{ +\item{id_trans}{Integer, the transition ID to generate data for} +} +\value{ +data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N } \description{ Functions to generate views on the database @@ -35,4 +44,7 @@ md5sum \item \code{make_transitions_v()}: Returns transitions based on lulc_data_t +\item \code{make_trans_pred_data_v()}: Returns wide table of transition results and predictor data +for a specific transition. Used as input to covariance filtering. + }} diff --git a/man/parquet_duckdb.Rd b/man/parquet_duckdb.Rd new file mode 100644 index 0000000..964cd90 --- /dev/null +++ b/man/parquet_duckdb.Rd @@ -0,0 +1,395 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet_duckdb.R +\name{parquet_duckdb} +\alias{parquet_duckdb} +\title{R6 Base Class for Parquet-Backed DuckDB Storage} +\description{ +A domain-agnostic R6 class that provides an interface to a folder-based data +storage system using DuckDB for in-memory SQL operations and parquet files +for efficient on-disk persistence. This class can be inherited by +domain-specific database classes. +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{connection}}{DBI connection object to an in-memory DuckDB database} + +\item{\code{path}}{Character string path to the data folder} + +\item{\code{default_format}}{Default file format for new tables} + +\item{\code{writeopts}}{Default write options for DuckDB} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-parquet_duckdb-new}{\code{parquet_duckdb$new()}} +\item \href{#method-parquet_duckdb-execute}{\code{parquet_duckdb$execute()}} +\item \href{#method-parquet_duckdb-get_query}{\code{parquet_duckdb$get_query()}} +\item \href{#method-parquet_duckdb-attach_table}{\code{parquet_duckdb$attach_table()}} +\item \href{#method-parquet_duckdb-detach_table}{\code{parquet_duckdb$detach_table()}} +\item \href{#method-parquet_duckdb-row_count}{\code{parquet_duckdb$row_count()}} +\item \href{#method-parquet_duckdb-list_tables}{\code{parquet_duckdb$list_tables()}} +\item \href{#method-parquet_duckdb-with_tables}{\code{parquet_duckdb$with_tables()}} +\item \href{#method-parquet_duckdb-fetch}{\code{parquet_duckdb$fetch()}} +\item \href{#method-parquet_duckdb-delete_from}{\code{parquet_duckdb$delete_from()}} +\item \href{#method-parquet_duckdb-commit_overwrite}{\code{parquet_duckdb$commit_overwrite()}} +\item \href{#method-parquet_duckdb-commit_append}{\code{parquet_duckdb$commit_append()}} +\item \href{#method-parquet_duckdb-commit_upsert}{\code{parquet_duckdb$commit_upsert()}} +\item \href{#method-parquet_duckdb-print}{\code{parquet_duckdb$print()}} +\item \href{#method-parquet_duckdb-clone}{\code{parquet_duckdb$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-new}{}}} +\subsection{Method \code{new()}}{ +Initialize a new parquet_duckdb object +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$new( + path, + default_format = c("parquet", "csv"), + extensions = character(0) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Character string. Path to the data folder.} + +\item{\code{default_format}}{Character. Default file format ("parquet" or "csv"). +Default is "parquet".} + +\item{\code{extensions}}{Character vector of DuckDB extensions to load (e.g., "spatial")} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A new \code{parquet_duckdb} object +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-execute}{}}} +\subsection{Method \code{execute()}}{ +Execute a SQL statement +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$execute(statement)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{statement}}{A SQL statement} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Number of rows affected by statement +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-get_query}{}}} +\subsection{Method \code{get_query()}}{ +Execute a SQL query and return results +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$get_query(statement)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{statement}}{A SQL query statement} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table with query results +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-attach_table}{}}} +\subsection{Method \code{attach_table()}}{ +Attach a table from parquet/CSV file as a temporary table in DuckDB +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character. Name of table to attach.} + +\item{\code{columns}}{Character vector. Optional SQL column selection, defaults to "*"} + +\item{\code{where}}{Character. Optional SQL WHERE clause to subset the table.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-detach_table}{}}} +\subsection{Method \code{detach_table()}}{ +Detach a table from the in-memory database +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$detach_table(table_name)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character. Name of table to drop.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-row_count}{}}} +\subsection{Method \code{row_count()}}{ +Get row count for a table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$row_count(table_name)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character string. Name of the table to query.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Integer number of rows +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-list_tables}{}}} +\subsection{Method \code{list_tables()}}{ +List all tables (files) in storage +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$list_tables()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Character vector of table names +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-with_tables}{}}} +\subsection{Method \code{with_tables()}}{ +Execute a function with specified tables attached, handling attach/detach automatically. +If a table is already attached in the DuckDB instance, it won't be re-attached or detached. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$with_tables(tables, func, ...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{tables}}{Character vector of table names to attach} + +\item{\code{func}}{Function to execute with tables attached} + +\item{\code{...}}{Additional arguments passed to func} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Result of func +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-fetch}{}}} +\subsection{Method \code{fetch()}}{ +Fetch data from a table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$fetch(table_name, where = NULL, limit = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character string. Name of the table to query.} + +\item{\code{where}}{Character string. Optional WHERE clause for the SQL query.} + +\item{\code{limit}}{Integer. Optional limit on number of rows to return.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-delete_from}{}}} +\subsection{Method \code{delete_from()}}{ +Delete rows from a table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$delete_from(table_name, where = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character string. Name of the table to delete from.} + +\item{\code{where}}{Character string. Optional WHERE clause; if NULL, deletes all rows.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Number of rows deleted +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_overwrite}{}}} +\subsection{Method \code{commit_overwrite()}}{ +Commit data in overwrite mode +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$commit_overwrite( + x, + table_name, + autoincrement_cols = character(0), + map_cols = character(0) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Data frame to commit} + +\item{\code{table_name}}{Character string table name} + +\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} + +\item{\code{map_cols}}{Character vector of columns to convert to MAP format} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_append}{}}} +\subsection{Method \code{commit_append()}}{ +Commit data in append mode +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$commit_append( + x, + table_name, + autoincrement_cols = character(0), + map_cols = character(0) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Data frame to commit} + +\item{\code{table_name}}{Character string table name} + +\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} + +\item{\code{map_cols}}{Character vector of columns to convert to MAP format} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_upsert}{}}} +\subsection{Method \code{commit_upsert()}}{ +Commit data in upsert mode (update existing, insert new) +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$commit_upsert( + x, + table_name, + key_cols = grep("^id_", names(x), value = TRUE), + autoincrement_cols = character(0), + map_cols = character(0) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Data frame to commit} + +\item{\code{table_name}}{Character string table name} + +\item{\code{key_cols}}{Character vector of columns that define uniqueness} + +\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} + +\item{\code{map_cols}}{Character vector of columns to convert to MAP format} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-print}{}}} +\subsection{Method \code{print()}}{ +Print method for parquet_duckdb +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$print(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{Not used} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +self (invisibly) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} From 0aa5a46bb7ed083aeec951314a2d89be7a8cc403 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:18:03 +0100 Subject: [PATCH 06/27] initial trans-preds implementation --- R/trans_preds_t.R | 112 +++++++++++++++- inst/tinytest/test_trans_preds_t.R | 203 ++++++++++++++++++++++++++++- 2 files changed, 311 insertions(+), 4 deletions(-) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 6d7c5dc..9e7c420 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -19,6 +19,8 @@ as_trans_preds_t <- function(x) { id_trans = integer(0) ) } + cast_dt_col(x, "id_pred", as.integer) + cast_dt_col(x, "id_trans", as.integer) new_evoland_table( x, "trans_preds_t", @@ -27,10 +29,114 @@ as_trans_preds_t <- function(x) { } #' @describeIn trans_preds_t Create a transition-predictor relation, i.e. records the -#' result of a predictor selection step. +#' result of a predictor selection step. Runs covariance filtering for each viable +#' transition and stores the selected predictors. +#' @param db An [evoland_db] instance with populated tables +#' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] +#' @param rank_fun Optional ranking function passed to [covariance_filter()] +#' @param weights Optional weights passed to [covariance_filter()] +#' @param ... Additional arguments passed to rank_fun via [covariance_filter()] #' @export -create_trans_preds_t <- function() { - as_trans_preds_t() +create_trans_preds_t <- function( + db, + corcut = 0.7, + rank_fun = rank_poly_glm, + weights = NULL, + ... +) { + stopifnot( + "db must be an evoland_db instance" = inherits(db, "evoland_db") + ) + + viable_trans <- db$trans_meta_t[is_viable == TRUE] + pred_meta <- db$pred_meta_t + stopifnot( + "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L, + "No predictors found in pred_meta_t" = nrow(pred_meta) > 0L + ) + + results_list <- list() + + # Iterate over transitions (anterior/posterior pairs) + for (i in seq_len(nrow(viable_trans))) { + id_trans <- viable_trans$id_trans[i] + id_lulc_ant <- viable_trans$id_lulc_anterior[i] + id_lulc_post <- viable_trans$id_lulc_posterior[i] + + message(glue::glue( + "Processing transition {i}/{nrow(viable_trans)}: ", + "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})" + )) + + # Get wide transition-predictor data + tryCatch( + { + trans_pred_data <- make_trans_pred_data_v(db, list(), id_trans) + + # Check if we have any data + if (nrow(trans_pred_data) == 0L) { + warning(glue::glue( + "No data for transition {id_trans}, skipping" + )) + next + } + + # Check if we have any predictor columns + pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE) + if (length(pred_cols) == 0L) { + warning(glue::glue( + "No predictor columns for transition {id_trans}, skipping" + )) + next + } + + filtered_data <- covariance_filter( + data = trans_pred_data, + result_col = "result", + rank_fun = rank_fun, + corcut = corcut, + ... + ) + + # Extract selected predictor IDs from column names + selected_cols <- setdiff(names(filtered_data), "result") + + if (length(selected_cols) > 0L) { + # Parse id_pred values from column names (e.g., "id_pred_1" -> 1) + selected_ids <- as.integer(sub("^id_pred_", "", selected_cols)) + + # Create result rows + results_list[[length(results_list) + 1]] <- data.table::data.table( + id_pred = selected_ids, + id_trans = id_trans + ) + + message(glue::glue( + " Selected {length(selected_ids)} predictor(s) for transition {id_trans}" + )) + } else { + message(glue::glue( + " No predictors selected for transition {id_trans}" + )) + } + }, + error = function(e) { + warning(glue::glue( + "Error processing transition {id_trans}: {e$message}" + )) + } + ) + } + + # Combine all results + if (length(results_list) == 0L) { + warning("No predictors selected for any transition") + return(as_trans_preds_t()) + } + + result <- data.table::rbindlist(results_list) + + as_trans_preds_t(result) } #' @export diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R index 866c455..3c5feda 100644 --- a/inst/tinytest/test_trans_preds_t.R +++ b/inst/tinytest/test_trans_preds_t.R @@ -2,6 +2,207 @@ library(tinytest) # Test creation and validation -trans_preds_t <- create_trans_preds_t() +trans_preds_t <- as_trans_preds_t() expect_silent(print(trans_preds_t)) expect_true(nrow(trans_preds_t) >= 0L) + + +#' Actually running the predictor selection logic is heavily dependent on previous +#' steps. Hence, this is a more heavy-handed test verging on integration testing. +test_dir_trans_preds <- tempfile("evoland_trans_preds_") +on.exit(unlink(test_dir_trans_preds, recursive = TRUE), add = TRUE) +db_tps <- evoland_db$new(test_dir_trans_preds) + +# Set up minimal coords and periods +db_tps$coords_t <- create_coords_t_square( + epsg = 2056, + extent = terra::ext(c(xmin = 2697000, xmax = 2697500, ymin = 1252000, ymax = 1252500)), + resolution = 100 +) + +db_tps$periods_t <- create_periods_t( + period_length_str = "P10Y", + start_observed = "1985-01-01", + end_observed = "2005-01-01", + end_extrapolated = "2015-01-01" +) + +# Set up LULC classes +db_tps$lulc_meta_t <- create_lulc_meta_t(list( + forest = list(pretty_name = "Forest", src_classes = 1L), + urban = list(pretty_name = "Urban", src_classes = 2L), + agriculture = list(pretty_name = "Agriculture", src_classes = 3L) +)) + +# Create synthetic LULC data with known transitions +set.seed(42) +n_coords <- nrow(db_tps$coords_t) +lulc_data <- data.table::rbindlist(list( + data.table::data.table( + id_coord = 1:n_coords, + id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.7, 0.3)), + id_period = 1L + ), + data.table::data.table( + id_coord = 1:n_coords, + id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.5, 0.5)), + id_period = 2L + ), + data.table::data.table( + id_coord = 1:n_coords, + id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.4, 0.6)), + id_period = 3L + ) +)) +db_tps$lulc_data_t <- as_lulc_data_t(lulc_data) + +# Create transition metadata +transitions <- db_tps$fetch("transitions_v") +db_tps$trans_meta_t <- create_trans_meta_t( + transitions, + min_cardinality_abs = 5L +) + +# Add predictor metadata with multiple predictors +pred_spec_tps <- list( + elevation = list( + unit = "m", + pretty_name = "Elevation", + description = "Elevation above sea level", + sources = list(list(url = "https://example.com/elevation.tif", md5sum = "abc123")) + ), + slope = list( + unit = "degrees", + pretty_name = "Slope", + description = "Terrain slope", + sources = list(list(url = "https://example.com/slope.tif", md5sum = "def456")) + ), + distance_to_road = list( + unit = "m", + pretty_name = "Distance to road", + description = "Distance to nearest road", + sources = list(list(url = "https://example.com/roads.gpkg", md5sum = "ghi789")) + ), + aspect = list( + unit = "degrees", + pretty_name = "Aspect", + description = "Terrain aspect", + sources = list(list(url = "https://example.com/aspect.tif", md5sum = "jkl012")) + ), + soil_type = list( + pretty_name = "Soil type", + description = "Soil classification", + sources = list(list(url = "https://example.com/soil.tif", md5sum = "mno345")) + ) +) +db_tps$pred_meta_t <- create_pred_meta_t(pred_spec_tps) + +# Add predictor data - mix of static and time-varying +set.seed(43) +pred_data_static <- data.table::rbindlist(list( + # Elevation (static, period 0) + data.table::data.table( + id_pred = 1L, + id_coord = 1:n_coords, + id_period = 0L, + value = runif(n_coords, 400, 800) + ), + # Slope (static, period 0) + data.table::data.table( + id_pred = 2L, + id_coord = 1:n_coords, + id_period = 0L, + value = runif(n_coords, 0, 30) + ), + # Aspect (static, period 0) + data.table::data.table( + id_pred = 4L, + id_coord = 1:n_coords, + id_period = 0L, + value = runif(n_coords, 0, 360) + ) +)) + +# Time-varying predictor +pred_data_varying <- data.table::rbindlist(lapply(1:3, function(period) { + data.table::data.table( + id_pred = 3L, # distance_to_road + id_coord = 1:n_coords, + id_period = period, + value = runif(n_coords, 0, 5000) + ) +})) + +db_tps$pred_data_t_float <- as_pred_data_t( + rbind(pred_data_static, pred_data_varying), + type = "float" +) + +# Add integer predictor (soil_type) +pred_data_int <- data.table::data.table( + id_pred = 5L, + id_coord = 1:n_coords, + id_period = 0L, + value = sample(1:5, n_coords, replace = TRUE) +) +db_tps$pred_data_t_int <- as_pred_data_t(pred_data_int, type = "int") + +# Test create_trans_preds_t +expect_message( + trans_preds_result <- + create_trans_preds_t( + db = db_tps, + corcut = 0.7 + ), + "Processing transition 1/2" +) + +expect_true(inherits(trans_preds_result, "trans_preds_t")) +expect_equal(nrow(trans_preds_result), 8L) + +# Verify structure +expect_true(all(c("id_pred", "id_trans") %in% names(trans_preds_result))) +expect_true(is.integer(trans_preds_result$id_pred)) +expect_true(is.integer(trans_preds_result$id_trans)) + +# Verify that all id_trans in result are viable +viable_trans_ids <- db_tps$trans_meta_t[is_viable == TRUE]$id_trans +expect_true(all(trans_preds_result$id_trans %in% viable_trans_ids)) + +# Verify that all id_pred in result exist in pred_meta_t +expect_true(all(trans_preds_result$id_pred %in% db_tps$pred_meta_t$id_pred)) + +# Test error handling - empty database +test_dir_empty <- tempfile("evoland_empty_") +on.exit(unlink(test_dir_empty, recursive = TRUE), add = TRUE) +db_empty <- evoland_db$new(test_dir_empty) + +expect_error( + create_trans_preds_t(db = "not_a_db"), + "must be an evoland_db" +) + +expect_error( + create_trans_preds_t(db = db_empty), + "No viable transitions" +) + +# Test with no predictors +test_dir_no_pred <- tempfile("evoland_no_pred_") +on.exit(unlink(test_dir_no_pred, recursive = TRUE), add = TRUE) +db_no_pred <- evoland_db$new(test_dir_no_pred) +db_no_pred$coords_t <- db_tps$coords_t +db_no_pred$periods_t <- db_tps$periods_t +db_no_pred$lulc_meta_t <- db_tps$lulc_meta_t +db_no_pred$lulc_data_t <- db_tps$lulc_data_t +db_no_pred$trans_meta_t <- db_tps$trans_meta_t +expect_error( + create_trans_preds_t(db = db_no_pred), + "No predictors found" +) + +# Test print method +expect_stdout( + print(trans_preds_result), + "Transition-Predictor|Total relationships" +) From f394e85c4fb6b96099cb5304f0a423e4ae88045e Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:18:34 +0100 Subject: [PATCH 07/27] binomial -> quasibinomial --- R/covariance_filter.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/covariance_filter.R b/R/covariance_filter.R index b2475f6..6860798 100644 --- a/R/covariance_filter.R +++ b/R/covariance_filter.R @@ -100,7 +100,7 @@ rank_poly_glm <- function(x, y, weights = NULL, ...) { fit <- glm.fit( x = cbind(1, poly(x, degree = 2, simple = TRUE)), y = y, - family = binomial(), + family = quasibinomial(), weights = weights ) From 5625308c2d4a4fae28f9cf88c9c733a821a079c7 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 28 Nov 2025 14:54:41 +0100 Subject: [PATCH 08/27] move neighbors logic into codebase + refactor: use $set to add methods/fields --- DESCRIPTION | 3 + NAMESPACE | 3 + R/evoland_db.R | 287 +++--------------------------------- R/evoland_db_neighbors.R | 173 ++++++++++++++++++++++ R/evoland_db_tables.R | 206 ++++++++++++++++++++++++++ R/evoland_db_views.R | 142 ++++++++---------- R/neighbors_t.R | 81 ++++++++++ R/parquet_duckdb.R | 47 ++++++ R/trans_preds_t.R | 2 +- R/util_terra.R | 2 + man/evoland_db.Rd | 104 ++++++------- man/evoland_db_neighbors.Rd | 26 ++++ man/evoland_db_tables.Rd | 30 ++++ man/evoland_db_views.Rd | 55 ++----- man/neighbors_t.Rd | 41 ++++++ man/trans_preds_t.Rd | 21 ++- 16 files changed, 767 insertions(+), 456 deletions(-) create mode 100644 R/evoland_db_neighbors.R create mode 100644 R/evoland_db_tables.R create mode 100644 R/neighbors_t.R create mode 100644 man/evoland_db_neighbors.Rd create mode 100644 man/evoland_db_tables.Rd create mode 100644 man/neighbors_t.Rd diff --git a/DESCRIPTION b/DESCRIPTION index fb2d27f..c21e25f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,12 +40,15 @@ Collate: 'covariance_filter.R' 'parquet_duckdb.R' 'evoland_db.R' + 'evoland_db_neighbors.R' + 'evoland_db_tables.R' 'evoland_db_views.R' 'init.R' 'intrv_masks_t.R' 'intrv_meta_t.R' 'lulc_data_t.R' 'lulc_meta_t.R' + 'neighbors_t.R' 'periods_t.R' 'pred_data_t.R' 'pred_meta_t.R' diff --git a/NAMESPACE b/NAMESPACE index 50faaf3..f111398 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(print,intrv_masks_t) S3method(print,intrv_meta_t) S3method(print,lulc_data_t) S3method(print,lulc_meta_t) +S3method(print,neighbors_t) S3method(print,periods_t) S3method(print,pred_data_t) S3method(print,pred_meta_t) @@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t) S3method(validate,intrv_meta_t) S3method(validate,lulc_data_t) S3method(validate,lulc_meta_t) +S3method(validate,neighbors_t) S3method(validate,periods_t) S3method(validate,pred_data_t) S3method(validate,pred_data_t_bool) @@ -37,6 +39,7 @@ export(as_intrv_masks_t) export(as_intrv_meta_t) export(as_lulc_data_t) export(as_lulc_meta_t) +export(as_neighbors_t) export(as_periods_t) export(as_pred_data_t) export(as_pred_meta_t) diff --git a/R/evoland_db.R b/R/evoland_db.R index f40bd82..628dfb2 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -8,6 +8,13 @@ #' #' Inherits from [parquet_duckdb] for generic database operations. #' +#' @seealso +#' Additional methods and active bindings are added to this class in separate files: +#' +#' - [evoland_db_tables] - Table active bindings (coords_t, lulc_data_t, etc.) +#' - [evoland_db_views] - View active bindings (lulc_meta_long_v, etc.) and methods +#' - [evoland_db_neighbors] - Neighbor analysis methods +#' #' @include parquet_duckdb.R #' @export @@ -43,8 +50,6 @@ evoland_db <- R6::R6Class( invisible(self) }, - ### Evoland-specific methods ---- - #' @description #' Fetch data from storage with evoland-specific view support #' @param table_name Character string. Name of the table to query. @@ -53,26 +58,21 @@ evoland_db <- R6::R6Class( #' #' @return A data.table fetch = function(table_name, where = NULL, limit = NULL) { - # Check if this is a view that needs special handling - view_result <- switch( - table_name, - lulc_meta_long_v = make_lulc_meta_long_v(self, private, where), - pred_sources_v = make_pred_sources_v(self, private, where), - transitions_v = make_transitions_v(self, private, where), - NULL - ) - - if (!is.null(view_result)) { - return(view_result) + # Check if this is a view (active binding) + if ( + table_name %in% + c("lulc_meta_long_v", "pred_sources_v", "transitions_v", "extent", "coords_minimal") + ) { + return(self[[table_name]]) } file_info <- private$get_file_path(table_name) if (!file_info$exists) { + # TODO make this an error return(private$get_empty_table(table_name)) } - # Call parent method super$fetch(table_name, where, limit) }, @@ -214,196 +214,7 @@ evoland_db <- R6::R6Class( ), ## Active Bindings ---- - active = list( - ### Bindings for tables ---- - #' @field coords_t A `coords_t` instance; see [create_coords_t()] for the type of - #' object to assign. Assigning is an upsert operation. - coords_t = function(x) { - create_active_binding(self, "coords_t", as_coords_t)(x) - }, - - #' @field periods_t A `periods_t` instance; see [create_periods_t()] for the type of - #' object to assign. Assigning is an upsert operation. - periods_t = function(x) { - create_active_binding(self, "periods_t", as_periods_t)(x) - }, - - #' @field lulc_meta_t A `lulc_meta_t` instance; see [create_lulc_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - lulc_meta_t = function(x) { - create_active_binding(self, "lulc_meta_t", as_lulc_meta_t)(x) - }, - - #' @field lulc_meta_long_v Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`. - lulc_meta_long_v = function() { - make_lulc_meta_long_v(self, private) - }, - - #' @field lulc_data_t A `lulc_data_t` instance; see [as_lulc_data_t()] for the type of - #' object to assign. Assigning is an upsert operation. - lulc_data_t = function(x) { - create_active_binding(self, "lulc_data_t", as_lulc_data_t)(x) - }, - - #' @field transitions_v Get the transitions from `lulc_data_t`. - transitions_v = function() { - make_transitions_v(self, private) - }, - - #' @field pred_data_t_float A `pred_data_t_float` instance; see - #' [create_pred_data_t()] for the type of object to assign. Assigning is an - #' upsert operation. - pred_data_t_float = function(x) { - create_active_binding( - self, - "pred_data_t_float", - as_pred_data_t, - type = "float" - )(x) - }, - - #' @field pred_data_t_int A `pred_data_t_int` instance; see - #' [create_pred_data_t()] for the type of object to assign. Assigning is an - #' upsert operation. - pred_data_t_int = function(x) { - create_active_binding( - self, - "pred_data_t_int", - as_pred_data_t, - type = "int" - )(x) - }, - - #' @field pred_data_t_bool A `pred_data_t_bool` instance; see - #' [create_pred_data_t()] for the type of object to assign. Assigning is an - #' upsert operation. - pred_data_t_bool = function(x) { - create_active_binding( - self, - "pred_data_t_bool", - as_pred_data_t, - type = "bool" - )(x) - }, - - ### Bindings for descriptions, views, etc. ---- - #' @field extent Return a terra SpatExtent based on coords_t - extent = function() { - make_extent_db(self, private) - }, - - #' @field coords_minimal data.table with only (id_coord, lon, lat) - coords_minimal = function() { - make_coords_minimal(self, private) - }, - - #' @field pred_meta_t A `pred_meta_t` instance; see [create_pred_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - pred_meta_t = function(x) { - if (missing(x)) { - x <- self$fetch("pred_meta_t") - return(as_pred_meta_t(x)) - } - stopifnot(inherits(x, "pred_meta_t")) - self$commit_upsert( - x, - table_name = "pred_meta_t", - key_cols = "name", - autoincrement_cols = "id_pred" - ) - }, - - #' @field pred_sources_v Retrieve a table of distinct predictor urls and their - #' md5sum - pred_sources_v = function() { - make_pred_sources_v(self, private) - }, - - #' @field trans_meta_t A `trans_meta_t` instance; see [create_trans_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - trans_meta_t = function(x) { - if (missing(x)) { - x <- self$fetch("trans_meta_t") - return(as_trans_meta_t(x)) - } - stopifnot(inherits(x, "trans_meta_t")) - self$commit_upsert( - x, - table_name = "trans_meta_t", - key_cols = c("id_lulc_anterior", "id_lulc_posterior"), - autoincrement_cols = "id_trans" - ) - }, - - #' @field trans_preds_t A `trans_preds_t` instance; see [create_trans_preds_t()] for the type of - #' object to assign. Assigning is an upsert operation. - trans_preds_t = function(x) { - create_active_binding(self, "trans_preds_t", as_trans_preds_t)(x) - }, - - #' @field intrv_meta_t A `intrv_meta_t` instance; see [create_intrv_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - intrv_meta_t = function(x) { - if (missing(x)) { - return(as_intrv_meta_t( - convert_list_cols(self$fetch("intrv_meta_t"), "params", kv_df_to_list) - )) - } - stopifnot(inherits(x, "intrv_meta_t")) - - self$commit_upsert(x, "intrv_meta_t", key_cols = "id_intrv", map_cols = "params") - }, - - #' @field intrv_masks_t A `intrv_masks_t` instance; see [as_intrv_masks_t()] for the type of - #' object to assign. Assigning is an upsert operation. - intrv_masks_t = function(x) { - create_active_binding(self, "intrv_masks_t", as_intrv_masks_t)(x) - }, - - #' @field trans_models_t A `trans_models_t` instance; see [create_trans_models_t()] for the type - #' of object to assign. Assigning is an upsert operation. - trans_models_t = function(x) { - if (missing(x)) { - return(as_trans_models_t( - convert_list_cols( - self$fetch("trans_models_t"), - c("model_params", "goodness_of_fit"), - kv_df_to_list - ) - )) - } - stopifnot(inherits(x, "trans_models_t")) - - self$commit_upsert( - x, - "trans_models_t", - key_cols = c("id_trans", "id_period"), - map_cols = c("model_params", "goodness_of_fit") - ) - }, - - #' @field alloc_params_t A `alloc_params_t` instance; see [as_alloc_params_t()] for the type - #' of object to assign. Assigning is an upsert operation. - alloc_params_t = function(x) { - if (missing(x)) { - return(as_alloc_params_t( - convert_list_cols( - self$fetch("alloc_params_t"), - c("alloc_params", "goodness_of_fit"), - kv_df_to_list - ) - )) - } - stopifnot(inherits(x, "alloc_params_t")) - - self$commit_upsert( - x, - "alloc_params_t", - key_cols = c("id_trans", "id_period"), - map_cols = c("alloc_params", "goodness_of_fit") - ) - } - ), + active = list(), ## Private Methods ---- private = list( @@ -431,7 +242,8 @@ evoland_db <- R6::R6Class( intrv_meta_t = as_intrv_meta_t(), intrv_masks_t = as_intrv_masks_t(), trans_models_t = as_trans_models_t(), - alloc_params_t = as_alloc_params_t() + alloc_params_t = as_alloc_params_t(), + neighbors_t = as_neighbors_t() ) if (table_name %in% names(empty_tables)) { @@ -443,68 +255,3 @@ evoland_db <- R6::R6Class( } ) ) - -# Helper function to create simple Bindings with standard fetch/commit pattern -# param self The R6 instance (self). -# param table_name Character string. Name of the table. -# param as_fn Function to convert fetched data to the appropriate type. -# param ... Additional arguments passed to as_fn. -# return A function suitable for use as an active binding. -create_active_binding <- function(self, table_name, as_fn, ...) { - extra_args <- list(...) - function(x) { - if (missing(x)) { - x <- self$fetch(table_name) - return(do.call(as_fn, c(list(x), extra_args))) - } - stopifnot(inherits(x, table_name)) - self$commit_upsert(x, table_name) - } -} - - -# Helper functions for converting between list and data.frame formats for DuckDB MAPs -convert_list_cols <- function(x, cols, fn) { - for (col in cols) { - x[[col]] <- lapply(x[[col]], fn) - } - x -} - -list_to_kv_df <- function(x) { - if (is.null(x) || length(x) == 0) { - return(data.frame( - key = character(0), - value = character(0), - stringsAsFactors = FALSE - )) - } - data.frame( - key = names(x), - value = as.character(unlist(x)), - stringsAsFactors = FALSE - ) -} - -kv_df_to_list <- function(x) { - if (is.null(x) || nrow(x) == 0) { - return(NULL) - } - - out <- list() - - for (row in seq_len(nrow(x))) { - key <- x$key[row] - val <- x$value[row] - - # Try numeric conversion - num_val <- suppressWarnings(as.numeric(val)) - if (!is.na(num_val)) { - out[[key]] <- num_val - } else { - out[[key]] <- val - } - } - - out -} diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R new file mode 100644 index 0000000..65f29f8 --- /dev/null +++ b/R/evoland_db_neighbors.R @@ -0,0 +1,173 @@ +#' Neighbor analysis methods for evoland_db +#' +#' @description +#' This file adds neighbor analysis methods to the `evoland_db` class using R6's `$set()` method. +#' These methods compute neighbor relationships and generate neighbor-based predictors. +#' +#' @section Methods Added: +#' +#' - `compute_neighbors(max_distance, distance_breaks, resolution, overwrite)` - +#' Computes neighbor relationships between coordinates. +#' - `max_distance`: Maximum distance for neighbors (default: 1000) +#' - `distance_breaks`: Vector of breaks for distance classes (default: c(0, 100, 500, 1000)) +#' - `resolution`: Grid resolution for distance calculations (default: 100) +#' - `overwrite`: Whether to overwrite existing neighbors_t (default: FALSE) +#' - `generate_neighbor_predictors()` - Generates predictor variables based on neighbor +#' land use counts by distance class. Requires neighbors_t with distance_class column +#' (compute_neighbors with distance_breaks). +#' +#' @name evoland_db_neighbors +#' @include evoland_db.R +NULL + +evoland_db$set( + "public", + "compute_neighbors", + function( + max_distance = 1000, + distance_breaks = c(0, 100, 500, 1000), + resolution = 100, + overwrite = FALSE + ) { + if (!overwrite && self$row_count("neighbors_t") > 0) { + message("neighbors_t already exists. Use overwrite = TRUE to recompute.") + return(invisible(self)) + } + + coords <- self$coords_t + + neighbors <- compute_neighbors( + coords, + max_distance = max_distance, + distance_breaks = distance_breaks, + resolution = resolution + ) + + self$commit_overwrite( + as_neighbors_t(neighbors), + table_name = "neighbors_t" + ) + + message(glue::glue("Computed {nrow(neighbors)} neighbor relationships")) + invisible(self) + } +) + +evoland_db$set("public", "generate_neighbor_predictors", function() { + if (self$row_count("neighbors_t") == 0) { + stop("No neighbor data found. Run $compute_neighbors() first.") + } + + if (self$row_count("lulc_meta_t") == 0) { + stop("No LULC metadata found. Add lulc_meta_t before generating neighbor predictors.") + } + + if (self$row_count("lulc_data_t") == 0) { + stop("No LULC data found. Add lulc_data_t before generating neighbor predictors.") + } + + neighbors_sample <- self$fetch("neighbors_t", limit = 1) + if (!"distance_class" %in% names(neighbors_sample)) { + stop( + "neighbors_t does not have distance_class column. Run $compute_neighbors() with distance_breaks parameter." + ) + } + + self$attach_table("neighbors_t") + self$attach_table("lulc_data_t") + self$attach_table("lulc_meta_t") + + has_pred_meta <- self$row_count("pred_meta_t") > 0 + if (has_pred_meta) { + self$attach_table("pred_meta_t") + } + + on.exit({ + self$detach_table("neighbors_t") + self$detach_table("lulc_data_t") + self$detach_table("lulc_meta_t") + if (has_pred_meta) self$detach_table("pred_meta_t") + }) + + max_id_query <- if (has_pred_meta) { + "(select coalesce(max(id_pred), 0) as max_pred from pred_meta_t)" + } else { + "(select 0 as max_pred)" + } + + self$execute(glue::glue( + r"{ + create temp table pred_meta_neighbors_t as + with + all_distance_classes as (select distinct distance_class from neighbors_t), + max_id as }", + max_id_query, + r"{ + select + row_number() over () + (select max_pred from max_id) as id_pred, + concat('id_lulc_', l.id_lulc, '_dist_', c.distance_class) as name, + concat('Count of ', l.pretty_name, ' within distance class ', c.distance_class) as pretty_name, + 'Number of neighbors by land use class and distance interval' as description, + 'land use coordinate data' as orig_format, + NULL as sources, + 'number of neighbors' as unit, + NULL as factor_levels, + c.distance_class, + l.id_lulc + from + lulc_meta_t l + cross join + all_distance_classes c + }" + )) + + pred_meta_neighbors <- self$get_query( + "select id_pred, name, pretty_name, description, orig_format, unit + from pred_meta_neighbors_t" + ) + + pred_meta_neighbors$sources <- lapply(1:nrow(pred_meta_neighbors), function(i) list()) + pred_meta_neighbors$factor_levels <- lapply(1:nrow(pred_meta_neighbors), function(i) list()) + + self$pred_meta_t <- as_pred_meta_t(pred_meta_neighbors) + + self$execute( + r"{ + create temp table pred_neighbors_t as + select + p.id_pred, + n.id_coord_origin as id_coord, + t.id_period, + count(n.id_coord_neighbor) as value + from + neighbors_t n, + lulc_data_t t, + pred_meta_neighbors_t p + where + n.id_coord_neighbor = t.id_coord + and p.id_lulc = t.id_lulc + and p.distance_class = n.distance_class + group by + n.id_coord_origin, + n.distance_class, + t.id_period, + t.id_lulc, + p.id_pred + }" + ) + + pred_neighbors <- self$get_query("select * from pred_neighbors_t") + pred_neighbors$id_pred <- as.integer(pred_neighbors$id_pred) + pred_neighbors$id_coord <- as.integer(pred_neighbors$id_coord) + pred_neighbors$id_period <- as.integer(pred_neighbors$id_period) + pred_neighbors$value <- as.integer(pred_neighbors$value) + + self$pred_data_t_int <- as_pred_data_t(pred_neighbors, type = "int") + + message(glue::glue( + "Generated {nrow(pred_meta_neighbors)} neighbor predictor variables with ", + "{nrow(pred_neighbors)} data points" + )) + + invisible(self) +}) diff --git a/R/evoland_db_tables.R b/R/evoland_db_tables.R new file mode 100644 index 0000000..d477ab3 --- /dev/null +++ b/R/evoland_db_tables.R @@ -0,0 +1,206 @@ +#' Table active bindings for evoland_db +#' +#' @description +#' This file adds table active bindings to the `evoland_db` class using R6's `$set()` method. +#' These bindings provide read/write access to database tables with automatic validation. +#' +#' @section Active Bindings Added: +#' +#' - `coords_t` - Coordinates table. See [as_coords_t()] +#' - `periods_t` - Time periods table. See [as_periods_t()] +#' - `lulc_meta_t` - Land use/land cover metadata. See [as_lulc_meta_t()] +#' - `lulc_data_t` - Land use/land cover data. See [as_lulc_data_t()] +#' - `pred_data_t_float` - Float predictor data. See [as_pred_data_t()] +#' - `pred_data_t_int` - Integer predictor data. See [as_pred_data_t()] +#' - `pred_data_t_bool` - Boolean predictor data. See [as_pred_data_t()] +#' - `pred_meta_t` - Predictor metadata. See [as_pred_meta_t()] +#' - `trans_meta_t` - Transition metadata. See [as_trans_meta_t()] +#' - `trans_preds_t` - Transition-predictor relationships. See [as_trans_preds_t()] +#' - `intrv_meta_t` - Intervention metadata. See [as_intrv_meta_t()] +#' - `intrv_masks_t` - Intervention masks. See [as_intrv_masks_t()] +#' - `trans_models_t` - Transition models. See [as_trans_models_t()] +#' - `alloc_params_t` - Allocation parameters. See [as_alloc_params_t()] +#' - `neighbors_t` - Neighbor relationships. See [as_neighbors_t()] +#' +#' @name evoland_db_tables +#' @include evoland_db.R +NULL + +create_table_binding <- function( + self, + table_name, + as_fn, + key_cols = NULL, + autoincrement_cols = NULL, + map_cols = NULL, + ... +) { + extra_args <- list(...) + + function(x) { + if (missing(x)) { + fetched <- self$fetch(table_name) + + if (!is.null(map_cols) && nrow(fetched) > 0) { + fetched <- convert_list_cols(fetched, map_cols, kv_df_to_list) + } + + return(do.call(as_fn, c(list(fetched), extra_args))) + } + + stopifnot(inherits(x, table_name)) + + self$commit_upsert( + x, + table_name = table_name, + key_cols = key_cols, + autoincrement_cols = autoincrement_cols, + map_cols = map_cols + ) + } +} + +evoland_db$set("active", "coords_t", function(x) { + create_table_binding( + self, + "coords_t", + as_coords_t, + key_cols = "id_coord" + )(x) +}) + +evoland_db$set("active", "periods_t", function(x) { + create_table_binding( + self, + "periods_t", + as_periods_t, + key_cols = "id_period" + )(x) +}) + +evoland_db$set("active", "lulc_meta_t", function(x) { + create_table_binding( + self, + "lulc_meta_t", + as_lulc_meta_t, + key_cols = "id_lulc" + )(x) +}) + +evoland_db$set("active", "lulc_data_t", function(x) { + create_table_binding( + self, + "lulc_data_t", + as_lulc_data_t, + key_cols = c("id_coord", "id_period") + )( + x + ) +}) + +evoland_db$set("active", "pred_data_t_float", function(x) { + create_table_binding( + self, + "pred_data_t_float", + as_pred_data_t, + key_cols = c("id_pred", "id_coord", "id_period"), + type = "float" + )(x) +}) + +evoland_db$set("active", "pred_data_t_int", function(x) { + create_table_binding( + self, + "pred_data_t_int", + as_pred_data_t, + key_cols = c("id_pred", "id_coord", "id_period"), + type = "int" + )(x) +}) + +evoland_db$set("active", "pred_data_t_bool", function(x) { + create_table_binding( + self, + "pred_data_t_bool", + as_pred_data_t, + key_cols = c("id_pred", "id_coord", "id_period"), + type = "bool" + )(x) +}) + +evoland_db$set("active", "pred_meta_t", function(x) { + create_table_binding( + self, + "pred_meta_t", + as_pred_meta_t, + key_cols = "name", + autoincrement_cols = "id_pred" + )(x) +}) + +evoland_db$set("active", "trans_meta_t", function(x) { + create_table_binding( + self, + "trans_meta_t", + as_trans_meta_t, + key_cols = c("id_lulc_anterior", "id_lulc_posterior"), + autoincrement_cols = "id_trans" + )(x) +}) + +evoland_db$set("active", "trans_preds_t", function(x) { + create_table_binding( + self, + "trans_preds_t", + as_trans_preds_t, + key_cols = c("id_trans", "id_pred") + )(x) +}) + +evoland_db$set("active", "intrv_meta_t", function(x) { + create_table_binding( + self, + "intrv_meta_t", + as_intrv_meta_t, + key_cols = "id_intrv", + map_cols = "params" + )(x) +}) + +evoland_db$set("active", "intrv_masks_t", function(x) { + create_table_binding( + self, + "intrv_masks_t", + as_intrv_masks_t, + key_cols = c("id_coord", "id_intrv") + )(x) +}) + +evoland_db$set("active", "trans_models_t", function(x) { + create_table_binding( + self, + "trans_models_t", + as_trans_models_t, + key_cols = c("id_trans", "id_period"), + map_cols = c("model_params", "goodness_of_fit") + )(x) +}) + +evoland_db$set("active", "alloc_params_t", function(x) { + create_table_binding( + self, + "alloc_params_t", + as_alloc_params_t, + key_cols = c("id_trans", "id_period"), + map_cols = c("alloc_params", "goodness_of_fit") + )(x) +}) + +evoland_db$set("active", "neighbors_t", function(x) { + create_table_binding( + self, + "neighbors_t", + as_neighbors_t, + key_cols = c("id_coord_origin", "id_coord_neighbor") + )(x) +}) diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index b0533e9..6f41fae 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -1,35 +1,27 @@ #' Views on the evoland-plus data model #' #' @description -#' Functions to generate views on the database +#' This file adds view active bindings and methods to the `evoland_db` class using R6's `$set()` method. +#' These provide computed views on the database without storing additional data. +#' +#' @section Active Bindings Added: +#' +#' - `lulc_meta_long_v` - Unrolled LULC metadata with one row per source class +#' - `pred_sources_v` - Distinct predictor URLs and their MD5 checksums +#' - `transitions_v` - Land use transitions derived from lulc_data_t +#' - `extent` - Spatial extent of coords_t as terra::SpatExtent +#' - `coords_minimal` - Minimal coordinate representation (id_coord, lon, lat) +#' +#' @section Methods Added: +#' +#' - `trans_pred_data_v(id_trans)` - Returns wide table of transition results and predictor data for a specific transition. Used as input to covariance filtering. #' #' @name evoland_db_views +#' @include evoland_db.R NULL -#' @describeIn evoland_db_views Retrieve a table of distinct predictor urls and their -#' md5sum -make_pred_sources_v <- function(self, private, where = NULL) { - self$with_tables("pred_meta_t", function() { - where_clause <- if (!is.null(where)) paste("AND", where) else "" - - self$get_query(glue::glue( - r"{ - select distinct - unnest(sources).url as url, - unnest(sources).md5sum as md5sum - from pred_meta_t - where sources is not null {where_clause} - }" - )) - }) -} - - -#' @describeIn evoland_db_views Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`. -make_lulc_meta_long_v <- function(self, private, where = NULL) { +evoland_db$set("active", "lulc_meta_long_v", function() { self$with_tables("lulc_meta_t", function() { - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - self$get_query(glue::glue( r"{ select @@ -38,53 +30,27 @@ make_lulc_meta_long_v <- function(self, private, where = NULL) { unnest(src_classes) as src_class from lulc_meta_t - {where_clause} }" )) }) -} - -#' @describeIn evoland_db_views Minimal coordinate representation (id_coord, lon, lat) -make_coords_minimal <- function(self, private, where = NULL) { - self$with_tables("coords_t", function() { - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" +}) +evoland_db$set("active", "pred_sources_v", function() { + self$with_tables("pred_meta_t", function() { self$get_query(glue::glue( r"{ - select id_coord, lon, lat - from coords_t - {where_clause} + select distinct + unnest(sources).url as url, + unnest(sources).md5sum as md5sum + from pred_meta_t + where sources is not null }" - )) |> - cast_dt_col("id_coord", as.integer) |> - data.table::setkeyv("id_coord") - }) -} - -#' @describeIn evoland_db_views Returns the extent of the coords_t as terra::SpatExtent -make_extent_db <- function(self, private) { - self$with_tables("coords_t", function() { - self$get_query(glue::glue( - r"{ - SELECT - min(lon) as xmin, - max(lon) as xmax, - min(lat) as ymin, - max(lat) as ymax - FROM - coords_t - }" - )) |> - unlist() |> - terra::ext() + )) }) -} +}) -#' @describeIn evoland_db_views Returns transitions based on lulc_data_t -make_transitions_v <- function(self, private, where = NULL) { +evoland_db$set("active", "transitions_v", function() { self$with_tables("lulc_data_t", function() { - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - self$get_query(glue::glue( r"{ SELECT @@ -99,34 +65,56 @@ make_transitions_v <- function(self, private, where = NULL) { ON curr.id_coord = prev.id_coord AND curr.id_period = prev.id_period + 1 - {where_clause} }" )) }) -} +}) -#' @describeIn evoland_db_views Returns wide table of transition results and predictor data -#' for a specific transition. Used as input to covariance filtering. -#' -#' @param id_trans Integer, the transition ID to generate data for -#' @return data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N -make_trans_pred_data_v <- function(self, private, id_trans) { +evoland_db$set("active", "extent", function() { + self$with_tables("coords_t", function() { + self$get_query(glue::glue( + r"{ + SELECT + min(lon) as xmin, + max(lon) as xmax, + min(lat) as ymin, + max(lat) as ymax + FROM + coords_t + }" + )) |> + unlist() |> + terra::ext() + }) +}) + +evoland_db$set("active", "coords_minimal", function() { + self$with_tables("coords_t", function() { + self$get_query(glue::glue( + r"{ + select id_coord, lon, lat + from coords_t + }" + )) |> + cast_dt_col("id_coord", as.integer) |> + data.table::setkeyv("id_coord") + }) +}) + +evoland_db$set("public", "trans_pred_data_v", function(id_trans) { stopifnot( "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans) ) - # Determine which predictor tables exist all_tables <- self$list_tables() pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool") existing_pred_tables <- intersect(pred_tables, all_tables) - # Build list of tables to attach tables_to_attach <- c("trans_meta_t", "lulc_data_t", "pred_meta_t", existing_pred_tables) self$with_tables( tables_to_attach, function() { - # Get transition metadata trans_info <- self$get_query(glue::glue( "SELECT id_lulc_anterior, id_lulc_posterior FROM trans_meta_t @@ -140,10 +128,8 @@ make_trans_pred_data_v <- function(self, private, id_trans) { id_lulc_ant <- trans_info$id_lulc_anterior id_lulc_post <- trans_info$id_lulc_posterior - # Build CTEs dynamically based on which predictor tables exist ctes <- list() - # Always include trans_result ctes$trans_result <- glue::glue( "trans_result AS ( SELECT @@ -162,8 +148,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) { )" ) - # Add predictor CTEs for each type that exists - # UNION period-specific data (period >= 1) with cross-joined static data (period 0) if ("pred_data_t_float" %in% existing_pred_tables) { ctes$pred_float_combined <- "pred_float_combined AS ( SELECT id_coord, id_period, id_pred, value @@ -215,7 +199,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) { )" } - # Build SELECT columns select_cols <- "tr.result" if ("pred_data_t_float" %in% existing_pred_tables) { select_cols <- paste0(select_cols, ", pf.* EXCLUDE (id_coord, id_period)") @@ -227,7 +210,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) { select_cols <- paste0(select_cols, ", pb.* EXCLUDE (id_coord, id_period)") } - # Build JOINs joins <- "" if ("pred_data_t_float" %in% existing_pred_tables) { joins <- paste0( @@ -248,7 +230,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) { ) } - # Combine everything into final query cte_string <- paste(unlist(ctes), collapse = ",\n\n ") query <- glue::glue( @@ -261,7 +242,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) { result <- self$get_query(query) - # Rename columns to id_pred_{N} format old_names <- names(result) new_names <- old_names for (i in seq_along(old_names)) { @@ -274,4 +254,4 @@ make_trans_pred_data_v <- function(self, private, id_trans) { result } ) -} +}) diff --git a/R/neighbors_t.R b/R/neighbors_t.R new file mode 100644 index 0000000..5d2e98f --- /dev/null +++ b/R/neighbors_t.R @@ -0,0 +1,81 @@ +#' Create Neighbors Table +#' +#' Creates a `neighbors_t` table and validates that it matches the schema. +#' +#' @name neighbors_t +#' +#' @param x An object that can be passed to [data.table::setDT()] +#' +#' @return A data.table of class "neighbors_t" with columns: +#' - `id_coord_origin`: Foreign key to coords_t (origin coordinate) +#' - `id_coord_neighbor`: Foreign key to coords_t (neighbor coordinate) +#' - `distance`: Numeric distance between coordinates +#' - `distance_class`: Optional factor representing distance intervals +#' @export +as_neighbors_t <- function(x) { + if (missing(x)) { + x <- data.table::data.table( + id_coord_origin = integer(0), + id_coord_neighbor = integer(0), + distance = numeric(0) + ) + } + cast_dt_col(x, "id_coord_origin", as.integer) + cast_dt_col(x, "id_coord_neighbor", as.integer) + if ("distance_class" %in% names(x)) { + cast_dt_col(x, "distance_class", as.factor) + } + new_evoland_table( + x, + "neighbors_t", + c("id_coord_origin", "id_coord_neighbor") + ) +} + +#' @describeIn neighbors_t Validate a neighbors_t object +#' @export +validate.neighbors_t <- function(x, ...) { + NextMethod() + + required_cols <- c("id_coord_origin", "id_coord_neighbor", "distance") + + data.table::setcolorder(x, required_cols) + + stopifnot( + is.integer(x[["id_coord_origin"]]), + is.integer(x[["id_coord_neighbor"]]), + is.numeric(x[["distance"]]) + ) + + return(x) +} + +#' @describeIn neighbors_t Print a neighbors_t object +#' @param nrow Maximum number of rows to print. See [data.table::print.data.table] +#' @param ... Passed to [data.table::print.data.table] +#' @export +print.neighbors_t <- function(x, nrow = 10, ...) { + if (nrow(x) > 0) { + n_origins <- data.table::uniqueN(x[["id_coord_origin"]]) + n_neighbors <- data.table::uniqueN(x[["id_coord_neighbor"]]) + total_pairs <- nrow(x) + + extra_info <- "" + if ("distance_class" %in% names(x)) { + n_classes <- data.table::uniqueN(x[["distance_class"]]) + extra_info <- glue::glue(", Distance classes: {n_classes}") + } + + cat(glue::glue( + "Neighbors Table\n", + "Neighbor pairs: {total_pairs}\n", + "Origin coordinates: {n_origins}, ", + "Neighbor coordinates: {n_neighbors}", + "{extra_info}\n\n" + )) + } else { + cat("Neighbors Table (empty)\n") + } + NextMethod(nrow = nrow, ...) + invisible(x) +} diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R index 0fa7d51..11aa465 100644 --- a/R/parquet_duckdb.R +++ b/R/parquet_duckdb.R @@ -574,3 +574,50 @@ parquet_duckdb <- R6::R6Class( } ) ) + + +# Helper functions for converting between list and data.frame formats for DuckDB MAPs +convert_list_cols <- function(x, cols, fn) { + for (col in cols) { + x[[col]] <- lapply(x[[col]], fn) + } + x +} + +list_to_kv_df <- function(x) { + if (is.null(x) || length(x) == 0) { + return(data.frame( + key = character(0), + value = character(0), + stringsAsFactors = FALSE + )) + } + data.frame( + key = names(x), + value = as.character(unlist(x)), + stringsAsFactors = FALSE + ) +} + +kv_df_to_list <- function(x) { + if (is.null(x) || nrow(x) == 0) { + return(NULL) + } + + out <- list() + + for (row in seq_len(nrow(x))) { + key <- x$key[row] + val <- x$value[row] + + # Try numeric conversion + num_val <- suppressWarnings(as.numeric(val)) + if (!is.na(num_val)) { + out[[key]] <- num_val + } else { + out[[key]] <- val + } + } + + out +} diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 9e7c420..ae324a5 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -71,7 +71,7 @@ create_trans_preds_t <- function( # Get wide transition-predictor data tryCatch( { - trans_pred_data <- make_trans_pred_data_v(db, list(), id_trans) + trans_pred_data <- db$trans_pred_data_v(id_trans) # Check if we have any data if (nrow(trans_pred_data) == 0L) { diff --git a/R/util_terra.R b/R/util_terra.R index 00d8825..42bd2f5 100644 --- a/R/util_terra.R +++ b/R/util_terra.R @@ -89,6 +89,8 @@ extract_using_coords_t.SpatVector <- function(x, coords_t, na_omit = TRUE) { out } +# TODO move this to a neighbors_t.R file that includes a formal class definition +# for the return table, including validation, print, coercion #' @describeIn util_terra Compute neighboring coordinates within specified distances. In #' order to be computationally feasible, the coordinates' IDs are rasterized before #' their actual Euclidean distance is calculated. If coordinates are so close that they diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index 49cf8f8..3e005d7 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -11,73 +11,23 @@ to disk in parquet format for better compression. Inherits from \link{parquet_duckdb} for generic database operations. } -\section{Super class}{ -\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db} +\seealso{ +Additional methods and active bindings are added to this class in separate files: +\itemize{ +\item \link{evoland_db_tables} - Table active bindings (coords_t, lulc_data_t, etc.) +\item \link{evoland_db_views} - View active bindings (lulc_meta_long_v, etc.) and methods +\item \link{evoland_db_neighbors} - Neighbor analysis methods } -\section{Active bindings}{ -\if{html}{\out{
}} -\describe{ -\item{\code{coords_t}}{A \code{coords_t} instance; see \code{\link[=create_coords_t]{create_coords_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{periods_t}}{A \code{periods_t} instance; see \code{\link[=create_periods_t]{create_periods_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{lulc_meta_t}}{A \code{lulc_meta_t} instance; see \code{\link[=create_lulc_meta_t]{create_lulc_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{lulc_meta_long_v}}{Return a \code{lulc_meta_long_v} instance, i.e. unrolled \code{lulc_meta_t}.} - -\item{\code{lulc_data_t}}{A \code{lulc_data_t} instance; see \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{transitions_v}}{Get the transitions from \code{lulc_data_t}.} - -\item{\code{pred_data_t_float}}{A \code{pred_data_t_float} instance; see -\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an -upsert operation.} - -\item{\code{pred_data_t_int}}{A \code{pred_data_t_int} instance; see -\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an -upsert operation.} - -\item{\code{pred_data_t_bool}}{A \code{pred_data_t_bool} instance; see -\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an -upsert operation.} - -\item{\code{extent}}{Return a terra SpatExtent based on coords_t} - -\item{\code{coords_minimal}}{data.table with only (id_coord, lon, lat)} - -\item{\code{pred_meta_t}}{A \code{pred_meta_t} instance; see \code{\link[=create_pred_meta_t]{create_pred_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{pred_sources_v}}{Retrieve a table of distinct predictor urls and their -md5sum} - -\item{\code{trans_meta_t}}{A \code{trans_meta_t} instance; see \code{\link[=create_trans_meta_t]{create_trans_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{trans_preds_t}}{A \code{trans_preds_t} instance; see \code{\link[=create_trans_preds_t]{create_trans_preds_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{intrv_meta_t}}{A \code{intrv_meta_t} instance; see \code{\link[=create_intrv_meta_t]{create_intrv_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{intrv_masks_t}}{A \code{intrv_masks_t} instance; see \code{\link[=as_intrv_masks_t]{as_intrv_masks_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{trans_models_t}}{A \code{trans_models_t} instance; see \code{\link[=create_trans_models_t]{create_trans_models_t()}} for the type -of object to assign. Assigning is an upsert operation.} - -\item{\code{alloc_params_t}}{A \code{alloc_params_t} instance; see \code{\link[=as_alloc_params_t]{as_alloc_params_t()}} for the type -of object to assign. Assigning is an upsert operation.} } -\if{html}{\out{
}} +\section{Super class}{ +\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db} } \section{Methods}{ \subsection{Public methods}{ \itemize{ +\item \href{#method-evoland_db-compute_neighbors}{\code{evoland_db$compute_neighbors()}} +\item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}} +\item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}} \item \href{#method-evoland_db-new}{\code{evoland_db$new()}} \item \href{#method-evoland_db-fetch}{\code{evoland_db$fetch()}} \item \href{#method-evoland_db-set_report}{\code{evoland_db$set_report()}} @@ -106,6 +56,38 @@ of object to assign. Assigning is an upsert operation.} }} \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-compute_neighbors}{}}} +\subsection{Method \code{compute_neighbors()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$compute_neighbors( + max_distance = 1000, + distance_breaks = c(0, 100, 500, 1000), + resolution = 100, + overwrite = FALSE +)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-generate_neighbor_predictors}{}}} +\subsection{Method \code{generate_neighbor_predictors()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$generate_neighbor_predictors()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}} +\subsection{Method \code{trans_pred_data_v()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$trans_pred_data_v(id_trans)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-new}{}}} \subsection{Method \code{new()}}{ diff --git a/man/evoland_db_neighbors.Rd b/man/evoland_db_neighbors.Rd new file mode 100644 index 0000000..09e3e93 --- /dev/null +++ b/man/evoland_db_neighbors.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evoland_db_neighbors.R +\name{evoland_db_neighbors} +\alias{evoland_db_neighbors} +\title{Neighbor analysis methods for evoland_db} +\description{ +This file adds neighbor analysis methods to the \code{evoland_db} class using R6's \verb{$set()} method. +These methods compute neighbor relationships and generate neighbor-based predictors. +} +\section{Methods Added}{ + +\itemize{ +\item \code{compute_neighbors(max_distance, distance_breaks, resolution, overwrite)} - +Computes neighbor relationships between coordinates. +\itemize{ +\item \code{max_distance}: Maximum distance for neighbors (default: 1000) +\item \code{distance_breaks}: Vector of breaks for distance classes (default: c(0, 100, 500, 1000)) +\item \code{resolution}: Grid resolution for distance calculations (default: 100) +\item \code{overwrite}: Whether to overwrite existing neighbors_t (default: FALSE) +} +\item \code{generate_neighbor_predictors()} - Generates predictor variables based on neighbor +land use counts by distance class. Requires neighbors_t with distance_class column +(compute_neighbors with distance_breaks). +} +} + diff --git a/man/evoland_db_tables.Rd b/man/evoland_db_tables.Rd new file mode 100644 index 0000000..59cc49d --- /dev/null +++ b/man/evoland_db_tables.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evoland_db_tables.R +\name{evoland_db_tables} +\alias{evoland_db_tables} +\title{Table active bindings for evoland_db} +\description{ +This file adds table active bindings to the \code{evoland_db} class using R6's \verb{$set()} method. +These bindings provide read/write access to database tables with automatic validation. +} +\section{Active Bindings Added}{ + +\itemize{ +\item \code{coords_t} - Coordinates table. See \code{\link[=as_coords_t]{as_coords_t()}} +\item \code{periods_t} - Time periods table. See \code{\link[=as_periods_t]{as_periods_t()}} +\item \code{lulc_meta_t} - Land use/land cover metadata. See \code{\link[=as_lulc_meta_t]{as_lulc_meta_t()}} +\item \code{lulc_data_t} - Land use/land cover data. See \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} +\item \code{pred_data_t_float} - Float predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}} +\item \code{pred_data_t_int} - Integer predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}} +\item \code{pred_data_t_bool} - Boolean predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}} +\item \code{pred_meta_t} - Predictor metadata. See \code{\link[=as_pred_meta_t]{as_pred_meta_t()}} +\item \code{trans_meta_t} - Transition metadata. See \code{\link[=as_trans_meta_t]{as_trans_meta_t()}} +\item \code{trans_preds_t} - Transition-predictor relationships. See \code{\link[=as_trans_preds_t]{as_trans_preds_t()}} +\item \code{intrv_meta_t} - Intervention metadata. See \code{\link[=as_intrv_meta_t]{as_intrv_meta_t()}} +\item \code{intrv_masks_t} - Intervention masks. See \code{\link[=as_intrv_masks_t]{as_intrv_masks_t()}} +\item \code{trans_models_t} - Transition models. See \code{\link[=as_trans_models_t]{as_trans_models_t()}} +\item \code{alloc_params_t} - Allocation parameters. See \code{\link[=as_alloc_params_t]{as_alloc_params_t()}} +\item \code{neighbors_t} - Neighbor relationships. See \code{\link[=as_neighbors_t]{as_neighbors_t()}} +} +} + diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd index a080c24..d0a7c93 100644 --- a/man/evoland_db_views.Rd +++ b/man/evoland_db_views.Rd @@ -2,49 +2,26 @@ % Please edit documentation in R/evoland_db_views.R \name{evoland_db_views} \alias{evoland_db_views} -\alias{make_pred_sources_v} -\alias{make_lulc_meta_long_v} -\alias{make_coords_minimal} -\alias{make_extent_db} -\alias{make_transitions_v} -\alias{make_trans_pred_data_v} \title{Views on the evoland-plus data model} -\usage{ -make_pred_sources_v(self, private, where = NULL) - -make_lulc_meta_long_v(self, private, where = NULL) - -make_coords_minimal(self, private, where = NULL) - -make_extent_db(self, private) - -make_transitions_v(self, private, where = NULL) - -make_trans_pred_data_v(self, private, id_trans) -} -\arguments{ -\item{id_trans}{Integer, the transition ID to generate data for} -} -\value{ -data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N -} \description{ -Functions to generate views on the database +This file adds view active bindings and methods to the \code{evoland_db} class using R6's \verb{$set()} method. +These provide computed views on the database without storing additional data. } -\section{Functions}{ -\itemize{ -\item \code{make_pred_sources_v()}: Retrieve a table of distinct predictor urls and their -md5sum +\section{Active Bindings Added}{ -\item \code{make_lulc_meta_long_v()}: Return a \code{lulc_meta_long_v} instance, i.e. unrolled \code{lulc_meta_t}. - -\item \code{make_coords_minimal()}: Minimal coordinate representation (id_coord, lon, lat) - -\item \code{make_extent_db()}: Returns the extent of the coords_t as terra::SpatExtent +\itemize{ +\item \code{lulc_meta_long_v} - Unrolled LULC metadata with one row per source class +\item \code{pred_sources_v} - Distinct predictor URLs and their MD5 checksums +\item \code{transitions_v} - Land use transitions derived from lulc_data_t +\item \code{extent} - Spatial extent of coords_t as terra::SpatExtent +\item \code{coords_minimal} - Minimal coordinate representation (id_coord, lon, lat) +} +} -\item \code{make_transitions_v()}: Returns transitions based on lulc_data_t +\section{Methods Added}{ -\item \code{make_trans_pred_data_v()}: Returns wide table of transition results and predictor data -for a specific transition. Used as input to covariance filtering. +\itemize{ +\item \code{trans_pred_data_v(id_trans)} - Returns wide table of transition results and predictor data for a specific transition. Used as input to covariance filtering. +} +} -}} diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd new file mode 100644 index 0000000..bc95b6a --- /dev/null +++ b/man/neighbors_t.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/neighbors_t.R +\name{neighbors_t} +\alias{neighbors_t} +\alias{as_neighbors_t} +\alias{validate.neighbors_t} +\alias{print.neighbors_t} +\title{Create Neighbors Table} +\usage{ +as_neighbors_t(x) + +\method{validate}{neighbors_t}(x, ...) + +\method{print}{neighbors_t}(x, nrow = 10, ...) +} +\arguments{ +\item{x}{An object that can be passed to \code{\link[data.table:setDT]{data.table::setDT()}}} + +\item{...}{Passed to \link[data.table:print.data.table]{data.table::print.data.table}} + +\item{nrow}{Maximum number of rows to print. See \link[data.table:print.data.table]{data.table::print.data.table}} +} +\value{ +A data.table of class "neighbors_t" with columns: +\itemize{ +\item \code{id_coord_origin}: Foreign key to coords_t (origin coordinate) +\item \code{id_coord_neighbor}: Foreign key to coords_t (neighbor coordinate) +\item \code{distance}: Numeric distance between coordinates +\item \code{distance_class}: Optional factor representing distance intervals +} +} +\description{ +Creates a \code{neighbors_t} table and validates that it matches the schema. +} +\section{Methods (by generic)}{ +\itemize{ +\item \code{validate(neighbors_t)}: Validate a neighbors_t object + +\item \code{print(neighbors_t)}: Print a neighbors_t object + +}} diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd index 545381f..a555322 100644 --- a/man/trans_preds_t.Rd +++ b/man/trans_preds_t.Rd @@ -9,16 +9,28 @@ \usage{ as_trans_preds_t(x) -create_trans_preds_t() +create_trans_preds_t( + db, + corcut = 0.7, + rank_fun = rank_poly_glm, + weights = NULL, + ... +) \method{print}{trans_preds_t}(x, nrow = 10, ...) } \arguments{ -\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} +\item{db}{An \link{evoland_db} instance with populated tables} + +\item{corcut}{Numeric threshold (0-1) for correlation filtering passed to \code{\link[=covariance_filter]{covariance_filter()}}} + +\item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}} + +\item{weights}{Optional weights passed to \code{\link[=covariance_filter]{covariance_filter()}}} \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}} -\item{db}{An \link{evoland_db} instance with populated trans_meta_t and pred_meta_t tables} +\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} } \value{ A data.table of class "trans_preds_t" with columns: @@ -40,6 +52,7 @@ modelling each transition type. \section{Functions}{ \itemize{ \item \code{create_trans_preds_t()}: Create a transition-predictor relation, i.e. records the -result of a predictor selection step. +result of a predictor selection step. Runs covariance filtering for each viable +transition and stores the selected predictors. }} From e83d1199a7ec232e00288e48a37da618c71c4c98 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 28 Nov 2025 15:35:35 +0100 Subject: [PATCH 09/27] cast_dt_col - check before coercing --- R/coords_t.R | 4 ++-- R/evoland_db_views.R | 2 +- R/intrv_masks_t.R | 4 ++-- R/lulc_data_t.R | 2 +- R/neighbors_t.R | 22 ++++++++++++---------- R/pred_data_t.R | 14 +++----------- R/trans_preds_t.R | 4 ++-- R/util.R | 22 ++++++++++++++++++++-- man/util.Rd | 2 +- 9 files changed, 44 insertions(+), 32 deletions(-) diff --git a/R/coords_t.R b/R/coords_t.R index e4c77f2..e99609a 100644 --- a/R/coords_t.R +++ b/R/coords_t.R @@ -34,9 +34,9 @@ as_coords_t <- function(x) { geom_polygon = list() ) } - cast_dt_col(x, "id_coord", as.integer) + cast_dt_col(x, "id_coord", "int") if (!is.null(x[["region"]])) { - cast_dt_col(x, "region", as.factor) + cast_dt_col(x, "region", "factor") } new_evoland_table( x, diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index 6f41fae..3201935 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -96,7 +96,7 @@ evoland_db$set("active", "coords_minimal", function() { from coords_t }" )) |> - cast_dt_col("id_coord", as.integer) |> + cast_dt_col("id_coord", "int") |> data.table::setkeyv("id_coord") }) }) diff --git a/R/intrv_masks_t.R b/R/intrv_masks_t.R index 501daf7..8aee5f3 100644 --- a/R/intrv_masks_t.R +++ b/R/intrv_masks_t.R @@ -18,8 +18,8 @@ as_intrv_masks_t <- function(x) { id_coord = integer(0) ) } - cast_dt_col(x, "id_coord", as.integer) - cast_dt_col(x, "id_coord", as.integer) + cast_dt_col(x, "id_coord", "int") + cast_dt_col(x, "id_coord", "int") new_evoland_table( x, "intrv_masks_t", diff --git a/R/lulc_data_t.R b/R/lulc_data_t.R index 91c4bcc..817df05 100644 --- a/R/lulc_data_t.R +++ b/R/lulc_data_t.R @@ -19,7 +19,7 @@ as_lulc_data_t <- function(x) { id_period = integer(0) ) } - cast_dt_col(x, "id_coord", as.integer) + cast_dt_col(x, "id_coord", "int") new_evoland_table( x, "lulc_data_t", diff --git a/R/neighbors_t.R b/R/neighbors_t.R index 5d2e98f..d4b142e 100644 --- a/R/neighbors_t.R +++ b/R/neighbors_t.R @@ -20,10 +20,10 @@ as_neighbors_t <- function(x) { distance = numeric(0) ) } - cast_dt_col(x, "id_coord_origin", as.integer) - cast_dt_col(x, "id_coord_neighbor", as.integer) + cast_dt_col(x, "id_coord_origin", "int") + cast_dt_col(x, "id_coord_neighbor", "int") if ("distance_class" %in% names(x)) { - cast_dt_col(x, "distance_class", as.factor) + cast_dt_col(x, "distance_class", "factor") } new_evoland_table( x, @@ -56,21 +56,23 @@ validate.neighbors_t <- function(x, ...) { #' @export print.neighbors_t <- function(x, nrow = 10, ...) { if (nrow(x) > 0) { - n_origins <- data.table::uniqueN(x[["id_coord_origin"]]) - n_neighbors <- data.table::uniqueN(x[["id_coord_neighbor"]]) - total_pairs <- nrow(x) + total_pairs <- format( + nrow(x), + big.mark = "_", + scientific = FALSE + ) extra_info <- "" if ("distance_class" %in% names(x)) { - n_classes <- data.table::uniqueN(x[["distance_class"]]) - extra_info <- glue::glue(", Distance classes: {n_classes}") + extra_info <- paste( + "Distance classes:", + paste(levels(x[["distance_class"]]), collapse = ", ") + ) } cat(glue::glue( "Neighbors Table\n", "Neighbor pairs: {total_pairs}\n", - "Origin coordinates: {n_origins}, ", - "Neighbor coordinates: {n_neighbors}", "{extra_info}\n\n" )) } else { diff --git a/R/pred_data_t.R b/R/pred_data_t.R index d3c7972..45ee49e 100644 --- a/R/pred_data_t.R +++ b/R/pred_data_t.R @@ -18,26 +18,18 @@ as_pred_data_t <- function(x, type) { stopifnot(type %in% c("float", "int", "bool")) - # Create empty table with proper value type - coercion_fn <- switch( - type, - float = as.numeric, - int = as.integer, - bool = as.logical - ) - if (missing(x)) { x <- data.table::data.table( id_pred = integer(0), id_coord = integer(0), id_period = integer(0), - value = coercion_fn(integer(0)) + value = integer(0) ) } data.table::setDT(x, key = c("id_pred", "id_coord", "id_period")) |> - cast_dt_col("value", coercion_fn) |> - cast_dt_col("id_coord", as.integer) + cast_dt_col("value", type) |> + cast_dt_col("id_coord", "int") class_name <- paste0("pred_data_t_", type) new_evoland_table(x, c(class_name, "pred_data_t")) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index ae324a5..f6ec30f 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -19,8 +19,8 @@ as_trans_preds_t <- function(x) { id_trans = integer(0) ) } - cast_dt_col(x, "id_pred", as.integer) - cast_dt_col(x, "id_trans", as.integer) + cast_dt_col(x, "id_pred", "int") + cast_dt_col(x, "id_trans", "int") new_evoland_table( x, "trans_preds_t", diff --git a/R/util.R b/R/util.R index ceebbcb..7b8e1c0 100644 --- a/R/util.R +++ b/R/util.R @@ -143,11 +143,29 @@ print_rowwise_yaml <- function(df) { } #' @describeIn util Cast a data.table column; invisibly returns x -cast_dt_col <- function(x, colname, castfun) { +cast_dt_col <- function(x, colname, type) { + predicate_fn <- switch( + type, + float = is.numeric, + int = is.integer, + bool = is.logical, + factor = is.factor + ) + if (predicate_fn(x[[colname]])) { + return(invisible(x)) + } + + coercion_fn <- switch( + type, + float = as.numeric, + int = as.integer, + bool = as.logical, + factor = as.factor + ) data.table::set( x = x, j = colname, - value = castfun(x[[colname]]) + value = coercion_fn(x[[colname]]) ) invisible(x) } diff --git a/man/util.Rd b/man/util.Rd index 575d7f8..a83b6fc 100644 --- a/man/util.Rd +++ b/man/util.Rd @@ -26,7 +26,7 @@ ensure_dir(dir) print_rowwise_yaml(df) -cast_dt_col(x, colname, castfun) +cast_dt_col(x, colname, type) } \arguments{ \item{x}{Left-hand side value} From 46ee96aa39c7847721312cb20b56041f14aa768f Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:11:45 +0100 Subject: [PATCH 10/27] error if table is missing --- R/evoland_db.R | 47 ++---------------------------- R/trans_preds_t.R | 3 +- inst/tinytest/test_evoland_db.R | 21 +------------ inst/tinytest/test_trans_preds_t.R | 9 ++++-- 4 files changed, 11 insertions(+), 69 deletions(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index 628dfb2..99d498d 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -60,6 +60,8 @@ evoland_db <- R6::R6Class( fetch = function(table_name, where = NULL, limit = NULL) { # Check if this is a view (active binding) if ( + # TODO these should probably not be active bindings, but instead methods with + # predefined query parameters table_name %in% c("lulc_meta_long_v", "pred_sources_v", "transitions_v", "extent", "coords_minimal") ) { @@ -69,8 +71,7 @@ evoland_db <- R6::R6Class( file_info <- private$get_file_path(table_name) if (!file_info$exists) { - # TODO make this an error - return(private$get_empty_table(table_name)) + stop("Table `", table_name, "` does not exist") } super$fetch(table_name, where, limit) @@ -211,47 +212,5 @@ evoland_db <- R6::R6Class( invisible(self) } - ), - - ## Active Bindings ---- - active = list(), - - ## Private Methods ---- - private = list( - # Get empty table with proper structure (evoland-specific) - # - # param table_name Character string table name - # return Empty data.table with correct columns - get_empty_table = function(table_name) { - # Define empty table structures - empty_tables <- list( - reporting_t = suppressWarnings(data.table::data.table( - key = character(0), - value = character(0) - )), - coords_t = as_coords_t(), - periods_t = as_periods_t(), - lulc_meta_t = as_lulc_meta_t(), - lulc_data_t = as_lulc_data_t(), - pred_meta_t = as_pred_meta_t(), - pred_data_t_float = as_pred_data_t(type = "float"), - pred_data_t_int = as_pred_data_t(type = "int"), - pred_data_t_bool = as_pred_data_t(type = "bool"), - trans_meta_t = as_trans_meta_t(), - trans_preds_t = as_trans_preds_t(), - intrv_meta_t = as_intrv_meta_t(), - intrv_masks_t = as_intrv_masks_t(), - trans_models_t = as_trans_models_t(), - alloc_params_t = as_alloc_params_t(), - neighbors_t = as_neighbors_t() - ) - - if (table_name %in% names(empty_tables)) { - return(empty_tables[[table_name]]) - } - - # Default: return empty data.table - data.table::data.table() - } ) ) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index f6ec30f..dc9e10c 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -51,8 +51,7 @@ create_trans_preds_t <- function( viable_trans <- db$trans_meta_t[is_viable == TRUE] pred_meta <- db$pred_meta_t stopifnot( - "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L, - "No predictors found in pred_meta_t" = nrow(pred_meta) > 0L + "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L ) results_list <- list() diff --git a/inst/tinytest/test_evoland_db.R b/inst/tinytest/test_evoland_db.R index 4a73bc6..c48f2df 100644 --- a/inst/tinytest/test_evoland_db.R +++ b/inst/tinytest/test_evoland_db.R @@ -38,26 +38,7 @@ expect_equal( sort(reporting2$key) ) -# Test 4: Domain-specific empty table structures via active bindings -empty_tables <- c( - "alloc_params_t", - "coords_t", - "intrv_masks_t", - "intrv_meta_t", - "lulc_data_t", - "lulc_meta_t", - "periods_t", - "pred_data_t_bool", - "pred_data_t_float", - "pred_data_t_int", - "pred_meta_t", - "trans_meta_t", - "trans_models_t", - "trans_preds_t" -) -for (table in empty_tables) { - expect_equal(nrow(db[[table]]), 0L) -} +# Test 4 removed: Fetching a missing table now produces an error # Test 5: Create synthetic evoland-specific test data coords_t <- create_coords_t_square( diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R index 3c5feda..737dc6a 100644 --- a/inst/tinytest/test_trans_preds_t.R +++ b/inst/tinytest/test_trans_preds_t.R @@ -184,7 +184,7 @@ expect_error( expect_error( create_trans_preds_t(db = db_empty), - "No viable transitions" + "Table `trans_meta_t` does not exist" ) # Test with no predictors @@ -195,10 +195,13 @@ db_no_pred$coords_t <- db_tps$coords_t db_no_pred$periods_t <- db_tps$periods_t db_no_pred$lulc_meta_t <- db_tps$lulc_meta_t db_no_pred$lulc_data_t <- db_tps$lulc_data_t -db_no_pred$trans_meta_t <- db_tps$trans_meta_t +expect_warning( + db_no_pred$trans_meta_t <- db_tps$trans_meta_t, + "Overriding existing IDs" +) expect_error( create_trans_preds_t(db = db_no_pred), - "No predictors found" + "Table `pred_meta_t` does not exist" ) # Test print method From 176da53a59c8db38856c887441dc0d088ad6375f Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 28 Nov 2025 17:10:08 +0100 Subject: [PATCH 11/27] weights makes no sense at this level --- R/trans_preds_t.R | 2 -- man/trans_preds_t.Rd | 10 +--------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index dc9e10c..4e2dd87 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -34,14 +34,12 @@ as_trans_preds_t <- function(x) { #' @param db An [evoland_db] instance with populated tables #' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] #' @param rank_fun Optional ranking function passed to [covariance_filter()] -#' @param weights Optional weights passed to [covariance_filter()] #' @param ... Additional arguments passed to rank_fun via [covariance_filter()] #' @export create_trans_preds_t <- function( db, corcut = 0.7, rank_fun = rank_poly_glm, - weights = NULL, ... ) { stopifnot( diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd index a555322..e2cbebf 100644 --- a/man/trans_preds_t.Rd +++ b/man/trans_preds_t.Rd @@ -9,13 +9,7 @@ \usage{ as_trans_preds_t(x) -create_trans_preds_t( - db, - corcut = 0.7, - rank_fun = rank_poly_glm, - weights = NULL, - ... -) +create_trans_preds_t(db, corcut = 0.7, rank_fun = rank_poly_glm, ...) \method{print}{trans_preds_t}(x, nrow = 10, ...) } @@ -26,8 +20,6 @@ create_trans_preds_t( \item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}} -\item{weights}{Optional weights passed to \code{\link[=covariance_filter]{covariance_filter()}}} - \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}} \item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} From e8f2ef7438032c38dac12df1b52c402ed0d1222e Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 28 Nov 2025 17:10:58 +0100 Subject: [PATCH 12/27] compute_neighbors -> create_neighbors_t --- NAMESPACE | 2 +- R/evoland_db_neighbors.R | 12 ++--- R/neighbors_t.R | 66 ++++++++++++++++++++++++++++ R/util_terra.R | 78 --------------------------------- inst/tinytest/test_util_terra.R | 20 ++++----- man/evoland_db.Rd | 10 ++--- man/evoland_db_neighbors.Rd | 4 +- man/neighbors_t.Rd | 34 ++++++++++++++ man/util_terra.Rd | 31 ------------- 9 files changed, 124 insertions(+), 133 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f111398..35a656a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,12 +46,12 @@ export(as_pred_meta_t) export(as_trans_meta_t) export(as_trans_models_t) export(as_trans_preds_t) -export(compute_neighbors) export(covariance_filter) export(create_coords_t_square) export(create_intrv_meta_t) export(create_intrv_meta_t_row) export(create_lulc_meta_t) +export(create_neighbors_t) export(create_periods_t) export(create_pred_meta_t) export(create_trans_meta_t) diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R index 65f29f8..e4d02cc 100644 --- a/R/evoland_db_neighbors.R +++ b/R/evoland_db_neighbors.R @@ -6,7 +6,7 @@ #' #' @section Methods Added: #' -#' - `compute_neighbors(max_distance, distance_breaks, resolution, overwrite)` - +#' - `create_neighbors_t(max_distance, distance_breaks, resolution, overwrite)` - #' Computes neighbor relationships between coordinates. #' - `max_distance`: Maximum distance for neighbors (default: 1000) #' - `distance_breaks`: Vector of breaks for distance classes (default: c(0, 100, 500, 1000)) @@ -14,7 +14,7 @@ #' - `overwrite`: Whether to overwrite existing neighbors_t (default: FALSE) #' - `generate_neighbor_predictors()` - Generates predictor variables based on neighbor #' land use counts by distance class. Requires neighbors_t with distance_class column -#' (compute_neighbors with distance_breaks). +#' (create_neighbors_t with distance_breaks). #' #' @name evoland_db_neighbors #' @include evoland_db.R @@ -22,7 +22,7 @@ NULL evoland_db$set( "public", - "compute_neighbors", + "create_neighbors_t", function( max_distance = 1000, distance_breaks = c(0, 100, 500, 1000), @@ -36,7 +36,7 @@ evoland_db$set( coords <- self$coords_t - neighbors <- compute_neighbors( + neighbors <- create_neighbors_t( coords, max_distance = max_distance, distance_breaks = distance_breaks, @@ -55,7 +55,7 @@ evoland_db$set( evoland_db$set("public", "generate_neighbor_predictors", function() { if (self$row_count("neighbors_t") == 0) { - stop("No neighbor data found. Run $compute_neighbors() first.") + stop("No neighbor data found. Run $create_neighbors_t() first.") } if (self$row_count("lulc_meta_t") == 0) { @@ -69,7 +69,7 @@ evoland_db$set("public", "generate_neighbor_predictors", function() { neighbors_sample <- self$fetch("neighbors_t", limit = 1) if (!"distance_class" %in% names(neighbors_sample)) { stop( - "neighbors_t does not have distance_class column. Run $compute_neighbors() with distance_breaks parameter." + "neighbors_t does not have distance_class column. Run $create_neighbors_t() with distance_breaks parameter." ) } diff --git a/R/neighbors_t.R b/R/neighbors_t.R index d4b142e..f4d09fd 100644 --- a/R/neighbors_t.R +++ b/R/neighbors_t.R @@ -32,6 +32,72 @@ as_neighbors_t <- function(x) { ) } +#' @describeIn neighbors_t Compute neighboring coordinates within specified distances. In +#' order to be computationally feasible, the coordinates' IDs are rasterized before +#' their actual Euclidean distance is calculated. If coordinates are so close that they +#' get rasterized to the same cell, the first one is used and a warning is emitted. If +#' this happens, try again using a lower resolution. +#' @param max_distance Maximum distance to search for neighbors (in same units as +#' coordinates and resolution) +#' @param distance_breaks Optional numeric vector defining distance class boundaries. +#' If NULL, no distance classification is performed. +#' If provided, must have at least 2 elements defining interval breaks. +#' @param resolution Grid cell size for rasterization (default: 100.0, in same units as coordinates) +#' @return A data.table with columns: +#' - id_coord_origin: ID of the origin coordinate +#' - id_coord_neighbor: ID of the neighboring coordinate +#' - distance: Distance between origin and neighbor +#' - distance_class: Factor indicating distance class (if distance_breaks provided) +#' @export +create_neighbors_t <- function( + coords_t, + max_distance, + distance_breaks = NULL, + resolution = 100.0 +) { + # Validate inputs + if (!inherits(coords_t, "coords_t")) { + stop("coords_t must be a coords_t object") + } + + if (!is.numeric(max_distance) || length(max_distance) != 1 || max_distance <= 0) { + stop("max_distance must be a positive scalar numeric") + } + + if (!is.null(distance_breaks)) { + if (!is.numeric(distance_breaks) || length(distance_breaks) < 2) { + stop("distance_breaks must be NULL or a numeric vector with at least 2 elements") + } + } + + # Call C++ function + dt <- distance_neighbors_cpp( + coords_t = coords_t, + max_distance = max_distance, + resolution = resolution + ) + + data.table::setalloccol(dt) + + # Rename distance_approx to distance + data.table::setnames(dt, "distance_approx", "distance") + + # Add distance class if breaks provided + if (!is.null(distance_breaks)) { + dt[, + distance_class := cut( + distance, + breaks = distance_breaks, + right = FALSE, + include.lowest = TRUE + ) + ] + } + + as_neighbors_t(dt) +} + + #' @describeIn neighbors_t Validate a neighbors_t object #' @export validate.neighbors_t <- function(x, ...) { diff --git a/R/util_terra.R b/R/util_terra.R index 42bd2f5..463c772 100644 --- a/R/util_terra.R +++ b/R/util_terra.R @@ -91,81 +91,3 @@ extract_using_coords_t.SpatVector <- function(x, coords_t, na_omit = TRUE) { # TODO move this to a neighbors_t.R file that includes a formal class definition # for the return table, including validation, print, coercion -#' @describeIn util_terra Compute neighboring coordinates within specified distances. In -#' order to be computationally feasible, the coordinates' IDs are rasterized before -#' their actual Euclidean distance is calculated. If coordinates are so close that they -#' get rasterized to the same cell, the first one is used and a warning is emitted. If -#' this happens, try again using a lower resolution. -#' @param max_distance Maximum distance to search for neighbors (in same units as -#' coordinates and resolution) -#' @param distance_breaks Optional numeric vector defining distance class boundaries. -#' If NULL, no distance classification is performed. -#' If provided, must have at least 2 elements defining interval breaks. -#' @param resolution Grid cell size for rasterization (default: 100.0, in same units as coordinates) -#' @return A data.table with columns: -#' - id_coord_origin: ID of the origin coordinate -#' - id_coord_neighbor: ID of the neighboring coordinate -#' - distance: Distance between origin and neighbor -#' - distance_class: Factor indicating distance class (if distance_breaks provided) -#' @export -compute_neighbors <- function( - coords_t, - max_distance, - distance_breaks = NULL, - resolution = 100.0 -) { - # Validate inputs - if (!inherits(coords_t, "coords_t")) { - stop("coords_t must be a coords_t object") - } - - if (!is.numeric(max_distance) || length(max_distance) != 1 || max_distance <= 0) { - stop("max_distance must be a positive scalar numeric") - } - - if (!is.null(distance_breaks)) { - if (!is.numeric(distance_breaks) || length(distance_breaks) < 2) { - stop("distance_breaks must be NULL or a numeric vector with at least 2 elements") - } - } - - # Call C++ function - dt <- distance_neighbors_cpp( - coords_t = coords_t, - max_distance = max_distance, - resolution = resolution - ) - - # Set data.table allocation - dt <- data.table::setalloccol(dt) - - # Rename distance_approx to distance - data.table::setnames(dt, "distance_approx", "distance") - - # Add distance class if breaks provided - if (!is.null(distance_breaks)) { - dt[, - distance_class := cut( - distance, - breaks = distance_breaks, - right = FALSE, - include.lowest = TRUE - ) - ] - } - - # Reorder columns - if (!is.null(distance_breaks)) { - data.table::setcolorder( - dt, - c("id_coord_origin", "id_coord_neighbor", "distance_class", "distance") - ) - } else { - data.table::setcolorder( - dt, - c("id_coord_origin", "id_coord_neighbor", "distance") - ) - } - - return(dt) -} diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R index b3089d3..efb0dac 100644 --- a/inst/tinytest/test_util_terra.R +++ b/inst/tinytest/test_util_terra.R @@ -140,7 +140,7 @@ expect_true(!anyNA(vector_na_omit_true$value)) vector_na_omit_false <- extract_using_coords_t(vect_with_na, coords_t, na_omit = FALSE) expect_true(anyNA(vector_na_omit_false$value)) -# Test compute_neighbors +# Test create_neighbors_t # Create a simple test coords_t with known distances test_coords <- data.table::data.table( id_coord = 1L:5L, @@ -152,7 +152,7 @@ test_coords <- data.table::data.table( test_coords <- as_coords_t(test_coords) # Test basic neighbor computation with max_distance = 150 -neighbors <- compute_neighbors(test_coords, max_distance = 150) +neighbors <- create_neighbors_t(test_coords, max_distance = 150) # Check structure expect_true(data.table::is.data.table(neighbors)) @@ -204,7 +204,7 @@ expect_equal( ) # Test with distance_breaks -neighbors_classified <- compute_neighbors( +neighbors_classified <- create_neighbors_t( test_coords, max_distance = 150, distance_breaks = c(0, 100, 150) @@ -234,7 +234,7 @@ expect_equal( ) # Test with smaller max_distance -neighbors_small <- compute_neighbors(test_coords, max_distance = 110) +neighbors_small <- create_neighbors_t(test_coords, max_distance = 110) # With max_distance = 110, point 1 should only have neighbors 2 and 4 (distance 100) # but not 5 (distance ~141.4) @@ -246,27 +246,27 @@ expect_false(5L %in% neighbors_from_1_small$id_coord_neighbor) # Test error handling expect_error( - compute_neighbors("not_coords_t", max_distance = 100), + create_neighbors_t("not_coords_t", max_distance = 100), "coords_t must be a coords_t object" ) expect_error( - compute_neighbors(test_coords, max_distance = -10), + create_neighbors_t(test_coords, max_distance = -10), "max_distance must be a positive scalar numeric" ) expect_error( - compute_neighbors(test_coords, max_distance = 100, distance_breaks = c(1)), + create_neighbors_t(test_coords, max_distance = 100, distance_breaks = c(1)), "distance_breaks must be NULL or a numeric vector with at least 2 elements" ) expect_error( - compute_neighbors(test_coords, max_distance = 100, distance_breaks = "invalid"), + create_neighbors_t(test_coords, max_distance = 100, distance_breaks = "invalid"), "distance_breaks must be NULL or a numeric vector with at least 2 elements" ) # Test with real coords_t from earlier in the test file -real_neighbors <- compute_neighbors(coords_t, max_distance = 300) +real_neighbors <- create_neighbors_t(coords_t, max_distance = 300) # Each point in a regular 100m grid should have neighbors # Interior points should have 8 neighbors within 300m (8-connectivity) @@ -291,7 +291,7 @@ for (i in seq_len( } # Test with distance classification on real data -real_neighbors_class <- compute_neighbors( +real_neighbors_class <- create_neighbors_t( coords_t, max_distance = 300, distance_breaks = c(0, 150, 300) diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index 3e005d7..fbdc608 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -25,7 +25,7 @@ Additional methods and active bindings are added to this class in separate files \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-evoland_db-compute_neighbors}{\code{evoland_db$compute_neighbors()}} +\item \href{#method-evoland_db-create_neighbors_t}{\code{evoland_db$create_neighbors_t()}} \item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}} \item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}} \item \href{#method-evoland_db-new}{\code{evoland_db$new()}} @@ -56,11 +56,11 @@ Additional methods and active bindings are added to this class in separate files }} \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-compute_neighbors}{}}} -\subsection{Method \code{compute_neighbors()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-create_neighbors_t}{}}} +\subsection{Method \code{create_neighbors_t()}}{ \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$compute_neighbors( +\if{html}{\out{
}}\preformatted{evoland_db$create_neighbors_t( max_distance = 1000, distance_breaks = c(0, 100, 500, 1000), resolution = 100, diff --git a/man/evoland_db_neighbors.Rd b/man/evoland_db_neighbors.Rd index 09e3e93..91db904 100644 --- a/man/evoland_db_neighbors.Rd +++ b/man/evoland_db_neighbors.Rd @@ -10,7 +10,7 @@ These methods compute neighbor relationships and generate neighbor-based predict \section{Methods Added}{ \itemize{ -\item \code{compute_neighbors(max_distance, distance_breaks, resolution, overwrite)} - +\item \code{create_neighbors_t(max_distance, distance_breaks, resolution, overwrite)} - Computes neighbor relationships between coordinates. \itemize{ \item \code{max_distance}: Maximum distance for neighbors (default: 1000) @@ -20,7 +20,7 @@ Computes neighbor relationships between coordinates. } \item \code{generate_neighbor_predictors()} - Generates predictor variables based on neighbor land use counts by distance class. Requires neighbors_t with distance_class column -(compute_neighbors with distance_breaks). +(create_neighbors_t with distance_breaks). } } diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd index bc95b6a..2376f02 100644 --- a/man/neighbors_t.Rd +++ b/man/neighbors_t.Rd @@ -3,12 +3,20 @@ \name{neighbors_t} \alias{neighbors_t} \alias{as_neighbors_t} +\alias{create_neighbors_t} \alias{validate.neighbors_t} \alias{print.neighbors_t} \title{Create Neighbors Table} \usage{ as_neighbors_t(x) +create_neighbors_t( + coords_t, + max_distance, + distance_breaks = NULL, + resolution = 100 +) + \method{validate}{neighbors_t}(x, ...) \method{print}{neighbors_t}(x, nrow = 10, ...) @@ -16,6 +24,15 @@ as_neighbors_t(x) \arguments{ \item{x}{An object that can be passed to \code{\link[data.table:setDT]{data.table::setDT()}}} +\item{max_distance}{Maximum distance to search for neighbors (in same units as +coordinates and resolution)} + +\item{distance_breaks}{Optional numeric vector defining distance class boundaries. +If NULL, no distance classification is performed. +If provided, must have at least 2 elements defining interval breaks.} + +\item{resolution}{Grid cell size for rasterization (default: 100.0, in same units as coordinates)} + \item{...}{Passed to \link[data.table:print.data.table]{data.table::print.data.table}} \item{nrow}{Maximum number of rows to print. See \link[data.table:print.data.table]{data.table::print.data.table}} @@ -28,6 +45,14 @@ A data.table of class "neighbors_t" with columns: \item \code{distance}: Numeric distance between coordinates \item \code{distance_class}: Optional factor representing distance intervals } + +A data.table with columns: +\itemize{ +\item id_coord_origin: ID of the origin coordinate +\item id_coord_neighbor: ID of the neighboring coordinate +\item distance: Distance between origin and neighbor +\item distance_class: Factor indicating distance class (if distance_breaks provided) +} } \description{ Creates a \code{neighbors_t} table and validates that it matches the schema. @@ -39,3 +64,12 @@ Creates a \code{neighbors_t} table and validates that it matches the schema. \item \code{print(neighbors_t)}: Print a neighbors_t object }} +\section{Functions}{ +\itemize{ +\item \code{create_neighbors_t()}: Compute neighboring coordinates within specified distances. In +order to be computationally feasible, the coordinates' IDs are rasterized before +their actual Euclidean distance is calculated. If coordinates are so close that they +get rasterized to the same cell, the first one is used and a warning is emitted. If +this happens, try again using a lower resolution. + +}} diff --git a/man/util_terra.Rd b/man/util_terra.Rd index d46c499..e429d0a 100644 --- a/man/util_terra.Rd +++ b/man/util_terra.Rd @@ -3,43 +3,18 @@ \name{util_terra} \alias{util_terra} \alias{extract_using_coords_t} -\alias{compute_neighbors} \title{evoland utility functions to work with terra objects} \usage{ extract_using_coords_t(x, coords_t, na_omit = TRUE) - -compute_neighbors( - coords_t, - max_distance, - distance_breaks = NULL, - resolution = 100 -) } \arguments{ \item{x}{The object to extract from; use "simple" extraction for rasters, i.e. no resampling is done.} \item{coords_t}{A coords_t object containing coordinate points} - -\item{max_distance}{Maximum distance to search for neighbors (in same units as -coordinates and resolution)} - -\item{distance_breaks}{Optional numeric vector defining distance class boundaries. -If NULL, no distance classification is performed. -If provided, must have at least 2 elements defining interval breaks.} - -\item{resolution}{Grid cell size for rasterization (default: 100.0, in same units as coordinates)} } \value{ A long data.table with \code{id_coord}, a \code{layer}/\code{attribute} column, and a \code{value} column. NAs are omitted - -A data.table with columns: -\itemize{ -\item id_coord_origin: ID of the origin coordinate -\item id_coord_neighbor: ID of the neighboring coordinate -\item distance: Distance between origin and neighbor -\item distance_class: Factor indicating distance class (if distance_breaks provided) -} } \description{ Useful to coax terra raster and vector data into evoland tabular form. @@ -48,10 +23,4 @@ Useful to coax terra raster and vector data into evoland tabular form. \itemize{ \item \code{extract_using_coords_t()}: Extract values from a SpatRaster or SpatVector object using a (minimal) \code{coords_t} -\item \code{compute_neighbors()}: Compute neighboring coordinates within specified distances. In -order to be computationally feasible, the coordinates' IDs are rasterized before -their actual Euclidean distance is calculated. If coordinates are so close that they -get rasterized to the same cell, the first one is used and a warning is emitted. If -this happens, try again using a lower resolution. - }} From a0a6256833cf0a598b23f57e42332a8f070d6b36 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Fri, 28 Nov 2025 17:31:24 +0100 Subject: [PATCH 13/27] make transition results bool --- R/covariance_filter.R | 13 ++++++------- R/evoland_db_views.R | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/R/covariance_filter.R b/R/covariance_filter.R index 6860798..fb06530 100644 --- a/R/covariance_filter.R +++ b/R/covariance_filter.R @@ -56,7 +56,6 @@ covariance_filter <- function( # Validate binary outcome stopifnot( - "result_col must be binary (0/1)" = length(unique(data[[result_col]])) == 2, "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1 ) @@ -119,8 +118,8 @@ rank_poly_glm <- function(x, y, weights = NULL, ...) { #' @keywords internal compute_balanced_weights <- function(trans_result, legacy = FALSE) { n_total <- length(trans_result) - n_trans <- sum(trans_result == 1) - n_non_trans <- sum(trans_result == 0) + n_trans <- sum(trans_result) + n_non_trans <- sum(!trans_result) # Compute inverse frequency weights weights <- numeric(n_total) @@ -129,16 +128,16 @@ compute_balanced_weights <- function(trans_result, legacy = FALSE) { # I found this weighting in evoland-plus-legacy, but the models wouldn't converge # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r # This is actually just setting the underrepresented class to the rounded imbalance ratio - weights[trans_result == 0] <- 1 - weights[trans_result == 1] <- round(n_non_trans / n_trans) + weights[!trans_result] <- 1 + weights[trans_result] <- round(n_non_trans / n_trans) return(weights) } # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y)) # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint # This weighting maintains the exact imbalance ratio - weights[trans_result == 1] <- n_total / (2 * n_trans) - weights[trans_result == 0] <- n_total / (2 * n_non_trans) + weights[trans_result] <- n_total / (2 * n_trans) + weights[!trans_result] <- n_total / (2 * n_non_trans) weights } diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index 3201935..42429a3 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -136,8 +136,8 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans) { curr.id_coord, curr.id_period, CASE - WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN 1 - WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN 0 + WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN TRUE + WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN FALSE ELSE NULL END AS result FROM lulc_data_t AS curr From 3cc787ee75856574da8c573aaef96ed1c68d6da5 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Sat, 29 Nov 2025 14:38:43 +0100 Subject: [PATCH 14/27] more informative / reflective printing method --- R/evoland_db.R | 37 ------------------- R/parquet_duckdb.R | 88 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 84 insertions(+), 41 deletions(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index 99d498d..4279ad0 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -174,43 +174,6 @@ evoland_db <- R6::R6Class( as_pred_data_t(pred_data, pred_type), paste0("pred_data_t_", pred_type) ) - }, - - #' @description - #' Print method for evoland_db - #' @param ... Not used - #' @return self (invisibly) - print = function(...) { - cat(" (extends parquet_duckdb)\n") - cat(sprintf("Database path: %s\n", self$path)) - cat(sprintf("Default format: %s\n\n", self$default_format)) - - cat("Database methods (inherited):\n") - cat(" Commit: commit_overwrite(x, table_name, autoincrement_cols, map_cols),\n") - cat(" commit_append(x, table_name, autoincrement_cols, map_cols),\n") - cat(" commit_upsert(x, table_name, key_cols, autoincrement_cols, map_cols)\n") - cat(" Fetch: fetch(table_name, where, limit)\n") - cat(" Delete: delete_from(table_name, where)\n") - cat(" Other: list_tables(), execute(statement), get_query(statement),\n") - cat(" attach_table(table_name, columns), detach_table(table_name),\n") - cat(" row_count(table_name)\n\n") - - cat("Set / Add methods:\n") - cat(" set_report(...), set_coords(type, ...),\n") - cat(" set_periods(period_length_str, start_observed, end_observed, end_extrapolated),\n") - cat(" add_predictor(pred_spec, pred_data, pred_type)\n\n") - - cat("Active bindings (read-write):\n") - cat(" coords_t, periods_t, lulc_meta_t, lulc_data_t, pred_data_t_float,\n") - cat(" pred_data_t_int, pred_data_t_bool, pred_meta_t, trans_meta_t, trans_preds_t,\n") - cat(" intrv_meta_t, intrv_masks_t, trans_models_t, alloc_params_t\n\n") - - cat("Active bindings (read-only):\n") - cat(" extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n\n") - - cat(sprintf("Tables: %d\n", length(self$list_tables()))) - - invisible(self) } ) ) diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R index 11aa465..e84f13e 100644 --- a/R/parquet_duckdb.R +++ b/R/parquet_duckdb.R @@ -436,10 +436,90 @@ parquet_duckdb <- R6::R6Class( #' @param ... Not used #' @return self (invisibly) print = function(...) { - cat("\n") - cat(sprintf("Database path: %s\n", self$path)) - cat(sprintf("Default format: %s\n", self$default_format)) - cat(sprintf("Tables: %d\n", length(self$list_tables()))) + # gather data to be printed + classes <- class(self) + classes <- classes[classes != "R6"] + + all_names <- names(self) + methods <- character(0) + active_bindings <- character(0) + + names(self$.__enclos_env__$private) + if (!is.null(self$.__enclos_env__$super)) { + # exclude private super names + super_names <- setdiff( + ls(self$.__enclos_env__$super), + ls(self$.__enclos_env__$super$.__enclos_env__$private) + ) + } else { + super_names <- character(0) + } + nonsuper_names <- setdiff(all_names, super_names) + + for (name in nonsuper_names) { + # Check if it's an active binding first; subset2 would evaluate it + if (bindingIsActive(name, self$.__enclos_env__$self)) { + active_bindings <- c(active_bindings, name) + } else { + obj <- .subset2(self, name) + if (is.function(obj) && !name %in% c("initialize", "print", "clone")) { + methods <- c(methods, name) + } + } + } + + methods <- sort(methods) + active_bindings <- + active_bindings[!grepl("_t($|_)", active_bindings)] |> + sort() + + # actually start printing + if (length(classes) == 1) { + cat("<", classes[1], "> Object\n") + } else { + cat(classes[1], "Object. Inherits from", toString(classes[-1]), "\n") + } + + compression <- if (grepl("compression\\s+(\\w+)", self$writeopts)) { + sub(".*compression\\s+(\\w+).*", "\\1", self$writeopts) + } else { + "none" + } + + # Database info on one line + cat( + glue::glue( + "Database: {self$path} | Format: {self$default_format} | Compression: {compression}" + ), + "\n\n" + ) + + tables <- self$list_tables() + if (length(tables) > 0) { + cat("Tables present:\n ") + cat(strwrap(toString(tables), width = 80), sep = "\n ") + cat("\n") + } else { + cat("Tables present: (none)\n\n") + } + + if (length(super_names) > 0) { + cat("DB Methods:\n ") + cat(strwrap(toString(super_names), width = 80), sep = "\n ") + cat("\n") + } + + if (length(methods) > 0) { + cat("Public methods:\n ") + cat(strwrap(toString(methods), width = 80), sep = "\n ") + cat("\n") + } + + if (length(active_bindings) > 0) { + cat("Active bindings:\n ") + cat(strwrap(toString(active_bindings), width = 80), sep = "\n ") + } + invisible(self) } ), From 51efad09b550b960f8bd21f66ab1c8141c1d701f Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Sat, 29 Nov 2025 14:39:51 +0100 Subject: [PATCH 15/27] set_neighbors: more idiomatic --- R/evoland_db_neighbors.R | 5 +++-- man/evoland_db.Rd | 32 ++++++-------------------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R index e4d02cc..9b5896a 100644 --- a/R/evoland_db_neighbors.R +++ b/R/evoland_db_neighbors.R @@ -22,7 +22,7 @@ NULL evoland_db$set( "public", - "create_neighbors_t", + "set_neighbors", function( max_distance = 1000, distance_breaks = c(0, 100, 500, 1000), @@ -55,7 +55,7 @@ evoland_db$set( evoland_db$set("public", "generate_neighbor_predictors", function() { if (self$row_count("neighbors_t") == 0) { - stop("No neighbor data found. Run $create_neighbors_t() first.") + stop("No neighbor data found. Run $set_neighbors() first.") } if (self$row_count("lulc_meta_t") == 0) { @@ -95,6 +95,7 @@ evoland_db$set("public", "generate_neighbor_predictors", function() { "(select 0 as max_pred)" } + # TODO this could possibly be done using the upsert, since it now takes care of the IDs self$execute(glue::glue( r"{ create temp table pred_meta_neighbors_t as diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index fbdc608..676de84 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -25,7 +25,7 @@ Additional methods and active bindings are added to this class in separate files \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-evoland_db-create_neighbors_t}{\code{evoland_db$create_neighbors_t()}} +\item \href{#method-evoland_db-set_neighbors}{\code{evoland_db$set_neighbors()}} \item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}} \item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}} \item \href{#method-evoland_db-new}{\code{evoland_db$new()}} @@ -34,7 +34,6 @@ Additional methods and active bindings are added to this class in separate files \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}} \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}} \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}} -\item \href{#method-evoland_db-print}{\code{evoland_db$print()}} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } @@ -50,17 +49,18 @@ Additional methods and active bindings are added to this class in separate files
  • evoland::parquet_duckdb$execute()
  • evoland::parquet_duckdb$get_query()
  • evoland::parquet_duckdb$list_tables()
  • +
  • evoland::parquet_duckdb$print()
  • evoland::parquet_duckdb$row_count()
  • evoland::parquet_duckdb$with_tables()
  • }} \if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-create_neighbors_t}{}}} -\subsection{Method \code{create_neighbors_t()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-set_neighbors}{}}} +\subsection{Method \code{set_neighbors()}}{ \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{evoland_db$create_neighbors_t( +\if{html}{\out{
    }}\preformatted{evoland_db$set_neighbors( max_distance = 1000, distance_breaks = c(0, 100, 500, 1000), resolution = 100, @@ -225,26 +225,6 @@ Add a predictor to the database } } \if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-print}{}}} -\subsection{Method \code{print()}}{ -Print method for evoland_db -\subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{evoland_db$print(...)}\if{html}{\out{
    }} -} - -\subsection{Arguments}{ -\if{html}{\out{
    }} -\describe{ -\item{\code{...}}{Not used} -} -\if{html}{\out{
    }} -} -\subsection{Returns}{ -self (invisibly) -} -} -\if{html}{\out{
    }} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-clone}{}}} \subsection{Method \code{clone()}}{ From f3223dfcc4488c866ab227106ac58c36333c971a Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Sun, 30 Nov 2025 13:57:39 +0100 Subject: [PATCH 16/27] changed print method --- inst/tinytest/test_parquet_duckdb.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R index fe9b188..ad58bc8 100644 --- a/inst/tinytest/test_parquet_duckdb.R +++ b/inst/tinytest/test_parquet_duckdb.R @@ -343,7 +343,7 @@ db$commit_upsert(test_no_keys, "no_keys_test", key_cols = character(0)) expect_equal(db$row_count("no_keys_test"), 4L) # Should append # Test 35: Print method -output <- capture.output(print(db)) -expect_true(any(grepl("parquet_duckdb", output))) -expect_true(any(grepl("Database path", output))) -expect_true(any(grepl("Default format", output))) +expect_stdout( + print(db), + "Public methods:|Active bindings:|Format|Compression" +) From ea5ed53668aa531ad6d587dc53035f3862b5ff60 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Sun, 30 Nov 2025 13:58:04 +0100 Subject: [PATCH 17/27] cast for safety --- R/evoland_db.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index 4279ad0..310e317 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -167,7 +167,7 @@ evoland_db <- R6::R6Class( where = glue::glue("name = '{names(pred_spec)}'") ) - data.table::set(pred_data, j = "id_pred", value = existing_meta[["id_pred"]]) + data.table::set(pred_data, j = "id_pred", value = as.integer(existing_meta[["id_pred"]])) data.table::setcolorder(pred_data, c("id_pred", "id_coord", "id_period", "value")) self$commit_upsert( From 0feb2d7a7ef11bbb33ce49ab2747ba16e875182a Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Sun, 30 Nov 2025 13:58:38 +0100 Subject: [PATCH 18/27] enable committing from existing in-memory tables --- R/parquet_duckdb.R | 64 +++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R index e84f13e..6307d87 100644 --- a/R/parquet_duckdb.R +++ b/R/parquet_duckdb.R @@ -246,7 +246,7 @@ parquet_duckdb <- R6::R6Class( #' @description #' Commit data in overwrite mode - #' @param x Data frame to commit + #' @param x Data frame to commit. If character, in-duckdb-memory table. #' @param table_name Character string table name #' @param autoincrement_cols Character vector of column names to auto-increment #' @param map_cols Character vector of columns to convert to MAP format @@ -260,10 +260,11 @@ parquet_duckdb <- R6::R6Class( file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) + on.exit(private$cleanup_new_data_v(), add = TRUE) + new_data_names <- private$new_data_names() # Warn if overriding existing IDs - if (length(intersect(autoincrement_cols, names(x))) > 0) { + if (length(intersect(autoincrement_cols, new_data_names)) > 0) { warning(glue::glue( "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" @@ -271,7 +272,7 @@ parquet_duckdb <- R6::R6Class( } # Build SELECT expression - ordinary_cols <- setdiff(names(x), autoincrement_cols) + ordinary_cols <- setdiff(new_data_names, autoincrement_cols) select_expr <- glue::glue_collapse( c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), sep = ",\n " @@ -291,7 +292,7 @@ parquet_duckdb <- R6::R6Class( #' @description #' Commit data in append mode - #' @param x Data frame to commit + #' @param x Data frame to commit. If character, in-duckdb-memory table. #' @param table_name Character string table name #' @param autoincrement_cols Character vector of column names to auto-increment #' @param map_cols Character vector of columns to convert to MAP format @@ -313,9 +314,10 @@ parquet_duckdb <- R6::R6Class( private$set_autoincrement_vars(table_name, autoincrement_cols) private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) + on.exit(private$cleanup_new_data_v(), add = TRUE) + new_data_names <- private$get_new_data_names() - ordinary_cols <- setdiff(names(x), autoincrement_cols) + ordinary_cols <- setdiff(new_data_names, autoincrement_cols) select_new <- glue::glue_collapse( c( glue::glue( @@ -344,16 +346,17 @@ parquet_duckdb <- R6::R6Class( #' @description #' Commit data in upsert mode (update existing, insert new) - #' @param x Data frame to commit + #' @param x Data frame to commit. If character, in-duckdb-memory table. #' @param table_name Character string table name - #' @param key_cols Character vector of columns that define uniqueness + #' @param key_cols Character vector of columns that define uniqueness. If missing, + #' use all columns starting with `id_` #' @param autoincrement_cols Character vector of column names to auto-increment #' @param map_cols Character vector of columns to convert to MAP format #' @return Invisible NULL (called for side effects) commit_upsert = function( x, table_name, - key_cols = grep("^id_", names(x), value = TRUE), + key_cols, autoincrement_cols = character(0), map_cols = character(0) ) { @@ -361,8 +364,6 @@ parquet_duckdb <- R6::R6Class( if (!file_info$exists) { return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) - } else if (length(key_cols) == 0) { - return(self$commit_append(x, table_name, autoincrement_cols, map_cols)) } self$attach_table(table_name) @@ -371,10 +372,17 @@ parquet_duckdb <- R6::R6Class( private$set_autoincrement_vars(table_name, autoincrement_cols) private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) + on.exit(private$cleanup_new_data_v(), add = TRUE) + new_data_names <- private$new_data_names() + if (missing(key_cols)) { + key_cols <- grep("^id_", new_data_names, value = TRUE) + } + if (length(key_cols) == 0) { + return(self$commit_append(x, table_name, autoincrement_cols, map_cols)) + } # Update existing data - ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols)) + ordinary_cols <- setdiff(new_data_names, union(key_cols, autoincrement_cols)) update_select_expr <- glue::glue_collapse( glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), sep = ",\n " @@ -400,7 +408,7 @@ parquet_duckdb <- R6::R6Class( glue::glue( "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" ), - glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}") + glue::glue("new_data_v.{setdiff(new_data_names, autoincrement_cols)}") ), sep = ",\n " ) @@ -567,6 +575,12 @@ parquet_duckdb <- R6::R6Class( # @param map_cols Character vector of columns to convert to MAP format # @return NULL (called for side effects) register_new_data_v = function(x, map_cols = character(0)) { + if (is.character(x)) { + # TODO add tests + self$execute(glue::glue("create view new_data_v as from {x}")) + return(invisible(NULL)) + } + if (length(map_cols) == 0) { # No MAP conversion needed - register directly duckdb::duckdb_register(self$connection, "new_data_v", x) @@ -598,20 +612,22 @@ parquet_duckdb <- R6::R6Class( # # @param map_cols Character vector indicating if MAP conversion was used # @return NULL (called for side effects) - cleanup_new_data_v = function(map_cols = character(0)) { - if (length(map_cols) == 0) { - duckdb::duckdb_unregister(self$connection, "new_data_v") - } else { - self$execute( - "drop table if exists new_data_v; + cleanup_new_data_v = function() { + duckdb::duckdb_unregister(self$connection, "new_data_v") + duckdb::duckdb_unregister(self$connection, "new_data_raw") + self$execute( + "drop table if exists new_data_v; drop view if exists new_data_v" - ) - duckdb::duckdb_unregister(self$connection, "new_data_raw") - } + ) invisible(NULL) }, + # Get the names of the current new_data_v + new_data_names = function() { + self$get_query("select column_name from (describe new_data_v)")[[1]] + }, + # Set DuckDB variables max_{colname} for autoincrement columns # # @param table_name Character string table name From a9fea629b798d7a77e8017472184969da15b6f2a Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Sun, 30 Nov 2025 18:40:12 +0100 Subject: [PATCH 19/27] corrected implementation of neighbour predictors --- R/evoland_db_neighbors.R | 67 ++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R index 9b5896a..b07c749 100644 --- a/R/evoland_db_neighbors.R +++ b/R/evoland_db_neighbors.R @@ -66,46 +66,33 @@ evoland_db$set("public", "generate_neighbor_predictors", function() { stop("No LULC data found. Add lulc_data_t before generating neighbor predictors.") } - neighbors_sample <- self$fetch("neighbors_t", limit = 1) + neighbors_sample <- self$fetch("neighbors_t", limit = 0L) if (!"distance_class" %in% names(neighbors_sample)) { stop( "neighbors_t does not have distance_class column. Run $create_neighbors_t() with distance_breaks parameter." ) } - self$attach_table("neighbors_t") + self$attach_table( + "neighbors_t", + columns = c("id_coord_origin", "id_coord_neighbor", "distance_class") + ) self$attach_table("lulc_data_t") self$attach_table("lulc_meta_t") - has_pred_meta <- self$row_count("pred_meta_t") > 0 - if (has_pred_meta) { - self$attach_table("pred_meta_t") - } - on.exit({ self$detach_table("neighbors_t") self$detach_table("lulc_data_t") self$detach_table("lulc_meta_t") - if (has_pred_meta) self$detach_table("pred_meta_t") }) - max_id_query <- if (has_pred_meta) { - "(select coalesce(max(id_pred), 0) as max_pred from pred_meta_t)" - } else { - "(select 0 as max_pred)" - } - - # TODO this could possibly be done using the upsert, since it now takes care of the IDs - self$execute(glue::glue( + n_predictors <- self$execute( r"{ - create temp table pred_meta_neighbors_t as + create or replace temp table pred_meta_neighbors_t as with - all_distance_classes as (select distinct distance_class from neighbors_t), - max_id as }", - max_id_query, - r"{ + all_distance_classes as (select distinct distance_class from neighbors_t) select - row_number() over () + (select max_pred from max_id) as id_pred, + NULL as id_pred, concat('id_lulc_', l.id_lulc, '_dist_', c.distance_class) as name, concat('Count of ', l.pretty_name, ' within distance class ', c.distance_class) as pretty_name, 'Number of neighbors by land use class and distance interval' as description, @@ -120,19 +107,25 @@ evoland_db$set("public", "generate_neighbor_predictors", function() { cross join all_distance_classes c }" - )) - - pred_meta_neighbors <- self$get_query( - "select id_pred, name, pretty_name, description, orig_format, unit + ) + self$execute( + "create or replace view pred_meta_upsert_v as + select name, pretty_name, description, orig_format, sources, unit, factor_levels from pred_meta_neighbors_t" ) - pred_meta_neighbors$sources <- lapply(1:nrow(pred_meta_neighbors), function(i) list()) - pred_meta_neighbors$factor_levels <- lapply(1:nrow(pred_meta_neighbors), function(i) list()) - - self$pred_meta_t <- as_pred_meta_t(pred_meta_neighbors) - + self$attach_table("pred_meta_t") + on.exit(self$detach_table("pred_meta_t"), add = TRUE) self$execute( + r"{ + update pred_meta_neighbors_t + set id_pred = pred_meta_t.id_pred + from pred_meta_t + where pred_meta_neighbors_t.name = pred_meta_t.name + }" + ) + + n_data_points <- self$execute( r"{ create temp table pred_neighbors_t as select @@ -157,17 +150,11 @@ evoland_db$set("public", "generate_neighbor_predictors", function() { }" ) - pred_neighbors <- self$get_query("select * from pred_neighbors_t") - pred_neighbors$id_pred <- as.integer(pred_neighbors$id_pred) - pred_neighbors$id_coord <- as.integer(pred_neighbors$id_coord) - pred_neighbors$id_period <- as.integer(pred_neighbors$id_period) - pred_neighbors$value <- as.integer(pred_neighbors$value) - - self$pred_data_t_int <- as_pred_data_t(pred_neighbors, type = "int") + self$commit_upsert("pred_neighbors_t", "pred_neighbors_t_int") message(glue::glue( - "Generated {nrow(pred_meta_neighbors)} neighbor predictor variables with ", - "{nrow(pred_neighbors)} data points" + "Generated {n_predictors} neighbor predictor variables with ", + "{n_data_points} data points" )) invisible(self) From 6bd2277539dd6533ade0862faaca6c515846849c Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 1 Dec 2025 15:03:45 +0100 Subject: [PATCH 20/27] add progress bar to neighbourhood calc --- R/RcppExports.R | 4 ++-- inst/tinytest/test_util_terra.R | 23 ++++++++++++++++------- src/RcppExports.cpp | 9 +++++---- src/distances.cpp | 33 +++++++++++++++++++++++++++------ 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 3a54496..395824b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,7 +1,7 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) { - .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution) +distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) { + .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet) } diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R index efb0dac..82f998a 100644 --- a/inst/tinytest/test_util_terra.R +++ b/inst/tinytest/test_util_terra.R @@ -152,7 +152,10 @@ test_coords <- data.table::data.table( test_coords <- as_coords_t(test_coords) # Test basic neighbor computation with max_distance = 150 -neighbors <- create_neighbors_t(test_coords, max_distance = 150) +expect_stdout( + neighbors <- create_neighbors_t(test_coords, max_distance = 150), + "Progress" +) # Check structure expect_true(data.table::is.data.table(neighbors)) @@ -204,10 +207,13 @@ expect_equal( ) # Test with distance_breaks -neighbors_classified <- create_neighbors_t( - test_coords, - max_distance = 150, - distance_breaks = c(0, 100, 150) +expect_stdout( + neighbors_classified <- create_neighbors_t( + test_coords, + max_distance = 150, + distance_breaks = c(0, 100, 150) + ), + "Progress" ) # Check that distance_class is populated @@ -234,7 +240,10 @@ expect_equal( ) # Test with smaller max_distance -neighbors_small <- create_neighbors_t(test_coords, max_distance = 110) +expect_stdout( + neighbors_small <- create_neighbors_t(test_coords, max_distance = 110), + "Progress" +) # With max_distance = 110, point 1 should only have neighbors 2 and 4 (distance 100) # but not 5 (distance ~141.4) @@ -266,7 +275,7 @@ expect_error( ) # Test with real coords_t from earlier in the test file -real_neighbors <- create_neighbors_t(coords_t, max_distance = 300) +expect_stdout(real_neighbors <- create_neighbors_t(coords_t, max_distance = 300), "Progress") # Each point in a regular 100m grid should have neighbors # Interior points should have 8 neighbors within 300m (8-connectivity) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c2f3be0..13f1a2b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,21 +11,22 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // distance_neighbors_cpp -List distance_neighbors_cpp(DataFrame coords_t, double max_distance, double resolution); -RcppExport SEXP _evoland_distance_neighbors_cpp(SEXP coords_tSEXP, SEXP max_distanceSEXP, SEXP resolutionSEXP) { +List distance_neighbors_cpp(DataFrame coords_t, double max_distance, double resolution, bool quiet); +RcppExport SEXP _evoland_distance_neighbors_cpp(SEXP coords_tSEXP, SEXP max_distanceSEXP, SEXP resolutionSEXP, SEXP quietSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< DataFrame >::type coords_t(coords_tSEXP); Rcpp::traits::input_parameter< double >::type max_distance(max_distanceSEXP); Rcpp::traits::input_parameter< double >::type resolution(resolutionSEXP); - rcpp_result_gen = Rcpp::wrap(distance_neighbors_cpp(coords_t, max_distance, resolution)); + Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); + rcpp_result_gen = Rcpp::wrap(distance_neighbors_cpp(coords_t, max_distance, resolution, quiet)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_evoland_distance_neighbors_cpp", (DL_FUNC) &_evoland_distance_neighbors_cpp, 3}, + {"_evoland_distance_neighbors_cpp", (DL_FUNC) &_evoland_distance_neighbors_cpp, 4}, {NULL, NULL, 0} }; diff --git a/src/distances.cpp b/src/distances.cpp index cd41ffb..2e23a21 100644 --- a/src/distances.cpp +++ b/src/distances.cpp @@ -24,17 +24,16 @@ using namespace Rcpp; * - id_coord_neighbor: ID of the neighboring coordinate * - distance_approx: Approximate distance between origin and neighbor * - * @note The function uses an approximate distance calculation based on grid cells. - * If multiple points fall into the same cell, a warning is issued and only the first - * point is retained for that cell. + * @note The function uses an approximate distance calculation based on grid + * cells. If multiple points fall into the same cell, a warning is issued and + * only the first point is retained for that cell. * * @warning Distance calculations are approximate and based on Euclidean distance in * coordinate space. For geographic coordinates, this may not reflect true geodesic distance. */ // [[Rcpp::export]] -List distance_neighbors_cpp(DataFrame coords_t, - double max_distance, - double resolution = 100.0) { +List distance_neighbors_cpp(DataFrame coords_t, double max_distance, + double resolution = 100.0, bool quiet = false) { // Extract columns IntegerVector id_coord = coords_t["id_coord"]; @@ -112,7 +111,25 @@ List distance_neighbors_cpp(DataFrame coords_t, // 5. Find neighbors within max_distance + // Progress reporting setup + int progress_interval = std::max(1000, n_points / 20); // Report every 5% + + if (!quiet) { + Rcpp::Rcout << "\rProgress: 0% (0/" << n_points << " points)" << std::flush; + } + for (int pt_idx = 0; pt_idx < n_points; pt_idx++) { + // Check for user interrupt periodically + if (pt_idx % 1000 == 0) { + Rcpp::checkUserInterrupt(); + } + + // Report progress + if (!quiet && pt_idx > 0 && pt_idx % progress_interval == 0) { + int pct = (int)(100.0 * pt_idx / n_points); + Rcpp::Rcout << "\rProgress: " << pct << "% (" << pt_idx << "/" << n_points + << " points)" << std::flush; + } int origin_id = id_coord[pt_idx]; int origin_row = row_indices[pt_idx]; int origin_col = col_indices[pt_idx]; @@ -160,6 +177,10 @@ List distance_neighbors_cpp(DataFrame coords_t, Named("id_coord_neighbor") = neighbor_ids, Named("distance_approx") = distances_approx ); + if (!quiet) { + Rcpp::Rcout << "\rProgress: 100% (" << n_points << "/" << n_points + << " points)" << std::endl; + } res.attr("class") = CharacterVector::create("data.table", "data.frame"); From 7d6c453e473fb1b22d16112e20be50509fde49a8 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 1 Dec 2025 15:10:58 +0100 Subject: [PATCH 21/27] commit: again a single public method; needed to manage table attachment --- R/evoland_db.R | 14 +- R/evoland_db_neighbors.R | 7 +- R/evoland_db_tables.R | 5 +- R/parquet_duckdb.R | 370 ++++++++++++++-------------- inst/tinytest/test_parquet_duckdb.R | 103 +++++--- man/parquet_duckdb.Rd | 105 +++----- vignettes/evoland.qmd | 4 +- 7 files changed, 310 insertions(+), 298 deletions(-) diff --git a/R/evoland_db.R b/R/evoland_db.R index 310e317..9aebd16 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -98,13 +98,14 @@ evoland_db <- R6::R6Class( params[["last_opened"]] <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC") params[["last_opened_username"]] <- Sys.getenv("USER", unset = "unknown") - self$commit_upsert( + self$commit( data.table::as.data.table(list( key = names(params), # cannot name a column "key" in data.table() value = unlist(params) )), table_name = "reporting_t", - key_cols = "key" + key_cols = "key", + method = "upsert" ) }, @@ -123,7 +124,7 @@ evoland_db <- R6::R6Class( function(x) stop("Unsupported coordinate type specified.") ) - self$commit_overwrite(as_coords_t(create_fun(...)), "coords_t") + self$commit(as_coords_t(create_fun(...)), "coords_t", method = "overwrite") }, #' @description @@ -144,7 +145,7 @@ evoland_db <- R6::R6Class( return(invisible(NULL)) } - self$commit_append( + self$commit( do.call(create_periods_t, as.list(environment())), "periods_t" ) @@ -170,9 +171,10 @@ evoland_db <- R6::R6Class( data.table::set(pred_data, j = "id_pred", value = as.integer(existing_meta[["id_pred"]])) data.table::setcolorder(pred_data, c("id_pred", "id_coord", "id_period", "value")) - self$commit_upsert( + self$commit( as_pred_data_t(pred_data, pred_type), - paste0("pred_data_t_", pred_type) + paste0("pred_data_t_", pred_type), + method = "upsert" ) } ) diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R index b07c749..f8f6d20 100644 --- a/R/evoland_db_neighbors.R +++ b/R/evoland_db_neighbors.R @@ -43,9 +43,10 @@ evoland_db$set( resolution = resolution ) - self$commit_overwrite( + self$commit( as_neighbors_t(neighbors), - table_name = "neighbors_t" + table_name = "neighbors_t", + method = "overwrite" ) message(glue::glue("Computed {nrow(neighbors)} neighbor relationships")) @@ -150,7 +151,7 @@ evoland_db$set("public", "generate_neighbor_predictors", function() { }" ) - self$commit_upsert("pred_neighbors_t", "pred_neighbors_t_int") + self$commit("pred_neighbors_t", "pred_neighbors_t_int", method = "upsert") message(glue::glue( "Generated {n_predictors} neighbor predictor variables with ", diff --git a/R/evoland_db_tables.R b/R/evoland_db_tables.R index d477ab3..1ff2043 100644 --- a/R/evoland_db_tables.R +++ b/R/evoland_db_tables.R @@ -50,12 +50,13 @@ create_table_binding <- function( stopifnot(inherits(x, table_name)) - self$commit_upsert( + self$commit( x, table_name = table_name, key_cols = key_cols, autoincrement_cols = autoincrement_cols, - map_cols = map_cols + map_cols = map_cols, + method = "upsert" ) } } diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R index 6307d87..e7c5538 100644 --- a/R/parquet_duckdb.R +++ b/R/parquet_duckdb.R @@ -104,7 +104,6 @@ parquet_duckdb <- R6::R6Class( sql <- glue::glue("{sql} where {where}") } - # Execute SQL self$execute(sql) invisible(NULL) }, @@ -242,201 +241,72 @@ parquet_duckdb <- R6::R6Class( return(count_before - count_after) }, - ### Commit Methods ---- - #' @description - #' Commit data in overwrite mode - #' @param x Data frame to commit. If character, in-duckdb-memory table. - #' @param table_name Character string table name - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - #' @return Invisible NULL (called for side effects) - commit_overwrite = function( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(), add = TRUE) - new_data_names <- private$new_data_names() - - # Warn if overriding existing IDs - if (length(intersect(autoincrement_cols, new_data_names)) > 0) { - warning(glue::glue( - "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", - "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" - )) - } - - # Build SELECT expression - ordinary_cols <- setdiff(new_data_names, autoincrement_cols) - select_expr <- glue::glue_collapse( - c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), - sep = ",\n " - ) - - self$execute(glue::glue( - r"{ - copy ( - select {select_expr} - from new_data_v - ) to '{file_path}' ({self$writeopts}) - }" - )) - - invisible(NULL) - }, - - #' @description - #' Commit data in append mode + #' Commit data using overwrite, append, or upsert modes. Handles autoincrement, key + #' identity columns, and list-to-MAP conversion. #' @param x Data frame to commit. If character, in-duckdb-memory table. #' @param table_name Character string table name + #' @param key_cols Character vector of columns that define uniqueness. If missing, + #' use all columns starting with `id_` #' @param autoincrement_cols Character vector of column names to auto-increment #' @param map_cols Character vector of columns to convert to MAP format + #' @param method Character, one of "overwrite", "append", "upsert" (upsert being an + #' update for existing rows, and insert for new rows) #' @return Invisible NULL (called for side effects) - commit_append = function( + commit = function( x, table_name, + key_cols, autoincrement_cols = character(0), - map_cols = character(0) + map_cols = character(0), + method = c("overwrite", "append", "upsert") ) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) - } - - self$attach_table(table_name) - on.exit(self$detach_table(table_name), add = TRUE) - private$set_autoincrement_vars(table_name, autoincrement_cols) + method <- match.arg(method) private$register_new_data_v(x, map_cols) on.exit(private$cleanup_new_data_v(), add = TRUE) - new_data_names <- private$get_new_data_names() - - ordinary_cols <- setdiff(new_data_names, autoincrement_cols) - select_new <- glue::glue_collapse( - c( - glue::glue( - "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" - ), - ordinary_cols - ), - sep = ",\n " - ) - - # Concatenation using UNION ALL; "by name" handles missing columns - self$execute(glue::glue( - r"{ - copy ( - select * from {table_name} - union all by name - select {select_new} - from new_data_v - ) - to '{file_info$path}' ({self$writeopts}) - }" - )) - - invisible(NULL) - }, + all_cols <- self$get_query( + "select column_name from (describe new_data_v)" + )[[1]] - #' @description - #' Commit data in upsert mode (update existing, insert new) - #' @param x Data frame to commit. If character, in-duckdb-memory table. - #' @param table_name Character string table name - #' @param key_cols Character vector of columns that define uniqueness. If missing, - #' use all columns starting with `id_` - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - #' @return Invisible NULL (called for side effects) - commit_upsert = function( - x, - table_name, - key_cols, - autoincrement_cols = character(0), - map_cols = character(0) - ) { file_info <- private$get_file_path(table_name) - if (!file_info$exists) { - return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) + if (method == "overwrite" || !file_info$exists) { + # in case overwrite explicitly required, or no previously existing data to + # append or upsert to; rest of logic can be skipped + return(private$commit_overwrite( + table_name = table_name, + all_cols = all_cols, + autoincrement_cols = autoincrement_cols, + file_info = file_info + )) } self$attach_table(table_name) on.exit(self$detach_table(table_name), add = TRUE) - private$set_autoincrement_vars(table_name, autoincrement_cols) - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(), add = TRUE) - new_data_names <- private$new_data_names() if (missing(key_cols)) { - key_cols <- grep("^id_", new_data_names, value = TRUE) - } - if (length(key_cols) == 0) { - return(self$commit_append(x, table_name, autoincrement_cols, map_cols)) + key_cols <- grep("^id_", all_cols, value = TRUE) } - # Update existing data - ordinary_cols <- setdiff(new_data_names, union(key_cols, autoincrement_cols)) - update_select_expr <- glue::glue_collapse( - glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), - sep = ",\n " - ) - update_join_condition <- glue::glue_collapse( - glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"), - sep = "\nand " - ) - - self$execute(glue::glue( - r"{ - update {table_name} set - {update_select_expr} - from new_data_v - where - {update_join_condition}; - }" - )) - - # Insert new data - insert_select_expr <- glue::glue_collapse( - c( - glue::glue( - "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" - ), - glue::glue("new_data_v.{setdiff(new_data_names, autoincrement_cols)}") - ), - sep = ",\n " - ) - null_condition <- glue::glue_collapse( - glue::glue("{table_name}.{key_cols} is null"), - sep = "\nand " - ) - - self$execute(glue::glue( - r"{ - insert into {table_name} - select - {insert_select_expr} - from - new_data_v - left join - {table_name} - on - {update_join_condition} - where - {null_condition} - ; - }" - )) - - self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})")) - - invisible(NULL) + if (method == "append" || length(key_cols) == 0L) { + # if there are no key columns to join on, upsert becomes append + private$commit_append( + table_name = table_name, + all_cols = all_cols, + autoincrement_cols = autoincrement_cols, + file_info = file_info + ) + } else { + private$commit_upsert( + table_name = table_name, + all_cols = all_cols, + key_cols = key_cols, + autoincrement_cols = autoincrement_cols, + file_info = file_info + ) + } }, #' @description @@ -542,6 +412,149 @@ parquet_duckdb <- R6::R6Class( } }, + ### Commit Methods ---- + #' param x Data frame to commit. If character, in-duckdb-memory table. + #' param table_name Character string table name + #' param autoincrement_cols Character vector of column names to auto-increment + #' param map_cols Character vector of columns to convert to MAP format + #' return Invisible NULL (called for side effects) + commit_overwrite = function( + table_name, + all_cols, + autoincrement_cols = character(0), + file_info + ) { + # Warn if overriding existing IDs + if (length(intersect(autoincrement_cols, all_cols)) > 0) { + warning(glue::glue( + "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", + "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" + )) + } + + # Build SELECT expression + ordinary_cols <- setdiff(all_cols, autoincrement_cols) + select_expr <- glue::glue_collapse( + c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), + sep = ",\n " + ) + + self$execute(glue::glue( + r"{ + copy ( + select {select_expr} + from new_data_v + ) to '{file_info$path}' ({self$writeopts}) + }" + )) + }, + + #' param x Data frame to commit. If character, in-duckdb-memory table. + #' param table_name Character string table name + #' param autoincrement_cols Character vector of column names to auto-increment + #' param map_cols Character vector of columns to convert to MAP format + #' return Invisible NULL (called for side effects) + commit_append = function( + table_name, + all_cols, + autoincrement_cols = character(0), + file_info + ) { + ordinary_cols <- setdiff(all_cols, autoincrement_cols) + select_new <- glue::glue_collapse( + c( + glue::glue( + "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" + ), + ordinary_cols + ), + sep = ",\n " + ) + + # Concatenation using UNION ALL; "by name" handles missing columns + self$execute(glue::glue( + r"{ + copy ( + select * from {table_name} + union all by name + select {select_new} + from new_data_v + ) + to '{file_info$path}' ({self$writeopts}) + }" + )) + }, + + #' param x Data frame to commit. If character, in-duckdb-memory table. + #' param table_name Character string table name + #' param key_cols Character vector of columns that define uniqueness. If missing, + #' use all columns starting with `id_` + #' param autoincrement_cols Character vector of column names to auto-increment + #' param map_cols Character vector of columns to convert to MAP format + #' return Invisible NULL (called for side effects) + commit_upsert = function( + table_name, + all_cols, + key_cols, + autoincrement_cols = character(0), + file_info + ) { + # Update existing data + ordinary_cols <- setdiff(all_cols, union(key_cols, autoincrement_cols)) + update_select_expr <- glue::glue_collapse( + glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), + sep = ",\n " + ) + update_join_condition <- glue::glue_collapse( + glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"), + sep = "\nand " + ) + + self$execute(glue::glue( + r"{ + update {table_name} set + {update_select_expr} + from new_data_v + where + {update_join_condition}; + }" + )) + + # Insert new data + insert_select_expr <- glue::glue_collapse( + c( + glue::glue( + "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" + ), + glue::glue("new_data_v.{setdiff(all_cols, autoincrement_cols)}") + ), + sep = ",\n " + ) + null_condition <- glue::glue_collapse( + glue::glue("{table_name}.{key_cols} is null"), + sep = "\nand " + ) + + self$execute(glue::glue( + r"{ + insert into {table_name} + select + {insert_select_expr} + from + new_data_v + left join + {table_name} + on + {update_join_condition} + where + {null_condition} + ; + }" + )) + + self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})")) + }, + # Get file path and format for a table # # @param table_name Character string table name @@ -613,21 +626,14 @@ parquet_duckdb <- R6::R6Class( # @param map_cols Character vector indicating if MAP conversion was used # @return NULL (called for side effects) cleanup_new_data_v = function() { - duckdb::duckdb_unregister(self$connection, "new_data_v") - duckdb::duckdb_unregister(self$connection, "new_data_raw") - self$execute( - "drop table if exists new_data_v; - drop view if exists new_data_v" - ) + try(duckdb::duckdb_unregister(self$connection, "new_data_v"), silent = TRUE) + try(duckdb::duckdb_unregister(self$connection, "new_data_raw"), silent = TRUE) + try(self$execute("drop table if exists new_data_v"), silent = TRUE) + try(self$execute("drop view if exists new_data_v"), silent = TRUE) invisible(NULL) }, - # Get the names of the current new_data_v - new_data_names = function() { - self$get_query("select column_name from (describe new_data_v)")[[1]] - }, - # Set DuckDB variables max_{colname} for autoincrement columns # # @param table_name Character string table name diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R index ad58bc8..5f1c3db 100644 --- a/inst/tinytest/test_parquet_duckdb.R +++ b/inst/tinytest/test_parquet_duckdb.R @@ -29,14 +29,18 @@ expect_equal(nrow(result), 0L) # Test 4: Row count for non-existent table expect_equal(db$row_count("nonexistent_table"), 0L) -# Test 5: commit_overwrite creates new table +# Test 5: commit overwrite creates new table test_data_1 <- data.table::data.table( id = 1:5, name = letters[1:5], value = c(10.1, 20.2, 30.3, 40.4, 50.5) ) expect_silent( - db$commit_overwrite(test_data_1, "test_table_1") + db$commit( + method = "overwrite", + test_data_1, + "test_table_1" + ) ) expect_true("test_table_1" %in% db$list_tables()) expect_equal(db$row_count("test_table_1"), 5L) @@ -45,60 +49,64 @@ expect_equal(db$row_count("test_table_1"), 5L) retrieved <- db$fetch("test_table_1") expect_equal(retrieved, test_data_1) -# Test 7: commit_overwrite replaces existing data +# Test 7: commit overwrite replaces existing data test_data_1b <- data.table::data.table( id = 10:12, name = letters[24:26], value = c(100.1, 200.2, 300.3) ) expect_silent( - db$commit_overwrite(test_data_1b, "test_table_1") + db$commit( + method = "overwrite", + test_data_1b, + "test_table_1" + ) ) expect_equal(db$row_count("test_table_1"), 3L) retrieved <- db$fetch("test_table_1") expect_equal(retrieved, test_data_1b) -# Test 8: commit_append adds to existing data +# Test 8: commit append adds to existing data test_data_1c <- data.table::data.table( id = 13:15, name = letters[1:3], value = c(111.1, 222.2, 333.3) ) expect_silent( - db$commit_append(test_data_1c, "test_table_1") + db$commit(test_data_1c, "test_table_1", method = "append") ) expect_equal(db$row_count("test_table_1"), 6L) retrieved <- db$fetch("test_table_1") expect_equal(nrow(retrieved), 6L) expect_true(all(c(10:15) %in% retrieved$id)) -# Test 9: commit_append on non-existent table creates it +# Test 9: commit append on non-existent table creates it expect_silent( - db$commit_append(test_data_1, "test_table_2") + db$commit(test_data_1, "test_table_2", method = "append") ) expect_true("test_table_2" %in% db$list_tables()) expect_equal(db$row_count("test_table_2"), 5L) -# Test 10: commit_upsert on non-existent table creates it +# Test 10: commit w/ upsert on non-existent table creates it test_data_3 <- data.table::data.table( id_key = 1:3, name = c("a", "b", "c"), value = c(1.1, 2.2, 3.3) ) expect_silent( - db$commit_upsert(test_data_3, "test_table_3", key_cols = "id_key") + db$commit(test_data_3, "test_table_3", key_cols = "id_key", method = "upsert") ) expect_true("test_table_3" %in% db$list_tables()) expect_equal(db$row_count("test_table_3"), 3L) -# Test 11: commit_upsert updates existing rows and inserts new ones +# Test 11: commit w/ upsert updates existing rows and inserts new ones test_data_3b <- data.table::data.table( id_key = c(2L, 3L, 4L), name = c("b_updated", "c_updated", "d"), value = c(22.2, 33.3, 44.4) ) expect_silent( - db$commit_upsert(test_data_3b, "test_table_3", key_cols = "id_key") + db$commit(test_data_3b, "test_table_3", key_cols = "id_key", method = "upsert") ) expect_equal(db$row_count("test_table_3"), 4L) retrieved <- db$fetch("test_table_3") @@ -130,7 +138,11 @@ retrieved <- db$fetch("test_table_3") expect_false(1L %in% retrieved$id_key) # Test 16: delete_from with complex WHERE -db$commit_overwrite(test_data_1, "test_table_4") +db$commit( + method = "overwrite", + test_data_1, + "test_table_4" +) deleted_count <- db$delete_from("test_table_4", where = "id < 3") expect_equal(deleted_count, 2L) retrieved <- db$fetch("test_table_4") @@ -147,7 +159,11 @@ deleted_count <- db$delete_from("nonexistent", where = "id = 1") expect_equal(deleted_count, 0L) # Test 19: delete_from with WHERE that matches nothing -db$commit_overwrite(test_data_1, "test_table_5") +db$commit( + method = "overwrite", + test_data_1, + "test_table_5" +) initial_count <- db$row_count("test_table_5") deleted_count <- db$delete_from("test_table_5", where = "id = 999") expect_equal(deleted_count, 0L) @@ -158,7 +174,8 @@ test_autoinc_1 <- data.table::data.table( name = c("item_a", "item_b", "item_c"), value = c(10, 20, 30) ) -db$commit_overwrite( +db$commit( + method = "overwrite", test_autoinc_1, "test_autoinc_1", autoincrement_cols = "id" @@ -172,10 +189,11 @@ test_autoinc_2 <- data.table::data.table( name = c("item_d", "item_e"), value = c(40, 50) ) -db$commit_append( +db$commit( test_autoinc_2, "test_autoinc_1", - autoincrement_cols = "id" + autoincrement_cols = "id", + method = "append" ) result <- db$fetch("test_autoinc_1") expect_equal(nrow(result), 5L) @@ -187,10 +205,11 @@ test_autoinc_3 <- data.table::data.table( name = c("item_f", "item_g"), value = c(60, 70) ) -db$commit_upsert( +db$commit( test_autoinc_3, "test_autoinc_1", - autoincrement_cols = "id" + autoincrement_cols = "id", + method = "upsert" ) result <- db$fetch("test_autoinc_1") expect_equal(nrow(result), 7L) @@ -203,7 +222,8 @@ test_autoinc_with_ids <- data.table::data.table( value = c(1, 2, 3) ) expect_warning( - db$commit_overwrite( + db$commit( + method = "overwrite", test_autoinc_with_ids, "test_autoinc_2", autoincrement_cols = "id" @@ -218,7 +238,8 @@ test_multi_autoinc <- data.table::data.table( name = c("item1", "item2"), value = c(10, 20) ) -db$commit_overwrite( +db$commit( + method = "overwrite", test_multi_autoinc, "test_multi_autoinc", autoincrement_cols = c("id_a", "id_b") @@ -232,15 +253,20 @@ test_continue_a <- data.table::data.table( id_seq = c(5L, 10L, 15L), value = c(100, 200, 300) ) -db$commit_overwrite(test_continue_a, "test_continue") +db$commit( + method = "overwrite", + test_continue_a, + "test_continue" +) test_continue_b <- data.table::data.table( value = c(400, 500) ) -db$commit_append( +db$commit( test_continue_b, "test_continue", - autoincrement_cols = "id_seq" + autoincrement_cols = "id_seq", + method = "append" ) result <- db$fetch("test_continue") expect_equal(nrow(result), 5L) @@ -248,7 +274,11 @@ expect_equal(result$id_seq[4:5], c(16L, 17L)) expect_equal(result$value[4:5], c(400, 500)) # Test 26: attach_table and detach_table -db$commit_overwrite(test_data_1, "test_attach") +db$commit( + method = "overwrite", + test_data_1, + "test_attach" +) expect_silent(db$attach_table("test_attach")) # Verify table is attached by querying it directly result <- db$get_query("SELECT COUNT(*) as n FROM test_attach") @@ -302,7 +332,7 @@ test_csv_data <- data.table::data.table( id = 1:3, name = c("a", "b", "c") ) -db_csv$commit_overwrite(test_csv_data, "csv_table") +db_csv$commit(test_csv_data, "csv_table", method = "overwrite") expect_true("csv_table" %in% db_csv$list_tables()) retrieved <- db_csv$fetch("csv_table") expect_equal(retrieved, test_csv_data) @@ -321,7 +351,11 @@ expect_silent( ) # Test 33: Persistence across connections -db$commit_overwrite(test_data_1, "persist_test") +db$commit( + method = "overwrite", + test_data_1, + "persist_test" +) rm(db) gc() @@ -331,15 +365,24 @@ expect_true("persist_test" %in% db$list_tables()) retrieved <- db$fetch("persist_test") expect_equal(retrieved, test_data_1) -# Test 34: commit_upsert with no key_cols defaults to append +# Test 34: commit with no key_cols defaults to append test_no_keys <- data.table::data.table( name = c("x", "y"), value = c(1, 2) ) -db$commit_overwrite(test_no_keys, "no_keys_test") +db$commit( + method = "overwrite", + test_no_keys, + "no_keys_test" +) expect_equal(db$row_count("no_keys_test"), 2L) -db$commit_upsert(test_no_keys, "no_keys_test", key_cols = character(0)) +db$commit( + test_no_keys, + "no_keys_test", + key_cols = character(0), + method = "upsert" +) expect_equal(db$row_count("no_keys_test"), 4L) # Should append # Test 35: Print method diff --git a/man/parquet_duckdb.Rd b/man/parquet_duckdb.Rd index 964cd90..cb41afa 100644 --- a/man/parquet_duckdb.Rd +++ b/man/parquet_duckdb.Rd @@ -35,9 +35,7 @@ domain-specific database classes. \item \href{#method-parquet_duckdb-with_tables}{\code{parquet_duckdb$with_tables()}} \item \href{#method-parquet_duckdb-fetch}{\code{parquet_duckdb$fetch()}} \item \href{#method-parquet_duckdb-delete_from}{\code{parquet_duckdb$delete_from()}} -\item \href{#method-parquet_duckdb-commit_overwrite}{\code{parquet_duckdb$commit_overwrite()}} -\item \href{#method-parquet_duckdb-commit_append}{\code{parquet_duckdb$commit_append()}} -\item \href{#method-parquet_duckdb-commit_upsert}{\code{parquet_duckdb$commit_upsert()}} +\item \href{#method-parquet_duckdb-commit}{\code{parquet_duckdb$commit()}} \item \href{#method-parquet_duckdb-print}{\code{parquet_duckdb$print()}} \item \href{#method-parquet_duckdb-clone}{\code{parquet_duckdb$clone()}} } @@ -260,94 +258,38 @@ Number of rows deleted } } \if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_overwrite}{}}} -\subsection{Method \code{commit_overwrite()}}{ -Commit data in overwrite mode +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit}{}}} +\subsection{Method \code{commit()}}{ +Commit data using overwrite, append, or upsert modes. Handles autoincrement, key +identity columns, and list-to-MAP conversion. \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{parquet_duckdb$commit_overwrite( +\if{html}{\out{
    }}\preformatted{parquet_duckdb$commit( x, table_name, + key_cols, autoincrement_cols = character(0), - map_cols = character(0) + map_cols = character(0), + method = c("overwrite", "append", "upsert") )}\if{html}{\out{
    }} } \subsection{Arguments}{ \if{html}{\out{
    }} \describe{ -\item{\code{x}}{Data frame to commit} +\item{\code{x}}{Data frame to commit. If character, in-duckdb-memory table.} \item{\code{table_name}}{Character string table name} -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
    }} -} -\subsection{Returns}{ -Invisible NULL (called for side effects) -} -} -\if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_append}{}}} -\subsection{Method \code{commit_append()}}{ -Commit data in append mode -\subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{parquet_duckdb$commit_append( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
    }} -} - -\subsection{Arguments}{ -\if{html}{\out{
    }} -\describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} +\item{\code{key_cols}}{Character vector of columns that define uniqueness. If missing, +use all columns starting with \code{id_}} \item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} \item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
    }} -} -\subsection{Returns}{ -Invisible NULL (called for side effects) -} -} -\if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_upsert}{}}} -\subsection{Method \code{commit_upsert()}}{ -Commit data in upsert mode (update existing, insert new) -\subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{parquet_duckdb$commit_upsert( - x, - table_name, - key_cols = grep("^id_", names(x), value = TRUE), - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
    }} -} - -\subsection{Arguments}{ -\if{html}{\out{
    }} -\describe{ -\item{\code{x}}{Data frame to commit} -\item{\code{table_name}}{Character string table name} - -\item{\code{key_cols}}{Character vector of columns that define uniqueness} - -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} +\item{\code{method}}{Character, one of "overwrite", "append", "upsert" (upsert being an +update for existing rows, and insert for new rows)} } \if{html}{\out{
    }} } @@ -373,6 +315,23 @@ Print method for parquet_duckdb } \subsection{Returns}{ self (invisibly) +param x Data frame to commit. If character, in-duckdb-memory table. +param table_name Character string table name +param autoincrement_cols Character vector of column names to auto-increment +param map_cols Character vector of columns to convert to MAP format +return Invisible NULL (called for side effects) +param x Data frame to commit. If character, in-duckdb-memory table. +param table_name Character string table name +param autoincrement_cols Character vector of column names to auto-increment +param map_cols Character vector of columns to convert to MAP format +return Invisible NULL (called for side effects) +param x Data frame to commit. If character, in-duckdb-memory table. +param table_name Character string table name +param key_cols Character vector of columns that define uniqueness. If missing, +use all columns starting with \code{id_} +param autoincrement_cols Character vector of column names to auto-increment +param map_cols Character vector of columns to convert to MAP format +return Invisible NULL (called for side effects) } } \if{html}{\out{
    }} diff --git a/vignettes/evoland.qmd b/vignettes/evoland.qmd index f06bef8..37da973 100644 --- a/vignettes/evoland.qmd +++ b/vignettes/evoland.qmd @@ -108,7 +108,7 @@ db$commit( value = c(lulc_files$url, lulc_files$md5sum, "BFS Arealstatistik") ), table_name = "reporting_t", - mode = "append" + method = "append" ) zippath <- file.path( @@ -322,7 +322,7 @@ id_coord_keep <- lulc_data_t[, id_coord] db$commit( x = db$coords_t[id_coord %in% id_coord_keep], table_name = "coords_t", - mode = "overwrite" + method = "overwrite" ) ``` From 8525327fd55c50c595c79d90b2320bfdf6e7b2e9 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:18:13 +0100 Subject: [PATCH 22/27] enable filling trans_pred_data_v with 0s for NAs / simpler passing of id preds --- R/covariance_filter.R | 17 ++++++----------- R/evoland_db_views.R | 12 +++++++++++- R/trans_preds_t.R | 16 ++++++++-------- man/covariance_filter.Rd | 3 +-- man/evoland_db.Rd | 6 ++---- man/trans_preds_t.Rd | 10 +++++++++- 6 files changed, 37 insertions(+), 27 deletions(-) diff --git a/R/covariance_filter.R b/R/covariance_filter.R index fb06530..e6a813f 100644 --- a/R/covariance_filter.R +++ b/R/covariance_filter.R @@ -23,8 +23,7 @@ #' coefficients above this threshold will be filtered out. Default is 0 (no filtering). #' @param ... Additional arguments passed to rank_fun. #' -#' @return A filtered data.table containing only the selected covariates after ranking -#' by the specified method and filtering based on correlation threshold. +#' @return A set of column names (covariates) to retain #' #' @details #' The function first ranks covariates using the provided ranking function (default: @@ -70,22 +69,18 @@ covariance_filter <- function( ) # Sort by scores (lower = better/more significant) - ranked_order <- order(scores) - data_ranked <- data[, ..ranked_order] + ranked_order <- names(sort(scores)) - # If no correlation filtering needed, return ranked data + # If no correlation filtering needed, return ranked predictors if (corcut == 1) { - return(data_ranked) + return(ranked_order) } # Compute correlation matrix once - cor_mat <- abs(cor(data_ranked, use = "pairwise.complete.obs")) + cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs")) # Iteratively select covariates based on correlation threshold - selected <- select_by_correlation(cor_mat, corcut) - - # Return selected covariates - data_ranked[, ..selected, drop = FALSE] + select_by_correlation(cor_mat, corcut) } diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index 42429a3..3acc060 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -101,7 +101,10 @@ evoland_db$set("active", "coords_minimal", function() { }) }) -evoland_db$set("public", "trans_pred_data_v", function(id_trans) { +# get transitions along with their predictor data in a wide data.table +# id_trans - integer transition ID +# na_value - if not NA, replace all NULL/NA predictor values with this value +evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) { stopifnot( "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans) ) @@ -251,6 +254,13 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans) { } data.table::setnames(result, old_names, new_names) + if (!is.na(na_value)) { + pred_cols <- setdiff(names(result), "result") + for (col in pred_cols) { + data.table::set(result, i = which(is.na(result[[col]])), j = col, value = na_value) + } + } + result } ) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 4e2dd87..091e9da 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -34,12 +34,14 @@ as_trans_preds_t <- function(x) { #' @param db An [evoland_db] instance with populated tables #' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] #' @param rank_fun Optional ranking function passed to [covariance_filter()] +#' @param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value #' @param ... Additional arguments passed to rank_fun via [covariance_filter()] #' @export create_trans_preds_t <- function( db, corcut = 0.7, rank_fun = rank_poly_glm, + na_value = NA, ... ) { stopifnot( @@ -68,7 +70,7 @@ create_trans_preds_t <- function( # Get wide transition-predictor data tryCatch( { - trans_pred_data <- db$trans_pred_data_v(id_trans) + trans_pred_data <- db$trans_pred_data_v(id_trans, na_value) # Check if we have any data if (nrow(trans_pred_data) == 0L) { @@ -87,7 +89,8 @@ create_trans_preds_t <- function( next } - filtered_data <- covariance_filter( + # Return ranked + filtered predictor names as id_pred_{n} + filtered_preds <- covariance_filter( data = trans_pred_data, result_col = "result", rank_fun = rank_fun, @@ -95,15 +98,12 @@ create_trans_preds_t <- function( ... ) - # Extract selected predictor IDs from column names - selected_cols <- setdiff(names(filtered_data), "result") - - if (length(selected_cols) > 0L) { + if (length(filtered_preds) > 0L) { # Parse id_pred values from column names (e.g., "id_pred_1" -> 1) - selected_ids <- as.integer(sub("^id_pred_", "", selected_cols)) + selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds)) # Create result rows - results_list[[length(results_list) + 1]] <- data.table::data.table( + results_list[[id_trans]] <- data.table::data.table( id_pred = selected_ids, id_trans = id_trans ) diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd index ffa87fc..0693652 100644 --- a/man/covariance_filter.Rd +++ b/man/covariance_filter.Rd @@ -50,8 +50,7 @@ Should take arguments (x, y, weights, ...) and return a single numeric value \item{cor_mat}{Absolute correlation matrix} } \value{ -A filtered data.table containing only the selected covariates after ranking -by the specified method and filtering based on correlation threshold. +A set of column names (covariates) to retain } \description{ The \code{covariance_filter} returns a set of covariates for land use land cover change diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index 676de84..bee1130 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -41,9 +41,7 @@ Additional methods and active bindings are added to this class in separate files
    Inherited methods
    • evoland::parquet_duckdb$attach_table()
    • -
    • evoland::parquet_duckdb$commit_append()
    • -
    • evoland::parquet_duckdb$commit_overwrite()
    • -
    • evoland::parquet_duckdb$commit_upsert()
    • +
    • evoland::parquet_duckdb$commit()
    • evoland::parquet_duckdb$delete_from()
    • evoland::parquet_duckdb$detach_table()
    • evoland::parquet_duckdb$execute()
    • @@ -83,7 +81,7 @@ Additional methods and active bindings are added to this class in separate files \if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}} \subsection{Method \code{trans_pred_data_v()}}{ \subsection{Usage}{ -\if{html}{\out{
      }}\preformatted{evoland_db$trans_pred_data_v(id_trans)}\if{html}{\out{
      }} +\if{html}{\out{
      }}\preformatted{evoland_db$trans_pred_data_v(id_trans, na_value = NA)}\if{html}{\out{
      }} } } diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd index e2cbebf..e999d9f 100644 --- a/man/trans_preds_t.Rd +++ b/man/trans_preds_t.Rd @@ -9,7 +9,13 @@ \usage{ as_trans_preds_t(x) -create_trans_preds_t(db, corcut = 0.7, rank_fun = rank_poly_glm, ...) +create_trans_preds_t( + db, + corcut = 0.7, + rank_fun = rank_poly_glm, + na_value = NA, + ... +) \method{print}{trans_preds_t}(x, nrow = 10, ...) } @@ -20,6 +26,8 @@ create_trans_preds_t(db, corcut = 0.7, rank_fun = rank_poly_glm, ...) \item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}} +\item{na_value}{Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value} + \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}} \item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} From be909cc16f4d39c21d08a8ab76fc12ceff5b0d27 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:56:10 +0100 Subject: [PATCH 23/27] trans_pred_data_v: enable filtering on id_pred --- R/evoland_db_views.R | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index 3acc060..06325cb 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -103,17 +103,19 @@ evoland_db$set("active", "coords_minimal", function() { # get transitions along with their predictor data in a wide data.table # id_trans - integer transition ID +# id_pred - optional integer vector of predictor IDs to include (NULL = all predictors) # na_value - if not NA, replace all NULL/NA predictor values with this value -evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) { +evoland_db$set("public", "trans_pred_data_v", function(id_trans, id_pred = NULL, na_value = NA) { stopifnot( - "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans) + "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans), + "id_pred must be NULL or a numeric vector" = is.null(id_pred) || is.numeric(id_pred) ) all_tables <- self$list_tables() pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool") existing_pred_tables <- intersect(pred_tables, all_tables) - tables_to_attach <- c("trans_meta_t", "lulc_data_t", "pred_meta_t", existing_pred_tables) + tables_to_attach <- c("trans_meta_t", "lulc_data_t", existing_pred_tables) self$with_tables( tables_to_attach, @@ -151,17 +153,26 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) )" ) + pred_filter <- "" + if (!is.null(id_pred)) { + pred_filter <- glue::glue(" AND id_pred IN ({toString(id_pred)})") + } + if ("pred_data_t_float" %in% existing_pred_tables) { - ctes$pred_float_combined <- "pred_float_combined AS ( + ctes$pred_float_combined <- glue::glue( + "pred_float_combined AS ( SELECT id_coord, id_period, id_pred, value FROM pred_data_t_float WHERE id_period >= 1 + {pred_filter} UNION ALL SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value FROM pred_data_t_float AS p0 CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods WHERE p0.id_period = 0 + {pred_filter} )" + ) ctes$pred_float_wide <- "pred_float_wide AS ( PIVOT pred_float_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period @@ -169,16 +180,20 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) } if ("pred_data_t_int" %in% existing_pred_tables) { - ctes$pred_int_combined <- "pred_int_combined AS ( + ctes$pred_int_combined <- glue::glue( + "pred_int_combined AS ( SELECT id_coord, id_period, id_pred, value FROM pred_data_t_int WHERE id_period >= 1 + {pred_filter} UNION ALL SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value FROM pred_data_t_int AS p0 CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods WHERE p0.id_period = 0 + {pred_filter} )" + ) ctes$pred_int_wide <- "pred_int_wide AS ( PIVOT pred_int_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period @@ -186,16 +201,20 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) } if ("pred_data_t_bool" %in% existing_pred_tables) { - ctes$pred_bool_combined <- "pred_bool_combined AS ( + ctes$pred_bool_combined <- glue::glue( + "pred_bool_combined AS ( SELECT id_coord, id_period, id_pred, value FROM pred_data_t_bool WHERE id_period >= 1 + {pred_filter} UNION ALL SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value FROM pred_data_t_bool AS p0 CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods WHERE p0.id_period = 0 + {pred_filter} )" + ) ctes$pred_bool_wide <- "pred_bool_wide AS ( PIVOT pred_bool_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period @@ -217,23 +236,23 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) if ("pred_data_t_float" %in% existing_pred_tables) { joins <- paste0( joins, - "\n LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period" + "\n LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period" ) } if ("pred_data_t_int" %in% existing_pred_tables) { joins <- paste0( joins, - "\n LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period" + "\n LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period" ) } if ("pred_data_t_bool" %in% existing_pred_tables) { joins <- paste0( joins, - "\n LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period" + "\n LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period" ) } - cte_string <- paste(unlist(ctes), collapse = ",\n\n ") + cte_string <- paste(unlist(ctes), collapse = ",\n\n ") query <- glue::glue( "WITH {cte_string} From 1e8d2ec6fc655f6977105907a27fb893d80d2727 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:45:51 +0100 Subject: [PATCH 24/27] make the predictor selection a pruning of a potentially existing trans_preds_t --- NAMESPACE | 1 - R/trans_preds_t.R | 217 ++++++++++++++++++++++++------------------- man/evoland_db.Rd | 27 +++++- man/trans_preds_t.Rd | 26 +----- 4 files changed, 149 insertions(+), 122 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 35a656a..3c5e473 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,7 +55,6 @@ export(create_neighbors_t) export(create_periods_t) export(create_pred_meta_t) export(create_trans_meta_t) -export(create_trans_preds_t) export(download_and_verify) export(evoland_db) export(extract_using_coords_t) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 091e9da..812bf7c 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -5,6 +5,7 @@ #' modelling each transition type. #' #' @name trans_preds_t +#' @include evoland_db.R #' #' @param db An [evoland_db] instance with populated trans_meta_t and pred_meta_t tables #' @@ -28,113 +29,137 @@ as_trans_preds_t <- function(x) { ) } -#' @describeIn trans_preds_t Create a transition-predictor relation, i.e. records the +# set an initial full set of transition / predictor relations +evoland_db$set( + "public", + "set_full_trans_preds", + function(overwrite = FALSE) { + if (self$row_count("trans_preds_t") > 0 && !overwrite) { + stop("Set overwrite to TRUE to overwrite existing trans_preds_t") + } + p <- self$pred_meta_t + t <- self$trans_meta_t[is_viable == TRUE] + + full <- expand.grid(id_pred = p[["id_pred"]], id_trans = t[["id_trans"]]) + self$trans_preds_t <- as_trans_preds_t(full) + } +) + +#' describeIn trans_preds_t Create a transition-predictor relation, i.e. records the #' result of a predictor selection step. Runs covariance filtering for each viable #' transition and stores the selected predictors. -#' @param db An [evoland_db] instance with populated tables -#' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] -#' @param rank_fun Optional ranking function passed to [covariance_filter()] -#' @param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value -#' @param ... Additional arguments passed to rank_fun via [covariance_filter()] -#' @export -create_trans_preds_t <- function( - db, - corcut = 0.7, - rank_fun = rank_poly_glm, - na_value = NA, - ... -) { - stopifnot( - "db must be an evoland_db instance" = inherits(db, "evoland_db") - ) - - viable_trans <- db$trans_meta_t[is_viable == TRUE] - pred_meta <- db$pred_meta_t - stopifnot( - "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L - ) - - results_list <- list() - - # Iterate over transitions (anterior/posterior pairs) - for (i in seq_len(nrow(viable_trans))) { - id_trans <- viable_trans$id_trans[i] - id_lulc_ant <- viable_trans$id_lulc_anterior[i] - id_lulc_post <- viable_trans$id_lulc_posterior[i] - - message(glue::glue( - "Processing transition {i}/{nrow(viable_trans)}: ", - "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})" - )) - - # Get wide transition-predictor data - tryCatch( - { - trans_pred_data <- db$trans_pred_data_v(id_trans, na_value) +#' param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] +#' param rank_fun Optional ranking function passed to [covariance_filter()] +#' param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value +#' param ... Additional arguments passed to rank_fun via [covariance_filter()] +evoland_db$set( + "public", + "prune_trans_preds_two_stage_covar", + function( + corcut = 0.7, + rank_fun = rank_poly_glm, + na_value = NA, + ... + ) { + viable_trans <- self$trans_meta_t[is_viable == TRUE] + pred_meta <- self$pred_meta_t + stopifnot( + "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L + ) + if (self$row_count("trans_preds_t") == 0) { + self$set_full_trans_preds() + } + trans_preds_pre <- self$trans_preds_t + + results_list <- list() + + # Iterate over transitions (anterior/posterior pairs) + for (i in seq_len(nrow(viable_trans))) { + id_trans <- viable_trans$id_trans[i] + id_lulc_ant <- viable_trans$id_lulc_anterior[i] + id_lulc_post <- viable_trans$id_lulc_posterior[i] + id_preds <- trans_preds_pre$id_pred[ + trans_preds_pre$id_trans == id_trans + ] + + message(glue::glue( + "Processing transition {i}/{nrow(viable_trans)}: ", + "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})" + )) + + if (length(id_preds) == 0L) { + next + } - # Check if we have any data - if (nrow(trans_pred_data) == 0L) { - warning(glue::glue( - "No data for transition {id_trans}, skipping" - )) - next - } + # Get wide transition-predictor data + tryCatch( + { + trans_pred_data <- self$trans_pred_data_v(id_trans, id_preds, na_value) + + # Check if we have any data + if (nrow(trans_pred_data) == 0L) { + warning(glue::glue( + "No data for transition {id_trans}, skipping" + )) + next + } + + # Check if we have any predictor columns + pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE) + if (length(pred_cols) == 0L) { + warning(glue::glue( + "No predictor columns for transition {id_trans}, skipping" + )) + next + } + + # Return ranked + filtered predictor names as id_pred_{n} + filtered_preds <- covariance_filter( + data = trans_pred_data, + result_col = "result", + rank_fun = rank_fun, + corcut = corcut, + ... + ) - # Check if we have any predictor columns - pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE) - if (length(pred_cols) == 0L) { + if (length(filtered_preds) > 0L) { + # Parse id_pred values from column names (e.g., "id_pred_1" -> 1) + selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds)) + + # Create result rows + results_list[[id_trans]] <- data.table::data.table( + id_pred = selected_ids, + id_trans = id_trans + ) + + message(glue::glue( + " Selected {length(selected_ids)} predictor(s) for transition {id_trans}" + )) + } else { + message(glue::glue( + " No predictors selected for transition {id_trans}" + )) + } + }, + error = function(e) { warning(glue::glue( - "No predictor columns for transition {id_trans}, skipping" + "Error processing transition {id_trans}: {e$message}" )) - next } + ) + } - # Return ranked + filtered predictor names as id_pred_{n} - filtered_preds <- covariance_filter( - data = trans_pred_data, - result_col = "result", - rank_fun = rank_fun, - corcut = corcut, - ... - ) - - if (length(filtered_preds) > 0L) { - # Parse id_pred values from column names (e.g., "id_pred_1" -> 1) - selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds)) - - # Create result rows - results_list[[id_trans]] <- data.table::data.table( - id_pred = selected_ids, - id_trans = id_trans - ) + # Combine all results + if (length(results_list) == 0L) { + warning("No predictors selected for any transition") + return(invisible(NULL)) + } - message(glue::glue( - " Selected {length(selected_ids)} predictor(s) for transition {id_trans}" - )) - } else { - message(glue::glue( - " No predictors selected for transition {id_trans}" - )) - } - }, - error = function(e) { - warning(glue::glue( - "Error processing transition {id_trans}: {e$message}" - )) - } - ) - } + result <- data.table::rbindlist(results_list) - # Combine all results - if (length(results_list) == 0L) { - warning("No predictors selected for any transition") - return(as_trans_preds_t()) + self$commit(as_trans_preds_t(result), "trans_preds_t", method = "overwrite") } - - result <- data.table::rbindlist(results_list) - - as_trans_preds_t(result) -} +) #' @export validate.trans_preds_t <- function(x, ...) { diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index bee1130..531b09e 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -34,6 +34,8 @@ Additional methods and active bindings are added to this class in separate files \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}} \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}} \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}} +\item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}} +\item \href{#method-evoland_db-prune_trans_preds_two_stage_covar}{\code{evoland_db$prune_trans_preds_two_stage_covar()}} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } @@ -81,7 +83,7 @@ Additional methods and active bindings are added to this class in separate files \if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}} \subsection{Method \code{trans_pred_data_v()}}{ \subsection{Usage}{ -\if{html}{\out{
      }}\preformatted{evoland_db$trans_pred_data_v(id_trans, na_value = NA)}\if{html}{\out{
      }} +\if{html}{\out{
      }}\preformatted{evoland_db$trans_pred_data_v(id_trans, id_pred = NULL, na_value = NA)}\if{html}{\out{
      }} } } @@ -221,6 +223,29 @@ Add a predictor to the database } \if{html}{\out{
    }} } +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-set_full_trans_preds}{}}} +\subsection{Method \code{set_full_trans_preds()}}{ +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{evoland_db$set_full_trans_preds(overwrite = FALSE)}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds_two_stage_covar}{}}} +\subsection{Method \code{prune_trans_preds_two_stage_covar()}}{ +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{evoland_db$prune_trans_preds_two_stage_covar( + corcut = 0.7, + rank_fun = rank_poly_glm, + na_value = NA, + ... +)}\if{html}{\out{
    }} +} + } \if{html}{\out{
    }} \if{html}{\out{}} diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd index e999d9f..202028e 100644 --- a/man/trans_preds_t.Rd +++ b/man/trans_preds_t.Rd @@ -3,34 +3,19 @@ \name{trans_preds_t} \alias{trans_preds_t} \alias{as_trans_preds_t} -\alias{create_trans_preds_t} \alias{print.trans_preds_t} \title{Create Transition-Predictor Relationship Table} \usage{ as_trans_preds_t(x) -create_trans_preds_t( - db, - corcut = 0.7, - rank_fun = rank_poly_glm, - na_value = NA, - ... -) - \method{print}{trans_preds_t}(x, nrow = 10, ...) } \arguments{ -\item{db}{An \link{evoland_db} instance with populated tables} - -\item{corcut}{Numeric threshold (0-1) for correlation filtering passed to \code{\link[=covariance_filter]{covariance_filter()}}} - -\item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}} - -\item{na_value}{Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value} +\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}} -\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}} +\item{db}{An \link{evoland_db} instance with populated trans_meta_t and pred_meta_t tables} } \value{ A data.table of class "trans_preds_t" with columns: @@ -49,10 +34,3 @@ modelling each transition type. \item \code{print(trans_preds_t)}: Print a trans_preds_t object, passing params to data.table print }} -\section{Functions}{ -\itemize{ -\item \code{create_trans_preds_t()}: Create a transition-predictor relation, i.e. records the -result of a predictor selection step. Runs covariance filtering for each viable -transition and stores the selected predictors. - -}} From bd16b400327309b897f8c7ee93fd2b230ce5e206 Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 2 Dec 2025 10:54:02 +0100 Subject: [PATCH 25/27] shut up distance progress --- R/neighbors_t.R | 6 ++++-- inst/tinytest/test_util_terra.R | 11 +++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/R/neighbors_t.R b/R/neighbors_t.R index f4d09fd..4a50301 100644 --- a/R/neighbors_t.R +++ b/R/neighbors_t.R @@ -53,7 +53,8 @@ create_neighbors_t <- function( coords_t, max_distance, distance_breaks = NULL, - resolution = 100.0 + resolution = 100.0, + quiet = FALSE ) { # Validate inputs if (!inherits(coords_t, "coords_t")) { @@ -74,7 +75,8 @@ create_neighbors_t <- function( dt <- distance_neighbors_cpp( coords_t = coords_t, max_distance = max_distance, - resolution = resolution + resolution = resolution, + quiet = quiet ) data.table::setalloccol(dt) diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R index 82f998a..35a0bab 100644 --- a/inst/tinytest/test_util_terra.R +++ b/inst/tinytest/test_util_terra.R @@ -300,10 +300,13 @@ for (i in seq_len( } # Test with distance classification on real data -real_neighbors_class <- create_neighbors_t( - coords_t, - max_distance = 300, - distance_breaks = c(0, 150, 300) +expect_silent( + real_neighbors_class <- create_neighbors_t( + coords_t, + max_distance = 300, + distance_breaks = c(0, 150, 300), + quiet = TRUE + ) ) expect_true(all(!is.na(real_neighbors_class$distance_class))) From d7e63034081be8cbe101261e6131d105850c373e Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 2 Dec 2025 10:54:46 +0100 Subject: [PATCH 26/27] add grrf filter --- DESCRIPTION | 8 +- NAMESPACE | 1 + R/covariance_filter.R | 4 +- R/grrf_filter.r | 136 +++++++++++++++++++++++++++++ R/trans_preds_t.R | 13 ++- inst/tinytest/test_trans_preds_t.R | 40 ++++----- man/evoland_db.Rd | 13 ++- man/grrf_filter.Rd | 70 +++++++++++++++ man/neighbors_t.Rd | 3 +- 9 files changed, 245 insertions(+), 43 deletions(-) create mode 100644 R/grrf_filter.r create mode 100644 man/grrf_filter.Rd diff --git a/DESCRIPTION b/DESCRIPTION index c21e25f..7b3e541 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,12 +28,13 @@ Imports: terra Suggests: tinytest, - quarto + quarto, + ranger VignetteBuilder: quarto Config/testthat/edition: 3 -LinkingTo: +LinkingTo: Rcpp -Collate: +Collate: 'RcppExports.R' 'alloc_params_t.R' 'coords_t.R' @@ -43,6 +44,7 @@ Collate: 'evoland_db_neighbors.R' 'evoland_db_tables.R' 'evoland_db_views.R' + 'grrf_filter.r' 'init.R' 'intrv_masks_t.R' 'intrv_meta_t.R' diff --git a/NAMESPACE b/NAMESPACE index 3c5e473..ee973a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,7 @@ export(create_trans_meta_t) export(download_and_verify) export(evoland_db) export(extract_using_coords_t) +export(grrf_filter) export(parquet_duckdb) export(print_rowwise_yaml) export(validate) diff --git a/R/covariance_filter.R b/R/covariance_filter.R index e6a813f..bb9fc0c 100644 --- a/R/covariance_filter.R +++ b/R/covariance_filter.R @@ -17,8 +17,8 @@ #' @param rank_fun Optional function to compute ranking scores for each covariate. #' Should take arguments (x, y, weights, ...) and return a single numeric value #' (lower = better). Defaults to polynomial GLM p-value ranking. -#' @param weights Optional vector of weights to be used in the ranking function. -#' If NULL and rank_fun uses default, class-balanced weights are computed automatically. +#' @param weights Optional vector of weights to be used in the ranking function. Defaults to +#' class-balanced weights #' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation #' coefficients above this threshold will be filtered out. Default is 0 (no filtering). #' @param ... Additional arguments passed to rank_fun. diff --git a/R/grrf_filter.r b/R/grrf_filter.r new file mode 100644 index 0000000..28ce993 --- /dev/null +++ b/R/grrf_filter.r @@ -0,0 +1,136 @@ +#' Guided Regularized Random Forest Feature Selection +#' +#' The `grrf_filter` returns a set of covariates for land use land cover change (LULCC) models based +#' on feature selection with Guided Regularized Random Forests. This is a two-stage random forest +#' approach: a first unregularized random forest estimates variable importance scores. These scores +#' are then used to guide a second regularized random forest that penalizes less important features, +#' resulting in a more parsimonious feature set. +#' +#' @param data A data.table of target variable and candidate covariates to be filtered; wide format +#' with one predictor per column. +#' @param result_col Name of the column representing the transition results (0: no trans, 1: trans) +#' @param weights Optional named vector of class weights. If NULL, class-balanced weights +#' are computed automatically using compute_grrf_weights(). +#' @param gamma Numeric between 0-1 controlling the weight of the normalized importance +#' score (the "importance coefficient"). When gamma = 0, we perform unguided +#' regularized random forest (no guiding effect). When gamma = 1, we apply the +#' strongest guiding effect, leading to the most penalization of redundant +#' features and the most concise feature sets. Default is 0.5. +#' @param num.trees Number of trees to grow in each random forest. Default is 500. +#' @param ... Additional arguments passed to ranger::ranger(). +#' +#' @return A character vector of column names (covariates) to retain, ordered by +#' importance (most important first) +#' +#' @details +#' The Guided Regularized Random Forest (GRRF) algorithm works as follows: +#' 1. Fit an initial unregularized random forest to obtain variable importance scores +#' 2. Normalize these importance scores and use them to compute regularization +#' coefficients: coefReg = (1 - gamma) + gamma * normalized_importance +#' 3. Fit a regularized random forest using these coefficients to penalize splits +#' on less important variables +#' 4. Return variables with positive importance in the regularized model +#' +#' Class weights are used to handle class imbalance. Variables in terminal nodes are +#' weighted by class, and splits are evaluated using weighted Gini impurity. +#' +#' The ranger implementation uses the `split.select.weights` parameter to apply +#' regularization penalties, approximating the RRF regularization approach. +#' +#' @references +#' Deng, H., & Runger, G. (2013). Gene selection with guided regularized random forest. +#' Pattern Recognition, 46(12), 3483-3489. https://arxiv.org/pdf/1306.0237.pdf +#' +#' Original implementation by Antoine Adde, edited by Ben Black and adapted for +#' ranger by the evoland-plus team. +#' +#' @name grrf_filter +#' +#' @export + +grrf_filter <- function( + data, + result_col = "result", + weights = compute_balanced_weights(data[[result_col]]), + gamma = 0.5, + num.trees = 500, + max.depth = 100, + ... +) { + # Check if ranger is available + if (!requireNamespace("ranger", quietly = TRUE)) { + stop( + "Package 'ranger' is required for grrf_filter but is not installed.\n", + "Please install it with: install.packages('ranger')", + call. = FALSE + ) + } + + data.table::setDT(data) + + # Validate inputs + stopifnot( + "gamma must be between 0 and 1" = gamma >= 0 && gamma <= 1, + "result_col must exist in data" = result_col %in% names(data), + "data must have at least one predictor column" = ncol(data) > 1 + ) + + # Prepare data: separate predictors from response + predictor_cols <- setdiff(names(data), result_col) + y <- as.factor(data[[result_col]]) + x <- data[, ..predictor_cols] + + # Step 1: Run initial unregularized random forest to get importance scores + rf_initial <- ranger::ranger( + x = x, + y = y, + num.trees = num.trees, + importance = "impurity", + case.weights = weights, + max.depth = max.depth, + ... + ) + + # Extract and normalize importance scores + imp_initial <- rf_initial$variable.importance + imp_normalized <- imp_initial / max(imp_initial) + + # Step 2: Calculate regularization coefficients (penalty weights) + # Higher importance -> higher coefficient -> less penalty + coef_reg <- (1 - gamma) + gamma * imp_normalized + + # Step 3: Run guided regularized random forest + # Use split.select.weights to penalize variables with lower importance + # Higher weight = more likely to be selected for splitting + rf_grrf <- ranger::ranger( + x = x, + y = y, + num.trees = num.trees, + importance = "impurity", + case.weights = weights, + max.depth = max.depth, + split.select.weights = coef_reg, + verbose = FALSE, # easiest way to get rid of cpp warning about split selected weights + ... + ) + + # Extract final importance scores + imp_final <- rf_grrf$variable.importance + + # Select variables with positive importance + selected_vars <- names(imp_final[imp_final > 0]) + + if (length(selected_vars) == 0) { + warning("No variables with positive importance found. Returning all variables.") + selected_vars <- predictor_cols + } + + # Order by importance (descending) + selected_vars <- selected_vars[order(imp_final[selected_vars], decreasing = TRUE)] + + message(glue::glue( + "Selected {length(selected_vars)}/{length(predictor_cols)} predictors" + )) + + return(selected_vars) +} diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 812bf7c..75949b8 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -41,7 +41,7 @@ evoland_db$set( t <- self$trans_meta_t[is_viable == TRUE] full <- expand.grid(id_pred = p[["id_pred"]], id_trans = t[["id_trans"]]) - self$trans_preds_t <- as_trans_preds_t(full) + self$commit(as_trans_preds_t(full), "trans_preds_t", method = "overwrite") } ) @@ -49,15 +49,14 @@ evoland_db$set( #' result of a predictor selection step. Runs covariance filtering for each viable #' transition and stores the selected predictors. #' param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] -#' param rank_fun Optional ranking function passed to [covariance_filter()] +#' param filter_fun Defaults to [covariance_filter()], but can be any function that returns #' param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value #' param ... Additional arguments passed to rank_fun via [covariance_filter()] evoland_db$set( "public", - "prune_trans_preds_two_stage_covar", + "prune_trans_preds", function( - corcut = 0.7, - rank_fun = rank_poly_glm, + filter_fun = covariance_filter, na_value = NA, ... ) { @@ -114,11 +113,9 @@ evoland_db$set( } # Return ranked + filtered predictor names as id_pred_{n} - filtered_preds <- covariance_filter( + filtered_preds <- filter_fun( data = trans_pred_data, result_col = "result", - rank_fun = rank_fun, - corcut = corcut, ... ) diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R index 737dc6a..d9c9da2 100644 --- a/inst/tinytest/test_trans_preds_t.R +++ b/inst/tinytest/test_trans_preds_t.R @@ -147,30 +147,31 @@ pred_data_int <- data.table::data.table( ) db_tps$pred_data_t_int <- as_pred_data_t(pred_data_int, type = "int") -# Test create_trans_preds_t +# Test pruning expect_message( - trans_preds_result <- - create_trans_preds_t( - db = db_tps, - corcut = 0.7 - ), + db_tps$prune_trans_preds( + corcut = 0.2 + ), "Processing transition 1/2" ) - +trans_preds_result <- db_tps$trans_preds_t expect_true(inherits(trans_preds_result, "trans_preds_t")) -expect_equal(nrow(trans_preds_result), 8L) - -# Verify structure -expect_true(all(c("id_pred", "id_trans") %in% names(trans_preds_result))) -expect_true(is.integer(trans_preds_result$id_pred)) -expect_true(is.integer(trans_preds_result$id_trans)) +expect_equal(nrow(trans_preds_result), 4L) # Verify that all id_trans in result are viable viable_trans_ids <- db_tps$trans_meta_t[is_viable == TRUE]$id_trans expect_true(all(trans_preds_result$id_trans %in% viable_trans_ids)) -# Verify that all id_pred in result exist in pred_meta_t -expect_true(all(trans_preds_result$id_pred %in% db_tps$pred_meta_t$id_pred)) +# reset to full set of trans - preds +expect_silent(db_tps$set_full_trans_preds(overwrite = TRUE)) +expect_message( + db_tps$prune_trans_preds( + filter_fun = grrf_filter, + num.trees = 10, + gamma = 0.9 + ), + "Selected 5 predictor\\(s\\) for transition" +) # Test error handling - empty database test_dir_empty <- tempfile("evoland_empty_") @@ -178,12 +179,7 @@ on.exit(unlink(test_dir_empty, recursive = TRUE), add = TRUE) db_empty <- evoland_db$new(test_dir_empty) expect_error( - create_trans_preds_t(db = "not_a_db"), - "must be an evoland_db" -) - -expect_error( - create_trans_preds_t(db = db_empty), + db_empty$prune_trans_preds(), "Table `trans_meta_t` does not exist" ) @@ -200,7 +196,7 @@ expect_warning( "Overriding existing IDs" ) expect_error( - create_trans_preds_t(db = db_no_pred), + db_no_pred$prune_trans_preds(), "Table `pred_meta_t` does not exist" ) diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index 531b09e..c3775f3 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -35,7 +35,7 @@ Additional methods and active bindings are added to this class in separate files \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}} \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}} \item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}} -\item \href{#method-evoland_db-prune_trans_preds_two_stage_covar}{\code{evoland_db$prune_trans_preds_two_stage_covar()}} +\item \href{#method-evoland_db-prune_trans_preds}{\code{evoland_db$prune_trans_preds()}} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } @@ -234,13 +234,12 @@ Add a predictor to the database } \if{html}{\out{
    }} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds_two_stage_covar}{}}} -\subsection{Method \code{prune_trans_preds_two_stage_covar()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds}{}}} +\subsection{Method \code{prune_trans_preds()}}{ \subsection{Usage}{ -\if{html}{\out{
    }}\preformatted{evoland_db$prune_trans_preds_two_stage_covar( - corcut = 0.7, - rank_fun = rank_poly_glm, +\if{html}{\out{
    }}\preformatted{evoland_db$prune_trans_preds( + filter_fun = covariance_filter, na_value = NA, ... )}\if{html}{\out{
    }} diff --git a/man/grrf_filter.Rd b/man/grrf_filter.Rd new file mode 100644 index 0000000..a3bbb93 --- /dev/null +++ b/man/grrf_filter.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grrf_filter.r +\name{grrf_filter} +\alias{grrf_filter} +\title{Guided Regularized Random Forest Feature Selection} +\usage{ +grrf_filter( + data, + result_col = "result", + weights = compute_balanced_weights(data[[result_col]]), + gamma = 0.5, + num.trees = 500, + max.depth = 100, + ... +) +} +\arguments{ +\item{data}{A data.table of target variable and candidate covariates to be filtered; wide format +with one predictor per column.} + +\item{result_col}{Name of the column representing the transition results (0: no trans, 1: trans)} + +\item{weights}{Optional named vector of class weights. If NULL, class-balanced weights +are computed automatically using compute_grrf_weights().} + +\item{gamma}{Numeric between 0-1 controlling the weight of the normalized importance +score (the "importance coefficient"). When gamma = 0, we perform unguided +regularized random forest (no guiding effect). When gamma = 1, we apply the +strongest guiding effect, leading to the most penalization of redundant +features and the most concise feature sets. Default is 0.5.} + +\item{num.trees}{Number of trees to grow in each random forest. Default is 500.} + +\item{...}{Additional arguments passed to ranger::ranger().} +} +\value{ +A character vector of column names (covariates) to retain, ordered by +importance (most important first) +} +\description{ +The \code{grrf_filter} returns a set of covariates for land use land cover change (LULCC) models based +on feature selection with Guided Regularized Random Forests. This is a two-stage random forest +approach: a first unregularized random forest estimates variable importance scores. These scores +are then used to guide a second regularized random forest that penalizes less important features, +resulting in a more parsimonious feature set. +} +\details{ +The Guided Regularized Random Forest (GRRF) algorithm works as follows: +\enumerate{ +\item Fit an initial unregularized random forest to obtain variable importance scores +\item Normalize these importance scores and use them to compute regularization +coefficients: coefReg = (1 - gamma) + gamma * normalized_importance +\item Fit a regularized random forest using these coefficients to penalize splits +on less important variables +\item Return variables with positive importance in the regularized model +} + +Class weights are used to handle class imbalance. Variables in terminal nodes are +weighted by class, and splits are evaluated using weighted Gini impurity. + +The ranger implementation uses the \code{split.select.weights} parameter to apply +regularization penalties, approximating the RRF regularization approach. +} +\references{ +Deng, H., & Runger, G. (2013). Gene selection with guided regularized random forest. +Pattern Recognition, 46(12), 3483-3489. https://arxiv.org/pdf/1306.0237.pdf + +Original implementation by Antoine Adde, edited by Ben Black and adapted for +ranger by the evoland-plus team. +} diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd index 2376f02..55d0260 100644 --- a/man/neighbors_t.Rd +++ b/man/neighbors_t.Rd @@ -14,7 +14,8 @@ create_neighbors_t( coords_t, max_distance, distance_breaks = NULL, - resolution = 100 + resolution = 100, + quiet = FALSE ) \method{validate}{neighbors_t}(x, ...) From e6c8e5c72c2684ca1275bfd69f066e53ecc99bbe Mon Sep 17 00:00:00 2001 From: mmyrte <24587121+mmyrte@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:26:39 +0100 Subject: [PATCH 27/27] make prune robust / normalize importance to [0,1] --- R/grrf_filter.r | 7 +++++-- R/trans_preds_t.R | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/grrf_filter.r b/R/grrf_filter.r index 28ce993..dc07e5a 100644 --- a/R/grrf_filter.r +++ b/R/grrf_filter.r @@ -93,7 +93,11 @@ grrf_filter <- function( # Extract and normalize importance scores imp_initial <- rf_initial$variable.importance - imp_normalized <- imp_initial / max(imp_initial) + # Normalize to [0,1] (importance values may be negative) + imp_normalized <- { + (imp_initial - min(imp_initial)) / + (max(imp_initial) - min(imp_initial)) + } # Step 2: Calculate regularization coefficients (penalty weights) # Higher importance -> higher coefficient -> less penalty @@ -110,7 +114,6 @@ grrf_filter <- function( case.weights = weights, max.depth = max.depth, split.select.weights = coef_reg, - verbose = FALSE, # easiest way to get rid of cpp warning about split selected weights ... ) diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 75949b8..4fc51c9 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -139,6 +139,11 @@ evoland_db$set( } }, error = function(e) { + # do not prune on error + results_list[[id_trans]] <- data.table::data.table( + id_pred = id_preds, + id_trans = id_trans + ) warning(glue::glue( "Error processing transition {id_trans}: {e$message}" ))