From 9185877dd484816b6a46ec9e23e4f9d40e1a9edb Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 11:45:00 +0100
Subject: [PATCH 01/27] add a covariance filter function

this is derived from the original by antoine adde
---
 NAMESPACE                |   1 +
 R/covariance_filter.R    | 169 +++++++++++++++++++++++++++++++++++++++
 man/covariance_filter.Rd |  81 +++++++++++++++++++
 3 files changed, 251 insertions(+)
 create mode 100644 R/covariance_filter.R
 create mode 100644 man/covariance_filter.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 77baa3d..888b193 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -44,6 +44,7 @@ export(as_trans_meta_t)
 export(as_trans_models_t)
 export(as_trans_preds_t)
 export(compute_neighbors)
+export(covariance_filter)
 export(create_coords_t_square)
 export(create_intrv_meta_t)
 export(create_intrv_meta_t_row)
diff --git a/R/covariance_filter.R b/R/covariance_filter.R
new file mode 100644
index 0000000..42ed84f
--- /dev/null
+++ b/R/covariance_filter.R
@@ -0,0 +1,169 @@
+#' Filter covariates for land use land cover change (LULCC) models
+#'
+#' This function filters a set of covariates for land use land cover change (LULCC)
+#' models based on various statistical methods and correlation thresholds.
+#'
+#' @param data A data.table of target variable and candidate covariates to be filtered;
+#'        wide format with one predictor per column.
+#' @param result_col Name of the column representing the transition results (0: no
+#'        trans, 1: trans)
+#' @param rank_fun Optional function to compute ranking scores for each covariate.
+#'        Should take arguments (x, y, weights, ...) and return a single numeric value
+#'        (lower = better). Defaults to polynomial GLM p-value ranking.
+#' @param weights Optional vector of weights to be used in the ranking function.
+#'        If NULL and rank_fun uses default, class-balanced weights are computed automatically.
+#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation
+#'        coefficients above this threshold will be filtered out. Default is 0 (no filtering).
+#' @param ... Additional arguments passed to rank_fun.
+#'
+#' @return A filtered data.table containing only the selected covariates after ranking
+#'         by the specified method and filtering based on correlation threshold.
+#'
+#' @details
+#' The function first ranks covariates using the provided ranking function (default:
+#' quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables
+#' based on the correlation cutoff threshold, preserving variables in order of their
+#' ranking. See
+#' <https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r> for
+#' where the concept came from. The original author was Antoine Adde, with edits by
+#' Benjamin Black.
+#'
+#' @name covariance_filter
+#'
+#' @export
+
+covariance_filter <- function(
+  data,
+  result_col = "result",
+  rank_fun = rank_poly_glm,
+  weights = compute_balanced_weights(data[[result_col]]),
+  corcut = 0.7,
+  ...
+) {
+  # Early return for single covariate
+  if (ncol(data) == 1) {
+    return(data)
+  }
+
+  data.table::setDT(data)
+
+  # Validate binary outcome
+  stopifnot(
+    "result_col must be binary (0/1)" = length(unique(data[[result_col]])) == 2,
+    "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1
+  )
+
+  # Compute ranking scores for all covariates (vectorized where possible)
+  scores <- vapply(
+    data[, -..result_col],
+    rank_fun,
+    FUN.VALUE = numeric(1),
+    y = data[[result_col]],
+    weights = weights,
+    ...
+  )
+
+  # Sort by scores (lower = better/more significant)
+  ranked_order <- order(scores)
+  data_ranked <- data[, ..ranked_order]
+
+  # If no correlation filtering needed, return ranked data
+  if (corcut == 1) {
+    return(data_ranked)
+  }
+
+  # Compute correlation matrix once
+  cor_mat <- abs(cor(data_ranked, use = "pairwise.complete.obs"))
+
+  # Iteratively select covariates based on correlation threshold
+  selected <- select_by_correlation(cor_mat, corcut)
+
+  # Return selected covariates
+  data_ranked[, ..selected, drop = FALSE]
+}
+
+
+#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns
+#' the lower p value for each of the polynomial terms
+#' @param x A numeric vector representing a single covariate
+#' @param y A binary outcome vector (0/1)
+#' @param weights Optional weights vector
+#' @param ... Additional arguments (ignored)
+#' @keywords internal
+rank_poly_glm <- function(x, y, weights = NULL, ...) {
+  fit <- glm.fit(
+    x = cbind(1, poly(x, degree = 2, simple = TRUE)),
+    y = y,
+    family = binomial(),
+    weights = weights
+  )
+
+  # Get p-values for linear and quadratic terms
+  coef_summary <- summary.glm(fit)$coefficients
+
+  # Return minimum p-value (most significant term)
+  min(coef_summary[2:3, 4], na.rm = TRUE)
+}
+
+
+#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary
+#' outcomes; returns a numeric vector
+#' @param trans_result Binary outcome vector (0/1)
+#' @param legacy Bool, use the legacy weighting?
+#' @keywords internal
+compute_balanced_weights <- function(trans_result, legacy = FALSE) {
+  n_total <- length(trans_result)
+  n_trans <- sum(trans_result == 1)
+  n_non_trans <- sum(trans_result == 0)
+
+  # Compute inverse frequency weights
+  weights <- numeric(n_total)
+
+  if (legacy) {
+    # I found this weighting in evoland-plus-legacy, but the models wouldn't converge
+    # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r
+    # This is actually just setting the underrepresented class to the rounded imbalance ratio
+    weights[trans_result == 0] <- 1
+    weights[trans_result == 1] <- round(n_non_trans / n_trans)
+    return(weights)
+  }
+
+  # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y))
+  # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint
+  # This weighting maintains the exact imbalance ratio
+  weights[trans_result == 1] <- n_total / (2 * n_trans)
+  weights[trans_result == 0] <- n_total / (2 * n_non_trans)
+
+  weights
+}
+
+
+#' @describeIn covariance_filter Select variables iteratively based on correlation
+#' threshold; returns a character vector of selected variable names
+#' @param cor_mat Absolute correlation matrix
+#' @param corcut Correlation cutoff threshold
+#' @keywords internal
+select_by_correlation <- function(cor_mat, corcut) {
+  var_names <- colnames(cor_mat)
+
+  # Early return if all correlations are below threshold
+  if (all(cor_mat[lower.tri(cor_mat)] < corcut)) {
+    return(var_names)
+  }
+
+  selected <- character(0)
+  remaining_idx <- seq_along(var_names)
+
+  while (length(remaining_idx) > 0) {
+    # Select the first remaining variable (highest ranked)
+    current_var <- remaining_idx[1]
+    selected <- c(selected, var_names[current_var])
+
+    # Find variables with correlation <= corcut with current variable
+    # (excluding the variable itself)
+    keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut)
+    remaining_idx <- remaining_idx[keep_idx]
+  }
+
+  selected
+}
diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd
new file mode 100644
index 0000000..3e47752
--- /dev/null
+++ b/man/covariance_filter.Rd
@@ -0,0 +1,81 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/covariance_filter.R
+\name{covariance_filter}
+\alias{covariance_filter}
+\alias{rank_poly_glm}
+\alias{compute_balanced_weights}
+\alias{select_by_correlation}
+\title{Filter covariates for land use land cover change (LULCC) models}
+\usage{
+covariance_filter(
+  data,
+  result_col = "result",
+  rank_fun = rank_poly_glm,
+  weights = compute_balanced_weights(data[[result_col]]),
+  corcut = 0.7,
+  ...
+)
+
+rank_poly_glm(x, y, weights = NULL, ...)
+
+compute_balanced_weights(trans_result, legacy = FALSE)
+
+select_by_correlation(cor_mat, corcut)
+}
+\arguments{
+\item{data}{A data.table of target variable and candidate covariates to be filtered;
+wide format with one predictor per column.}
+
+\item{result_col}{Name of the column representing the transition results (0: no
+trans, 1: trans)}
+
+\item{rank_fun}{Optional function to compute ranking scores for each covariate.
+Should take arguments (x, y, weights, ...) and return a single numeric value
+(lower = better). Defaults to polynomial GLM p-value ranking.}
+
+\item{weights}{Optional weights vector}
+
+\item{corcut}{Correlation cutoff threshold}
+
+\item{...}{Additional arguments (ignored)}
+
+\item{x}{A numeric vector representing a single covariate}
+
+\item{y}{A binary outcome vector (0/1)}
+
+\item{trans_result}{Binary outcome vector (0/1)}
+
+\item{legacy}{Bool, use the legacy weighting?}
+
+\item{cor_mat}{Absolute correlation matrix}
+}
+\value{
+A filtered data.table containing only the selected covariates after ranking
+by the specified method and filtering based on correlation threshold.
+}
+\description{
+This function filters a set of covariates for land use land cover change (LULCC)
+models based on various statistical methods and correlation thresholds.
+}
+\details{
+The function first ranks covariates using the provided ranking function (default:
+quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables
+based on the correlation cutoff threshold, preserving variables in order of their
+ranking. See
+\url{https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r} for
+where the concept came from. The original author was Antoine Adde, with edits by
+Benjamin Black.
+}
+\section{Functions}{
+\itemize{
+\item \code{rank_poly_glm()}: Default ranking function using polynomial GLM. Returns
+the lower p value for each of the polynomial terms
+
+\item \code{compute_balanced_weights()}: Compute class-balanced weights for imbalanced binary
+outcomes; returns a numeric vector
+
+\item \code{select_by_correlation()}: Select variables iteratively based on correlation
+threshold; returns a character vector of selected variable names
+
+}}
+\keyword{internal}

From 1c7a1f5aec1735414e6c9bcaf9759e66d24a0a6a Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 11:45:26 +0100
Subject: [PATCH 02/27] update docs

---
 R/evoland_db.R          |  1 +
 R/evoland_db_views.R    |  2 +-
 man/evoland_db.Rd       | 33 +++++++++++++++++++++++++++++----
 man/evoland_db_views.Rd | 16 +++++++++++++---
 man/trans_meta_t.Rd     |  5 +++--
 5 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/R/evoland_db.R b/R/evoland_db.R
index 5179ffd..ad67db3 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -320,6 +320,7 @@ evoland_db <- R6::R6Class(
     #' them into R objects.
     #' @param table_name Character vector. Names of table to attach.
     #' @param columns Character vector. Optional sql column selection, defaults to "*"
+    #' @param where A SQL where statement to optionally subset the table being attached.
     attach_table = function(table_name, columns = "*", where = NULL) {
       # TODO the attach/detach combo should be available in a DB method "with
       # table"; it should do nothing if the table is already attached
diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index f17c8df..b679911 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -84,7 +84,7 @@ make_extent_db <- function(self, private) {
     terra::ext()
 }
 
-#' @describeIn evoland_db_views
+#' @describeIn evoland_db_views Returns transitions based on lulc_data_t
 make_transitions_v <- function(self, private, where = NULL) {
   self$attach_table("lulc_data_t")
   on.exit(self$detach_table("lulc_data_t"))
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index e555300..af49fa0 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -39,6 +39,8 @@ object to assign. Assigning is an upsert operation.}
 \item{\code{lulc_data_t}}{A \code{lulc_data_t} instance; see \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} for the type of
 object to assign. Assigning is an upsert operation.}
 
+\item{\code{transitions_v}}{Get the transitions from \code{lulc_data_t}.}
+
 \item{\code{pred_data_t_float}}{A \code{pred_data_t_float} instance; see
 \code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an
 upsert operation.}
@@ -100,6 +102,7 @@ of object to assign. Assigning is an upsert operation.}
 \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}}
 \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}}
 \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}}
+\item \href{#method-evoland_db-print}{\code{evoland_db$print()}}
 \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}}
 }
 }
@@ -297,11 +300,11 @@ No. of rows affected by statement
 \if{html}{\out{<a id="method-evoland_db-attach_table"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-attach_table}{}}}
 \subsection{Method \code{attach_table()}}{
-Attach one or more tables from the database folder as temporary tables in DuckDB.
-This is useful for working with multiple tables in SQL queries without loading
-them into R memory.
+Attach a single table as a temporary table in DuckDB memory. This is
+useful for working with multiple tables in SQL queries instead of loading
+them into R objects.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$attach_table(table_name, columns = "*")}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -310,6 +313,8 @@ them into R memory.
 \item{\code{table_name}}{Character vector. Names of table to attach.}
 
 \item{\code{columns}}{Character vector. Optional sql column selection, defaults to "*"}
+
+\item{\code{where}}{A SQL where statement to optionally subset the table being attached.}
 }
 \if{html}{\out{</div>}}
 }
@@ -462,6 +467,26 @@ Add a predictor to the database
 }
 }
 \if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-evoland_db-print"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-print}{}}}
+\subsection{Method \code{print()}}{
+Print method for evoland_db
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$print(...)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{...}}{Not used}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+self (invisibly)
+}
+}
+\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-clone"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-clone}{}}}
 \subsection{Method \code{clone()}}{
diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd
index 68d5b66..d149f48 100644
--- a/man/evoland_db_views.Rd
+++ b/man/evoland_db_views.Rd
@@ -5,13 +5,19 @@
 \alias{make_pred_sources_v}
 \alias{make_lulc_meta_long_v}
 \alias{make_coords_minimal}
+\alias{make_extent_db}
+\alias{make_transitions_v}
 \title{Views on the evoland-plus data model}
 \usage{
-make_pred_sources_v(self, private)
+make_pred_sources_v(self, private, where = NULL)
 
-make_lulc_meta_long_v(self, private)
+make_lulc_meta_long_v(self, private, where = NULL)
 
-make_coords_minimal(self, private)
+make_coords_minimal(self, private, where = NULL)
+
+make_extent_db(self, private)
+
+make_transitions_v(self, private, where = NULL)
 }
 \description{
 Functions to generate views on the database
@@ -25,4 +31,8 @@ md5sum
 
 \item \code{make_coords_minimal()}: Minimal coordinate representation (id_coord, lon, lat)
 
+\item \code{make_extent_db()}: Returns the extent of the coords_t as terra::SpatExtent
+
+\item \code{make_transitions_v()}: Returns transitions based on lulc_data_t
+
 }}
diff --git a/man/trans_meta_t.Rd b/man/trans_meta_t.Rd
index 8c1d23b..b65c9e8 100644
--- a/man/trans_meta_t.Rd
+++ b/man/trans_meta_t.Rd
@@ -10,7 +10,7 @@
 as_trans_meta_t(x)
 
 create_trans_meta_t(
-  lulc_data,
+  transitions,
   min_cardinality_abs = NULL,
   min_frequency_rel = NULL,
   exclude_anterior = NULL,
@@ -20,7 +20,8 @@ create_trans_meta_t(
 \method{print}{trans_meta_t}(x, nrow = 10, ...)
 }
 \arguments{
-\item{lulc_data}{A \link{lulc_data_t} table with land use observations}
+\item{transitions}{A transitions_v table, with columns id_coord, id_lulc_anterior,
+id_lulc_posterior, id_period}
 
 \item{min_cardinality_abs}{Minimum absolute number of transitions for viability (optional)}
 

From 6809db2f00e1a69d71c4dc37c26cf8235d02ab42 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 12:36:44 +0100
Subject: [PATCH 03/27] better docs

---
 R/covariance_filter.R    | 27 ++++++++++++++++-----------
 man/covariance_filter.Rd | 28 +++++++++++++++++-----------
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/R/covariance_filter.R b/R/covariance_filter.R
index 42ed84f..b2475f6 100644
--- a/R/covariance_filter.R
+++ b/R/covariance_filter.R
@@ -1,7 +1,14 @@
-#' Filter covariates for land use land cover change (LULCC) models
+#' Two stage covariate filtering
 #'
-#' This function filters a set of covariates for land use land cover change (LULCC)
-#' models based on various statistical methods and correlation thresholds.
+#' The `covariance_filter` returns a set of covariates for land use land cover change
+#' (LULCC) models based on a two-stage variable selection: a first statistical fit
+#' estimates a covariate's quality for a given prediction task. A second step selects
+#' all variables below a given correlation threshold: We iterate over a correlation
+#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e.
+#' candidates) greater than the given threshold are dropped from the full set of
+#' candidates. This candidate selection is retained and used to select the next column,
+#' until no further columns are left to investigate. The columns that were iterated over
+#' are those returned as a character vector of selected variable names.
 #'
 #' @param data A data.table of target variable and candidate covariates to be filtered;
 #'        wide format with one predictor per column.
@@ -21,12 +28,12 @@
 #'
 #' @details
 #' The function first ranks covariates using the provided ranking function (default:
-#' quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables
-#' based on the correlation cutoff threshold, preserving variables in order of their
-#' ranking. See
+#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson)
+#' correlated variables based on the correlation cutoff threshold, preserving variables
+#' in order of their ranking. See
 #' <https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r> for
 #' where the concept came from. The original author was Antoine Adde, with edits by
-#' Benjamin Black.
+#' Benjamin Black. A similar mechanism is found in <https://github.com/antadde/covsel/>.
 #'
 #' @name covariance_filter
 #'
@@ -88,7 +95,6 @@ covariance_filter <- function(
 #' @param x A numeric vector representing a single covariate
 #' @param y A binary outcome vector (0/1)
 #' @param weights Optional weights vector
-#' @param ... Additional arguments (ignored)
 #' @keywords internal
 rank_poly_glm <- function(x, y, weights = NULL, ...) {
   fit <- glm.fit(
@@ -109,7 +115,7 @@ rank_poly_glm <- function(x, y, weights = NULL, ...) {
 #' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary
 #' outcomes; returns a numeric vector
 #' @param trans_result Binary outcome vector (0/1)
-#' @param legacy Bool, use the legacy weighting?
+#' @param legacy Bool, use legacy weighting?
 #' @keywords internal
 compute_balanced_weights <- function(trans_result, legacy = FALSE) {
   n_total <- length(trans_result)
@@ -138,8 +144,7 @@ compute_balanced_weights <- function(trans_result, legacy = FALSE) {
 }
 
 
-#' @describeIn covariance_filter Select variables iteratively based on correlation
-#' threshold; returns a character vector of selected variable names
+#' @describeIn covariance_filter Implements the iterative selection procedure.
 #' @param cor_mat Absolute correlation matrix
 #' @param corcut Correlation cutoff threshold
 #' @keywords internal
diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd
index 3e47752..ffa87fc 100644
--- a/man/covariance_filter.Rd
+++ b/man/covariance_filter.Rd
@@ -5,7 +5,7 @@
 \alias{rank_poly_glm}
 \alias{compute_balanced_weights}
 \alias{select_by_correlation}
-\title{Filter covariates for land use land cover change (LULCC) models}
+\title{Two stage covariate filtering}
 \usage{
 covariance_filter(
   data,
@@ -37,7 +37,7 @@ Should take arguments (x, y, weights, ...) and return a single numeric value
 
 \item{corcut}{Correlation cutoff threshold}
 
-\item{...}{Additional arguments (ignored)}
+\item{...}{Additional arguments passed to rank_fun.}
 
 \item{x}{A numeric vector representing a single covariate}
 
@@ -45,7 +45,7 @@ Should take arguments (x, y, weights, ...) and return a single numeric value
 
 \item{trans_result}{Binary outcome vector (0/1)}
 
-\item{legacy}{Bool, use the legacy weighting?}
+\item{legacy}{Bool, use legacy weighting?}
 
 \item{cor_mat}{Absolute correlation matrix}
 }
@@ -54,17 +54,24 @@ A filtered data.table containing only the selected covariates after ranking
 by the specified method and filtering based on correlation threshold.
 }
 \description{
-This function filters a set of covariates for land use land cover change (LULCC)
-models based on various statistical methods and correlation thresholds.
+The \code{covariance_filter} returns a set of covariates for land use land cover change
+(LULCC) models based on a two-stage variable selection: a first statistical fit
+estimates a covariate's quality for a given prediction task. A second step selects
+all variables below a given correlation threshold: We iterate over a correlation
+matrix ordered in the first step. Starting within the leftmost column, all rows (i.e.
+candidates) greater than the given threshold are dropped from the full set of
+candidates. This candidate selection is retained and used to select the next column,
+until no further columns are left to investigate. The columns that were iterated over
+are those returned as a character vector of selected variable names.
 }
 \details{
 The function first ranks covariates using the provided ranking function (default:
-quasibinomial polynomial GLM). Then, it iteratively removes highly correlated variables
-based on the correlation cutoff threshold, preserving variables in order of their
-ranking. See
+quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson)
+correlated variables based on the correlation cutoff threshold, preserving variables
+in order of their ranking. See
 \url{https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r} for
 where the concept came from. The original author was Antoine Adde, with edits by
-Benjamin Black.
+Benjamin Black. A similar mechanism is found in \url{https://github.com/antadde/covsel/}.
 }
 \section{Functions}{
 \itemize{
@@ -74,8 +81,7 @@ the lower p value for each of the polynomial terms
 \item \code{compute_balanced_weights()}: Compute class-balanced weights for imbalanced binary
 outcomes; returns a numeric vector
 
-\item \code{select_by_correlation()}: Select variables iteratively based on correlation
-threshold; returns a character vector of selected variable names
+\item \code{select_by_correlation()}: Implements the iterative selection procedure.
 
 }}
 \keyword{internal}

From 6db820e8f60ac6480957534075afbe6b4673ffc3 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 12:36:52 +0100
Subject: [PATCH 04/27] drop id_trans; will be set by DB

---
 R/trans_meta_t.R    | 6 ++----
 man/trans_meta_t.Rd | 3 ++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/R/trans_meta_t.R b/R/trans_meta_t.R
index 8613add..db81f8e 100644
--- a/R/trans_meta_t.R
+++ b/R/trans_meta_t.R
@@ -41,7 +41,8 @@ as_trans_meta_t <- function(x) {
   )
 }
 
-#' @describeIn trans_meta_t Calculate the transition metadata and mark for modelling feasibility
+#' @describeIn trans_meta_t Calculate the transition metadata and mark for modelling
+#' feasibility. Does not attribute `id_trans`; this only makes sense as part of a DB.
 #' @export
 create_trans_meta_t <- function(
   transitions,
@@ -108,9 +109,6 @@ create_trans_meta_t <- function(
     }
   }
 
-  # Add id_trans
-  trans_summary[, id_trans := .I]
-
   as_trans_meta_t(trans_summary)
 }
 
diff --git a/man/trans_meta_t.Rd b/man/trans_meta_t.Rd
index b65c9e8..9577213 100644
--- a/man/trans_meta_t.Rd
+++ b/man/trans_meta_t.Rd
@@ -59,6 +59,7 @@ entries for each viable transition type.
 }}
 \section{Functions}{
 \itemize{
-\item \code{create_trans_meta_t()}: Calculate the transition metadata and mark for modelling feasibility
+\item \code{create_trans_meta_t()}: Calculate the transition metadata and mark for modelling
+feasibility. Does not attribute \code{id_trans}; this only makes sense as part of a DB.
 
 }}

From 4d98d621a0a42734012c294ec832e2c34a340a06 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:42:27 +0100
Subject: [PATCH 05/27] parquet_duckdb: move content-agnostic evoland_db logic

plus adding a with_tables function
---
 DESCRIPTION                         |  22 ++
 NAMESPACE                           |   1 +
 R/evoland_db.R                      | 519 ++-----------------------
 R/evoland_db_views.R                | 330 ++++++++++++----
 R/parquet_duckdb.R                  | 576 ++++++++++++++++++++++++++++
 inst/tinytest/test_evoland_db.R     | 244 +++++-------
 inst/tinytest/test_parquet_duckdb.R | 349 +++++++++++++++++
 man/evoland_db.Rd                   | 266 +------------
 man/evoland_db_views.Rd             |  12 +
 man/parquet_duckdb.Rd               | 395 +++++++++++++++++++
 10 files changed, 1748 insertions(+), 966 deletions(-)
 create mode 100644 R/parquet_duckdb.R
 create mode 100644 inst/tinytest/test_parquet_duckdb.R
 create mode 100644 man/parquet_duckdb.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index cb31905..fb2d27f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -33,3 +33,25 @@ VignetteBuilder: quarto
 Config/testthat/edition: 3
 LinkingTo: 
     Rcpp
+Collate: 
+    'RcppExports.R'
+    'alloc_params_t.R'
+    'coords_t.R'
+    'covariance_filter.R'
+    'parquet_duckdb.R'
+    'evoland_db.R'
+    'evoland_db_views.R'
+    'init.R'
+    'intrv_masks_t.R'
+    'intrv_meta_t.R'
+    'lulc_data_t.R'
+    'lulc_meta_t.R'
+    'periods_t.R'
+    'pred_data_t.R'
+    'pred_meta_t.R'
+    'trans_meta_t.R'
+    'trans_models_t.R'
+    'trans_preds_t.R'
+    'util.R'
+    'util_download.R'
+    'util_terra.R'
diff --git a/NAMESPACE b/NAMESPACE
index 888b193..50faaf3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -56,6 +56,7 @@ export(create_trans_preds_t)
 export(download_and_verify)
 export(evoland_db)
 export(extract_using_coords_t)
+export(parquet_duckdb)
 export(print_rowwise_yaml)
 export(validate)
 importFrom(Rcpp,sourceCpp)
diff --git a/R/evoland_db.R b/R/evoland_db.R
index ad67db3..f40bd82 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -6,25 +6,17 @@
 #' This class uses DuckDB for in-memory SQL operations while persisting data
 #' to disk in parquet format for better compression.
 #'
+#' Inherits from [parquet_duckdb] for generic database operations.
+#'
+#' @include parquet_duckdb.R
 #' @export
 
 evoland_db <- R6::R6Class(
   classname = "evoland_db",
+  inherit = parquet_duckdb,
 
   ## Public Methods ----
   public = list(
-    #' @field connection DBI connection object to an in-memory DuckDB database
-    connection = NULL,
-
-    #' @field path Character string path to the data folder
-    path = NULL,
-
-    #' @field default_format Default file format for new tables
-    default_format = NULL,
-
-    #' @field writeopts Default write options for DuckDB, see
-    writeopts = NULL,
-
     #' @description
     #' Initialize a new evoland_db object
     #' @param path Character string. Path to the data folder.
@@ -38,217 +30,23 @@ evoland_db <- R6::R6Class(
       default_format = c("parquet", "csv"),
       ...
     ) {
-      # Create folder if it doesn't exist
-      self$path <- ensure_dir(path)
-
-      # set format / writeopts
-      self$default_format <- match.arg(default_format)
-      self$writeopts <- switch(
-        self$default_format,
-        parquet = "FORMAT parquet, COMPRESSION zstd",
-        csv = "FORMAT csv",
-        stop(glue::glue("Unsupported format: {format}"))
+      # Initialize parent class with spatial extension
+      super$initialize(
+        path = path,
+        default_format = default_format,
+        extensions = "spatial"
       )
 
-      # Create in-memory connection for SQL operations
-      self$connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:")
-      self$execute("INSTALL spatial; LOAD spatial;")
-
+      # Set evoland-specific reporting metadata
       self$set_report(...)
 
       invisible(self)
     },
 
-    ### DB methods ----
-    # TODO the database methods should be spun out into a parent class "parquet_db" that is
-    # independent of domain specific logic. the domain logic would then live in evoland_db.
-    #' @description
-    #' Commit data in overwrite mode
-    #' @param x Data frame to commit
-    #' @param table_name Character string table name
-    #' @param autoincrement_cols Character vector of column names to auto-increment
-    #' @param map_cols Character vector of columns to convert to MAP format
-    commit_overwrite = function(
-      x,
-      table_name,
-      autoincrement_cols = character(0),
-      map_cols = character(0)
-    ) {
-      file_path <- file.path(self$path, paste0(table_name, ".", self$default_format))
-
-      private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
-
-      # if there are any of these, first get existing max values
-      if (length(intersect(autoincrement_cols, names(x))) > 0) {
-        warning(glue::glue(
-          "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n",
-          "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning"
-        ))
-      }
-
-      # autoincrement = row number; all other cols treated as ordinary
-      ordinary_cols <- setdiff(names(x), autoincrement_cols)
-      select_expr <- glue::glue_collapse(
-        c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols),
-        sep = ",\n "
-      )
-
-      self$execute(glue::glue(
-        r"{
-        copy (
-          select {select_expr}
-          from new_data_v
-        ) to '{file_path}' ({self$writeopts})
-        }"
-      ))
-    },
-
-    #' @description
-    #' Commit data in append mode
-    #' @param x Data frame to commit
-    #' @param table_name Character string table name
-    #' @param autoincrement_cols Character vector of column names to auto-increment
-    #' @param map_cols Character vector of columns to convert to MAP format
-    commit_append = function(
-      x,
-      table_name,
-      autoincrement_cols = character(0),
-      map_cols = character(0)
-    ) {
-      file_info <- private$get_file_path(table_name)
-
-      if (!file_info$exists) {
-        self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)
-      }
-
-      self$attach_table(table_name)
-      on.exit(self$detach_table(table_name))
-      private$set_autoincrement_vars(table_name, autoincrement_cols)
-
-      private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
-
-      ordinary_cols <- setdiff(names(x), autoincrement_cols)
-      select_new <- glue::glue_collapse(
-        c(
-          glue::glue(
-            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
-          ),
-          ordinary_cols
-        ),
-        sep = ",\n "
-      )
-
-      # concatenation using union all; the "by name" option inserts NULLs if a column is
-      # missing in one of the queries. also makes it robust against differing col orders.
-      self$execute(glue::glue(
-        r"{
-          copy (
-            select
-              *
-            from
-              {table_name}
-            union all by name
-            select
-              {select_new}
-            from
-              new_data_v
-          )
-          to '{file_info$path}' ({self$writeopts})
-          }"
-      ))
-    },
-
-    #' @description
-    #' Commit data in upsert mode
-    #' @param x Data frame to commit
-    #' @param table_name Character string table name
-    #' @param key_cols Identify unique columns - heuristic: if prefixed with
-    #' id_, the set of all columns designates a uniqueness condition
-    #' @param autoincrement_cols Character vector of column names to auto-increment
-    #' @param map_cols Character vector of columns to convert to MAP format
-    commit_upsert = function(
-      x,
-      table_name,
-      key_cols = grep("^id_", names(x), value = TRUE),
-      autoincrement_cols = character(0),
-      map_cols = character(0)
-    ) {
-      file_info <- private$get_file_path(table_name)
-
-      if (!file_info$exists) {
-        return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols))
-      } else if (length(key_cols) == 0) {
-        return(self$commit_append(x, table_name, autoincrement_cols, map_cols))
-      }
-
-      self$attach_table(table_name)
-      on.exit(self$detach_table(table_name))
-
-      private$set_autoincrement_vars(table_name, autoincrement_cols)
-
-      private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
-
-      # Update existing data without touching key_cols or autoincrement_cols
-      ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols))
-      update_select_expr <- glue::glue_collapse(
-        glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"),
-        sep = ",\n "
-      )
-      update_join_condition <- glue::glue_collapse(
-        glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"),
-        sep = "\nand "
-      )
-
-      self$execute(glue::glue(
-        r"{
-        update {table_name} set
-          {update_select_expr}
-        from new_data_v
-        where
-          {update_join_condition};
-        }"
-      ))
-
-      # Insert new data with incrementing autoincrement_cols
-      insert_select_expr <- glue::glue_collapse(
-        c(
-          glue::glue(
-            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
-          ),
-          glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}")
-        ),
-        sep = ",\n "
-      )
-      null_condition <- glue::glue_collapse(
-        glue::glue("{table_name}.{key_cols} is null"),
-        sep = "\nand "
-      )
-
-      self$execute(glue::glue(
-        r"{
-        insert into {table_name}
-        select
-          {insert_select_expr}
-        from
-          new_data_v
-        left join
-          {table_name}
-        on
-          {update_join_condition}
-        where
-          {null_condition}
-        ;
-        }"
-      ))
-
-      self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})"))
-    },
+    ### Evoland-specific methods ----
 
     #' @description
-    #' Fetch data from storage
+    #' Fetch data from storage with evoland-specific view support
     #' @param table_name Character string. Name of the table to query.
     #' @param where Character string. Optional WHERE clause for the SQL query.
     #' @param limit integerish, limit the amount of rows to return
@@ -274,128 +72,8 @@ evoland_db <- R6::R6Class(
         return(private$get_empty_table(table_name))
       }
 
-      # Build SQL query
-      sql <- glue::glue("SELECT * FROM read_{file_info$format}('{file_info$path}')")
-
-      if (!is.null(where)) {
-        sql <- glue::glue("{sql} WHERE {where}")
-      }
-      if (!is.null(limit)) {
-        sql <- glue::glue("{sql} LIMIT {limit}")
-      }
-
-      self$get_query(sql)
-    },
-
-    #' @description
-    #' List all tables (files) in storage
-    #' @return Character vector of table names
-    list_tables = function() {
-      list.files(self$path, pattern = "\\.(parquet|csv)$", full.names = FALSE) |>
-        tools::file_path_sans_ext() |>
-        unique() |>
-        sort()
-    },
-
-    #' @description
-    #' Execute statement
-    #' @param statement A SQL statement
-    #' @return No. of rows affected by statement
-    execute = function(statement) {
-      DBI::dbExecute(self$connection, statement)
-    },
-
-    #' @description
-    #' Get Query
-    #' @param statement A SQL statement
-    #' @return No. of rows affected by statement
-    get_query = function(statement) {
-      DBI::dbGetQuery(self$connection, statement) |>
-        data.table::as.data.table()
-    },
-
-    #' @description
-    #' Attach a single table as a temporary table in DuckDB memory. This is
-    #' useful for working with multiple tables in SQL queries instead of loading
-    #' them into R objects.
-    #' @param table_name Character vector. Names of table to attach.
-    #' @param columns Character vector. Optional sql column selection, defaults to "*"
-    #' @param where A SQL where statement to optionally subset the table being attached.
-    attach_table = function(table_name, columns = "*", where = NULL) {
-      # TODO the attach/detach combo should be available in a DB method "with
-      # table"; it should do nothing if the table is already attached
-      file_info <- private$get_file_path(table_name)
-
-      # Build SQL query
-      sql <- glue::glue(
-        "CREATE TEMP TABLE {table_name} AS ",
-        "SELECT {paste(columns, collapse = ', ')} ",
-        "FROM read_{file_info$format}('{file_info$path}')"
-      )
-
-      if (!is.null(where)) {
-        sql <- glue::glue("{sql} WHERE {where}")
-      }
-
-      # Execute SQL
-      self$execute(sql)
-    },
-
-    #' @description
-    #' Detach one or more tables from the database.
-    #' @param table_name Character. Name of table to drop.
-    detach_table = function(table_name) {
-      self$execute(paste0("drop table ", table_name, ";"))
-    },
-
-    #' @description
-    #' Get table row count
-    #' @param table_name Character string. Name of the table to query.
-    #' @return No. of rows
-    row_count = function(table_name) {
-      file_info <- private$get_file_path(table_name)
-
-      if (!file_info$exists) {
-        return(0L)
-      }
-
-      self$get_query(
-        glue::glue("SELECT COUNT(*) as n FROM read_{file_info$format}('{file_info$path}')")
-      )[[1]]
-    },
-
-    #' @description
-    #' Delete rows from a table
-    #' @param table_name Character string. Name of the table to delete from.
-    #' @param where Character string, defaults to NULL: delete everything in table.
-    #' @return No. of rows affected
-    delete_from = function(table_name, where = NULL) {
-      file_info <- private$get_file_path(table_name)
-
-      if (!file_info$exists) {
-        return(0L)
-      }
-
-      count_before <- self$row_count(table_name)
-
-      if (is.null(where)) {
-        file.remove(file_info$path)
-        return(count_before)
-      }
-
-      self$execute(glue::glue(
-        r"{
-        COPY (
-          SELECT * FROM read_{file_info$format}('{file_info$path}')
-          WHERE NOT ({where})
-        )
-        TO '{file_info$path}' ({self$writeopts})
-        }"
-      ))
-
-      count_after <- self$row_count(table_name)
-
-      return(count_before - count_after)
+      # Call parent method
+      super$fetch(table_name, where, limit)
     },
 
     ### Setter methods ----
@@ -406,10 +84,15 @@ evoland_db <- R6::R6Class(
     set_report = function(...) {
       params <- list(...)
       if (self$row_count("reporting_t") == 0L) {
-        params[["report_name"]] <- "evoland_scenario"
-        params[["report_name_pretty"]] <- "Default Evoland Scenario"
-        params[["report_include_date"]] <- "TRUE"
-        params[["creator_username"]] <- Sys.getenv("USER", unset = "unknown")
+        # only upsert if these values are missing upon DB init
+        params[["report_name"]] <-
+          params[["report_name"]] %||% "evoland_scenario"
+        params[["report_name_pretty"]] <-
+          params[["report_name_pretty"]] %||% "Default Evoland Scenario"
+        params[["report_include_date"]] <-
+          params[["report_include_date"]] %||% "TRUE"
+        params[["creator_username"]] <-
+          params[["creator_username"]] %||% Sys.getenv("USER", unset = "unknown")
       }
       params[["last_opened"]] <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
       params[["last_opened_username"]] <- Sys.getenv("USER", unset = "unknown")
@@ -497,10 +180,11 @@ evoland_db <- R6::R6Class(
     #' @param ... Not used
     #' @return self (invisibly)
     print = function(...) {
-      cat("<evoland_db>\n")
-      cat(sprintf("Database path: %s\n\n", self$path))
+      cat("<evoland_db> (extends parquet_duckdb)\n")
+      cat(sprintf("Database path: %s\n", self$path))
+      cat(sprintf("Default format: %s\n\n", self$default_format))
 
-      cat("Database methods:\n")
+      cat("Database methods (inherited):\n")
       cat("  Commit: commit_overwrite(x, table_name, autoincrement_cols, map_cols),\n")
       cat("          commit_append(x, table_name, autoincrement_cols, map_cols),\n")
       cat("          commit_upsert(x, table_name, key_cols, autoincrement_cols, map_cols)\n")
@@ -521,7 +205,9 @@ evoland_db <- R6::R6Class(
       cat("  intrv_meta_t, intrv_masks_t, trans_models_t, alloc_params_t\n\n")
 
       cat("Active bindings (read-only):\n")
-      cat("  extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n")
+      cat("  extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n\n")
+
+      cat(sprintf("Tables: %d\n", length(self$list_tables())))
 
       invisible(self)
     }
@@ -721,150 +407,7 @@ evoland_db <- R6::R6Class(
 
   ## Private Methods ----
   private = list(
-    # r6 hook called on gc(); Close the database connection
-    #
-    # return NULL (called for side effects)
-    finalize = function() {
-      if (!is.null(self$connection)) {
-        DBI::dbDisconnect(self$connection)
-        self$connection <- NULL
-      }
-    },
-
-    # Get file path and format for a table
-    #
-    # param table_name Character string table name
-    # return List with path, format, and exists flag
-    get_file_path = function(table_name) {
-      # Check for parquet first, then csv
-      parquet_path <- file.path(self$path, paste0(table_name, ".parquet"))
-      csv_path <- file.path(self$path, paste0(table_name, ".csv"))
-
-      if (file.exists(parquet_path)) {
-        return(list(path = parquet_path, format = "parquet", exists = TRUE))
-      } else if (file.exists(csv_path)) {
-        return(list(path = csv_path, format = "csv", exists = TRUE))
-      } else {
-        # Return default format for new file
-        default_path <- file.path(
-          self$path,
-          paste0(table_name, ".", self$default_format)
-        )
-        return(list(
-          path = default_path,
-          format = self$default_format,
-          exists = FALSE
-        ))
-      }
-    },
-
-    # Register new_data_v table, optionally converting MAP columns
-    #
-    # param x Data to register or character name of existing table/view
-    # param map_cols Character vector of columns to convert to MAP format
-    # return NULL (called for side effects)
-    register_new_data_v = function(x, map_cols = character(0)) {
-      # If x is a table name, create alias to new_data_v
-      if (is.character(x) && length(x) == 1) {
-        self$execute(glue::glue("create view new_data_v as select * from {x}"))
-        return(invisible(NULL))
-      }
-
-      if (length(map_cols) == 0) {
-        # No MAP conversion needed - register directly
-        duckdb::duckdb_register(self$connection, "new_data_v", x)
-      } else {
-        # Convert list columns to key-value dataframes
-        x <-
-          data.table::copy(x) |>
-          convert_list_cols(map_cols, list_to_kv_df)
-
-        # Register as intermediate table
-        duckdb::duckdb_register(self$connection, "new_data_raw", x)
-
-        # Build SELECT expression with map_from_entries for MAP columns
-        map_exprs <- glue::glue("map_from_entries({map_cols}) as {map_cols}")
-        other_cols <- setdiff(names(x), map_cols)
-        all_exprs <- c(other_cols, map_exprs)
-        select_expr <- glue::glue_collapse(all_exprs, sep = ", ")
-
-        # Create new_data_v from new_data_raw
-        self$execute(glue::glue(
-          "create temp table new_data_v as select {select_expr} from new_data_raw"
-        ))
-      }
-
-      invisible(NULL)
-    },
-
-    # Cleanup new_data_v and related tables
-    #
-    # param map_cols Character vector indicating if MAP conversion was used
-    # return NULL (called for side effects)
-    cleanup_new_data_v = function(map_cols = character(0)) {
-      if (length(map_cols) == 0) {
-        duckdb::duckdb_unregister(self$connection, "new_data_v")
-      } else {
-        self$execute(
-          "drop table if exists new_data_v;
-          drop view if exists new_data_v"
-        )
-        duckdb::duckdb_unregister(self$connection, "new_data_raw")
-      }
-
-      invisible(NULL)
-    },
-
-    # Write data to file
-    #
-    # param x Data frame to write
-    # param table_name Character string table name
-    write_file = function(x, table_name) {
-      file_path <- file.path(self$path, paste0(table_name, ".", self$default_format))
-
-      duckdb::duckdb_register(self$connection, "temp_write_table", x)
-      on.exit(duckdb::duckdb_unregister(self$connection, "temp_write_table"))
-
-      sql <- glue::glue("COPY temp_write_table TO '{file_path}' ({self$writeopts})")
-      DBI::dbExecute(self$connection, sql)
-    },
-
-    # Set one duckdb variable name max_{colname} to the maximum found for each
-    # autoincrement_col in table_name. if column is missing from table_name, set to 0.
-    set_autoincrement_vars = function(table_name, autoincrement_cols) {
-      if (length(autoincrement_cols) == 0L) {
-        return(NULL)
-      }
-      if (!DBI::dbExistsTable(self$connection, table_name)) {
-        # attach / detach unless it was already there
-        self$attach_table(table_name)
-        on.exit(self$detach_table(table_name))
-      }
-
-      existing_cols <-
-        glue::glue("select column_name from (describe {table_name})") |>
-        self$get_query() |>
-        (\(x) x[[1]])() |>
-        intersect(autoincrement_cols)
-
-      missing_cols <- setdiff(autoincrement_cols, existing_cols)
-
-      set_exprs <- glue::glue_collapse(
-        c(
-          glue::glue(
-            "set variable max_{existing_cols} = (select coalesce(max({existing_cols}), 0) from {table_name});"
-          ),
-          glue::glue(
-            "set variable max_{missing_cols} = 0;"
-          )
-        ),
-        sep = "\n"
-      )
-
-      self$execute(set_exprs)
-    },
-
-    # Get empty table with proper structure
+    # Get empty table with proper structure (evoland-specific)
     #
     # param table_name Character string table name
     # return Empty data.table with correct columns
diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index b679911..b0533e9 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -9,103 +9,269 @@ NULL
 #' @describeIn evoland_db_views Retrieve a table of distinct predictor urls and their
 #' md5sum
 make_pred_sources_v <- function(self, private, where = NULL) {
-  self$attach_table("pred_meta_t")
-  on.exit(self$detach_table("pred_meta_t"))
-
-  where_clause <- if (!is.null(where)) paste("AND", where) else ""
-
-  self$get_query(glue::glue(
-    r"{
-    select distinct
-      unnest(sources).url as url,
-      unnest(sources).md5sum as md5sum
-    from pred_meta_t
-    where sources is not null {where_clause}
-    }"
-  ))
+  self$with_tables("pred_meta_t", function() {
+    where_clause <- if (!is.null(where)) paste("AND", where) else ""
+
+    self$get_query(glue::glue(
+      r"{
+      select distinct
+        unnest(sources).url as url,
+        unnest(sources).md5sum as md5sum
+      from pred_meta_t
+      where sources is not null {where_clause}
+      }"
+    ))
+  })
 }
 
 
 #' @describeIn evoland_db_views Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`.
 make_lulc_meta_long_v <- function(self, private, where = NULL) {
-  self$attach_table("lulc_meta_t")
-  on.exit(self$detach_table("lulc_meta_t"))
-
-  where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
-
-  self$get_query(glue::glue(
-    r"{
-    select
-      id_lulc,
-      name,
-      unnest(src_classes) as src_class
-    from 
-      lulc_meta_t
-    {where_clause}
-    }"
-  ))
+  self$with_tables("lulc_meta_t", function() {
+    where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
+
+    self$get_query(glue::glue(
+      r"{
+      select
+        id_lulc,
+        name,
+        unnest(src_classes) as src_class
+      from
+        lulc_meta_t
+      {where_clause}
+      }"
+    ))
+  })
 }
 
 #' @describeIn evoland_db_views Minimal coordinate representation (id_coord, lon, lat)
 make_coords_minimal <- function(self, private, where = NULL) {
-  self$attach_table("coords_t", c("id_coord", "lon", "lat"))
-  on.exit(self$detach_table("coords_t"))
-
-  where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
-
-  self$get_query(glue::glue(
-    r"{
-    select id_coord, lon, lat 
-    from coords_t
-    {where_clause}
-    }"
-  )) |>
-    cast_dt_col("id_coord", as.integer) |>
-    data.table::setkeyv("id_coord")
+  self$with_tables("coords_t", function() {
+    where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
+
+    self$get_query(glue::glue(
+      r"{
+      select id_coord, lon, lat
+      from coords_t
+      {where_clause}
+      }"
+    )) |>
+      cast_dt_col("id_coord", as.integer) |>
+      data.table::setkeyv("id_coord")
+  })
 }
 
 #' @describeIn evoland_db_views Returns the extent of the coords_t as terra::SpatExtent
 make_extent_db <- function(self, private) {
-  self$attach_table("coords_t", c("lon", "lat"))
-  on.exit(self$detach_table("coords_t"))
-
-  self$get_query(glue::glue(
-    r"{
-        SELECT
-          min(lon) as xmin,
-          max(lon) as xmax,
-          min(lat) as ymin,
-          max(lat) as ymax
-        FROM
-          coords_t
-        }"
-  )) |>
-    unlist() |>
-    terra::ext()
+  self$with_tables("coords_t", function() {
+    self$get_query(glue::glue(
+      r"{
+          SELECT
+            min(lon) as xmin,
+            max(lon) as xmax,
+            min(lat) as ymin,
+            max(lat) as ymax
+          FROM
+            coords_t
+          }"
+    )) |>
+      unlist() |>
+      terra::ext()
+  })
 }
 
 #' @describeIn evoland_db_views Returns transitions based on lulc_data_t
 make_transitions_v <- function(self, private, where = NULL) {
-  self$attach_table("lulc_data_t")
-  on.exit(self$detach_table("lulc_data_t"))
-
-  where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
-
-  self$get_query(glue::glue(
-    r"{
-    SELECT
-      curr.id_period,
-      prev.id_lulc as id_lulc_anterior,
-      curr.id_lulc as id_lulc_posterior,
-      curr.id_coord
-    FROM
-      lulc_data_t as curr
-    INNER JOIN
-      lulc_data_t as prev
-    ON
-      curr.id_coord = prev.id_coord
-      AND curr.id_period = prev.id_period + 1
-    {where_clause}
-    }"
-  ))
+  self$with_tables("lulc_data_t", function() {
+    where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
+
+    self$get_query(glue::glue(
+      r"{
+      SELECT
+        curr.id_period,
+        prev.id_lulc as id_lulc_anterior,
+        curr.id_lulc as id_lulc_posterior,
+        curr.id_coord
+      FROM
+        lulc_data_t as curr
+      INNER JOIN
+        lulc_data_t as prev
+      ON
+        curr.id_coord = prev.id_coord
+        AND curr.id_period = prev.id_period + 1
+      {where_clause}
+      }"
+    ))
+  })
+}
+
+#' @describeIn evoland_db_views Returns wide table of transition results and predictor data
+#' for a specific transition. Used as input to covariance filtering.
+#'
+#' @param id_trans Integer, the transition ID to generate data for
+#' @return data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N
+make_trans_pred_data_v <- function(self, private, id_trans) {
+  stopifnot(
+    "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans)
+  )
+
+  # Determine which predictor tables exist
+  all_tables <- self$list_tables()
+  pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool")
+  existing_pred_tables <- intersect(pred_tables, all_tables)
+
+  # Build list of tables to attach
+  tables_to_attach <- c("trans_meta_t", "lulc_data_t", "pred_meta_t", existing_pred_tables)
+
+  self$with_tables(
+    tables_to_attach,
+    function() {
+      # Get transition metadata
+      trans_info <- self$get_query(glue::glue(
+        "SELECT id_lulc_anterior, id_lulc_posterior
+         FROM trans_meta_t
+         WHERE id_trans = {id_trans}"
+      ))
+
+      if (nrow(trans_info) == 0L) {
+        stop(glue::glue("Transition id_trans = {id_trans} not found in trans_meta_t"))
+      }
+
+      id_lulc_ant <- trans_info$id_lulc_anterior
+      id_lulc_post <- trans_info$id_lulc_posterior
+
+      # Build CTEs dynamically based on which predictor tables exist
+      ctes <- list()
+
+      # Always include trans_result
+      ctes$trans_result <- glue::glue(
+        "trans_result AS (
+          SELECT
+            curr.id_coord,
+            curr.id_period,
+            CASE
+              WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN 1
+              WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN 0
+              ELSE NULL
+            END AS result
+          FROM lulc_data_t AS curr
+          INNER JOIN lulc_data_t AS prev
+            ON curr.id_coord = prev.id_coord
+            AND curr.id_period = prev.id_period + 1
+          WHERE prev.id_lulc = {id_lulc_ant}
+        )"
+      )
+
+      # Add predictor CTEs for each type that exists
+      # UNION period-specific data (period >= 1) with cross-joined static data (period 0)
+      if ("pred_data_t_float" %in% existing_pred_tables) {
+        ctes$pred_float_combined <- "pred_float_combined AS (
+          SELECT id_coord, id_period, id_pred, value
+          FROM pred_data_t_float
+          WHERE id_period >= 1
+          UNION ALL
+          SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value
+          FROM pred_data_t_float AS p0
+          CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods
+          WHERE p0.id_period = 0
+        )"
+
+        ctes$pred_float_wide <- "pred_float_wide AS (
+          PIVOT pred_float_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period
+        )"
+      }
+
+      if ("pred_data_t_int" %in% existing_pred_tables) {
+        ctes$pred_int_combined <- "pred_int_combined AS (
+          SELECT id_coord, id_period, id_pred, value
+          FROM pred_data_t_int
+          WHERE id_period >= 1
+          UNION ALL
+          SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value
+          FROM pred_data_t_int AS p0
+          CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods
+          WHERE p0.id_period = 0
+        )"
+
+        ctes$pred_int_wide <- "pred_int_wide AS (
+          PIVOT pred_int_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period
+        )"
+      }
+
+      if ("pred_data_t_bool" %in% existing_pred_tables) {
+        ctes$pred_bool_combined <- "pred_bool_combined AS (
+          SELECT id_coord, id_period, id_pred, value
+          FROM pred_data_t_bool
+          WHERE id_period >= 1
+          UNION ALL
+          SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value
+          FROM pred_data_t_bool AS p0
+          CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods
+          WHERE p0.id_period = 0
+        )"
+
+        ctes$pred_bool_wide <- "pred_bool_wide AS (
+          PIVOT pred_bool_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period
+        )"
+      }
+
+      # Build SELECT columns
+      select_cols <- "tr.result"
+      if ("pred_data_t_float" %in% existing_pred_tables) {
+        select_cols <- paste0(select_cols, ", pf.* EXCLUDE (id_coord, id_period)")
+      }
+      if ("pred_data_t_int" %in% existing_pred_tables) {
+        select_cols <- paste0(select_cols, ", pi.* EXCLUDE (id_coord, id_period)")
+      }
+      if ("pred_data_t_bool" %in% existing_pred_tables) {
+        select_cols <- paste0(select_cols, ", pb.* EXCLUDE (id_coord, id_period)")
+      }
+
+      # Build JOINs
+      joins <- ""
+      if ("pred_data_t_float" %in% existing_pred_tables) {
+        joins <- paste0(
+          joins,
+          "\n        LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period"
+        )
+      }
+      if ("pred_data_t_int" %in% existing_pred_tables) {
+        joins <- paste0(
+          joins,
+          "\n        LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period"
+        )
+      }
+      if ("pred_data_t_bool" %in% existing_pred_tables) {
+        joins <- paste0(
+          joins,
+          "\n        LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period"
+        )
+      }
+
+      # Combine everything into final query
+      cte_string <- paste(unlist(ctes), collapse = ",\n\n        ")
+
+      query <- glue::glue(
+        "WITH {cte_string}
+
+        SELECT {select_cols}
+        FROM trans_result AS tr{joins}
+        WHERE tr.result IS NOT NULL"
+      )
+
+      result <- self$get_query(query)
+
+      # Rename columns to id_pred_{N} format
+      old_names <- names(result)
+      new_names <- old_names
+      for (i in seq_along(old_names)) {
+        if (old_names[i] != "result" && grepl("^\\d+$", old_names[i])) {
+          new_names[i] <- paste0("id_pred_", old_names[i])
+        }
+      }
+      data.table::setnames(result, old_names, new_names)
+
+      result
+    }
+  )
 }
diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R
new file mode 100644
index 0000000..0fa7d51
--- /dev/null
+++ b/R/parquet_duckdb.R
@@ -0,0 +1,576 @@
+#' R6 Base Class for Parquet-Backed DuckDB Storage
+#'
+#' @description
+#' A domain-agnostic R6 class that provides an interface to a folder-based data
+#' storage system using DuckDB for in-memory SQL operations and parquet files
+#' for efficient on-disk persistence. This class can be inherited by
+#' domain-specific database classes.
+#'
+#' @export
+
+parquet_duckdb <- R6::R6Class(
+  classname = "parquet_duckdb",
+
+  ## Public Methods ----
+  public = list(
+    #' @field connection DBI connection object to an in-memory DuckDB database
+    connection = NULL,
+
+    #' @field path Character string path to the data folder
+    path = NULL,
+
+    #' @field default_format Default file format for new tables
+    default_format = NULL,
+
+    #' @field writeopts Default write options for DuckDB
+    writeopts = NULL,
+
+    #' @description
+    #' Initialize a new parquet_duckdb object
+    #' @param path Character string. Path to the data folder.
+    #' @param default_format Character. Default file format ("parquet" or "csv").
+    #' Default is "parquet".
+    #' @param extensions Character vector of DuckDB extensions to load (e.g., "spatial")
+    #'
+    #' @return A new `parquet_duckdb` object
+    initialize = function(
+      path,
+      default_format = c("parquet", "csv"),
+      extensions = character(0)
+    ) {
+      # Create folder if it doesn't exist
+      self$path <- ensure_dir(path)
+
+      # Set format / writeopts
+      self$default_format <- match.arg(default_format)
+      self$writeopts <- switch(
+        self$default_format,
+        parquet = "format parquet, compression zstd",
+        csv = "format csv",
+        stop(glue::glue("Unsupported format: {self$default_format}"))
+      )
+
+      # Create in-memory connection for SQL operations
+      self$connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:")
+
+      # load extensions
+      for (ext in extensions) {
+        self$execute(glue::glue("install {ext}; load {ext};"))
+      }
+
+      invisible(self)
+    },
+
+    ### Core Database Methods ----
+
+    #' @description
+    #' Execute a SQL statement
+    #' @param statement A SQL statement
+    #' @return Number of rows affected by statement
+    execute = function(statement) {
+      DBI::dbExecute(self$connection, statement)
+    },
+
+    #' @description
+    #' Execute a SQL query and return results
+    #' @param statement A SQL query statement
+    #' @return A data.table with query results
+    get_query = function(statement) {
+      DBI::dbGetQuery(self$connection, statement) |>
+        data.table::as.data.table()
+    },
+
+    #' @description
+    #' Attach a table from parquet/CSV file as a temporary table in DuckDB
+    #' @param table_name Character. Name of table to attach.
+    #' @param columns Character vector. Optional SQL column selection, defaults to "*"
+    #' @param where Character. Optional SQL WHERE clause to subset the table.
+    #' @return Invisible NULL (called for side effects)
+    attach_table = function(table_name, columns = "*", where = NULL) {
+      file_info <- private$get_file_path(table_name)
+
+      if (!file_info$exists) {
+        stop(glue::glue("Table '{table_name}' does not exist at path: {self$path}"))
+      }
+
+      # Build SQL query
+      sql <- glue::glue(
+        "create temp table {table_name} as ",
+        "select {paste(columns, collapse = ', ')} ",
+        "from read_{file_info$format}('{file_info$path}')"
+      )
+
+      if (!is.null(where)) {
+        sql <- glue::glue("{sql} where {where}")
+      }
+
+      # Execute SQL
+      self$execute(sql)
+      invisible(NULL)
+    },
+
+    #' @description
+    #' Detach a table from the in-memory database
+    #' @param table_name Character. Name of table to drop.
+    #' @return Invisible NULL (called for side effects)
+    detach_table = function(table_name) {
+      self$execute(paste0("drop table if exists ", table_name, ";"))
+      invisible(NULL)
+    },
+
+    #' @description
+    #' Get row count for a table
+    #' @param table_name Character string. Name of the table to query.
+    #' @return Integer number of rows
+    row_count = function(table_name) {
+      file_info <- private$get_file_path(table_name)
+
+      if (!file_info$exists) {
+        return(0L)
+      }
+
+      self$get_query(
+        glue::glue("select count(*) as n from read_{file_info$format}('{file_info$path}')")
+      )[[1]]
+    },
+
+    #' @description
+    #' List all tables (files) in storage
+    #' @return Character vector of table names
+    list_tables = function() {
+      list.files(self$path, pattern = "\\.(parquet|csv)$", full.names = FALSE) |>
+        tools::file_path_sans_ext() |>
+        unique() |>
+        sort()
+    },
+
+    #' @description
+    #' Execute a function with specified tables attached, handling attach/detach automatically.
+    #' If a table is already attached in the DuckDB instance, it won't be re-attached or detached.
+    #'
+    #' @param tables Character vector of table names to attach
+    #' @param func Function to execute with tables attached
+    #' @param ... Additional arguments passed to func
+    #' @return Result of func
+    with_tables = function(tables, func, ...) {
+      # Track which tables we attach (so we know which to detach)
+      attached_tables <- character(0)
+
+      # Check which tables are already attached
+      existing_tables <- DBI::dbListTables(self$connection)
+
+      # Attach tables that aren't already present
+      for (table in tables) {
+        if (!table %in% existing_tables) {
+          self$attach_table(table)
+          attached_tables <- c(attached_tables, table)
+        }
+      }
+
+      # Ensure cleanup on exit
+      on.exit(
+        {
+          for (table in attached_tables) {
+            self$detach_table(table)
+          }
+        },
+        add = TRUE
+      )
+
+      # Execute the function
+      func(...)
+    },
+
+    #' @description
+    #' Fetch data from a table
+    #' @param table_name Character string. Name of the table to query.
+    #' @param where Character string. Optional WHERE clause for the SQL query.
+    #' @param limit Integer. Optional limit on number of rows to return.
+    #'
+    #' @return A data.table
+    fetch = function(table_name, where = NULL, limit = NULL) {
+      file_info <- private$get_file_path(table_name)
+
+      if (!file_info$exists) {
+        return(data.table::data.table())
+      }
+
+      # build sql query
+      sql <- glue::glue("select * from read_{file_info$format}('{file_info$path}')")
+
+      if (!is.null(where)) {
+        sql <- glue::glue("{sql} where {where}")
+      }
+      if (!is.null(limit)) {
+        sql <- glue::glue("{sql} limit {limit}")
+      }
+
+      self$get_query(sql)
+    },
+
+    #' @description
+    #' Delete rows from a table
+    #' @param table_name Character string. Name of the table to delete from.
+    #' @param where Character string. Optional WHERE clause; if NULL, deletes all rows.
+    #' @return Number of rows deleted
+    delete_from = function(table_name, where = NULL) {
+      file_info <- private$get_file_path(table_name)
+
+      if (!file_info$exists) {
+        return(0L)
+      }
+
+      count_before <- self$row_count(table_name)
+
+      if (is.null(where)) {
+        file.remove(file_info$path)
+        return(count_before)
+      }
+
+      self$execute(glue::glue(
+        r"{
+        copy (
+          select * from read_{file_info$format}('{file_info$path}')
+          where not ({where})
+        )
+        to '{file_info$path}' ({self$writeopts})
+        }"
+      ))
+
+      count_after <- self$row_count(table_name)
+
+      return(count_before - count_after)
+    },
+
+    ### Commit Methods ----
+
+    #' @description
+    #' Commit data in overwrite mode
+    #' @param x Data frame to commit
+    #' @param table_name Character string table name
+    #' @param autoincrement_cols Character vector of column names to auto-increment
+    #' @param map_cols Character vector of columns to convert to MAP format
+    #' @return Invisible NULL (called for side effects)
+    commit_overwrite = function(
+      x,
+      table_name,
+      autoincrement_cols = character(0),
+      map_cols = character(0)
+    ) {
+      file_path <- file.path(self$path, paste0(table_name, ".", self$default_format))
+
+      private$register_new_data_v(x, map_cols)
+      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
+
+      # Warn if overriding existing IDs
+      if (length(intersect(autoincrement_cols, names(x))) > 0) {
+        warning(glue::glue(
+          "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n",
+          "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning"
+        ))
+      }
+
+      # Build SELECT expression
+      ordinary_cols <- setdiff(names(x), autoincrement_cols)
+      select_expr <- glue::glue_collapse(
+        c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols),
+        sep = ",\n "
+      )
+
+      self$execute(glue::glue(
+        r"{
+        copy (
+          select {select_expr}
+          from new_data_v
+        ) to '{file_path}' ({self$writeopts})
+        }"
+      ))
+
+      invisible(NULL)
+    },
+
+    #' @description
+    #' Commit data in append mode
+    #' @param x Data frame to commit
+    #' @param table_name Character string table name
+    #' @param autoincrement_cols Character vector of column names to auto-increment
+    #' @param map_cols Character vector of columns to convert to MAP format
+    #' @return Invisible NULL (called for side effects)
+    commit_append = function(
+      x,
+      table_name,
+      autoincrement_cols = character(0),
+      map_cols = character(0)
+    ) {
+      file_info <- private$get_file_path(table_name)
+
+      if (!file_info$exists) {
+        return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols))
+      }
+
+      self$attach_table(table_name)
+      on.exit(self$detach_table(table_name), add = TRUE)
+      private$set_autoincrement_vars(table_name, autoincrement_cols)
+
+      private$register_new_data_v(x, map_cols)
+      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
+
+      ordinary_cols <- setdiff(names(x), autoincrement_cols)
+      select_new <- glue::glue_collapse(
+        c(
+          glue::glue(
+            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
+          ),
+          ordinary_cols
+        ),
+        sep = ",\n "
+      )
+
+      # Concatenation using UNION ALL; "by name" handles missing columns
+      self$execute(glue::glue(
+        r"{
+          copy (
+            select * from {table_name}
+            union all by name
+            select {select_new}
+            from new_data_v
+          )
+          to '{file_info$path}' ({self$writeopts})
+          }"
+      ))
+
+      invisible(NULL)
+    },
+
+    #' @description
+    #' Commit data in upsert mode (update existing, insert new)
+    #' @param x Data frame to commit
+    #' @param table_name Character string table name
+    #' @param key_cols Character vector of columns that define uniqueness
+    #' @param autoincrement_cols Character vector of column names to auto-increment
+    #' @param map_cols Character vector of columns to convert to MAP format
+    #' @return Invisible NULL (called for side effects)
+    commit_upsert = function(
+      x,
+      table_name,
+      key_cols = grep("^id_", names(x), value = TRUE),
+      autoincrement_cols = character(0),
+      map_cols = character(0)
+    ) {
+      file_info <- private$get_file_path(table_name)
+
+      if (!file_info$exists) {
+        return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols))
+      } else if (length(key_cols) == 0) {
+        return(self$commit_append(x, table_name, autoincrement_cols, map_cols))
+      }
+
+      self$attach_table(table_name)
+      on.exit(self$detach_table(table_name), add = TRUE)
+
+      private$set_autoincrement_vars(table_name, autoincrement_cols)
+
+      private$register_new_data_v(x, map_cols)
+      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
+
+      # Update existing data
+      ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols))
+      update_select_expr <- glue::glue_collapse(
+        glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"),
+        sep = ",\n "
+      )
+      update_join_condition <- glue::glue_collapse(
+        glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"),
+        sep = "\nand "
+      )
+
+      self$execute(glue::glue(
+        r"{
+        update {table_name} set
+          {update_select_expr}
+        from new_data_v
+        where
+          {update_join_condition};
+        }"
+      ))
+
+      # Insert new data
+      insert_select_expr <- glue::glue_collapse(
+        c(
+          glue::glue(
+            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
+          ),
+          glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}")
+        ),
+        sep = ",\n "
+      )
+      null_condition <- glue::glue_collapse(
+        glue::glue("{table_name}.{key_cols} is null"),
+        sep = "\nand "
+      )
+
+      self$execute(glue::glue(
+        r"{
+        insert into {table_name}
+        select
+          {insert_select_expr}
+        from
+          new_data_v
+        left join
+          {table_name}
+        on
+          {update_join_condition}
+        where
+          {null_condition}
+        ;
+        }"
+      ))
+
+      self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})"))
+
+      invisible(NULL)
+    },
+
+    #' @description
+    #' Print method for parquet_duckdb
+    #' @param ... Not used
+    #' @return self (invisibly)
+    print = function(...) {
+      cat("<parquet_duckdb>\n")
+      cat(sprintf("Database path: %s\n", self$path))
+      cat(sprintf("Default format: %s\n", self$default_format))
+      cat(sprintf("Tables: %d\n", length(self$list_tables())))
+      invisible(self)
+    }
+  ),
+
+  ## Private Methods ----
+  private = list(
+    # R6 hook called on gc(); Close the database connection
+    finalize = function() {
+      if (!is.null(self$connection)) {
+        DBI::dbDisconnect(self$connection)
+        self$connection <- NULL
+      }
+    },
+
+    # Get file path and format for a table
+    #
+    # @param table_name Character string table name
+    # @return List with path, format, and exists flag
+    get_file_path = function(table_name) {
+      # Check for parquet first, then csv
+      parquet_path <- file.path(self$path, paste0(table_name, ".parquet"))
+      csv_path <- file.path(self$path, paste0(table_name, ".csv"))
+
+      if (file.exists(parquet_path)) {
+        return(list(path = parquet_path, format = "parquet", exists = TRUE))
+      } else if (file.exists(csv_path)) {
+        return(list(path = csv_path, format = "csv", exists = TRUE))
+      } else {
+        # Return default format for new file
+        default_path <- file.path(
+          self$path,
+          paste0(table_name, ".", self$default_format)
+        )
+        return(list(
+          path = default_path,
+          format = self$default_format,
+          exists = FALSE
+        ))
+      }
+    },
+
+    # Register new_data_v table, optionally converting MAP columns
+    #
+    # @param x Data to register
+    # @param map_cols Character vector of columns to convert to MAP format
+    # @return NULL (called for side effects)
+    register_new_data_v = function(x, map_cols = character(0)) {
+      if (length(map_cols) == 0) {
+        # No MAP conversion needed - register directly
+        duckdb::duckdb_register(self$connection, "new_data_v", x)
+      } else {
+        # Convert list columns to key-value dataframes
+        x <-
+          data.table::copy(x) |>
+          convert_list_cols(map_cols, list_to_kv_df)
+
+        # Register as intermediate table
+        duckdb::duckdb_register(self$connection, "new_data_raw", x)
+
+        # Build SELECT expression with map_from_entries for MAP columns
+        map_exprs <- glue::glue("map_from_entries({map_cols}) as {map_cols}")
+        other_cols <- setdiff(names(x), map_cols)
+        all_exprs <- c(other_cols, map_exprs)
+        select_expr <- glue::glue_collapse(all_exprs, sep = ", ")
+
+        # Create new_data_v from new_data_raw
+        self$execute(glue::glue(
+          "create temp table new_data_v as select {select_expr} from new_data_raw"
+        ))
+      }
+
+      invisible(NULL)
+    },
+
+    # Cleanup new_data_v and related tables
+    #
+    # @param map_cols Character vector indicating if MAP conversion was used
+    # @return NULL (called for side effects)
+    cleanup_new_data_v = function(map_cols = character(0)) {
+      if (length(map_cols) == 0) {
+        duckdb::duckdb_unregister(self$connection, "new_data_v")
+      } else {
+        self$execute(
+          "drop table if exists new_data_v;
+          drop view if exists new_data_v"
+        )
+        duckdb::duckdb_unregister(self$connection, "new_data_raw")
+      }
+
+      invisible(NULL)
+    },
+
+    # Set DuckDB variables max_{colname} for autoincrement columns
+    #
+    # @param table_name Character string table name
+    # @param autoincrement_cols Character vector of column names
+    # @return NULL (called for side effects)
+    set_autoincrement_vars = function(table_name, autoincrement_cols) {
+      if (length(autoincrement_cols) == 0L) {
+        return(NULL)
+      }
+
+      if (!DBI::dbExistsTable(self$connection, table_name)) {
+        # Attach/detach if not already attached
+        self$attach_table(table_name)
+        on.exit(self$detach_table(table_name))
+      }
+
+      existing_cols <-
+        glue::glue("select column_name from (describe {table_name})") |>
+        self$get_query() |>
+        (\(x) x[[1]])() |>
+        intersect(autoincrement_cols)
+
+      missing_cols <- setdiff(autoincrement_cols, existing_cols)
+
+      set_exprs <- glue::glue_collapse(
+        c(
+          glue::glue(
+            "set variable max_{existing_cols} =
+              (select coalesce(max({existing_cols}), 0) from {table_name});"
+          ),
+          glue::glue(
+            "set variable max_{missing_cols} = 0;"
+          )
+        ),
+        sep = "\n"
+      )
+
+      self$execute(set_exprs)
+      invisible(NULL)
+    }
+  )
+)
diff --git a/inst/tinytest/test_evoland_db.R b/inst/tinytest/test_evoland_db.R
index b84f060..4a73bc6 100644
--- a/inst/tinytest/test_evoland_db.R
+++ b/inst/tinytest/test_evoland_db.R
@@ -1,10 +1,12 @@
-# Test that a new database can be set up using evoland_db$new()
+# Test evoland_db domain-specific functionality
+# Generic parquet_duckdb tests are in test_parquet_duckdb.R
 library(tinytest)
 
 # Create temporary directory for testing
 test_dir <- tempfile("evoland_test_")
 on.exit(unlink(test_dir, recursive = TRUE), add = TRUE)
 
+# Test 1: evoland_db initialization with reporting
 expect_silent(
   db <- evoland_db$new(
     path = test_dir,
@@ -13,13 +15,15 @@ expect_silent(
   )
 )
 expect_true(inherits(db, "evoland_db"))
+expect_true(inherits(db, "parquet_duckdb")) # Should inherit from parent
 
-# In folder-based storage, only reporting_t exists initially
-# Other tables are created on demand
-expected_tables_initial <- c("reporting_t")
-expect_identical(db$list_tables(), expected_tables_initial)
-
+# Test 2: Reporting table is created and populated
+expect_identical(db$list_tables(), "reporting_t")
 reporting1 <- db$fetch("reporting_t")
+expect_true("report_name" %in% reporting1$key)
+expect_true("tinytest" %in% reporting1$value)
+
+# Test 3: Persistence of reporting data
 rm(db)
 gc()
 db <- evoland_db$new(
@@ -34,9 +38,7 @@ expect_equal(
   sort(reporting2$key)
 )
 
-
-# Check that accessing non-existent tables returns empty data.tables
-# (these tables don't appear in list_tables() until they have data)
+# Test 4: Domain-specific empty table structures via active bindings
 empty_tables <- c(
   "alloc_params_t",
   "coords_t",
@@ -57,7 +59,7 @@ for (table in empty_tables) {
   expect_equal(nrow(db[[table]]), 0L)
 }
 
-# Create synthetic, minimal test data
+# Test 5: Create synthetic evoland-specific test data
 coords_t <- create_coords_t_square(
   epsg = 2056,
   extent = terra::ext(c(
@@ -230,8 +232,7 @@ intrv_masks_t <- as_intrv_masks_t(
   )
 )
 
-# Test DB roundtrips & integrity checks, repeated assignments (~upserts)
-# TODO make this into a testing harness that iterates over a list of sample data
+# Test 6: Active bindings - coords_t
 expect_silent(db$coords_t <- coords_t)
 expect_silent(db$coords_t <- coords_t)
 expect_identical(db$coords_t, coords_t)
@@ -240,12 +241,15 @@ expect_identical(
   data.table::as.data.table(coords_t[, 1:3])
 )
 
+# Test 7: Active bindings - lulc_meta_t
 expect_silent(db$lulc_meta_t <- lulc_meta_t)
 expect_identical(db$lulc_meta_t, lulc_meta_t)
 
+# Test 8: Active bindings - periods_t
 expect_silent(db$periods_t <- periods_t)
 expect_identical(db$periods_t, periods_t)
 
+# Test 9: Active bindings - pred_meta_t with auto-increment
 expect_silent(db$pred_meta_t <- pred_meta_t[1, ])
 expect_silent(db$pred_meta_t <- pred_meta_t)
 # After committing, id_pred should be assigned (1:2 in this case)
@@ -254,20 +258,24 @@ expect_equal(retrieved_pred_meta[["id_pred"]], 1:2)
 expect_equal(retrieved_pred_meta[["name"]], c("noise", "distance_to_lake"))
 db$delete_from("pred_meta_t") # clear up, we want to ensure this works with add_predictor
 
+# Test 10: Active bindings - intrv_meta_t (with MAP columns)
 expect_silent(db$intrv_meta_t <- intrv_meta_t)
 expect_equal(db$intrv_meta_t, intrv_meta_t)
 
+# Test 11: Active bindings - trans_meta_t
 expect_silent(db$trans_meta_t <- trans_meta_t)
 expect_equal(db$trans_meta_t[, c(-1)], trans_meta_t)
 
+# Test 12: Active bindings - trans_models_t (with MAP and BLOB columns)
 expect_silent(db$trans_models_t <- trans_models_t)
 expect_equal(db$trans_models_t, trans_models_t)
 
-# now that we have the trans_meta we shouldn't be violating the FK anymore
+# Test 13: Active bindings - alloc_params_t (with MAP columns)
 expect_silent(db$alloc_params_t <- alloc_params_t)
 expect_equal(db$alloc_params_t, alloc_params_t)
 
-# repeated upsert should be idempotent
+# Test 14: add_predictor method - first predictor
+# Test 15: add_predictor method - second predictor
 expect_silent(
   db$add_predictor(
     pred_spec = pred_spec["noise"],
@@ -284,17 +292,19 @@ expect_silent(
     pred_type = "float"
   )
 )
-# check that add_predictor works for metadata
+# Test 16: add_predictor correctly updates metadata
 retrieved_pred_meta <- db$pred_meta_t
 expect_equal(retrieved_pred_meta[["id_pred"]], 1:2)
 expect_equal(retrieved_pred_meta[["name"]], c("noise", "distance_to_lake"))
 
-# check that add_predictor works for data
+# Test 17: add_predictor correctly updates data
 expect_equal(db$row_count("pred_data_t_float"), 48L)
+# Test 18: Active bindings - pred_data_t_float upsert is idempotent
 expect_silent(db$pred_data_t_float <- pred_data_t)
 expect_equal(db$row_count("pred_data_t_float"), 48L)
 expect_silent(db$pred_data_t_float <- pred_data_t)
 expect_equal(db$row_count("pred_data_t_float"), 48L)
+# Test 19: Active bindings enforce type validation
 expect_error(
   # should only be able to insert the correct class
   db$pred_data_t_float <- data.table::data.table(
@@ -306,6 +316,7 @@ expect_error(
   r"(^inherits.* is not TRUE$)"
 )
 
+# Test 20: lulc_data_t requires proper type
 expect_error(
   db$lulc_data_t <- lulc_data_dt <- data.table::data.table(
     id_coord = c(1L, 2L, 1L),
@@ -318,146 +329,77 @@ expect_silent(
   db$lulc_data_t <- as_lulc_data_t(lulc_data_dt)
 )
 
-# Test delete_from functionality
-# Setup: we already have pred_data_t_float with 48 rows (id_pred 1:2, id_coord 3:50, id_period 1)
-expect_equal(db$row_count("pred_data_t_float"), 48L)
-
-# Test 1: Delete with WHERE clause - delete specific predictor
-deleted_count <- db$delete_from("pred_data_t_float", where = "id_pred = 1")
-expect_equal(deleted_count, 24L)
-expect_equal(db$row_count("pred_data_t_float"), 24L)
-
-# Verify only id_pred = 2 remains
-remaining <- db$pred_data_t_float
-expect_equal(unique(remaining$id_pred), 2L)
-expect_equal(nrow(remaining), 24L)
+# Test 21: Domain-specific view - coords_minimal
+coords_minimal <- db$coords_minimal
+expect_true(inherits(coords_minimal, "data.table"))
+expect_equal(ncol(coords_minimal), 3L)
+expect_true(all(c("id_coord", "lon", "lat") %in% names(coords_minimal)))
+expect_equal(nrow(coords_minimal), nrow(coords_t))
+
+# Test 22: Domain-specific view - extent
+# Set up coords first
+db$coords_t <- coords_t
+extent <- db$extent
+expect_true(inherits(extent, "SpatExtent"))
+
+# Test 23: Domain-specific view - lulc_meta_long_v
+db$lulc_meta_t <- lulc_meta_t
+lulc_long <- db$lulc_meta_long_v
+expect_true(inherits(lulc_long, "data.table"))
+expect_true("src_class" %in% names(lulc_long))
+# Should have one row per src_class
+expect_true(nrow(lulc_long) > nrow(lulc_meta_t))
+
+# Test 24: Domain-specific view - pred_sources_v
+db$pred_meta_t <- pred_meta_t
+sources <- db$pred_sources_v
+expect_true(inherits(sources, "data.table"))
+expect_true(all(c("url", "md5sum") %in% names(sources)))
+expect_true(nrow(sources) > 0L)
+
+# Test 25: set_coords method
+test_dir_coords <- tempfile("evoland_coords_")
+on.exit(unlink(test_dir_coords, recursive = TRUE), add = TRUE)
+db_coords <- evoland_db$new(test_dir_coords)
 
-# Test 2: Delete with complex WHERE clause
-# Add back some data first
-db$pred_data_t_float <- pred_data_t
-expect_equal(db$row_count("pred_data_t_float"), 48L)
-
-# Delete only specific coordinates
-deleted_count <- db$delete_from("pred_data_t_float", where = "id_coord < 10")
-expect_true(deleted_count > 0L)
-remaining <- db$pred_data_t_float
-expect_true(all(remaining$id_coord >= 10))
-
-# Test 3: Delete all rows (NULL where clause)
-count_before_delete <- db$row_count("pred_data_t_float")
-deleted_count <- db$delete_from("pred_data_t_float", where = NULL)
-expect_equal(deleted_count, count_before_delete)
-expect_equal(db$row_count("pred_data_t_float"), 0L)
-
-# Test 4: Delete from non-existent table returns 0
-deleted_count <- db$delete_from("nonexistent_table", where = "id = 1")
-expect_equal(deleted_count, 0L)
-
-# Test 5: Delete with WHERE that matches nothing
-db$pred_data_t_float <- pred_data_t
-initial_count <- db$row_count("pred_data_t_float")
-deleted_count <- db$delete_from("pred_data_t_float", where = "id_pred = 999")
-expect_equal(deleted_count, 0L)
-expect_equal(db$row_count("pred_data_t_float"), initial_count)
-
-# Test auto-increment functionality
-# Create a new test database for auto-increment tests
-test_dir_autoinc <- tempfile("evoland_autoinc_")
-on.exit(unlink(test_dir_autoinc, recursive = TRUE), add = TRUE)
-db_autoinc <- evoland_db$new(test_dir_autoinc)
-
-# Test 1: Auto-increment on overwrite mode (new table)
-test_data_1 <- data.table::data.table(
-  name = c("predictor_a", "predictor_b", "predictor_c"),
-  unit = c("m", "kg", "s")
-)
-db_autoinc$commit_overwrite(
-  test_data_1,
-  "test_autoinc_t",
-  autoincrement_cols = "id_test"
-)
-result_1 <- db_autoinc$fetch("test_autoinc_t")
-expect_equal(result_1$id_test, 1:3)
-expect_equal(result_1$name, c("predictor_a", "predictor_b", "predictor_c"))
-
-# Test 2: Auto-increment on append mode
-test_data_2 <- data.table::data.table(
-  name = c("predictor_d", "predictor_e"),
-  unit = c("A", "V")
-)
-db_autoinc$commit_append(
-  test_data_2,
-  "test_autoinc_t",
-  autoincrement_cols = "id_test"
-)
-result_2 <- db_autoinc$fetch("test_autoinc_t")
-expect_equal(nrow(result_2), 5L)
-expect_equal(result_2$id_test, 1:5)
-expect_equal(result_2$name[4:5], c("predictor_d", "predictor_e"))
-
-# Test 3: Auto-increment on upsert mode with new rows
-test_data_3 <- data.table::data.table(
-  name = c("predictor_f", "predictor_g"),
-  unit = c("W", "J")
-)
-db_autoinc$commit_upsert(
-  test_data_3,
-  "test_autoinc_t",
-  autoincrement_cols = "id_test"
-)
-result_3 <- db_autoinc$fetch("test_autoinc_t")
-expect_equal(nrow(result_3), 7L)
-expect_equal(result_3$id_test, 1:7)
-
-# Test 4: Auto-increment preserves existing IDs in data
-test_data_4 <- data.table::data.table(
-  id_test = c(NA, 100L, NA),
-  name = c("new_a", "existing", "new_b"),
-  unit = c("x", "y", "z")
+expect_silent(
+  db_coords$set_coords(
+    type = "square",
+    epsg = 2056,
+    extent = terra::ext(c(xmin = 2697000, xmax = 2698000, ymin = 1252000, ymax = 1253000)),
+    resolution = 100
+  )
 )
+expect_true(db_coords$row_count("coords_t") > 0L)
+
+# Should refuse to overwrite
 expect_warning(
-  db_autoinc$commit_overwrite(
-    test_data_4,
-    "test_autoinc2_t",
-    autoincrement_cols = "id_test"
+  db_coords$set_coords(
+    type = "square",
+    epsg = 2056,
+    extent = terra::ext(c(xmin = 2697000, xmax = 2698000, ymin = 1252000, ymax = 1253000)),
+    resolution = 100
   ),
-  "Overriding existing IDs"
+  "not empty"
 )
-result_4 <- db_autoinc$fetch("test_autoinc2_t")
-expect_equal(result_4$id_test, 1:3)
 
-# Test 5: Multiple auto-increment columns
-test_data_5 <- data.table::data.table(
-  name = c("item1", "item2"),
-  value = c(10, 20)
-)
-db_autoinc$commit_overwrite(
-  test_data_5,
-  "test_multi_autoinc_t",
-  autoincrement_cols = c("id_a", "id_b")
-)
-result_5 <- db_autoinc$fetch("test_multi_autoinc_t")
-expect_equal(result_5$id_a, 1:2)
-expect_equal(result_5$id_b, 1:2)
-
-# Test 6: Auto-increment continues from max in append
-test_data_6a <- data.table::data.table(
-  id_seq = c(5L, 10L, 15L),
-  value = c(100, 200, 300)
-)
-db_autoinc$commit_overwrite(
-  test_data_6a,
-  "test_continue_t"
-)
-test_data_6b <- data.table::data.table(
-  value = c(400, 500)
+# Test 26: set_periods method
+test_dir_periods <- tempfile("evoland_periods_")
+on.exit(unlink(test_dir_periods, recursive = TRUE), add = TRUE)
+db_periods <- evoland_db$new(test_dir_periods)
+
+expect_silent(
+  db_periods$set_periods(
+    period_length_str = "P10Y",
+    start_observed = "1985-01-01",
+    end_observed = "2020-01-01",
+    end_extrapolated = "2060-01-01"
+  )
 )
-db_autoinc$commit_append(
-  test_data_6b,
-  "test_continue_t",
-  autoincrement_cols = "id_seq"
+expect_true(db_periods$row_count("periods_t") > 0L)
+
+# Should refuse to overwrite
+expect_warning(
+  db_periods$set_periods(),
+  "not empty"
 )
-result_6 <- db_autoinc$fetch("test_continue_t")
-expect_equal(nrow(result_6), 5L)
-expect_equal(result_6$id_seq[4:5], c(16L, 17L))
-expect_equal(result_6$value[4:5], c(400, 500))
diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R
new file mode 100644
index 0000000..fe9b188
--- /dev/null
+++ b/inst/tinytest/test_parquet_duckdb.R
@@ -0,0 +1,349 @@
+# Test generic parquet_duckdb functionality
+library(tinytest)
+
+# Create temporary directory for testing
+test_dir <- tempfile("parquet_duckdb_test_")
+on.exit(unlink(test_dir, recursive = TRUE), add = TRUE)
+
+# Test 1: Initialization
+expect_silent(
+  db <- parquet_duckdb$new(
+    path = test_dir,
+    default_format = "parquet"
+  )
+)
+expect_true(inherits(db, "parquet_duckdb"))
+expect_true(dir.exists(test_dir))
+expect_equal(db$default_format, "parquet")
+expect_true(!is.null(db$connection))
+expect_true(inherits(db$connection, "duckdb_connection"))
+
+# Test 2: Initial state - no tables
+expect_identical(db$list_tables(), character(0))
+
+# Test 3: Fetch from non-existent table returns empty data.table
+result <- db$fetch("nonexistent_table")
+expect_true(inherits(result, "data.table"))
+expect_equal(nrow(result), 0L)
+
+# Test 4: Row count for non-existent table
+expect_equal(db$row_count("nonexistent_table"), 0L)
+
+# Test 5: commit_overwrite creates new table
+test_data_1 <- data.table::data.table(
+  id = 1:5,
+  name = letters[1:5],
+  value = c(10.1, 20.2, 30.3, 40.4, 50.5)
+)
+expect_silent(
+  db$commit_overwrite(test_data_1, "test_table_1")
+)
+expect_true("test_table_1" %in% db$list_tables())
+expect_equal(db$row_count("test_table_1"), 5L)
+
+# Test 6: Fetch retrieves committed data
+retrieved <- db$fetch("test_table_1")
+expect_equal(retrieved, test_data_1)
+
+# Test 7: commit_overwrite replaces existing data
+test_data_1b <- data.table::data.table(
+  id = 10:12,
+  name = letters[24:26],
+  value = c(100.1, 200.2, 300.3)
+)
+expect_silent(
+  db$commit_overwrite(test_data_1b, "test_table_1")
+)
+expect_equal(db$row_count("test_table_1"), 3L)
+retrieved <- db$fetch("test_table_1")
+expect_equal(retrieved, test_data_1b)
+
+# Test 8: commit_append adds to existing data
+test_data_1c <- data.table::data.table(
+  id = 13:15,
+  name = letters[1:3],
+  value = c(111.1, 222.2, 333.3)
+)
+expect_silent(
+  db$commit_append(test_data_1c, "test_table_1")
+)
+expect_equal(db$row_count("test_table_1"), 6L)
+retrieved <- db$fetch("test_table_1")
+expect_equal(nrow(retrieved), 6L)
+expect_true(all(c(10:15) %in% retrieved$id))
+
+# Test 9: commit_append on non-existent table creates it
+expect_silent(
+  db$commit_append(test_data_1, "test_table_2")
+)
+expect_true("test_table_2" %in% db$list_tables())
+expect_equal(db$row_count("test_table_2"), 5L)
+
+# Test 10: commit_upsert on non-existent table creates it
+test_data_3 <- data.table::data.table(
+  id_key = 1:3,
+  name = c("a", "b", "c"),
+  value = c(1.1, 2.2, 3.3)
+)
+expect_silent(
+  db$commit_upsert(test_data_3, "test_table_3", key_cols = "id_key")
+)
+expect_true("test_table_3" %in% db$list_tables())
+expect_equal(db$row_count("test_table_3"), 3L)
+
+# Test 11: commit_upsert updates existing rows and inserts new ones
+test_data_3b <- data.table::data.table(
+  id_key = c(2L, 3L, 4L),
+  name = c("b_updated", "c_updated", "d"),
+  value = c(22.2, 33.3, 44.4)
+)
+expect_silent(
+  db$commit_upsert(test_data_3b, "test_table_3", key_cols = "id_key")
+)
+expect_equal(db$row_count("test_table_3"), 4L)
+retrieved <- db$fetch("test_table_3")
+expect_equal(nrow(retrieved), 4L)
+expect_equal(retrieved[id_key == 2]$name, "b_updated")
+expect_equal(retrieved[id_key == 3]$value, 33.3)
+expect_equal(retrieved[id_key == 4]$name, "d")
+expect_equal(retrieved[id_key == 1]$name, "a") # unchanged
+
+# Test 12: Fetch with WHERE clause
+result <- db$fetch("test_table_3", where = "id_key >= 3")
+expect_equal(nrow(result), 2L)
+expect_true(all(result$id_key >= 3))
+
+# Test 13: Fetch with LIMIT
+result <- db$fetch("test_table_3", limit = 2)
+expect_equal(nrow(result), 2L)
+
+# Test 14: Fetch with WHERE and LIMIT
+result <- db$fetch("test_table_3", where = "value > 20", limit = 1)
+expect_equal(nrow(result), 1L)
+expect_true(result$value > 20)
+
+# Test 15: delete_from with WHERE clause
+deleted_count <- db$delete_from("test_table_3", where = "id_key = 1")
+expect_equal(deleted_count, 1L)
+expect_equal(db$row_count("test_table_3"), 3L)
+retrieved <- db$fetch("test_table_3")
+expect_false(1L %in% retrieved$id_key)
+
+# Test 16: delete_from with complex WHERE
+db$commit_overwrite(test_data_1, "test_table_4")
+deleted_count <- db$delete_from("test_table_4", where = "id < 3")
+expect_equal(deleted_count, 2L)
+retrieved <- db$fetch("test_table_4")
+expect_true(all(retrieved$id >= 3))
+
+# Test 17: delete_from with NULL (delete all)
+count_before <- db$row_count("test_table_4")
+deleted_count <- db$delete_from("test_table_4", where = NULL)
+expect_equal(deleted_count, count_before)
+expect_equal(db$row_count("test_table_4"), 0L)
+
+# Test 18: delete_from on non-existent table
+deleted_count <- db$delete_from("nonexistent", where = "id = 1")
+expect_equal(deleted_count, 0L)
+
+# Test 19: delete_from with WHERE that matches nothing
+db$commit_overwrite(test_data_1, "test_table_5")
+initial_count <- db$row_count("test_table_5")
+deleted_count <- db$delete_from("test_table_5", where = "id = 999")
+expect_equal(deleted_count, 0L)
+expect_equal(db$row_count("test_table_5"), initial_count)
+
+# Test 20: Auto-increment on overwrite (new table)
+test_autoinc_1 <- data.table::data.table(
+  name = c("item_a", "item_b", "item_c"),
+  value = c(10, 20, 30)
+)
+db$commit_overwrite(
+  test_autoinc_1,
+  "test_autoinc_1",
+  autoincrement_cols = "id"
+)
+result <- db$fetch("test_autoinc_1")
+expect_equal(result$id, 1:3)
+expect_equal(result$name, c("item_a", "item_b", "item_c"))
+
+# Test 21: Auto-increment on append
+test_autoinc_2 <- data.table::data.table(
+  name = c("item_d", "item_e"),
+  value = c(40, 50)
+)
+db$commit_append(
+  test_autoinc_2,
+  "test_autoinc_1",
+  autoincrement_cols = "id"
+)
+result <- db$fetch("test_autoinc_1")
+expect_equal(nrow(result), 5L)
+expect_equal(result$id, 1:5)
+expect_equal(result$name[4:5], c("item_d", "item_e"))
+
+# Test 22: Auto-increment on upsert with new rows
+test_autoinc_3 <- data.table::data.table(
+  name = c("item_f", "item_g"),
+  value = c(60, 70)
+)
+db$commit_upsert(
+  test_autoinc_3,
+  "test_autoinc_1",
+  autoincrement_cols = "id"
+)
+result <- db$fetch("test_autoinc_1")
+expect_equal(nrow(result), 7L)
+expect_equal(result$id, 1:7)
+
+# Test 23: Auto-increment warning when overriding existing IDs
+test_autoinc_with_ids <- data.table::data.table(
+  id = c(NA, 100L, NA),
+  name = c("new_a", "existing", "new_b"),
+  value = c(1, 2, 3)
+)
+expect_warning(
+  db$commit_overwrite(
+    test_autoinc_with_ids,
+    "test_autoinc_2",
+    autoincrement_cols = "id"
+  ),
+  "Overriding existing IDs"
+)
+result <- db$fetch("test_autoinc_2")
+expect_equal(result$id, 1:3)
+
+# Test 24: Multiple auto-increment columns
+test_multi_autoinc <- data.table::data.table(
+  name = c("item1", "item2"),
+  value = c(10, 20)
+)
+db$commit_overwrite(
+  test_multi_autoinc,
+  "test_multi_autoinc",
+  autoincrement_cols = c("id_a", "id_b")
+)
+result <- db$fetch("test_multi_autoinc")
+expect_equal(result$id_a, 1:2)
+expect_equal(result$id_b, 1:2)
+
+# Test 25: Auto-increment continues from max
+test_continue_a <- data.table::data.table(
+  id_seq = c(5L, 10L, 15L),
+  value = c(100, 200, 300)
+)
+db$commit_overwrite(test_continue_a, "test_continue")
+
+test_continue_b <- data.table::data.table(
+  value = c(400, 500)
+)
+db$commit_append(
+  test_continue_b,
+  "test_continue",
+  autoincrement_cols = "id_seq"
+)
+result <- db$fetch("test_continue")
+expect_equal(nrow(result), 5L)
+expect_equal(result$id_seq[4:5], c(16L, 17L))
+expect_equal(result$value[4:5], c(400, 500))
+
+# Test 26: attach_table and detach_table
+db$commit_overwrite(test_data_1, "test_attach")
+expect_silent(db$attach_table("test_attach"))
+# Verify table is attached by querying it directly
+result <- db$get_query("SELECT COUNT(*) as n FROM test_attach")
+expect_equal(result$n, 5L)
+expect_silent(db$detach_table("test_attach"))
+# After detach, table should not be accessible
+expect_error(
+  db$get_query("SELECT COUNT(*) as n FROM test_attach"),
+  "test_attach"
+)
+
+# Test 27: attach_table with column selection
+db$attach_table("test_attach", columns = c("id", "name"))
+result <- db$get_query("SELECT * FROM test_attach")
+expect_equal(ncol(result), 2L)
+expect_true(all(c("id", "name") %in% names(result)))
+expect_false("value" %in% names(result))
+db$detach_table("test_attach")
+
+# Test 28: attach_table with WHERE clause
+db$attach_table("test_attach", where = "id > 3")
+result <- db$get_query("SELECT * FROM test_attach")
+expect_equal(nrow(result), 2L)
+expect_true(all(result$id > 3))
+db$detach_table("test_attach")
+
+# Test 29: execute() method
+db$attach_table("test_attach")
+rows_affected <- db$execute("DELETE FROM test_attach WHERE id = 1")
+expect_true(rows_affected >= 0) # DuckDB returns number of affected rows
+db$detach_table("test_attach")
+
+# Test 30: get_query() method
+db$attach_table("test_attach")
+result <- db$get_query("SELECT MAX(id) as max_id FROM test_attach")
+expect_true(inherits(result, "data.table"))
+expect_true("max_id" %in% names(result))
+db$detach_table("test_attach")
+
+# Test 31: CSV format support
+test_dir_csv <- tempfile("parquet_duckdb_csv_")
+on.exit(unlink(test_dir_csv, recursive = TRUE), add = TRUE)
+
+db_csv <- parquet_duckdb$new(
+  path = test_dir_csv,
+  default_format = "csv"
+)
+expect_equal(db_csv$default_format, "csv")
+
+test_csv_data <- data.table::data.table(
+  id = 1:3,
+  name = c("a", "b", "c")
+)
+db_csv$commit_overwrite(test_csv_data, "csv_table")
+expect_true("csv_table" %in% db_csv$list_tables())
+retrieved <- db_csv$fetch("csv_table")
+expect_equal(retrieved, test_csv_data)
+
+# Test 32: Extension loading
+test_dir_ext <- tempfile("parquet_duckdb_ext_")
+on.exit(unlink(test_dir_ext, recursive = TRUE), add = TRUE)
+
+db_ext <- parquet_duckdb$new(
+  path = test_dir_ext,
+  extensions = "spatial"
+)
+# Verify spatial extension is loaded by using a spatial function
+expect_silent(
+  db_ext$get_query("SELECT ST_Point(0, 0) as geom")
+)
+
+# Test 33: Persistence across connections
+db$commit_overwrite(test_data_1, "persist_test")
+rm(db)
+gc()
+
+# Reconnect to same path
+db <- parquet_duckdb$new(path = test_dir)
+expect_true("persist_test" %in% db$list_tables())
+retrieved <- db$fetch("persist_test")
+expect_equal(retrieved, test_data_1)
+
+# Test 34: commit_upsert with no key_cols defaults to append
+test_no_keys <- data.table::data.table(
+  name = c("x", "y"),
+  value = c(1, 2)
+)
+db$commit_overwrite(test_no_keys, "no_keys_test")
+expect_equal(db$row_count("no_keys_test"), 2L)
+
+db$commit_upsert(test_no_keys, "no_keys_test", key_cols = character(0))
+expect_equal(db$row_count("no_keys_test"), 4L) # Should append
+
+# Test 35: Print method
+output <- capture.output(print(db))
+expect_true(any(grepl("parquet_duckdb", output)))
+expect_true(any(grepl("Database path", output)))
+expect_true(any(grepl("Default format", output)))
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index af49fa0..49cf8f8 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -8,19 +8,11 @@ An R6 class that provides an interface to a folder-based data storage system
 for the evoland package. Each table is stored as a parquet (or CSV) file.
 This class uses DuckDB for in-memory SQL operations while persisting data
 to disk in parquet format for better compression.
-}
-\section{Public fields}{
-\if{html}{\out{<div class="r6-fields">}}
-\describe{
-\item{\code{connection}}{DBI connection object to an in-memory DuckDB database}
-
-\item{\code{path}}{Character string path to the data folder}
 
-\item{\code{default_format}}{Default file format for new tables}
-
-\item{\code{writeopts}}{Default write options for DuckDB, see}
+Inherits from \link{parquet_duckdb} for generic database operations.
 }
-\if{html}{\out{</div>}}
+\section{Super class}{
+\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db}
 }
 \section{Active bindings}{
 \if{html}{\out{<div class="r6-active-bindings">}}
@@ -87,17 +79,7 @@ of object to assign. Assigning is an upsert operation.}
 \subsection{Public methods}{
 \itemize{
 \item \href{#method-evoland_db-new}{\code{evoland_db$new()}}
-\item \href{#method-evoland_db-commit_overwrite}{\code{evoland_db$commit_overwrite()}}
-\item \href{#method-evoland_db-commit_append}{\code{evoland_db$commit_append()}}
-\item \href{#method-evoland_db-commit_upsert}{\code{evoland_db$commit_upsert()}}
 \item \href{#method-evoland_db-fetch}{\code{evoland_db$fetch()}}
-\item \href{#method-evoland_db-list_tables}{\code{evoland_db$list_tables()}}
-\item \href{#method-evoland_db-execute}{\code{evoland_db$execute()}}
-\item \href{#method-evoland_db-get_query}{\code{evoland_db$get_query()}}
-\item \href{#method-evoland_db-attach_table}{\code{evoland_db$attach_table()}}
-\item \href{#method-evoland_db-detach_table}{\code{evoland_db$detach_table()}}
-\item \href{#method-evoland_db-row_count}{\code{evoland_db$row_count()}}
-\item \href{#method-evoland_db-delete_from}{\code{evoland_db$delete_from()}}
 \item \href{#method-evoland_db-set_report}{\code{evoland_db$set_report()}}
 \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}}
 \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}}
@@ -106,6 +88,23 @@ of object to assign. Assigning is an upsert operation.}
 \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}}
 }
 }
+\if{html}{\out{
+<details><summary>Inherited methods</summary>
+<ul>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="attach_table"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-attach_table'><code>evoland::parquet_duckdb$attach_table()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit_append"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit_append'><code>evoland::parquet_duckdb$commit_append()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit_overwrite"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit_overwrite'><code>evoland::parquet_duckdb$commit_overwrite()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit_upsert"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit_upsert'><code>evoland::parquet_duckdb$commit_upsert()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="delete_from"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-delete_from'><code>evoland::parquet_duckdb$delete_from()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="detach_table"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-detach_table'><code>evoland::parquet_duckdb$detach_table()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="execute"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-execute'><code>evoland::parquet_duckdb$execute()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="get_query"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-get_query'><code>evoland::parquet_duckdb$get_query()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="list_tables"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-list_tables'><code>evoland::parquet_duckdb$list_tables()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="row_count"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-row_count'><code>evoland::parquet_duckdb$row_count()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="with_tables"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-with_tables'><code>evoland::parquet_duckdb$with_tables()</code></a></span></li>
+</ul>
+</details>
+}}
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-new"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-new}{}}}
@@ -132,98 +131,10 @@ A new \code{evoland_db} object
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-commit_overwrite"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-commit_overwrite}{}}}
-\subsection{Method \code{commit_overwrite()}}{
-Commit data in overwrite mode
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$commit_overwrite(
-  x,
-  table_name,
-  autoincrement_cols = character(0),
-  map_cols = character(0)
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Data frame to commit}
-
-\item{\code{table_name}}{Character string table name}
-
-\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
-
-\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-commit_append"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-commit_append}{}}}
-\subsection{Method \code{commit_append()}}{
-Commit data in append mode
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$commit_append(
-  x,
-  table_name,
-  autoincrement_cols = character(0),
-  map_cols = character(0)
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Data frame to commit}
-
-\item{\code{table_name}}{Character string table name}
-
-\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
-
-\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-commit_upsert"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-commit_upsert}{}}}
-\subsection{Method \code{commit_upsert()}}{
-Commit data in upsert mode
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$commit_upsert(
-  x,
-  table_name,
-  key_cols = grep("^id_", names(x), value = TRUE),
-  autoincrement_cols = character(0),
-  map_cols = character(0)
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Data frame to commit}
-
-\item{\code{table_name}}{Character string table name}
-
-\item{\code{key_cols}}{Identify unique columns - heuristic: if prefixed with
-id_, the set of all columns designates a uniqueness condition}
-
-\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
-
-\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-fetch"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-fetch}{}}}
 \subsection{Method \code{fetch()}}{
-Fetch data from storage
+Fetch data from storage with evoland-specific view support
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{evoland_db$fetch(table_name, where = NULL, limit = NULL)}\if{html}{\out{</div>}}
 }
@@ -244,141 +155,6 @@ A data.table
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-list_tables"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-list_tables}{}}}
-\subsection{Method \code{list_tables()}}{
-List all tables (files) in storage
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$list_tables()}\if{html}{\out{</div>}}
-}
-
-\subsection{Returns}{
-Character vector of table names
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-execute"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-execute}{}}}
-\subsection{Method \code{execute()}}{
-Execute statement
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$execute(statement)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{statement}}{A SQL statement}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-No. of rows affected by statement
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-get_query"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-get_query}{}}}
-\subsection{Method \code{get_query()}}{
-Get Query
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$get_query(statement)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{statement}}{A SQL statement}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-No. of rows affected by statement
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-attach_table"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-attach_table}{}}}
-\subsection{Method \code{attach_table()}}{
-Attach a single table as a temporary table in DuckDB memory. This is
-useful for working with multiple tables in SQL queries instead of loading
-them into R objects.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{table_name}}{Character vector. Names of table to attach.}
-
-\item{\code{columns}}{Character vector. Optional sql column selection, defaults to "*"}
-
-\item{\code{where}}{A SQL where statement to optionally subset the table being attached.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-detach_table"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-detach_table}{}}}
-\subsection{Method \code{detach_table()}}{
-Detach one or more tables from the database.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$detach_table(table_name)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{table_name}}{Character. Name of table to drop.}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-row_count"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-row_count}{}}}
-\subsection{Method \code{row_count()}}{
-Get table row count
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$row_count(table_name)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{table_name}}{Character string. Name of the table to query.}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-No. of rows
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-delete_from"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-delete_from}{}}}
-\subsection{Method \code{delete_from()}}{
-Delete rows from a table
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$delete_from(table_name, where = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{table_name}}{Character string. Name of the table to delete from.}
-
-\item{\code{where}}{Character string, defaults to NULL: delete everything in table.}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-No. of rows affected
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-set_report"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-set_report}{}}}
 \subsection{Method \code{set_report()}}{
diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd
index d149f48..a080c24 100644
--- a/man/evoland_db_views.Rd
+++ b/man/evoland_db_views.Rd
@@ -7,6 +7,7 @@
 \alias{make_coords_minimal}
 \alias{make_extent_db}
 \alias{make_transitions_v}
+\alias{make_trans_pred_data_v}
 \title{Views on the evoland-plus data model}
 \usage{
 make_pred_sources_v(self, private, where = NULL)
@@ -18,6 +19,14 @@ make_coords_minimal(self, private, where = NULL)
 make_extent_db(self, private)
 
 make_transitions_v(self, private, where = NULL)
+
+make_trans_pred_data_v(self, private, id_trans)
+}
+\arguments{
+\item{id_trans}{Integer, the transition ID to generate data for}
+}
+\value{
+data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N
 }
 \description{
 Functions to generate views on the database
@@ -35,4 +44,7 @@ md5sum
 
 \item \code{make_transitions_v()}: Returns transitions based on lulc_data_t
 
+\item \code{make_trans_pred_data_v()}: Returns wide table of transition results and predictor data
+for a specific transition. Used as input to covariance filtering.
+
 }}
diff --git a/man/parquet_duckdb.Rd b/man/parquet_duckdb.Rd
new file mode 100644
index 0000000..964cd90
--- /dev/null
+++ b/man/parquet_duckdb.Rd
@@ -0,0 +1,395 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet_duckdb.R
+\name{parquet_duckdb}
+\alias{parquet_duckdb}
+\title{R6 Base Class for Parquet-Backed DuckDB Storage}
+\description{
+A domain-agnostic R6 class that provides an interface to a folder-based data
+storage system using DuckDB for in-memory SQL operations and parquet files
+for efficient on-disk persistence. This class can be inherited by
+domain-specific database classes.
+}
+\section{Public fields}{
+\if{html}{\out{<div class="r6-fields">}}
+\describe{
+\item{\code{connection}}{DBI connection object to an in-memory DuckDB database}
+
+\item{\code{path}}{Character string path to the data folder}
+
+\item{\code{default_format}}{Default file format for new tables}
+
+\item{\code{writeopts}}{Default write options for DuckDB}
+}
+\if{html}{\out{</div>}}
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-parquet_duckdb-new}{\code{parquet_duckdb$new()}}
+\item \href{#method-parquet_duckdb-execute}{\code{parquet_duckdb$execute()}}
+\item \href{#method-parquet_duckdb-get_query}{\code{parquet_duckdb$get_query()}}
+\item \href{#method-parquet_duckdb-attach_table}{\code{parquet_duckdb$attach_table()}}
+\item \href{#method-parquet_duckdb-detach_table}{\code{parquet_duckdb$detach_table()}}
+\item \href{#method-parquet_duckdb-row_count}{\code{parquet_duckdb$row_count()}}
+\item \href{#method-parquet_duckdb-list_tables}{\code{parquet_duckdb$list_tables()}}
+\item \href{#method-parquet_duckdb-with_tables}{\code{parquet_duckdb$with_tables()}}
+\item \href{#method-parquet_duckdb-fetch}{\code{parquet_duckdb$fetch()}}
+\item \href{#method-parquet_duckdb-delete_from}{\code{parquet_duckdb$delete_from()}}
+\item \href{#method-parquet_duckdb-commit_overwrite}{\code{parquet_duckdb$commit_overwrite()}}
+\item \href{#method-parquet_duckdb-commit_append}{\code{parquet_duckdb$commit_append()}}
+\item \href{#method-parquet_duckdb-commit_upsert}{\code{parquet_duckdb$commit_upsert()}}
+\item \href{#method-parquet_duckdb-print}{\code{parquet_duckdb$print()}}
+\item \href{#method-parquet_duckdb-clone}{\code{parquet_duckdb$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-new"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-new}{}}}
+\subsection{Method \code{new()}}{
+Initialize a new parquet_duckdb object
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$new(
+  path,
+  default_format = c("parquet", "csv"),
+  extensions = character(0)
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{path}}{Character string. Path to the data folder.}
+
+\item{\code{default_format}}{Character. Default file format ("parquet" or "csv").
+Default is "parquet".}
+
+\item{\code{extensions}}{Character vector of DuckDB extensions to load (e.g., "spatial")}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+A new \code{parquet_duckdb} object
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-execute"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-execute}{}}}
+\subsection{Method \code{execute()}}{
+Execute a SQL statement
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$execute(statement)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{statement}}{A SQL statement}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Number of rows affected by statement
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-get_query"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-get_query}{}}}
+\subsection{Method \code{get_query()}}{
+Execute a SQL query and return results
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$get_query(statement)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{statement}}{A SQL query statement}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+A data.table with query results
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-attach_table"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-attach_table}{}}}
+\subsection{Method \code{attach_table()}}{
+Attach a table from parquet/CSV file as a temporary table in DuckDB
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{table_name}}{Character. Name of table to attach.}
+
+\item{\code{columns}}{Character vector. Optional SQL column selection, defaults to "*"}
+
+\item{\code{where}}{Character. Optional SQL WHERE clause to subset the table.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Invisible NULL (called for side effects)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-detach_table"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-detach_table}{}}}
+\subsection{Method \code{detach_table()}}{
+Detach a table from the in-memory database
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$detach_table(table_name)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{table_name}}{Character. Name of table to drop.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Invisible NULL (called for side effects)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-row_count"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-row_count}{}}}
+\subsection{Method \code{row_count()}}{
+Get row count for a table
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$row_count(table_name)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{table_name}}{Character string. Name of the table to query.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Integer number of rows
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-list_tables"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-list_tables}{}}}
+\subsection{Method \code{list_tables()}}{
+List all tables (files) in storage
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$list_tables()}\if{html}{\out{</div>}}
+}
+
+\subsection{Returns}{
+Character vector of table names
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-with_tables"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-with_tables}{}}}
+\subsection{Method \code{with_tables()}}{
+Execute a function with specified tables attached, handling attach/detach automatically.
+If a table is already attached in the DuckDB instance, it won't be re-attached or detached.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$with_tables(tables, func, ...)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{tables}}{Character vector of table names to attach}
+
+\item{\code{func}}{Function to execute with tables attached}
+
+\item{\code{...}}{Additional arguments passed to func}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Result of func
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-fetch"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-fetch}{}}}
+\subsection{Method \code{fetch()}}{
+Fetch data from a table
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$fetch(table_name, where = NULL, limit = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{table_name}}{Character string. Name of the table to query.}
+
+\item{\code{where}}{Character string. Optional WHERE clause for the SQL query.}
+
+\item{\code{limit}}{Integer. Optional limit on number of rows to return.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+A data.table
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-delete_from"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-delete_from}{}}}
+\subsection{Method \code{delete_from()}}{
+Delete rows from a table
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$delete_from(table_name, where = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{table_name}}{Character string. Name of the table to delete from.}
+
+\item{\code{where}}{Character string. Optional WHERE clause; if NULL, deletes all rows.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Number of rows deleted
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-commit_overwrite"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_overwrite}{}}}
+\subsection{Method \code{commit_overwrite()}}{
+Commit data in overwrite mode
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit_overwrite(
+  x,
+  table_name,
+  autoincrement_cols = character(0),
+  map_cols = character(0)
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Data frame to commit}
+
+\item{\code{table_name}}{Character string table name}
+
+\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
+
+\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Invisible NULL (called for side effects)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-commit_append"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_append}{}}}
+\subsection{Method \code{commit_append()}}{
+Commit data in append mode
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit_append(
+  x,
+  table_name,
+  autoincrement_cols = character(0),
+  map_cols = character(0)
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Data frame to commit}
+
+\item{\code{table_name}}{Character string table name}
+
+\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
+
+\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Invisible NULL (called for side effects)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-commit_upsert"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_upsert}{}}}
+\subsection{Method \code{commit_upsert()}}{
+Commit data in upsert mode (update existing, insert new)
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit_upsert(
+  x,
+  table_name,
+  key_cols = grep("^id_", names(x), value = TRUE),
+  autoincrement_cols = character(0),
+  map_cols = character(0)
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{x}}{Data frame to commit}
+
+\item{\code{table_name}}{Character string table name}
+
+\item{\code{key_cols}}{Character vector of columns that define uniqueness}
+
+\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
+
+\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Invisible NULL (called for side effects)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-print"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-print}{}}}
+\subsection{Method \code{print()}}{
+Print method for parquet_duckdb
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$print(...)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{...}}{Not used}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+self (invisibly)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-parquet_duckdb-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}

From 0aa5a46bb7ed083aeec951314a2d89be7a8cc403 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 19:18:03 +0100
Subject: [PATCH 06/27] initial trans-preds implementation

---
 R/trans_preds_t.R                  | 112 +++++++++++++++-
 inst/tinytest/test_trans_preds_t.R | 203 ++++++++++++++++++++++++++++-
 2 files changed, 311 insertions(+), 4 deletions(-)

diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index 6d7c5dc..9e7c420 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -19,6 +19,8 @@ as_trans_preds_t <- function(x) {
       id_trans = integer(0)
     )
   }
+  cast_dt_col(x, "id_pred", as.integer)
+  cast_dt_col(x, "id_trans", as.integer)
   new_evoland_table(
     x,
     "trans_preds_t",
@@ -27,10 +29,114 @@ as_trans_preds_t <- function(x) {
 }
 
 #' @describeIn trans_preds_t Create a transition-predictor relation, i.e. records the
-#' result of a predictor selection step.
+#' result of a predictor selection step. Runs covariance filtering for each viable
+#' transition and stores the selected predictors.
+#' @param db An [evoland_db] instance with populated tables
+#' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()]
+#' @param rank_fun Optional ranking function passed to [covariance_filter()]
+#' @param weights Optional weights passed to [covariance_filter()]
+#' @param ... Additional arguments passed to rank_fun via [covariance_filter()]
 #' @export
-create_trans_preds_t <- function() {
-  as_trans_preds_t()
+create_trans_preds_t <- function(
+  db,
+  corcut = 0.7,
+  rank_fun = rank_poly_glm,
+  weights = NULL,
+  ...
+) {
+  stopifnot(
+    "db must be an evoland_db instance" = inherits(db, "evoland_db")
+  )
+
+  viable_trans <- db$trans_meta_t[is_viable == TRUE]
+  pred_meta <- db$pred_meta_t
+  stopifnot(
+    "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L,
+    "No predictors found in pred_meta_t" = nrow(pred_meta) > 0L
+  )
+
+  results_list <- list()
+
+  # Iterate over transitions (anterior/posterior pairs)
+  for (i in seq_len(nrow(viable_trans))) {
+    id_trans <- viable_trans$id_trans[i]
+    id_lulc_ant <- viable_trans$id_lulc_anterior[i]
+    id_lulc_post <- viable_trans$id_lulc_posterior[i]
+
+    message(glue::glue(
+      "Processing transition {i}/{nrow(viable_trans)}: ",
+      "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})"
+    ))
+
+    # Get wide transition-predictor data
+    tryCatch(
+      {
+        trans_pred_data <- make_trans_pred_data_v(db, list(), id_trans)
+
+        # Check if we have any data
+        if (nrow(trans_pred_data) == 0L) {
+          warning(glue::glue(
+            "No data for transition {id_trans}, skipping"
+          ))
+          next
+        }
+
+        # Check if we have any predictor columns
+        pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE)
+        if (length(pred_cols) == 0L) {
+          warning(glue::glue(
+            "No predictor columns for transition {id_trans}, skipping"
+          ))
+          next
+        }
+
+        filtered_data <- covariance_filter(
+          data = trans_pred_data,
+          result_col = "result",
+          rank_fun = rank_fun,
+          corcut = corcut,
+          ...
+        )
+
+        # Extract selected predictor IDs from column names
+        selected_cols <- setdiff(names(filtered_data), "result")
+
+        if (length(selected_cols) > 0L) {
+          # Parse id_pred values from column names (e.g., "id_pred_1" -> 1)
+          selected_ids <- as.integer(sub("^id_pred_", "", selected_cols))
+
+          # Create result rows
+          results_list[[length(results_list) + 1]] <- data.table::data.table(
+            id_pred = selected_ids,
+            id_trans = id_trans
+          )
+
+          message(glue::glue(
+            "  Selected {length(selected_ids)} predictor(s) for transition {id_trans}"
+          ))
+        } else {
+          message(glue::glue(
+            "  No predictors selected for transition {id_trans}"
+          ))
+        }
+      },
+      error = function(e) {
+        warning(glue::glue(
+          "Error processing transition {id_trans}: {e$message}"
+        ))
+      }
+    )
+  }
+
+  # Combine all results
+  if (length(results_list) == 0L) {
+    warning("No predictors selected for any transition")
+    return(as_trans_preds_t())
+  }
+
+  result <- data.table::rbindlist(results_list)
+
+  as_trans_preds_t(result)
 }
 
 #' @export
diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R
index 866c455..3c5feda 100644
--- a/inst/tinytest/test_trans_preds_t.R
+++ b/inst/tinytest/test_trans_preds_t.R
@@ -2,6 +2,207 @@ library(tinytest)
 
 
 # Test creation and validation
-trans_preds_t <- create_trans_preds_t()
+trans_preds_t <- as_trans_preds_t()
 expect_silent(print(trans_preds_t))
 expect_true(nrow(trans_preds_t) >= 0L)
+
+
+#' Actually running the predictor selection logic is heavily dependent on previous
+#' steps. Hence, this is a more heavy-handed test verging on integration testing.
+test_dir_trans_preds <- tempfile("evoland_trans_preds_")
+on.exit(unlink(test_dir_trans_preds, recursive = TRUE), add = TRUE)
+db_tps <- evoland_db$new(test_dir_trans_preds)
+
+# Set up minimal coords and periods
+db_tps$coords_t <- create_coords_t_square(
+  epsg = 2056,
+  extent = terra::ext(c(xmin = 2697000, xmax = 2697500, ymin = 1252000, ymax = 1252500)),
+  resolution = 100
+)
+
+db_tps$periods_t <- create_periods_t(
+  period_length_str = "P10Y",
+  start_observed = "1985-01-01",
+  end_observed = "2005-01-01",
+  end_extrapolated = "2015-01-01"
+)
+
+# Set up LULC classes
+db_tps$lulc_meta_t <- create_lulc_meta_t(list(
+  forest = list(pretty_name = "Forest", src_classes = 1L),
+  urban = list(pretty_name = "Urban", src_classes = 2L),
+  agriculture = list(pretty_name = "Agriculture", src_classes = 3L)
+))
+
+# Create synthetic LULC data with known transitions
+set.seed(42)
+n_coords <- nrow(db_tps$coords_t)
+lulc_data <- data.table::rbindlist(list(
+  data.table::data.table(
+    id_coord = 1:n_coords,
+    id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.7, 0.3)),
+    id_period = 1L
+  ),
+  data.table::data.table(
+    id_coord = 1:n_coords,
+    id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.5, 0.5)),
+    id_period = 2L
+  ),
+  data.table::data.table(
+    id_coord = 1:n_coords,
+    id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.4, 0.6)),
+    id_period = 3L
+  )
+))
+db_tps$lulc_data_t <- as_lulc_data_t(lulc_data)
+
+# Create transition metadata
+transitions <- db_tps$fetch("transitions_v")
+db_tps$trans_meta_t <- create_trans_meta_t(
+  transitions,
+  min_cardinality_abs = 5L
+)
+
+# Add predictor metadata with multiple predictors
+pred_spec_tps <- list(
+  elevation = list(
+    unit = "m",
+    pretty_name = "Elevation",
+    description = "Elevation above sea level",
+    sources = list(list(url = "https://example.com/elevation.tif", md5sum = "abc123"))
+  ),
+  slope = list(
+    unit = "degrees",
+    pretty_name = "Slope",
+    description = "Terrain slope",
+    sources = list(list(url = "https://example.com/slope.tif", md5sum = "def456"))
+  ),
+  distance_to_road = list(
+    unit = "m",
+    pretty_name = "Distance to road",
+    description = "Distance to nearest road",
+    sources = list(list(url = "https://example.com/roads.gpkg", md5sum = "ghi789"))
+  ),
+  aspect = list(
+    unit = "degrees",
+    pretty_name = "Aspect",
+    description = "Terrain aspect",
+    sources = list(list(url = "https://example.com/aspect.tif", md5sum = "jkl012"))
+  ),
+  soil_type = list(
+    pretty_name = "Soil type",
+    description = "Soil classification",
+    sources = list(list(url = "https://example.com/soil.tif", md5sum = "mno345"))
+  )
+)
+db_tps$pred_meta_t <- create_pred_meta_t(pred_spec_tps)
+
+# Add predictor data - mix of static and time-varying
+set.seed(43)
+pred_data_static <- data.table::rbindlist(list(
+  # Elevation (static, period 0)
+  data.table::data.table(
+    id_pred = 1L,
+    id_coord = 1:n_coords,
+    id_period = 0L,
+    value = runif(n_coords, 400, 800)
+  ),
+  # Slope (static, period 0)
+  data.table::data.table(
+    id_pred = 2L,
+    id_coord = 1:n_coords,
+    id_period = 0L,
+    value = runif(n_coords, 0, 30)
+  ),
+  # Aspect (static, period 0)
+  data.table::data.table(
+    id_pred = 4L,
+    id_coord = 1:n_coords,
+    id_period = 0L,
+    value = runif(n_coords, 0, 360)
+  )
+))
+
+# Time-varying predictor
+pred_data_varying <- data.table::rbindlist(lapply(1:3, function(period) {
+  data.table::data.table(
+    id_pred = 3L, # distance_to_road
+    id_coord = 1:n_coords,
+    id_period = period,
+    value = runif(n_coords, 0, 5000)
+  )
+}))
+
+db_tps$pred_data_t_float <- as_pred_data_t(
+  rbind(pred_data_static, pred_data_varying),
+  type = "float"
+)
+
+# Add integer predictor (soil_type)
+pred_data_int <- data.table::data.table(
+  id_pred = 5L,
+  id_coord = 1:n_coords,
+  id_period = 0L,
+  value = sample(1:5, n_coords, replace = TRUE)
+)
+db_tps$pred_data_t_int <- as_pred_data_t(pred_data_int, type = "int")
+
+# Test create_trans_preds_t
+expect_message(
+  trans_preds_result <-
+    create_trans_preds_t(
+      db = db_tps,
+      corcut = 0.7
+    ),
+  "Processing transition 1/2"
+)
+
+expect_true(inherits(trans_preds_result, "trans_preds_t"))
+expect_equal(nrow(trans_preds_result), 8L)
+
+# Verify structure
+expect_true(all(c("id_pred", "id_trans") %in% names(trans_preds_result)))
+expect_true(is.integer(trans_preds_result$id_pred))
+expect_true(is.integer(trans_preds_result$id_trans))
+
+# Verify that all id_trans in result are viable
+viable_trans_ids <- db_tps$trans_meta_t[is_viable == TRUE]$id_trans
+expect_true(all(trans_preds_result$id_trans %in% viable_trans_ids))
+
+# Verify that all id_pred in result exist in pred_meta_t
+expect_true(all(trans_preds_result$id_pred %in% db_tps$pred_meta_t$id_pred))
+
+# Test error handling - empty database
+test_dir_empty <- tempfile("evoland_empty_")
+on.exit(unlink(test_dir_empty, recursive = TRUE), add = TRUE)
+db_empty <- evoland_db$new(test_dir_empty)
+
+expect_error(
+  create_trans_preds_t(db = "not_a_db"),
+  "must be an evoland_db"
+)
+
+expect_error(
+  create_trans_preds_t(db = db_empty),
+  "No viable transitions"
+)
+
+# Test with no predictors
+test_dir_no_pred <- tempfile("evoland_no_pred_")
+on.exit(unlink(test_dir_no_pred, recursive = TRUE), add = TRUE)
+db_no_pred <- evoland_db$new(test_dir_no_pred)
+db_no_pred$coords_t <- db_tps$coords_t
+db_no_pred$periods_t <- db_tps$periods_t
+db_no_pred$lulc_meta_t <- db_tps$lulc_meta_t
+db_no_pred$lulc_data_t <- db_tps$lulc_data_t
+db_no_pred$trans_meta_t <- db_tps$trans_meta_t
+expect_error(
+  create_trans_preds_t(db = db_no_pred),
+  "No predictors found"
+)
+
+# Test print method
+expect_stdout(
+  print(trans_preds_result),
+  "Transition-Predictor|Total relationships"
+)

From f394e85c4fb6b96099cb5304f0a423e4ae88045e Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Thu, 27 Nov 2025 19:18:34 +0100
Subject: [PATCH 07/27] binomial -> quasibinomial

---
 R/covariance_filter.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/covariance_filter.R b/R/covariance_filter.R
index b2475f6..6860798 100644
--- a/R/covariance_filter.R
+++ b/R/covariance_filter.R
@@ -100,7 +100,7 @@ rank_poly_glm <- function(x, y, weights = NULL, ...) {
   fit <- glm.fit(
     x = cbind(1, poly(x, degree = 2, simple = TRUE)),
     y = y,
-    family = binomial(),
+    family = quasibinomial(),
     weights = weights
   )
 

From 5625308c2d4a4fae28f9cf88c9c733a821a079c7 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Fri, 28 Nov 2025 14:54:41 +0100
Subject: [PATCH 08/27] move neighbors logic into codebase + refactor: use $set
 to add methods/fields

---
 DESCRIPTION                 |   3 +
 NAMESPACE                   |   3 +
 R/evoland_db.R              | 287 +++---------------------------------
 R/evoland_db_neighbors.R    | 173 ++++++++++++++++++++++
 R/evoland_db_tables.R       | 206 ++++++++++++++++++++++++++
 R/evoland_db_views.R        | 142 ++++++++----------
 R/neighbors_t.R             |  81 ++++++++++
 R/parquet_duckdb.R          |  47 ++++++
 R/trans_preds_t.R           |   2 +-
 R/util_terra.R              |   2 +
 man/evoland_db.Rd           | 104 ++++++-------
 man/evoland_db_neighbors.Rd |  26 ++++
 man/evoland_db_tables.Rd    |  30 ++++
 man/evoland_db_views.Rd     |  55 ++-----
 man/neighbors_t.Rd          |  41 ++++++
 man/trans_preds_t.Rd        |  21 ++-
 16 files changed, 767 insertions(+), 456 deletions(-)
 create mode 100644 R/evoland_db_neighbors.R
 create mode 100644 R/evoland_db_tables.R
 create mode 100644 R/neighbors_t.R
 create mode 100644 man/evoland_db_neighbors.Rd
 create mode 100644 man/evoland_db_tables.Rd
 create mode 100644 man/neighbors_t.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index fb2d27f..c21e25f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -40,12 +40,15 @@ Collate:
     'covariance_filter.R'
     'parquet_duckdb.R'
     'evoland_db.R'
+    'evoland_db_neighbors.R'
+    'evoland_db_tables.R'
     'evoland_db_views.R'
     'init.R'
     'intrv_masks_t.R'
     'intrv_meta_t.R'
     'lulc_data_t.R'
     'lulc_meta_t.R'
+    'neighbors_t.R'
     'periods_t.R'
     'pred_data_t.R'
     'pred_meta_t.R'
diff --git a/NAMESPACE b/NAMESPACE
index 50faaf3..f111398 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,6 +8,7 @@ S3method(print,intrv_masks_t)
 S3method(print,intrv_meta_t)
 S3method(print,lulc_data_t)
 S3method(print,lulc_meta_t)
+S3method(print,neighbors_t)
 S3method(print,periods_t)
 S3method(print,pred_data_t)
 S3method(print,pred_meta_t)
@@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t)
 S3method(validate,intrv_meta_t)
 S3method(validate,lulc_data_t)
 S3method(validate,lulc_meta_t)
+S3method(validate,neighbors_t)
 S3method(validate,periods_t)
 S3method(validate,pred_data_t)
 S3method(validate,pred_data_t_bool)
@@ -37,6 +39,7 @@ export(as_intrv_masks_t)
 export(as_intrv_meta_t)
 export(as_lulc_data_t)
 export(as_lulc_meta_t)
+export(as_neighbors_t)
 export(as_periods_t)
 export(as_pred_data_t)
 export(as_pred_meta_t)
diff --git a/R/evoland_db.R b/R/evoland_db.R
index f40bd82..628dfb2 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -8,6 +8,13 @@
 #'
 #' Inherits from [parquet_duckdb] for generic database operations.
 #'
+#' @seealso
+#' Additional methods and active bindings are added to this class in separate files:
+#'
+#' - [evoland_db_tables] - Table active bindings (coords_t, lulc_data_t, etc.)
+#' - [evoland_db_views] - View active bindings (lulc_meta_long_v, etc.) and methods
+#' - [evoland_db_neighbors] - Neighbor analysis methods
+#'
 #' @include parquet_duckdb.R
 #' @export
 
@@ -43,8 +50,6 @@ evoland_db <- R6::R6Class(
       invisible(self)
     },
 
-    ### Evoland-specific methods ----
-
     #' @description
     #' Fetch data from storage with evoland-specific view support
     #' @param table_name Character string. Name of the table to query.
@@ -53,26 +58,21 @@ evoland_db <- R6::R6Class(
     #'
     #' @return A data.table
     fetch = function(table_name, where = NULL, limit = NULL) {
-      # Check if this is a view that needs special handling
-      view_result <- switch(
-        table_name,
-        lulc_meta_long_v = make_lulc_meta_long_v(self, private, where),
-        pred_sources_v = make_pred_sources_v(self, private, where),
-        transitions_v = make_transitions_v(self, private, where),
-        NULL
-      )
-
-      if (!is.null(view_result)) {
-        return(view_result)
+      # Check if this is a view (active binding)
+      if (
+        table_name %in%
+          c("lulc_meta_long_v", "pred_sources_v", "transitions_v", "extent", "coords_minimal")
+      ) {
+        return(self[[table_name]])
       }
 
       file_info <- private$get_file_path(table_name)
 
       if (!file_info$exists) {
+        # TODO make this an error
         return(private$get_empty_table(table_name))
       }
 
-      # Call parent method
       super$fetch(table_name, where, limit)
     },
 
@@ -214,196 +214,7 @@ evoland_db <- R6::R6Class(
   ),
 
   ## Active Bindings ----
-  active = list(
-    ### Bindings for tables ----
-    #' @field coords_t A `coords_t` instance; see [create_coords_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    coords_t = function(x) {
-      create_active_binding(self, "coords_t", as_coords_t)(x)
-    },
-
-    #' @field periods_t A `periods_t` instance; see [create_periods_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    periods_t = function(x) {
-      create_active_binding(self, "periods_t", as_periods_t)(x)
-    },
-
-    #' @field lulc_meta_t A `lulc_meta_t` instance; see [create_lulc_meta_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    lulc_meta_t = function(x) {
-      create_active_binding(self, "lulc_meta_t", as_lulc_meta_t)(x)
-    },
-
-    #' @field lulc_meta_long_v Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`.
-    lulc_meta_long_v = function() {
-      make_lulc_meta_long_v(self, private)
-    },
-
-    #' @field lulc_data_t A `lulc_data_t` instance; see [as_lulc_data_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    lulc_data_t = function(x) {
-      create_active_binding(self, "lulc_data_t", as_lulc_data_t)(x)
-    },
-
-    #' @field transitions_v Get the transitions from `lulc_data_t`.
-    transitions_v = function() {
-      make_transitions_v(self, private)
-    },
-
-    #' @field pred_data_t_float A `pred_data_t_float` instance; see
-    #' [create_pred_data_t()] for the type of object to assign. Assigning is an
-    #' upsert operation.
-    pred_data_t_float = function(x) {
-      create_active_binding(
-        self,
-        "pred_data_t_float",
-        as_pred_data_t,
-        type = "float"
-      )(x)
-    },
-
-    #' @field pred_data_t_int A `pred_data_t_int` instance; see
-    #' [create_pred_data_t()] for the type of object to assign. Assigning is an
-    #' upsert operation.
-    pred_data_t_int = function(x) {
-      create_active_binding(
-        self,
-        "pred_data_t_int",
-        as_pred_data_t,
-        type = "int"
-      )(x)
-    },
-
-    #' @field pred_data_t_bool A `pred_data_t_bool` instance; see
-    #' [create_pred_data_t()] for the type of object to assign. Assigning is an
-    #' upsert operation.
-    pred_data_t_bool = function(x) {
-      create_active_binding(
-        self,
-        "pred_data_t_bool",
-        as_pred_data_t,
-        type = "bool"
-      )(x)
-    },
-
-    ### Bindings for descriptions, views, etc. ----
-    #' @field extent Return a terra SpatExtent based on coords_t
-    extent = function() {
-      make_extent_db(self, private)
-    },
-
-    #' @field coords_minimal data.table with only (id_coord, lon, lat)
-    coords_minimal = function() {
-      make_coords_minimal(self, private)
-    },
-
-    #' @field pred_meta_t A `pred_meta_t` instance; see [create_pred_meta_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    pred_meta_t = function(x) {
-      if (missing(x)) {
-        x <- self$fetch("pred_meta_t")
-        return(as_pred_meta_t(x))
-      }
-      stopifnot(inherits(x, "pred_meta_t"))
-      self$commit_upsert(
-        x,
-        table_name = "pred_meta_t",
-        key_cols = "name",
-        autoincrement_cols = "id_pred"
-      )
-    },
-
-    #' @field pred_sources_v Retrieve a table of distinct predictor urls and their
-    #' md5sum
-    pred_sources_v = function() {
-      make_pred_sources_v(self, private)
-    },
-
-    #' @field trans_meta_t A `trans_meta_t` instance; see [create_trans_meta_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    trans_meta_t = function(x) {
-      if (missing(x)) {
-        x <- self$fetch("trans_meta_t")
-        return(as_trans_meta_t(x))
-      }
-      stopifnot(inherits(x, "trans_meta_t"))
-      self$commit_upsert(
-        x,
-        table_name = "trans_meta_t",
-        key_cols = c("id_lulc_anterior", "id_lulc_posterior"),
-        autoincrement_cols = "id_trans"
-      )
-    },
-
-    #' @field trans_preds_t A `trans_preds_t` instance; see [create_trans_preds_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    trans_preds_t = function(x) {
-      create_active_binding(self, "trans_preds_t", as_trans_preds_t)(x)
-    },
-
-    #' @field intrv_meta_t A `intrv_meta_t` instance; see [create_intrv_meta_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    intrv_meta_t = function(x) {
-      if (missing(x)) {
-        return(as_intrv_meta_t(
-          convert_list_cols(self$fetch("intrv_meta_t"), "params", kv_df_to_list)
-        ))
-      }
-      stopifnot(inherits(x, "intrv_meta_t"))
-
-      self$commit_upsert(x, "intrv_meta_t", key_cols = "id_intrv", map_cols = "params")
-    },
-
-    #' @field intrv_masks_t A `intrv_masks_t` instance; see [as_intrv_masks_t()] for the type of
-    #' object to assign. Assigning is an upsert operation.
-    intrv_masks_t = function(x) {
-      create_active_binding(self, "intrv_masks_t", as_intrv_masks_t)(x)
-    },
-
-    #' @field trans_models_t A `trans_models_t` instance; see [create_trans_models_t()] for the type
-    #' of object to assign. Assigning is an upsert operation.
-    trans_models_t = function(x) {
-      if (missing(x)) {
-        return(as_trans_models_t(
-          convert_list_cols(
-            self$fetch("trans_models_t"),
-            c("model_params", "goodness_of_fit"),
-            kv_df_to_list
-          )
-        ))
-      }
-      stopifnot(inherits(x, "trans_models_t"))
-
-      self$commit_upsert(
-        x,
-        "trans_models_t",
-        key_cols = c("id_trans", "id_period"),
-        map_cols = c("model_params", "goodness_of_fit")
-      )
-    },
-
-    #' @field alloc_params_t A `alloc_params_t` instance; see [as_alloc_params_t()] for the type
-    #' of object to assign. Assigning is an upsert operation.
-    alloc_params_t = function(x) {
-      if (missing(x)) {
-        return(as_alloc_params_t(
-          convert_list_cols(
-            self$fetch("alloc_params_t"),
-            c("alloc_params", "goodness_of_fit"),
-            kv_df_to_list
-          )
-        ))
-      }
-      stopifnot(inherits(x, "alloc_params_t"))
-
-      self$commit_upsert(
-        x,
-        "alloc_params_t",
-        key_cols = c("id_trans", "id_period"),
-        map_cols = c("alloc_params", "goodness_of_fit")
-      )
-    }
-  ),
+  active = list(),
 
   ## Private Methods ----
   private = list(
@@ -431,7 +242,8 @@ evoland_db <- R6::R6Class(
         intrv_meta_t = as_intrv_meta_t(),
         intrv_masks_t = as_intrv_masks_t(),
         trans_models_t = as_trans_models_t(),
-        alloc_params_t = as_alloc_params_t()
+        alloc_params_t = as_alloc_params_t(),
+        neighbors_t = as_neighbors_t()
       )
 
       if (table_name %in% names(empty_tables)) {
@@ -443,68 +255,3 @@ evoland_db <- R6::R6Class(
     }
   )
 )
-
-# Helper function to create simple Bindings with standard fetch/commit pattern
-# param self The R6 instance (self).
-# param table_name Character string. Name of the table.
-# param as_fn Function to convert fetched data to the appropriate type.
-# param ... Additional arguments passed to as_fn.
-# return A function suitable for use as an active binding.
-create_active_binding <- function(self, table_name, as_fn, ...) {
-  extra_args <- list(...)
-  function(x) {
-    if (missing(x)) {
-      x <- self$fetch(table_name)
-      return(do.call(as_fn, c(list(x), extra_args)))
-    }
-    stopifnot(inherits(x, table_name))
-    self$commit_upsert(x, table_name)
-  }
-}
-
-
-# Helper functions for converting between list and data.frame formats for DuckDB MAPs
-convert_list_cols <- function(x, cols, fn) {
-  for (col in cols) {
-    x[[col]] <- lapply(x[[col]], fn)
-  }
-  x
-}
-
-list_to_kv_df <- function(x) {
-  if (is.null(x) || length(x) == 0) {
-    return(data.frame(
-      key = character(0),
-      value = character(0),
-      stringsAsFactors = FALSE
-    ))
-  }
-  data.frame(
-    key = names(x),
-    value = as.character(unlist(x)),
-    stringsAsFactors = FALSE
-  )
-}
-
-kv_df_to_list <- function(x) {
-  if (is.null(x) || nrow(x) == 0) {
-    return(NULL)
-  }
-
-  out <- list()
-
-  for (row in seq_len(nrow(x))) {
-    key <- x$key[row]
-    val <- x$value[row]
-
-    # Try numeric conversion
-    num_val <- suppressWarnings(as.numeric(val))
-    if (!is.na(num_val)) {
-      out[[key]] <- num_val
-    } else {
-      out[[key]] <- val
-    }
-  }
-
-  out
-}
diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R
new file mode 100644
index 0000000..65f29f8
--- /dev/null
+++ b/R/evoland_db_neighbors.R
@@ -0,0 +1,173 @@
+#' Neighbor analysis methods for evoland_db
+#'
+#' @description
+#' This file adds neighbor analysis methods to the `evoland_db` class using R6's `$set()` method.
+#' These methods compute neighbor relationships and generate neighbor-based predictors.
+#'
+#' @section Methods Added:
+#'
+#' - `compute_neighbors(max_distance, distance_breaks, resolution, overwrite)` -
+#'   Computes neighbor relationships between coordinates.
+#'   - `max_distance`: Maximum distance for neighbors (default: 1000)
+#'   - `distance_breaks`: Vector of breaks for distance classes (default: c(0, 100, 500, 1000))
+#'   - `resolution`: Grid resolution for distance calculations (default: 100)
+#'   - `overwrite`: Whether to overwrite existing neighbors_t (default: FALSE)
+#' - `generate_neighbor_predictors()` - Generates predictor variables based on neighbor
+#'   land use counts by distance class. Requires neighbors_t with distance_class column
+#'   (compute_neighbors with distance_breaks).
+#'
+#' @name evoland_db_neighbors
+#' @include evoland_db.R
+NULL
+
+evoland_db$set(
+  "public",
+  "compute_neighbors",
+  function(
+    max_distance = 1000,
+    distance_breaks = c(0, 100, 500, 1000),
+    resolution = 100,
+    overwrite = FALSE
+  ) {
+    if (!overwrite && self$row_count("neighbors_t") > 0) {
+      message("neighbors_t already exists. Use overwrite = TRUE to recompute.")
+      return(invisible(self))
+    }
+
+    coords <- self$coords_t
+
+    neighbors <- compute_neighbors(
+      coords,
+      max_distance = max_distance,
+      distance_breaks = distance_breaks,
+      resolution = resolution
+    )
+
+    self$commit_overwrite(
+      as_neighbors_t(neighbors),
+      table_name = "neighbors_t"
+    )
+
+    message(glue::glue("Computed {nrow(neighbors)} neighbor relationships"))
+    invisible(self)
+  }
+)
+
+evoland_db$set("public", "generate_neighbor_predictors", function() {
+  if (self$row_count("neighbors_t") == 0) {
+    stop("No neighbor data found. Run $compute_neighbors() first.")
+  }
+
+  if (self$row_count("lulc_meta_t") == 0) {
+    stop("No LULC metadata found. Add lulc_meta_t before generating neighbor predictors.")
+  }
+
+  if (self$row_count("lulc_data_t") == 0) {
+    stop("No LULC data found. Add lulc_data_t before generating neighbor predictors.")
+  }
+
+  neighbors_sample <- self$fetch("neighbors_t", limit = 1)
+  if (!"distance_class" %in% names(neighbors_sample)) {
+    stop(
+      "neighbors_t does not have distance_class column. Run $compute_neighbors() with distance_breaks parameter."
+    )
+  }
+
+  self$attach_table("neighbors_t")
+  self$attach_table("lulc_data_t")
+  self$attach_table("lulc_meta_t")
+
+  has_pred_meta <- self$row_count("pred_meta_t") > 0
+  if (has_pred_meta) {
+    self$attach_table("pred_meta_t")
+  }
+
+  on.exit({
+    self$detach_table("neighbors_t")
+    self$detach_table("lulc_data_t")
+    self$detach_table("lulc_meta_t")
+    if (has_pred_meta) self$detach_table("pred_meta_t")
+  })
+
+  max_id_query <- if (has_pred_meta) {
+    "(select coalesce(max(id_pred), 0) as max_pred from pred_meta_t)"
+  } else {
+    "(select 0 as max_pred)"
+  }
+
+  self$execute(glue::glue(
+    r"{
+    create temp table pred_meta_neighbors_t as
+    with
+      all_distance_classes as (select distinct distance_class from neighbors_t),
+      max_id as }",
+    max_id_query,
+    r"{
+    select
+      row_number() over () + (select max_pred from max_id) as id_pred,
+      concat('id_lulc_', l.id_lulc, '_dist_', c.distance_class) as name,
+      concat('Count of ', l.pretty_name, ' within distance class ', c.distance_class) as pretty_name,
+      'Number of neighbors by land use class and distance interval' as description,
+      'land use coordinate data' as orig_format,
+      NULL as sources,
+      'number of neighbors' as unit,
+      NULL as factor_levels,
+      c.distance_class,
+      l.id_lulc
+    from
+      lulc_meta_t l
+    cross join
+      all_distance_classes c
+    }"
+  ))
+
+  pred_meta_neighbors <- self$get_query(
+    "select id_pred, name, pretty_name, description, orig_format, unit
+     from pred_meta_neighbors_t"
+  )
+
+  pred_meta_neighbors$sources <- lapply(1:nrow(pred_meta_neighbors), function(i) list())
+  pred_meta_neighbors$factor_levels <- lapply(1:nrow(pred_meta_neighbors), function(i) list())
+
+  self$pred_meta_t <- as_pred_meta_t(pred_meta_neighbors)
+
+  self$execute(
+    r"{
+    create temp table pred_neighbors_t as
+    select
+      p.id_pred,
+      n.id_coord_origin as id_coord,
+      t.id_period,
+      count(n.id_coord_neighbor) as value
+    from
+     neighbors_t n,
+     lulc_data_t t,
+     pred_meta_neighbors_t p
+    where
+      n.id_coord_neighbor = t.id_coord
+      and p.id_lulc = t.id_lulc
+      and p.distance_class = n.distance_class
+    group by
+      n.id_coord_origin,
+      n.distance_class,
+      t.id_period,
+      t.id_lulc,
+      p.id_pred
+    }"
+  )
+
+  pred_neighbors <- self$get_query("select * from pred_neighbors_t")
+  pred_neighbors$id_pred <- as.integer(pred_neighbors$id_pred)
+  pred_neighbors$id_coord <- as.integer(pred_neighbors$id_coord)
+  pred_neighbors$id_period <- as.integer(pred_neighbors$id_period)
+  pred_neighbors$value <- as.integer(pred_neighbors$value)
+
+  self$pred_data_t_int <- as_pred_data_t(pred_neighbors, type = "int")
+
+  message(glue::glue(
+    "Generated {nrow(pred_meta_neighbors)} neighbor predictor variables with ",
+    "{nrow(pred_neighbors)} data points"
+  ))
+
+  invisible(self)
+})
diff --git a/R/evoland_db_tables.R b/R/evoland_db_tables.R
new file mode 100644
index 0000000..d477ab3
--- /dev/null
+++ b/R/evoland_db_tables.R
@@ -0,0 +1,206 @@
+#' Table active bindings for evoland_db
+#'
+#' @description
+#' This file adds table active bindings to the `evoland_db` class using R6's `$set()` method.
+#' These bindings provide read/write access to database tables with automatic validation.
+#'
+#' @section Active Bindings Added:
+#'
+#' - `coords_t` - Coordinates table. See [as_coords_t()]
+#' - `periods_t` - Time periods table. See [as_periods_t()]
+#' - `lulc_meta_t` - Land use/land cover metadata. See [as_lulc_meta_t()]
+#' - `lulc_data_t` - Land use/land cover data. See [as_lulc_data_t()]
+#' - `pred_data_t_float` - Float predictor data. See [as_pred_data_t()]
+#' - `pred_data_t_int` - Integer predictor data. See [as_pred_data_t()]
+#' - `pred_data_t_bool` - Boolean predictor data. See [as_pred_data_t()]
+#' - `pred_meta_t` - Predictor metadata. See [as_pred_meta_t()]
+#' - `trans_meta_t` - Transition metadata. See [as_trans_meta_t()]
+#' - `trans_preds_t` - Transition-predictor relationships. See [as_trans_preds_t()]
+#' - `intrv_meta_t` - Intervention metadata. See [as_intrv_meta_t()]
+#' - `intrv_masks_t` - Intervention masks. See [as_intrv_masks_t()]
+#' - `trans_models_t` - Transition models. See [as_trans_models_t()]
+#' - `alloc_params_t` - Allocation parameters. See [as_alloc_params_t()]
+#' - `neighbors_t` - Neighbor relationships. See [as_neighbors_t()]
+#'
+#' @name evoland_db_tables
+#' @include evoland_db.R
+NULL
+
+create_table_binding <- function(
+  self,
+  table_name,
+  as_fn,
+  key_cols = NULL,
+  autoincrement_cols = NULL,
+  map_cols = NULL,
+  ...
+) {
+  extra_args <- list(...)
+
+  function(x) {
+    if (missing(x)) {
+      fetched <- self$fetch(table_name)
+
+      if (!is.null(map_cols) && nrow(fetched) > 0) {
+        fetched <- convert_list_cols(fetched, map_cols, kv_df_to_list)
+      }
+
+      return(do.call(as_fn, c(list(fetched), extra_args)))
+    }
+
+    stopifnot(inherits(x, table_name))
+
+    self$commit_upsert(
+      x,
+      table_name = table_name,
+      key_cols = key_cols,
+      autoincrement_cols = autoincrement_cols,
+      map_cols = map_cols
+    )
+  }
+}
+
+evoland_db$set("active", "coords_t", function(x) {
+  create_table_binding(
+    self,
+    "coords_t",
+    as_coords_t,
+    key_cols = "id_coord"
+  )(x)
+})
+
+evoland_db$set("active", "periods_t", function(x) {
+  create_table_binding(
+    self,
+    "periods_t",
+    as_periods_t,
+    key_cols = "id_period"
+  )(x)
+})
+
+evoland_db$set("active", "lulc_meta_t", function(x) {
+  create_table_binding(
+    self,
+    "lulc_meta_t",
+    as_lulc_meta_t,
+    key_cols = "id_lulc"
+  )(x)
+})
+
+evoland_db$set("active", "lulc_data_t", function(x) {
+  create_table_binding(
+    self,
+    "lulc_data_t",
+    as_lulc_data_t,
+    key_cols = c("id_coord", "id_period")
+  )(
+    x
+  )
+})
+
+evoland_db$set("active", "pred_data_t_float", function(x) {
+  create_table_binding(
+    self,
+    "pred_data_t_float",
+    as_pred_data_t,
+    key_cols = c("id_pred", "id_coord", "id_period"),
+    type = "float"
+  )(x)
+})
+
+evoland_db$set("active", "pred_data_t_int", function(x) {
+  create_table_binding(
+    self,
+    "pred_data_t_int",
+    as_pred_data_t,
+    key_cols = c("id_pred", "id_coord", "id_period"),
+    type = "int"
+  )(x)
+})
+
+evoland_db$set("active", "pred_data_t_bool", function(x) {
+  create_table_binding(
+    self,
+    "pred_data_t_bool",
+    as_pred_data_t,
+    key_cols = c("id_pred", "id_coord", "id_period"),
+    type = "bool"
+  )(x)
+})
+
+evoland_db$set("active", "pred_meta_t", function(x) {
+  create_table_binding(
+    self,
+    "pred_meta_t",
+    as_pred_meta_t,
+    key_cols = "name",
+    autoincrement_cols = "id_pred"
+  )(x)
+})
+
+evoland_db$set("active", "trans_meta_t", function(x) {
+  create_table_binding(
+    self,
+    "trans_meta_t",
+    as_trans_meta_t,
+    key_cols = c("id_lulc_anterior", "id_lulc_posterior"),
+    autoincrement_cols = "id_trans"
+  )(x)
+})
+
+evoland_db$set("active", "trans_preds_t", function(x) {
+  create_table_binding(
+    self,
+    "trans_preds_t",
+    as_trans_preds_t,
+    key_cols = c("id_trans", "id_pred")
+  )(x)
+})
+
+evoland_db$set("active", "intrv_meta_t", function(x) {
+  create_table_binding(
+    self,
+    "intrv_meta_t",
+    as_intrv_meta_t,
+    key_cols = "id_intrv",
+    map_cols = "params"
+  )(x)
+})
+
+evoland_db$set("active", "intrv_masks_t", function(x) {
+  create_table_binding(
+    self,
+    "intrv_masks_t",
+    as_intrv_masks_t,
+    key_cols = c("id_coord", "id_intrv")
+  )(x)
+})
+
+evoland_db$set("active", "trans_models_t", function(x) {
+  create_table_binding(
+    self,
+    "trans_models_t",
+    as_trans_models_t,
+    key_cols = c("id_trans", "id_period"),
+    map_cols = c("model_params", "goodness_of_fit")
+  )(x)
+})
+
+evoland_db$set("active", "alloc_params_t", function(x) {
+  create_table_binding(
+    self,
+    "alloc_params_t",
+    as_alloc_params_t,
+    key_cols = c("id_trans", "id_period"),
+    map_cols = c("alloc_params", "goodness_of_fit")
+  )(x)
+})
+
+evoland_db$set("active", "neighbors_t", function(x) {
+  create_table_binding(
+    self,
+    "neighbors_t",
+    as_neighbors_t,
+    key_cols = c("id_coord_origin", "id_coord_neighbor")
+  )(x)
+})
diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index b0533e9..6f41fae 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -1,35 +1,27 @@
 #' Views on the evoland-plus data model
 #'
 #' @description
-#' Functions to generate views on the database
+#' This file adds view active bindings and methods to the `evoland_db` class using R6's `$set()` method.
+#' These provide computed views on the database without storing additional data.
+#'
+#' @section Active Bindings Added:
+#'
+#' - `lulc_meta_long_v` - Unrolled LULC metadata with one row per source class
+#' - `pred_sources_v` - Distinct predictor URLs and their MD5 checksums
+#' - `transitions_v` - Land use transitions derived from lulc_data_t
+#' - `extent` - Spatial extent of coords_t as terra::SpatExtent
+#' - `coords_minimal` - Minimal coordinate representation (id_coord, lon, lat)
+#'
+#' @section Methods Added:
+#'
+#' - `trans_pred_data_v(id_trans)` - Returns wide table of transition results and predictor data for a specific transition. Used as input to covariance filtering.
 #'
 #' @name evoland_db_views
+#' @include evoland_db.R
 NULL
 
-#' @describeIn evoland_db_views Retrieve a table of distinct predictor urls and their
-#' md5sum
-make_pred_sources_v <- function(self, private, where = NULL) {
-  self$with_tables("pred_meta_t", function() {
-    where_clause <- if (!is.null(where)) paste("AND", where) else ""
-
-    self$get_query(glue::glue(
-      r"{
-      select distinct
-        unnest(sources).url as url,
-        unnest(sources).md5sum as md5sum
-      from pred_meta_t
-      where sources is not null {where_clause}
-      }"
-    ))
-  })
-}
-
-
-#' @describeIn evoland_db_views Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`.
-make_lulc_meta_long_v <- function(self, private, where = NULL) {
+evoland_db$set("active", "lulc_meta_long_v", function() {
   self$with_tables("lulc_meta_t", function() {
-    where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
-
     self$get_query(glue::glue(
       r"{
       select
@@ -38,53 +30,27 @@ make_lulc_meta_long_v <- function(self, private, where = NULL) {
         unnest(src_classes) as src_class
       from
         lulc_meta_t
-      {where_clause}
       }"
     ))
   })
-}
-
-#' @describeIn evoland_db_views Minimal coordinate representation (id_coord, lon, lat)
-make_coords_minimal <- function(self, private, where = NULL) {
-  self$with_tables("coords_t", function() {
-    where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
+})
 
+evoland_db$set("active", "pred_sources_v", function() {
+  self$with_tables("pred_meta_t", function() {
     self$get_query(glue::glue(
       r"{
-      select id_coord, lon, lat
-      from coords_t
-      {where_clause}
+      select distinct
+        unnest(sources).url as url,
+        unnest(sources).md5sum as md5sum
+      from pred_meta_t
+      where sources is not null
       }"
-    )) |>
-      cast_dt_col("id_coord", as.integer) |>
-      data.table::setkeyv("id_coord")
-  })
-}
-
-#' @describeIn evoland_db_views Returns the extent of the coords_t as terra::SpatExtent
-make_extent_db <- function(self, private) {
-  self$with_tables("coords_t", function() {
-    self$get_query(glue::glue(
-      r"{
-          SELECT
-            min(lon) as xmin,
-            max(lon) as xmax,
-            min(lat) as ymin,
-            max(lat) as ymax
-          FROM
-            coords_t
-          }"
-    )) |>
-      unlist() |>
-      terra::ext()
+    ))
   })
-}
+})
 
-#' @describeIn evoland_db_views Returns transitions based on lulc_data_t
-make_transitions_v <- function(self, private, where = NULL) {
+evoland_db$set("active", "transitions_v", function() {
   self$with_tables("lulc_data_t", function() {
-    where_clause <- if (!is.null(where)) paste("WHERE", where) else ""
-
     self$get_query(glue::glue(
       r"{
       SELECT
@@ -99,34 +65,56 @@ make_transitions_v <- function(self, private, where = NULL) {
       ON
         curr.id_coord = prev.id_coord
         AND curr.id_period = prev.id_period + 1
-      {where_clause}
       }"
     ))
   })
-}
+})
 
-#' @describeIn evoland_db_views Returns wide table of transition results and predictor data
-#' for a specific transition. Used as input to covariance filtering.
-#'
-#' @param id_trans Integer, the transition ID to generate data for
-#' @return data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N
-make_trans_pred_data_v <- function(self, private, id_trans) {
+evoland_db$set("active", "extent", function() {
+  self$with_tables("coords_t", function() {
+    self$get_query(glue::glue(
+      r"{
+      SELECT
+        min(lon) as xmin,
+        max(lon) as xmax,
+        min(lat) as ymin,
+        max(lat) as ymax
+      FROM
+        coords_t
+      }"
+    )) |>
+      unlist() |>
+      terra::ext()
+  })
+})
+
+evoland_db$set("active", "coords_minimal", function() {
+  self$with_tables("coords_t", function() {
+    self$get_query(glue::glue(
+      r"{
+      select id_coord, lon, lat
+      from coords_t
+      }"
+    )) |>
+      cast_dt_col("id_coord", as.integer) |>
+      data.table::setkeyv("id_coord")
+  })
+})
+
+evoland_db$set("public", "trans_pred_data_v", function(id_trans) {
   stopifnot(
     "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans)
   )
 
-  # Determine which predictor tables exist
   all_tables <- self$list_tables()
   pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool")
   existing_pred_tables <- intersect(pred_tables, all_tables)
 
-  # Build list of tables to attach
   tables_to_attach <- c("trans_meta_t", "lulc_data_t", "pred_meta_t", existing_pred_tables)
 
   self$with_tables(
     tables_to_attach,
     function() {
-      # Get transition metadata
       trans_info <- self$get_query(glue::glue(
         "SELECT id_lulc_anterior, id_lulc_posterior
          FROM trans_meta_t
@@ -140,10 +128,8 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
       id_lulc_ant <- trans_info$id_lulc_anterior
       id_lulc_post <- trans_info$id_lulc_posterior
 
-      # Build CTEs dynamically based on which predictor tables exist
       ctes <- list()
 
-      # Always include trans_result
       ctes$trans_result <- glue::glue(
         "trans_result AS (
           SELECT
@@ -162,8 +148,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
         )"
       )
 
-      # Add predictor CTEs for each type that exists
-      # UNION period-specific data (period >= 1) with cross-joined static data (period 0)
       if ("pred_data_t_float" %in% existing_pred_tables) {
         ctes$pred_float_combined <- "pred_float_combined AS (
           SELECT id_coord, id_period, id_pred, value
@@ -215,7 +199,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
         )"
       }
 
-      # Build SELECT columns
       select_cols <- "tr.result"
       if ("pred_data_t_float" %in% existing_pred_tables) {
         select_cols <- paste0(select_cols, ", pf.* EXCLUDE (id_coord, id_period)")
@@ -227,7 +210,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
         select_cols <- paste0(select_cols, ", pb.* EXCLUDE (id_coord, id_period)")
       }
 
-      # Build JOINs
       joins <- ""
       if ("pred_data_t_float" %in% existing_pred_tables) {
         joins <- paste0(
@@ -248,7 +230,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
         )
       }
 
-      # Combine everything into final query
       cte_string <- paste(unlist(ctes), collapse = ",\n\n        ")
 
       query <- glue::glue(
@@ -261,7 +242,6 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
 
       result <- self$get_query(query)
 
-      # Rename columns to id_pred_{N} format
       old_names <- names(result)
       new_names <- old_names
       for (i in seq_along(old_names)) {
@@ -274,4 +254,4 @@ make_trans_pred_data_v <- function(self, private, id_trans) {
       result
     }
   )
-}
+})
diff --git a/R/neighbors_t.R b/R/neighbors_t.R
new file mode 100644
index 0000000..5d2e98f
--- /dev/null
+++ b/R/neighbors_t.R
@@ -0,0 +1,81 @@
+#' Create Neighbors Table
+#'
+#' Creates a `neighbors_t` table and validates that it matches the schema.
+#'
+#' @name neighbors_t
+#'
+#' @param x An object that can be passed to [data.table::setDT()]
+#'
+#' @return A data.table of class "neighbors_t" with columns:
+#'   - `id_coord_origin`: Foreign key to coords_t (origin coordinate)
+#'   - `id_coord_neighbor`: Foreign key to coords_t (neighbor coordinate)
+#'   - `distance`: Numeric distance between coordinates
+#'   - `distance_class`: Optional factor representing distance intervals
+#' @export
+as_neighbors_t <- function(x) {
+  if (missing(x)) {
+    x <- data.table::data.table(
+      id_coord_origin = integer(0),
+      id_coord_neighbor = integer(0),
+      distance = numeric(0)
+    )
+  }
+  cast_dt_col(x, "id_coord_origin", as.integer)
+  cast_dt_col(x, "id_coord_neighbor", as.integer)
+  if ("distance_class" %in% names(x)) {
+    cast_dt_col(x, "distance_class", as.factor)
+  }
+  new_evoland_table(
+    x,
+    "neighbors_t",
+    c("id_coord_origin", "id_coord_neighbor")
+  )
+}
+
+#' @describeIn neighbors_t Validate a neighbors_t object
+#' @export
+validate.neighbors_t <- function(x, ...) {
+  NextMethod()
+
+  required_cols <- c("id_coord_origin", "id_coord_neighbor", "distance")
+
+  data.table::setcolorder(x, required_cols)
+
+  stopifnot(
+    is.integer(x[["id_coord_origin"]]),
+    is.integer(x[["id_coord_neighbor"]]),
+    is.numeric(x[["distance"]])
+  )
+
+  return(x)
+}
+
+#' @describeIn neighbors_t Print a neighbors_t object
+#' @param nrow Maximum number of rows to print. See [data.table::print.data.table]
+#' @param ... Passed to [data.table::print.data.table]
+#' @export
+print.neighbors_t <- function(x, nrow = 10, ...) {
+  if (nrow(x) > 0) {
+    n_origins <- data.table::uniqueN(x[["id_coord_origin"]])
+    n_neighbors <- data.table::uniqueN(x[["id_coord_neighbor"]])
+    total_pairs <- nrow(x)
+
+    extra_info <- ""
+    if ("distance_class" %in% names(x)) {
+      n_classes <- data.table::uniqueN(x[["distance_class"]])
+      extra_info <- glue::glue(", Distance classes: {n_classes}")
+    }
+
+    cat(glue::glue(
+      "Neighbors Table\n",
+      "Neighbor pairs: {total_pairs}\n",
+      "Origin coordinates: {n_origins}, ",
+      "Neighbor coordinates: {n_neighbors}",
+      "{extra_info}\n\n"
+    ))
+  } else {
+    cat("Neighbors Table (empty)\n")
+  }
+  NextMethod(nrow = nrow, ...)
+  invisible(x)
+}
diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R
index 0fa7d51..11aa465 100644
--- a/R/parquet_duckdb.R
+++ b/R/parquet_duckdb.R
@@ -574,3 +574,50 @@ parquet_duckdb <- R6::R6Class(
     }
   )
 )
+
+
+# Helper functions for converting between list and data.frame formats for DuckDB MAPs
+convert_list_cols <- function(x, cols, fn) {
+  for (col in cols) {
+    x[[col]] <- lapply(x[[col]], fn)
+  }
+  x
+}
+
+list_to_kv_df <- function(x) {
+  if (is.null(x) || length(x) == 0) {
+    return(data.frame(
+      key = character(0),
+      value = character(0),
+      stringsAsFactors = FALSE
+    ))
+  }
+  data.frame(
+    key = names(x),
+    value = as.character(unlist(x)),
+    stringsAsFactors = FALSE
+  )
+}
+
+kv_df_to_list <- function(x) {
+  if (is.null(x) || nrow(x) == 0) {
+    return(NULL)
+  }
+
+  out <- list()
+
+  for (row in seq_len(nrow(x))) {
+    key <- x$key[row]
+    val <- x$value[row]
+
+    # Try numeric conversion
+    num_val <- suppressWarnings(as.numeric(val))
+    if (!is.na(num_val)) {
+      out[[key]] <- num_val
+    } else {
+      out[[key]] <- val
+    }
+  }
+
+  out
+}
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index 9e7c420..ae324a5 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -71,7 +71,7 @@ create_trans_preds_t <- function(
     # Get wide transition-predictor data
     tryCatch(
       {
-        trans_pred_data <- make_trans_pred_data_v(db, list(), id_trans)
+        trans_pred_data <- db$trans_pred_data_v(id_trans)
 
         # Check if we have any data
         if (nrow(trans_pred_data) == 0L) {
diff --git a/R/util_terra.R b/R/util_terra.R
index 00d8825..42bd2f5 100644
--- a/R/util_terra.R
+++ b/R/util_terra.R
@@ -89,6 +89,8 @@ extract_using_coords_t.SpatVector <- function(x, coords_t, na_omit = TRUE) {
   out
 }
 
+# TODO move this to a neighbors_t.R file that includes a formal class definition
+# for the return table, including validation, print, coercion
 #' @describeIn util_terra Compute neighboring coordinates within specified distances. In
 #' order to be computationally feasible, the coordinates' IDs are rasterized before
 #' their actual Euclidean distance is calculated. If coordinates are so close that they
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index 49cf8f8..3e005d7 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -11,73 +11,23 @@ to disk in parquet format for better compression.
 
 Inherits from \link{parquet_duckdb} for generic database operations.
 }
-\section{Super class}{
-\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db}
+\seealso{
+Additional methods and active bindings are added to this class in separate files:
+\itemize{
+\item \link{evoland_db_tables} - Table active bindings (coords_t, lulc_data_t, etc.)
+\item \link{evoland_db_views} - View active bindings (lulc_meta_long_v, etc.) and methods
+\item \link{evoland_db_neighbors} - Neighbor analysis methods
 }
-\section{Active bindings}{
-\if{html}{\out{<div class="r6-active-bindings">}}
-\describe{
-\item{\code{coords_t}}{A \code{coords_t} instance; see \code{\link[=create_coords_t]{create_coords_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{periods_t}}{A \code{periods_t} instance; see \code{\link[=create_periods_t]{create_periods_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{lulc_meta_t}}{A \code{lulc_meta_t} instance; see \code{\link[=create_lulc_meta_t]{create_lulc_meta_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{lulc_meta_long_v}}{Return a \code{lulc_meta_long_v} instance, i.e. unrolled \code{lulc_meta_t}.}
-
-\item{\code{lulc_data_t}}{A \code{lulc_data_t} instance; see \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{transitions_v}}{Get the transitions from \code{lulc_data_t}.}
-
-\item{\code{pred_data_t_float}}{A \code{pred_data_t_float} instance; see
-\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an
-upsert operation.}
-
-\item{\code{pred_data_t_int}}{A \code{pred_data_t_int} instance; see
-\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an
-upsert operation.}
-
-\item{\code{pred_data_t_bool}}{A \code{pred_data_t_bool} instance; see
-\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an
-upsert operation.}
-
-\item{\code{extent}}{Return a terra SpatExtent based on coords_t}
-
-\item{\code{coords_minimal}}{data.table with only (id_coord, lon, lat)}
-
-\item{\code{pred_meta_t}}{A \code{pred_meta_t} instance; see \code{\link[=create_pred_meta_t]{create_pred_meta_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{pred_sources_v}}{Retrieve a table of distinct predictor urls and their
-md5sum}
-
-\item{\code{trans_meta_t}}{A \code{trans_meta_t} instance; see \code{\link[=create_trans_meta_t]{create_trans_meta_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{trans_preds_t}}{A \code{trans_preds_t} instance; see \code{\link[=create_trans_preds_t]{create_trans_preds_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{intrv_meta_t}}{A \code{intrv_meta_t} instance; see \code{\link[=create_intrv_meta_t]{create_intrv_meta_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{intrv_masks_t}}{A \code{intrv_masks_t} instance; see \code{\link[=as_intrv_masks_t]{as_intrv_masks_t()}} for the type of
-object to assign. Assigning is an upsert operation.}
-
-\item{\code{trans_models_t}}{A \code{trans_models_t} instance; see \code{\link[=create_trans_models_t]{create_trans_models_t()}} for the type
-of object to assign. Assigning is an upsert operation.}
-
-\item{\code{alloc_params_t}}{A \code{alloc_params_t} instance; see \code{\link[=as_alloc_params_t]{as_alloc_params_t()}} for the type
-of object to assign. Assigning is an upsert operation.}
 }
-\if{html}{\out{</div>}}
+\section{Super class}{
+\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db}
 }
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
+\item \href{#method-evoland_db-compute_neighbors}{\code{evoland_db$compute_neighbors()}}
+\item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}}
+\item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}}
 \item \href{#method-evoland_db-new}{\code{evoland_db$new()}}
 \item \href{#method-evoland_db-fetch}{\code{evoland_db$fetch()}}
 \item \href{#method-evoland_db-set_report}{\code{evoland_db$set_report()}}
@@ -106,6 +56,38 @@ of object to assign. Assigning is an upsert operation.}
 </details>
 }}
 \if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-evoland_db-compute_neighbors"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-compute_neighbors}{}}}
+\subsection{Method \code{compute_neighbors()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$compute_neighbors(
+  max_distance = 1000,
+  distance_breaks = c(0, 100, 500, 1000),
+  resolution = 100,
+  overwrite = FALSE
+)}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-evoland_db-generate_neighbor_predictors"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-generate_neighbor_predictors}{}}}
+\subsection{Method \code{generate_neighbor_predictors()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$generate_neighbor_predictors()}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-evoland_db-trans_pred_data_v"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}}
+\subsection{Method \code{trans_pred_data_v()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$trans_pred_data_v(id_trans)}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-new"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-new}{}}}
 \subsection{Method \code{new()}}{
diff --git a/man/evoland_db_neighbors.Rd b/man/evoland_db_neighbors.Rd
new file mode 100644
index 0000000..09e3e93
--- /dev/null
+++ b/man/evoland_db_neighbors.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/evoland_db_neighbors.R
+\name{evoland_db_neighbors}
+\alias{evoland_db_neighbors}
+\title{Neighbor analysis methods for evoland_db}
+\description{
+This file adds neighbor analysis methods to the \code{evoland_db} class using R6's \verb{$set()} method.
+These methods compute neighbor relationships and generate neighbor-based predictors.
+}
+\section{Methods Added}{
+
+\itemize{
+\item \code{compute_neighbors(max_distance, distance_breaks, resolution, overwrite)} -
+Computes neighbor relationships between coordinates.
+\itemize{
+\item \code{max_distance}: Maximum distance for neighbors (default: 1000)
+\item \code{distance_breaks}: Vector of breaks for distance classes (default: c(0, 100, 500, 1000))
+\item \code{resolution}: Grid resolution for distance calculations (default: 100)
+\item \code{overwrite}: Whether to overwrite existing neighbors_t (default: FALSE)
+}
+\item \code{generate_neighbor_predictors()} - Generates predictor variables based on neighbor
+land use counts by distance class. Requires neighbors_t with distance_class column
+(compute_neighbors with distance_breaks).
+}
+}
+
diff --git a/man/evoland_db_tables.Rd b/man/evoland_db_tables.Rd
new file mode 100644
index 0000000..59cc49d
--- /dev/null
+++ b/man/evoland_db_tables.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/evoland_db_tables.R
+\name{evoland_db_tables}
+\alias{evoland_db_tables}
+\title{Table active bindings for evoland_db}
+\description{
+This file adds table active bindings to the \code{evoland_db} class using R6's \verb{$set()} method.
+These bindings provide read/write access to database tables with automatic validation.
+}
+\section{Active Bindings Added}{
+
+\itemize{
+\item \code{coords_t} - Coordinates table. See \code{\link[=as_coords_t]{as_coords_t()}}
+\item \code{periods_t} - Time periods table. See \code{\link[=as_periods_t]{as_periods_t()}}
+\item \code{lulc_meta_t} - Land use/land cover metadata. See \code{\link[=as_lulc_meta_t]{as_lulc_meta_t()}}
+\item \code{lulc_data_t} - Land use/land cover data. See \code{\link[=as_lulc_data_t]{as_lulc_data_t()}}
+\item \code{pred_data_t_float} - Float predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}}
+\item \code{pred_data_t_int} - Integer predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}}
+\item \code{pred_data_t_bool} - Boolean predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}}
+\item \code{pred_meta_t} - Predictor metadata. See \code{\link[=as_pred_meta_t]{as_pred_meta_t()}}
+\item \code{trans_meta_t} - Transition metadata. See \code{\link[=as_trans_meta_t]{as_trans_meta_t()}}
+\item \code{trans_preds_t} - Transition-predictor relationships. See \code{\link[=as_trans_preds_t]{as_trans_preds_t()}}
+\item \code{intrv_meta_t} - Intervention metadata. See \code{\link[=as_intrv_meta_t]{as_intrv_meta_t()}}
+\item \code{intrv_masks_t} - Intervention masks. See \code{\link[=as_intrv_masks_t]{as_intrv_masks_t()}}
+\item \code{trans_models_t} - Transition models. See \code{\link[=as_trans_models_t]{as_trans_models_t()}}
+\item \code{alloc_params_t} - Allocation parameters. See \code{\link[=as_alloc_params_t]{as_alloc_params_t()}}
+\item \code{neighbors_t} - Neighbor relationships. See \code{\link[=as_neighbors_t]{as_neighbors_t()}}
+}
+}
+
diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd
index a080c24..d0a7c93 100644
--- a/man/evoland_db_views.Rd
+++ b/man/evoland_db_views.Rd
@@ -2,49 +2,26 @@
 % Please edit documentation in R/evoland_db_views.R
 \name{evoland_db_views}
 \alias{evoland_db_views}
-\alias{make_pred_sources_v}
-\alias{make_lulc_meta_long_v}
-\alias{make_coords_minimal}
-\alias{make_extent_db}
-\alias{make_transitions_v}
-\alias{make_trans_pred_data_v}
 \title{Views on the evoland-plus data model}
-\usage{
-make_pred_sources_v(self, private, where = NULL)
-
-make_lulc_meta_long_v(self, private, where = NULL)
-
-make_coords_minimal(self, private, where = NULL)
-
-make_extent_db(self, private)
-
-make_transitions_v(self, private, where = NULL)
-
-make_trans_pred_data_v(self, private, id_trans)
-}
-\arguments{
-\item{id_trans}{Integer, the transition ID to generate data for}
-}
-\value{
-data.table with columns: result (0/1), id_pred_1, id_pred_2, ..., id_pred_N
-}
 \description{
-Functions to generate views on the database
+This file adds view active bindings and methods to the \code{evoland_db} class using R6's \verb{$set()} method.
+These provide computed views on the database without storing additional data.
 }
-\section{Functions}{
-\itemize{
-\item \code{make_pred_sources_v()}: Retrieve a table of distinct predictor urls and their
-md5sum
+\section{Active Bindings Added}{
 
-\item \code{make_lulc_meta_long_v()}: Return a \code{lulc_meta_long_v} instance, i.e. unrolled \code{lulc_meta_t}.
-
-\item \code{make_coords_minimal()}: Minimal coordinate representation (id_coord, lon, lat)
-
-\item \code{make_extent_db()}: Returns the extent of the coords_t as terra::SpatExtent
+\itemize{
+\item \code{lulc_meta_long_v} - Unrolled LULC metadata with one row per source class
+\item \code{pred_sources_v} - Distinct predictor URLs and their MD5 checksums
+\item \code{transitions_v} - Land use transitions derived from lulc_data_t
+\item \code{extent} - Spatial extent of coords_t as terra::SpatExtent
+\item \code{coords_minimal} - Minimal coordinate representation (id_coord, lon, lat)
+}
+}
 
-\item \code{make_transitions_v()}: Returns transitions based on lulc_data_t
+\section{Methods Added}{
 
-\item \code{make_trans_pred_data_v()}: Returns wide table of transition results and predictor data
-for a specific transition. Used as input to covariance filtering.
+\itemize{
+\item \code{trans_pred_data_v(id_trans)} - Returns wide table of transition results and predictor data for a specific transition. Used as input to covariance filtering.
+}
+}
 
-}}
diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd
new file mode 100644
index 0000000..bc95b6a
--- /dev/null
+++ b/man/neighbors_t.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/neighbors_t.R
+\name{neighbors_t}
+\alias{neighbors_t}
+\alias{as_neighbors_t}
+\alias{validate.neighbors_t}
+\alias{print.neighbors_t}
+\title{Create Neighbors Table}
+\usage{
+as_neighbors_t(x)
+
+\method{validate}{neighbors_t}(x, ...)
+
+\method{print}{neighbors_t}(x, nrow = 10, ...)
+}
+\arguments{
+\item{x}{An object that can be passed to \code{\link[data.table:setDT]{data.table::setDT()}}}
+
+\item{...}{Passed to \link[data.table:print.data.table]{data.table::print.data.table}}
+
+\item{nrow}{Maximum number of rows to print. See \link[data.table:print.data.table]{data.table::print.data.table}}
+}
+\value{
+A data.table of class "neighbors_t" with columns:
+\itemize{
+\item \code{id_coord_origin}: Foreign key to coords_t (origin coordinate)
+\item \code{id_coord_neighbor}: Foreign key to coords_t (neighbor coordinate)
+\item \code{distance}: Numeric distance between coordinates
+\item \code{distance_class}: Optional factor representing distance intervals
+}
+}
+\description{
+Creates a \code{neighbors_t} table and validates that it matches the schema.
+}
+\section{Methods (by generic)}{
+\itemize{
+\item \code{validate(neighbors_t)}: Validate a neighbors_t object
+
+\item \code{print(neighbors_t)}: Print a neighbors_t object
+
+}}
diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd
index 545381f..a555322 100644
--- a/man/trans_preds_t.Rd
+++ b/man/trans_preds_t.Rd
@@ -9,16 +9,28 @@
 \usage{
 as_trans_preds_t(x)
 
-create_trans_preds_t()
+create_trans_preds_t(
+  db,
+  corcut = 0.7,
+  rank_fun = rank_poly_glm,
+  weights = NULL,
+  ...
+)
 
 \method{print}{trans_preds_t}(x, nrow = 10, ...)
 }
 \arguments{
-\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}}
+\item{db}{An \link{evoland_db} instance with populated tables}
+
+\item{corcut}{Numeric threshold (0-1) for correlation filtering passed to \code{\link[=covariance_filter]{covariance_filter()}}}
+
+\item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}}
+
+\item{weights}{Optional weights passed to \code{\link[=covariance_filter]{covariance_filter()}}}
 
 \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}}
 
-\item{db}{An \link{evoland_db} instance with populated trans_meta_t and pred_meta_t tables}
+\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}}
 }
 \value{
 A data.table of class "trans_preds_t" with columns:
@@ -40,6 +52,7 @@ modelling each transition type.
 \section{Functions}{
 \itemize{
 \item \code{create_trans_preds_t()}: Create a transition-predictor relation, i.e. records the
-result of a predictor selection step.
+result of a predictor selection step. Runs covariance filtering for each viable
+transition and stores the selected predictors.
 
 }}

From e83d1199a7ec232e00288e48a37da618c71c4c98 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Fri, 28 Nov 2025 15:35:35 +0100
Subject: [PATCH 09/27] cast_dt_col - check before coercing

---
 R/coords_t.R         |  4 ++--
 R/evoland_db_views.R |  2 +-
 R/intrv_masks_t.R    |  4 ++--
 R/lulc_data_t.R      |  2 +-
 R/neighbors_t.R      | 22 ++++++++++++----------
 R/pred_data_t.R      | 14 +++-----------
 R/trans_preds_t.R    |  4 ++--
 R/util.R             | 22 ++++++++++++++++++++--
 man/util.Rd          |  2 +-
 9 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/R/coords_t.R b/R/coords_t.R
index e4c77f2..e99609a 100644
--- a/R/coords_t.R
+++ b/R/coords_t.R
@@ -34,9 +34,9 @@ as_coords_t <- function(x) {
       geom_polygon = list()
     )
   }
-  cast_dt_col(x, "id_coord", as.integer)
+  cast_dt_col(x, "id_coord", "int")
   if (!is.null(x[["region"]])) {
-    cast_dt_col(x, "region", as.factor)
+    cast_dt_col(x, "region", "factor")
   }
   new_evoland_table(
     x,
diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index 6f41fae..3201935 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -96,7 +96,7 @@ evoland_db$set("active", "coords_minimal", function() {
       from coords_t
       }"
     )) |>
-      cast_dt_col("id_coord", as.integer) |>
+      cast_dt_col("id_coord", "int") |>
       data.table::setkeyv("id_coord")
   })
 })
diff --git a/R/intrv_masks_t.R b/R/intrv_masks_t.R
index 501daf7..8aee5f3 100644
--- a/R/intrv_masks_t.R
+++ b/R/intrv_masks_t.R
@@ -18,8 +18,8 @@ as_intrv_masks_t <- function(x) {
       id_coord = integer(0)
     )
   }
-  cast_dt_col(x, "id_coord", as.integer)
-  cast_dt_col(x, "id_coord", as.integer)
+  cast_dt_col(x, "id_coord", "int")
+  cast_dt_col(x, "id_coord", "int")
   new_evoland_table(
     x,
     "intrv_masks_t",
diff --git a/R/lulc_data_t.R b/R/lulc_data_t.R
index 91c4bcc..817df05 100644
--- a/R/lulc_data_t.R
+++ b/R/lulc_data_t.R
@@ -19,7 +19,7 @@ as_lulc_data_t <- function(x) {
       id_period = integer(0)
     )
   }
-  cast_dt_col(x, "id_coord", as.integer)
+  cast_dt_col(x, "id_coord", "int")
   new_evoland_table(
     x,
     "lulc_data_t",
diff --git a/R/neighbors_t.R b/R/neighbors_t.R
index 5d2e98f..d4b142e 100644
--- a/R/neighbors_t.R
+++ b/R/neighbors_t.R
@@ -20,10 +20,10 @@ as_neighbors_t <- function(x) {
       distance = numeric(0)
     )
   }
-  cast_dt_col(x, "id_coord_origin", as.integer)
-  cast_dt_col(x, "id_coord_neighbor", as.integer)
+  cast_dt_col(x, "id_coord_origin", "int")
+  cast_dt_col(x, "id_coord_neighbor", "int")
   if ("distance_class" %in% names(x)) {
-    cast_dt_col(x, "distance_class", as.factor)
+    cast_dt_col(x, "distance_class", "factor")
   }
   new_evoland_table(
     x,
@@ -56,21 +56,23 @@ validate.neighbors_t <- function(x, ...) {
 #' @export
 print.neighbors_t <- function(x, nrow = 10, ...) {
   if (nrow(x) > 0) {
-    n_origins <- data.table::uniqueN(x[["id_coord_origin"]])
-    n_neighbors <- data.table::uniqueN(x[["id_coord_neighbor"]])
-    total_pairs <- nrow(x)
+    total_pairs <- format(
+      nrow(x),
+      big.mark = "_",
+      scientific = FALSE
+    )
 
     extra_info <- ""
     if ("distance_class" %in% names(x)) {
-      n_classes <- data.table::uniqueN(x[["distance_class"]])
-      extra_info <- glue::glue(", Distance classes: {n_classes}")
+      extra_info <- paste(
+        "Distance classes:",
+        paste(levels(x[["distance_class"]]), collapse = ", ")
+      )
     }
 
     cat(glue::glue(
       "Neighbors Table\n",
       "Neighbor pairs: {total_pairs}\n",
-      "Origin coordinates: {n_origins}, ",
-      "Neighbor coordinates: {n_neighbors}",
       "{extra_info}\n\n"
     ))
   } else {
diff --git a/R/pred_data_t.R b/R/pred_data_t.R
index d3c7972..45ee49e 100644
--- a/R/pred_data_t.R
+++ b/R/pred_data_t.R
@@ -18,26 +18,18 @@
 as_pred_data_t <- function(x, type) {
   stopifnot(type %in% c("float", "int", "bool"))
 
-  # Create empty table with proper value type
-  coercion_fn <- switch(
-    type,
-    float = as.numeric,
-    int = as.integer,
-    bool = as.logical
-  )
-
   if (missing(x)) {
     x <- data.table::data.table(
       id_pred = integer(0),
       id_coord = integer(0),
       id_period = integer(0),
-      value = coercion_fn(integer(0))
+      value = integer(0)
     )
   }
 
   data.table::setDT(x, key = c("id_pred", "id_coord", "id_period")) |>
-    cast_dt_col("value", coercion_fn) |>
-    cast_dt_col("id_coord", as.integer)
+    cast_dt_col("value", type) |>
+    cast_dt_col("id_coord", "int")
 
   class_name <- paste0("pred_data_t_", type)
   new_evoland_table(x, c(class_name, "pred_data_t"))
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index ae324a5..f6ec30f 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -19,8 +19,8 @@ as_trans_preds_t <- function(x) {
       id_trans = integer(0)
     )
   }
-  cast_dt_col(x, "id_pred", as.integer)
-  cast_dt_col(x, "id_trans", as.integer)
+  cast_dt_col(x, "id_pred", "int")
+  cast_dt_col(x, "id_trans", "int")
   new_evoland_table(
     x,
     "trans_preds_t",
diff --git a/R/util.R b/R/util.R
index ceebbcb..7b8e1c0 100644
--- a/R/util.R
+++ b/R/util.R
@@ -143,11 +143,29 @@ print_rowwise_yaml <- function(df) {
 }
 
 #' @describeIn util Cast a data.table column; invisibly returns x
-cast_dt_col <- function(x, colname, castfun) {
+cast_dt_col <- function(x, colname, type) {
+  predicate_fn <- switch(
+    type,
+    float = is.numeric,
+    int = is.integer,
+    bool = is.logical,
+    factor = is.factor
+  )
+  if (predicate_fn(x[[colname]])) {
+    return(invisible(x))
+  }
+
+  coercion_fn <- switch(
+    type,
+    float = as.numeric,
+    int = as.integer,
+    bool = as.logical,
+    factor = as.factor
+  )
   data.table::set(
     x = x,
     j = colname,
-    value = castfun(x[[colname]])
+    value = coercion_fn(x[[colname]])
   )
   invisible(x)
 }
diff --git a/man/util.Rd b/man/util.Rd
index 575d7f8..a83b6fc 100644
--- a/man/util.Rd
+++ b/man/util.Rd
@@ -26,7 +26,7 @@ ensure_dir(dir)
 
 print_rowwise_yaml(df)
 
-cast_dt_col(x, colname, castfun)
+cast_dt_col(x, colname, type)
 }
 \arguments{
 \item{x}{Left-hand side value}

From 46ee96aa39c7847721312cb20b56041f14aa768f Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:11:45 +0100
Subject: [PATCH 10/27] error if table is missing

---
 R/evoland_db.R                     | 47 ++----------------------------
 R/trans_preds_t.R                  |  3 +-
 inst/tinytest/test_evoland_db.R    | 21 +------------
 inst/tinytest/test_trans_preds_t.R |  9 ++++--
 4 files changed, 11 insertions(+), 69 deletions(-)

diff --git a/R/evoland_db.R b/R/evoland_db.R
index 628dfb2..99d498d 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -60,6 +60,8 @@ evoland_db <- R6::R6Class(
     fetch = function(table_name, where = NULL, limit = NULL) {
       # Check if this is a view (active binding)
       if (
+        # TODO these should probably not be active bindings, but instead methods with
+        # predefined query parameters
         table_name %in%
           c("lulc_meta_long_v", "pred_sources_v", "transitions_v", "extent", "coords_minimal")
       ) {
@@ -69,8 +71,7 @@ evoland_db <- R6::R6Class(
       file_info <- private$get_file_path(table_name)
 
       if (!file_info$exists) {
-        # TODO make this an error
-        return(private$get_empty_table(table_name))
+        stop("Table `", table_name, "` does not exist")
       }
 
       super$fetch(table_name, where, limit)
@@ -211,47 +212,5 @@ evoland_db <- R6::R6Class(
 
       invisible(self)
     }
-  ),
-
-  ## Active Bindings ----
-  active = list(),
-
-  ## Private Methods ----
-  private = list(
-    # Get empty table with proper structure (evoland-specific)
-    #
-    # param table_name Character string table name
-    # return Empty data.table with correct columns
-    get_empty_table = function(table_name) {
-      # Define empty table structures
-      empty_tables <- list(
-        reporting_t = suppressWarnings(data.table::data.table(
-          key = character(0),
-          value = character(0)
-        )),
-        coords_t = as_coords_t(),
-        periods_t = as_periods_t(),
-        lulc_meta_t = as_lulc_meta_t(),
-        lulc_data_t = as_lulc_data_t(),
-        pred_meta_t = as_pred_meta_t(),
-        pred_data_t_float = as_pred_data_t(type = "float"),
-        pred_data_t_int = as_pred_data_t(type = "int"),
-        pred_data_t_bool = as_pred_data_t(type = "bool"),
-        trans_meta_t = as_trans_meta_t(),
-        trans_preds_t = as_trans_preds_t(),
-        intrv_meta_t = as_intrv_meta_t(),
-        intrv_masks_t = as_intrv_masks_t(),
-        trans_models_t = as_trans_models_t(),
-        alloc_params_t = as_alloc_params_t(),
-        neighbors_t = as_neighbors_t()
-      )
-
-      if (table_name %in% names(empty_tables)) {
-        return(empty_tables[[table_name]])
-      }
-
-      # Default: return empty data.table
-      data.table::data.table()
-    }
   )
 )
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index f6ec30f..dc9e10c 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -51,8 +51,7 @@ create_trans_preds_t <- function(
   viable_trans <- db$trans_meta_t[is_viable == TRUE]
   pred_meta <- db$pred_meta_t
   stopifnot(
-    "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L,
-    "No predictors found in pred_meta_t" = nrow(pred_meta) > 0L
+    "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L
   )
 
   results_list <- list()
diff --git a/inst/tinytest/test_evoland_db.R b/inst/tinytest/test_evoland_db.R
index 4a73bc6..c48f2df 100644
--- a/inst/tinytest/test_evoland_db.R
+++ b/inst/tinytest/test_evoland_db.R
@@ -38,26 +38,7 @@ expect_equal(
   sort(reporting2$key)
 )
 
-# Test 4: Domain-specific empty table structures via active bindings
-empty_tables <- c(
-  "alloc_params_t",
-  "coords_t",
-  "intrv_masks_t",
-  "intrv_meta_t",
-  "lulc_data_t",
-  "lulc_meta_t",
-  "periods_t",
-  "pred_data_t_bool",
-  "pred_data_t_float",
-  "pred_data_t_int",
-  "pred_meta_t",
-  "trans_meta_t",
-  "trans_models_t",
-  "trans_preds_t"
-)
-for (table in empty_tables) {
-  expect_equal(nrow(db[[table]]), 0L)
-}
+# Test 4 removed: Fetching a missing table now produces an error
 
 # Test 5: Create synthetic evoland-specific test data
 coords_t <- create_coords_t_square(
diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R
index 3c5feda..737dc6a 100644
--- a/inst/tinytest/test_trans_preds_t.R
+++ b/inst/tinytest/test_trans_preds_t.R
@@ -184,7 +184,7 @@ expect_error(
 
 expect_error(
   create_trans_preds_t(db = db_empty),
-  "No viable transitions"
+  "Table `trans_meta_t` does not exist"
 )
 
 # Test with no predictors
@@ -195,10 +195,13 @@ db_no_pred$coords_t <- db_tps$coords_t
 db_no_pred$periods_t <- db_tps$periods_t
 db_no_pred$lulc_meta_t <- db_tps$lulc_meta_t
 db_no_pred$lulc_data_t <- db_tps$lulc_data_t
-db_no_pred$trans_meta_t <- db_tps$trans_meta_t
+expect_warning(
+  db_no_pred$trans_meta_t <- db_tps$trans_meta_t,
+  "Overriding existing IDs"
+)
 expect_error(
   create_trans_preds_t(db = db_no_pred),
-  "No predictors found"
+  "Table `pred_meta_t` does not exist"
 )
 
 # Test print method

From 176da53a59c8db38856c887441dc0d088ad6375f Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:10:08 +0100
Subject: [PATCH 11/27] weights makes no sense at this level

---
 R/trans_preds_t.R    |  2 --
 man/trans_preds_t.Rd | 10 +---------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index dc9e10c..4e2dd87 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -34,14 +34,12 @@ as_trans_preds_t <- function(x) {
 #' @param db An [evoland_db] instance with populated tables
 #' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()]
 #' @param rank_fun Optional ranking function passed to [covariance_filter()]
-#' @param weights Optional weights passed to [covariance_filter()]
 #' @param ... Additional arguments passed to rank_fun via [covariance_filter()]
 #' @export
 create_trans_preds_t <- function(
   db,
   corcut = 0.7,
   rank_fun = rank_poly_glm,
-  weights = NULL,
   ...
 ) {
   stopifnot(
diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd
index a555322..e2cbebf 100644
--- a/man/trans_preds_t.Rd
+++ b/man/trans_preds_t.Rd
@@ -9,13 +9,7 @@
 \usage{
 as_trans_preds_t(x)
 
-create_trans_preds_t(
-  db,
-  corcut = 0.7,
-  rank_fun = rank_poly_glm,
-  weights = NULL,
-  ...
-)
+create_trans_preds_t(db, corcut = 0.7, rank_fun = rank_poly_glm, ...)
 
 \method{print}{trans_preds_t}(x, nrow = 10, ...)
 }
@@ -26,8 +20,6 @@ create_trans_preds_t(
 
 \item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}}
 
-\item{weights}{Optional weights passed to \code{\link[=covariance_filter]{covariance_filter()}}}
-
 \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}}
 
 \item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}}

From e8f2ef7438032c38dac12df1b52c402ed0d1222e Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:10:58 +0100
Subject: [PATCH 12/27] compute_neighbors -> create_neighbors_t

---
 NAMESPACE                       |  2 +-
 R/evoland_db_neighbors.R        | 12 ++---
 R/neighbors_t.R                 | 66 ++++++++++++++++++++++++++++
 R/util_terra.R                  | 78 ---------------------------------
 inst/tinytest/test_util_terra.R | 20 ++++-----
 man/evoland_db.Rd               | 10 ++---
 man/evoland_db_neighbors.Rd     |  4 +-
 man/neighbors_t.Rd              | 34 ++++++++++++++
 man/util_terra.Rd               | 31 -------------
 9 files changed, 124 insertions(+), 133 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index f111398..35a656a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -46,12 +46,12 @@ export(as_pred_meta_t)
 export(as_trans_meta_t)
 export(as_trans_models_t)
 export(as_trans_preds_t)
-export(compute_neighbors)
 export(covariance_filter)
 export(create_coords_t_square)
 export(create_intrv_meta_t)
 export(create_intrv_meta_t_row)
 export(create_lulc_meta_t)
+export(create_neighbors_t)
 export(create_periods_t)
 export(create_pred_meta_t)
 export(create_trans_meta_t)
diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R
index 65f29f8..e4d02cc 100644
--- a/R/evoland_db_neighbors.R
+++ b/R/evoland_db_neighbors.R
@@ -6,7 +6,7 @@
 #'
 #' @section Methods Added:
 #'
-#' - `compute_neighbors(max_distance, distance_breaks, resolution, overwrite)` -
+#' - `create_neighbors_t(max_distance, distance_breaks, resolution, overwrite)` -
 #'   Computes neighbor relationships between coordinates.
 #'   - `max_distance`: Maximum distance for neighbors (default: 1000)
 #'   - `distance_breaks`: Vector of breaks for distance classes (default: c(0, 100, 500, 1000))
@@ -14,7 +14,7 @@
 #'   - `overwrite`: Whether to overwrite existing neighbors_t (default: FALSE)
 #' - `generate_neighbor_predictors()` - Generates predictor variables based on neighbor
 #'   land use counts by distance class. Requires neighbors_t with distance_class column
-#'   (compute_neighbors with distance_breaks).
+#'   (create_neighbors_t with distance_breaks).
 #'
 #' @name evoland_db_neighbors
 #' @include evoland_db.R
@@ -22,7 +22,7 @@ NULL
 
 evoland_db$set(
   "public",
-  "compute_neighbors",
+  "create_neighbors_t",
   function(
     max_distance = 1000,
     distance_breaks = c(0, 100, 500, 1000),
@@ -36,7 +36,7 @@ evoland_db$set(
 
     coords <- self$coords_t
 
-    neighbors <- compute_neighbors(
+    neighbors <- create_neighbors_t(
       coords,
       max_distance = max_distance,
       distance_breaks = distance_breaks,
@@ -55,7 +55,7 @@ evoland_db$set(
 
 evoland_db$set("public", "generate_neighbor_predictors", function() {
   if (self$row_count("neighbors_t") == 0) {
-    stop("No neighbor data found. Run $compute_neighbors() first.")
+    stop("No neighbor data found. Run $create_neighbors_t() first.")
   }
 
   if (self$row_count("lulc_meta_t") == 0) {
@@ -69,7 +69,7 @@ evoland_db$set("public", "generate_neighbor_predictors", function() {
   neighbors_sample <- self$fetch("neighbors_t", limit = 1)
   if (!"distance_class" %in% names(neighbors_sample)) {
     stop(
-      "neighbors_t does not have distance_class column. Run $compute_neighbors() with distance_breaks parameter."
+      "neighbors_t does not have distance_class column. Run $create_neighbors_t() with distance_breaks parameter."
     )
   }
 
diff --git a/R/neighbors_t.R b/R/neighbors_t.R
index d4b142e..f4d09fd 100644
--- a/R/neighbors_t.R
+++ b/R/neighbors_t.R
@@ -32,6 +32,72 @@ as_neighbors_t <- function(x) {
   )
 }
 
+#' @describeIn neighbors_t Compute neighboring coordinates within specified distances. In
+#' order to be computationally feasible, the coordinates' IDs are rasterized before
+#' their actual Euclidean distance is calculated. If coordinates are so close that they
+#' get rasterized to the same cell, the first one is used and a warning is emitted. If
+#' this happens, try again using a lower resolution.
+#' @param max_distance Maximum distance to search for neighbors (in same units as
+#' coordinates and resolution)
+#' @param distance_breaks Optional numeric vector defining distance class boundaries.
+#'   If NULL, no distance classification is performed.
+#'   If provided, must have at least 2 elements defining interval breaks.
+#' @param resolution Grid cell size for rasterization (default: 100.0, in same units as coordinates)
+#' @return A data.table with columns:
+#'   - id_coord_origin: ID of the origin coordinate
+#'   - id_coord_neighbor: ID of the neighboring coordinate
+#'   - distance: Distance between origin and neighbor
+#'   - distance_class: Factor indicating distance class (if distance_breaks provided)
+#' @export
+create_neighbors_t <- function(
+  coords_t,
+  max_distance,
+  distance_breaks = NULL,
+  resolution = 100.0
+) {
+  # Validate inputs
+  if (!inherits(coords_t, "coords_t")) {
+    stop("coords_t must be a coords_t object")
+  }
+
+  if (!is.numeric(max_distance) || length(max_distance) != 1 || max_distance <= 0) {
+    stop("max_distance must be a positive scalar numeric")
+  }
+
+  if (!is.null(distance_breaks)) {
+    if (!is.numeric(distance_breaks) || length(distance_breaks) < 2) {
+      stop("distance_breaks must be NULL or a numeric vector with at least 2 elements")
+    }
+  }
+
+  # Call C++ function
+  dt <- distance_neighbors_cpp(
+    coords_t = coords_t,
+    max_distance = max_distance,
+    resolution = resolution
+  )
+
+  data.table::setalloccol(dt)
+
+  # Rename distance_approx to distance
+  data.table::setnames(dt, "distance_approx", "distance")
+
+  # Add distance class if breaks provided
+  if (!is.null(distance_breaks)) {
+    dt[,
+      distance_class := cut(
+        distance,
+        breaks = distance_breaks,
+        right = FALSE,
+        include.lowest = TRUE
+      )
+    ]
+  }
+
+  as_neighbors_t(dt)
+}
+
+
 #' @describeIn neighbors_t Validate a neighbors_t object
 #' @export
 validate.neighbors_t <- function(x, ...) {
diff --git a/R/util_terra.R b/R/util_terra.R
index 42bd2f5..463c772 100644
--- a/R/util_terra.R
+++ b/R/util_terra.R
@@ -91,81 +91,3 @@ extract_using_coords_t.SpatVector <- function(x, coords_t, na_omit = TRUE) {
 
 # TODO move this to a neighbors_t.R file that includes a formal class definition
 # for the return table, including validation, print, coercion
-#' @describeIn util_terra Compute neighboring coordinates within specified distances. In
-#' order to be computationally feasible, the coordinates' IDs are rasterized before
-#' their actual Euclidean distance is calculated. If coordinates are so close that they
-#' get rasterized to the same cell, the first one is used and a warning is emitted. If
-#' this happens, try again using a lower resolution.
-#' @param max_distance Maximum distance to search for neighbors (in same units as
-#' coordinates and resolution)
-#' @param distance_breaks Optional numeric vector defining distance class boundaries.
-#'   If NULL, no distance classification is performed.
-#'   If provided, must have at least 2 elements defining interval breaks.
-#' @param resolution Grid cell size for rasterization (default: 100.0, in same units as coordinates)
-#' @return A data.table with columns:
-#'   - id_coord_origin: ID of the origin coordinate
-#'   - id_coord_neighbor: ID of the neighboring coordinate
-#'   - distance: Distance between origin and neighbor
-#'   - distance_class: Factor indicating distance class (if distance_breaks provided)
-#' @export
-compute_neighbors <- function(
-  coords_t,
-  max_distance,
-  distance_breaks = NULL,
-  resolution = 100.0
-) {
-  # Validate inputs
-  if (!inherits(coords_t, "coords_t")) {
-    stop("coords_t must be a coords_t object")
-  }
-
-  if (!is.numeric(max_distance) || length(max_distance) != 1 || max_distance <= 0) {
-    stop("max_distance must be a positive scalar numeric")
-  }
-
-  if (!is.null(distance_breaks)) {
-    if (!is.numeric(distance_breaks) || length(distance_breaks) < 2) {
-      stop("distance_breaks must be NULL or a numeric vector with at least 2 elements")
-    }
-  }
-
-  # Call C++ function
-  dt <- distance_neighbors_cpp(
-    coords_t = coords_t,
-    max_distance = max_distance,
-    resolution = resolution
-  )
-
-  # Set data.table allocation
-  dt <- data.table::setalloccol(dt)
-
-  # Rename distance_approx to distance
-  data.table::setnames(dt, "distance_approx", "distance")
-
-  # Add distance class if breaks provided
-  if (!is.null(distance_breaks)) {
-    dt[,
-      distance_class := cut(
-        distance,
-        breaks = distance_breaks,
-        right = FALSE,
-        include.lowest = TRUE
-      )
-    ]
-  }
-
-  # Reorder columns
-  if (!is.null(distance_breaks)) {
-    data.table::setcolorder(
-      dt,
-      c("id_coord_origin", "id_coord_neighbor", "distance_class", "distance")
-    )
-  } else {
-    data.table::setcolorder(
-      dt,
-      c("id_coord_origin", "id_coord_neighbor", "distance")
-    )
-  }
-
-  return(dt)
-}
diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R
index b3089d3..efb0dac 100644
--- a/inst/tinytest/test_util_terra.R
+++ b/inst/tinytest/test_util_terra.R
@@ -140,7 +140,7 @@ expect_true(!anyNA(vector_na_omit_true$value))
 vector_na_omit_false <- extract_using_coords_t(vect_with_na, coords_t, na_omit = FALSE)
 expect_true(anyNA(vector_na_omit_false$value))
 
-# Test compute_neighbors
+# Test create_neighbors_t
 # Create a simple test coords_t with known distances
 test_coords <- data.table::data.table(
   id_coord = 1L:5L,
@@ -152,7 +152,7 @@ test_coords <- data.table::data.table(
 test_coords <- as_coords_t(test_coords)
 
 # Test basic neighbor computation with max_distance = 150
-neighbors <- compute_neighbors(test_coords, max_distance = 150)
+neighbors <- create_neighbors_t(test_coords, max_distance = 150)
 
 # Check structure
 expect_true(data.table::is.data.table(neighbors))
@@ -204,7 +204,7 @@ expect_equal(
 )
 
 # Test with distance_breaks
-neighbors_classified <- compute_neighbors(
+neighbors_classified <- create_neighbors_t(
   test_coords,
   max_distance = 150,
   distance_breaks = c(0, 100, 150)
@@ -234,7 +234,7 @@ expect_equal(
 )
 
 # Test with smaller max_distance
-neighbors_small <- compute_neighbors(test_coords, max_distance = 110)
+neighbors_small <- create_neighbors_t(test_coords, max_distance = 110)
 
 # With max_distance = 110, point 1 should only have neighbors 2 and 4 (distance 100)
 # but not 5 (distance ~141.4)
@@ -246,27 +246,27 @@ expect_false(5L %in% neighbors_from_1_small$id_coord_neighbor)
 
 # Test error handling
 expect_error(
-  compute_neighbors("not_coords_t", max_distance = 100),
+  create_neighbors_t("not_coords_t", max_distance = 100),
   "coords_t must be a coords_t object"
 )
 
 expect_error(
-  compute_neighbors(test_coords, max_distance = -10),
+  create_neighbors_t(test_coords, max_distance = -10),
   "max_distance must be a positive scalar numeric"
 )
 
 expect_error(
-  compute_neighbors(test_coords, max_distance = 100, distance_breaks = c(1)),
+  create_neighbors_t(test_coords, max_distance = 100, distance_breaks = c(1)),
   "distance_breaks must be NULL or a numeric vector with at least 2 elements"
 )
 
 expect_error(
-  compute_neighbors(test_coords, max_distance = 100, distance_breaks = "invalid"),
+  create_neighbors_t(test_coords, max_distance = 100, distance_breaks = "invalid"),
   "distance_breaks must be NULL or a numeric vector with at least 2 elements"
 )
 
 # Test with real coords_t from earlier in the test file
-real_neighbors <- compute_neighbors(coords_t, max_distance = 300)
+real_neighbors <- create_neighbors_t(coords_t, max_distance = 300)
 
 # Each point in a regular 100m grid should have neighbors
 # Interior points should have 8 neighbors within 300m (8-connectivity)
@@ -291,7 +291,7 @@ for (i in seq_len(
 }
 
 # Test with distance classification on real data
-real_neighbors_class <- compute_neighbors(
+real_neighbors_class <- create_neighbors_t(
   coords_t,
   max_distance = 300,
   distance_breaks = c(0, 150, 300)
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index 3e005d7..fbdc608 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -25,7 +25,7 @@ Additional methods and active bindings are added to this class in separate files
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
-\item \href{#method-evoland_db-compute_neighbors}{\code{evoland_db$compute_neighbors()}}
+\item \href{#method-evoland_db-create_neighbors_t}{\code{evoland_db$create_neighbors_t()}}
 \item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}}
 \item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}}
 \item \href{#method-evoland_db-new}{\code{evoland_db$new()}}
@@ -56,11 +56,11 @@ Additional methods and active bindings are added to this class in separate files
 </details>
 }}
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-compute_neighbors"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-compute_neighbors}{}}}
-\subsection{Method \code{compute_neighbors()}}{
+\if{html}{\out{<a id="method-evoland_db-create_neighbors_t"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-create_neighbors_t}{}}}
+\subsection{Method \code{create_neighbors_t()}}{
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$compute_neighbors(
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$create_neighbors_t(
   max_distance = 1000,
   distance_breaks = c(0, 100, 500, 1000),
   resolution = 100,
diff --git a/man/evoland_db_neighbors.Rd b/man/evoland_db_neighbors.Rd
index 09e3e93..91db904 100644
--- a/man/evoland_db_neighbors.Rd
+++ b/man/evoland_db_neighbors.Rd
@@ -10,7 +10,7 @@ These methods compute neighbor relationships and generate neighbor-based predict
 \section{Methods Added}{
 
 \itemize{
-\item \code{compute_neighbors(max_distance, distance_breaks, resolution, overwrite)} -
+\item \code{create_neighbors_t(max_distance, distance_breaks, resolution, overwrite)} -
 Computes neighbor relationships between coordinates.
 \itemize{
 \item \code{max_distance}: Maximum distance for neighbors (default: 1000)
@@ -20,7 +20,7 @@ Computes neighbor relationships between coordinates.
 }
 \item \code{generate_neighbor_predictors()} - Generates predictor variables based on neighbor
 land use counts by distance class. Requires neighbors_t with distance_class column
-(compute_neighbors with distance_breaks).
+(create_neighbors_t with distance_breaks).
 }
 }
 
diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd
index bc95b6a..2376f02 100644
--- a/man/neighbors_t.Rd
+++ b/man/neighbors_t.Rd
@@ -3,12 +3,20 @@
 \name{neighbors_t}
 \alias{neighbors_t}
 \alias{as_neighbors_t}
+\alias{create_neighbors_t}
 \alias{validate.neighbors_t}
 \alias{print.neighbors_t}
 \title{Create Neighbors Table}
 \usage{
 as_neighbors_t(x)
 
+create_neighbors_t(
+  coords_t,
+  max_distance,
+  distance_breaks = NULL,
+  resolution = 100
+)
+
 \method{validate}{neighbors_t}(x, ...)
 
 \method{print}{neighbors_t}(x, nrow = 10, ...)
@@ -16,6 +24,15 @@ as_neighbors_t(x)
 \arguments{
 \item{x}{An object that can be passed to \code{\link[data.table:setDT]{data.table::setDT()}}}
 
+\item{max_distance}{Maximum distance to search for neighbors (in same units as
+coordinates and resolution)}
+
+\item{distance_breaks}{Optional numeric vector defining distance class boundaries.
+If NULL, no distance classification is performed.
+If provided, must have at least 2 elements defining interval breaks.}
+
+\item{resolution}{Grid cell size for rasterization (default: 100.0, in same units as coordinates)}
+
 \item{...}{Passed to \link[data.table:print.data.table]{data.table::print.data.table}}
 
 \item{nrow}{Maximum number of rows to print. See \link[data.table:print.data.table]{data.table::print.data.table}}
@@ -28,6 +45,14 @@ A data.table of class "neighbors_t" with columns:
 \item \code{distance}: Numeric distance between coordinates
 \item \code{distance_class}: Optional factor representing distance intervals
 }
+
+A data.table with columns:
+\itemize{
+\item id_coord_origin: ID of the origin coordinate
+\item id_coord_neighbor: ID of the neighboring coordinate
+\item distance: Distance between origin and neighbor
+\item distance_class: Factor indicating distance class (if distance_breaks provided)
+}
 }
 \description{
 Creates a \code{neighbors_t} table and validates that it matches the schema.
@@ -39,3 +64,12 @@ Creates a \code{neighbors_t} table and validates that it matches the schema.
 \item \code{print(neighbors_t)}: Print a neighbors_t object
 
 }}
+\section{Functions}{
+\itemize{
+\item \code{create_neighbors_t()}: Compute neighboring coordinates within specified distances. In
+order to be computationally feasible, the coordinates' IDs are rasterized before
+their actual Euclidean distance is calculated. If coordinates are so close that they
+get rasterized to the same cell, the first one is used and a warning is emitted. If
+this happens, try again using a lower resolution.
+
+}}
diff --git a/man/util_terra.Rd b/man/util_terra.Rd
index d46c499..e429d0a 100644
--- a/man/util_terra.Rd
+++ b/man/util_terra.Rd
@@ -3,43 +3,18 @@
 \name{util_terra}
 \alias{util_terra}
 \alias{extract_using_coords_t}
-\alias{compute_neighbors}
 \title{evoland utility functions to work with terra objects}
 \usage{
 extract_using_coords_t(x, coords_t, na_omit = TRUE)
-
-compute_neighbors(
-  coords_t,
-  max_distance,
-  distance_breaks = NULL,
-  resolution = 100
-)
 }
 \arguments{
 \item{x}{The object to extract from; use "simple" extraction for rasters, i.e. no resampling is done.}
 
 \item{coords_t}{A coords_t object containing coordinate points}
-
-\item{max_distance}{Maximum distance to search for neighbors (in same units as
-coordinates and resolution)}
-
-\item{distance_breaks}{Optional numeric vector defining distance class boundaries.
-If NULL, no distance classification is performed.
-If provided, must have at least 2 elements defining interval breaks.}
-
-\item{resolution}{Grid cell size for rasterization (default: 100.0, in same units as coordinates)}
 }
 \value{
 A long data.table with \code{id_coord}, a \code{layer}/\code{attribute} column, and a \code{value} column.
 NAs are omitted
-
-A data.table with columns:
-\itemize{
-\item id_coord_origin: ID of the origin coordinate
-\item id_coord_neighbor: ID of the neighboring coordinate
-\item distance: Distance between origin and neighbor
-\item distance_class: Factor indicating distance class (if distance_breaks provided)
-}
 }
 \description{
 Useful to coax terra raster and vector data into evoland tabular form.
@@ -48,10 +23,4 @@ Useful to coax terra raster and vector data into evoland tabular form.
 \itemize{
 \item \code{extract_using_coords_t()}: Extract values from a SpatRaster or SpatVector object using a (minimal) \code{coords_t}
 
-\item \code{compute_neighbors()}: Compute neighboring coordinates within specified distances. In
-order to be computationally feasible, the coordinates' IDs are rasterized before
-their actual Euclidean distance is calculated. If coordinates are so close that they
-get rasterized to the same cell, the first one is used and a warning is emitted. If
-this happens, try again using a lower resolution.
-
 }}

From a0a6256833cf0a598b23f57e42332a8f070d6b36 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:31:24 +0100
Subject: [PATCH 13/27] make transition results bool

---
 R/covariance_filter.R | 13 ++++++-------
 R/evoland_db_views.R  |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/R/covariance_filter.R b/R/covariance_filter.R
index 6860798..fb06530 100644
--- a/R/covariance_filter.R
+++ b/R/covariance_filter.R
@@ -56,7 +56,6 @@ covariance_filter <- function(
 
   # Validate binary outcome
   stopifnot(
-    "result_col must be binary (0/1)" = length(unique(data[[result_col]])) == 2,
     "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1
   )
 
@@ -119,8 +118,8 @@ rank_poly_glm <- function(x, y, weights = NULL, ...) {
 #' @keywords internal
 compute_balanced_weights <- function(trans_result, legacy = FALSE) {
   n_total <- length(trans_result)
-  n_trans <- sum(trans_result == 1)
-  n_non_trans <- sum(trans_result == 0)
+  n_trans <- sum(trans_result)
+  n_non_trans <- sum(!trans_result)
 
   # Compute inverse frequency weights
   weights <- numeric(n_total)
@@ -129,16 +128,16 @@ compute_balanced_weights <- function(trans_result, legacy = FALSE) {
     # I found this weighting in evoland-plus-legacy, but the models wouldn't converge
     # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r
     # This is actually just setting the underrepresented class to the rounded imbalance ratio
-    weights[trans_result == 0] <- 1
-    weights[trans_result == 1] <- round(n_non_trans / n_trans)
+    weights[!trans_result] <- 1
+    weights[trans_result] <- round(n_non_trans / n_trans)
     return(weights)
   }
 
   # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y))
   # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint
   # This weighting maintains the exact imbalance ratio
-  weights[trans_result == 1] <- n_total / (2 * n_trans)
-  weights[trans_result == 0] <- n_total / (2 * n_non_trans)
+  weights[trans_result] <- n_total / (2 * n_trans)
+  weights[!trans_result] <- n_total / (2 * n_non_trans)
 
   weights
 }
diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index 3201935..42429a3 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -136,8 +136,8 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans) {
             curr.id_coord,
             curr.id_period,
             CASE
-              WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN 1
-              WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN 0
+              WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN TRUE
+              WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN FALSE
               ELSE NULL
             END AS result
           FROM lulc_data_t AS curr

From 3cc787ee75856574da8c573aaef96ed1c68d6da5 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Sat, 29 Nov 2025 14:38:43 +0100
Subject: [PATCH 14/27] more informative / reflective printing method

---
 R/evoland_db.R     | 37 -------------------
 R/parquet_duckdb.R | 88 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 84 insertions(+), 41 deletions(-)

diff --git a/R/evoland_db.R b/R/evoland_db.R
index 99d498d..4279ad0 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -174,43 +174,6 @@ evoland_db <- R6::R6Class(
         as_pred_data_t(pred_data, pred_type),
         paste0("pred_data_t_", pred_type)
       )
-    },
-
-    #' @description
-    #' Print method for evoland_db
-    #' @param ... Not used
-    #' @return self (invisibly)
-    print = function(...) {
-      cat("<evoland_db> (extends parquet_duckdb)\n")
-      cat(sprintf("Database path: %s\n", self$path))
-      cat(sprintf("Default format: %s\n\n", self$default_format))
-
-      cat("Database methods (inherited):\n")
-      cat("  Commit: commit_overwrite(x, table_name, autoincrement_cols, map_cols),\n")
-      cat("          commit_append(x, table_name, autoincrement_cols, map_cols),\n")
-      cat("          commit_upsert(x, table_name, key_cols, autoincrement_cols, map_cols)\n")
-      cat("  Fetch:  fetch(table_name, where, limit)\n")
-      cat("  Delete: delete_from(table_name, where)\n")
-      cat("  Other:  list_tables(), execute(statement), get_query(statement),\n")
-      cat("          attach_table(table_name, columns), detach_table(table_name),\n")
-      cat("          row_count(table_name)\n\n")
-
-      cat("Set / Add methods:\n")
-      cat("  set_report(...), set_coords(type, ...),\n")
-      cat("  set_periods(period_length_str, start_observed, end_observed, end_extrapolated),\n")
-      cat("  add_predictor(pred_spec, pred_data, pred_type)\n\n")
-
-      cat("Active bindings (read-write):\n")
-      cat("  coords_t, periods_t, lulc_meta_t, lulc_data_t, pred_data_t_float,\n")
-      cat("  pred_data_t_int, pred_data_t_bool, pred_meta_t, trans_meta_t, trans_preds_t,\n")
-      cat("  intrv_meta_t, intrv_masks_t, trans_models_t, alloc_params_t\n\n")
-
-      cat("Active bindings (read-only):\n")
-      cat("  extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n\n")
-
-      cat(sprintf("Tables: %d\n", length(self$list_tables())))
-
-      invisible(self)
     }
   )
 )
diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R
index 11aa465..e84f13e 100644
--- a/R/parquet_duckdb.R
+++ b/R/parquet_duckdb.R
@@ -436,10 +436,90 @@ parquet_duckdb <- R6::R6Class(
     #' @param ... Not used
     #' @return self (invisibly)
     print = function(...) {
-      cat("<parquet_duckdb>\n")
-      cat(sprintf("Database path: %s\n", self$path))
-      cat(sprintf("Default format: %s\n", self$default_format))
-      cat(sprintf("Tables: %d\n", length(self$list_tables())))
+      # gather data to be printed
+      classes <- class(self)
+      classes <- classes[classes != "R6"]
+
+      all_names <- names(self)
+      methods <- character(0)
+      active_bindings <- character(0)
+
+      names(self$.__enclos_env__$private)
+      if (!is.null(self$.__enclos_env__$super)) {
+        # exclude private super names
+        super_names <- setdiff(
+          ls(self$.__enclos_env__$super),
+          ls(self$.__enclos_env__$super$.__enclos_env__$private)
+        )
+      } else {
+        super_names <- character(0)
+      }
+      nonsuper_names <- setdiff(all_names, super_names)
+
+      for (name in nonsuper_names) {
+        # Check if it's an active binding first; subset2 would evaluate it
+        if (bindingIsActive(name, self$.__enclos_env__$self)) {
+          active_bindings <- c(active_bindings, name)
+        } else {
+          obj <- .subset2(self, name)
+          if (is.function(obj) && !name %in% c("initialize", "print", "clone")) {
+            methods <- c(methods, name)
+          }
+        }
+      }
+
+      methods <- sort(methods)
+      active_bindings <-
+        active_bindings[!grepl("_t($|_)", active_bindings)] |>
+        sort()
+
+      # actually start printing
+      if (length(classes) == 1) {
+        cat("<", classes[1], "> Object\n")
+      } else {
+        cat(classes[1], "Object. Inherits from", toString(classes[-1]), "\n")
+      }
+
+      compression <- if (grepl("compression\\s+(\\w+)", self$writeopts)) {
+        sub(".*compression\\s+(\\w+).*", "\\1", self$writeopts)
+      } else {
+        "none"
+      }
+
+      # Database info on one line
+      cat(
+        glue::glue(
+          "Database: {self$path} | Format: {self$default_format} | Compression: {compression}"
+        ),
+        "\n\n"
+      )
+
+      tables <- self$list_tables()
+      if (length(tables) > 0) {
+        cat("Tables present:\n  ")
+        cat(strwrap(toString(tables), width = 80), sep = "\n  ")
+        cat("\n")
+      } else {
+        cat("Tables present: (none)\n\n")
+      }
+
+      if (length(super_names) > 0) {
+        cat("DB Methods:\n  ")
+        cat(strwrap(toString(super_names), width = 80), sep = "\n  ")
+        cat("\n")
+      }
+
+      if (length(methods) > 0) {
+        cat("Public methods:\n  ")
+        cat(strwrap(toString(methods), width = 80), sep = "\n  ")
+        cat("\n")
+      }
+
+      if (length(active_bindings) > 0) {
+        cat("Active bindings:\n  ")
+        cat(strwrap(toString(active_bindings), width = 80), sep = "\n  ")
+      }
+
       invisible(self)
     }
   ),

From 51efad09b550b960f8bd21f66ab1c8141c1d701f Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Sat, 29 Nov 2025 14:39:51 +0100
Subject: [PATCH 15/27] set_neighbors: more idiomatic

---
 R/evoland_db_neighbors.R |  5 +++--
 man/evoland_db.Rd        | 32 ++++++--------------------------
 2 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R
index e4d02cc..9b5896a 100644
--- a/R/evoland_db_neighbors.R
+++ b/R/evoland_db_neighbors.R
@@ -22,7 +22,7 @@ NULL
 
 evoland_db$set(
   "public",
-  "create_neighbors_t",
+  "set_neighbors",
   function(
     max_distance = 1000,
     distance_breaks = c(0, 100, 500, 1000),
@@ -55,7 +55,7 @@ evoland_db$set(
 
 evoland_db$set("public", "generate_neighbor_predictors", function() {
   if (self$row_count("neighbors_t") == 0) {
-    stop("No neighbor data found. Run $create_neighbors_t() first.")
+    stop("No neighbor data found. Run $set_neighbors() first.")
   }
 
   if (self$row_count("lulc_meta_t") == 0) {
@@ -95,6 +95,7 @@ evoland_db$set("public", "generate_neighbor_predictors", function() {
     "(select 0 as max_pred)"
   }
 
+  # TODO this could possibly be done using the upsert, since it now takes care of the IDs
   self$execute(glue::glue(
     r"{
     create temp table pred_meta_neighbors_t as
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index fbdc608..676de84 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -25,7 +25,7 @@ Additional methods and active bindings are added to this class in separate files
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
-\item \href{#method-evoland_db-create_neighbors_t}{\code{evoland_db$create_neighbors_t()}}
+\item \href{#method-evoland_db-set_neighbors}{\code{evoland_db$set_neighbors()}}
 \item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}}
 \item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}}
 \item \href{#method-evoland_db-new}{\code{evoland_db$new()}}
@@ -34,7 +34,6 @@ Additional methods and active bindings are added to this class in separate files
 \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}}
 \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}}
 \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}}
-\item \href{#method-evoland_db-print}{\code{evoland_db$print()}}
 \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}}
 }
 }
@@ -50,17 +49,18 @@ Additional methods and active bindings are added to this class in separate files
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="execute"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-execute'><code>evoland::parquet_duckdb$execute()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="get_query"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-get_query'><code>evoland::parquet_duckdb$get_query()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="list_tables"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-list_tables'><code>evoland::parquet_duckdb$list_tables()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="print"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-print'><code>evoland::parquet_duckdb$print()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="row_count"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-row_count'><code>evoland::parquet_duckdb$row_count()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="with_tables"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-with_tables'><code>evoland::parquet_duckdb$with_tables()</code></a></span></li>
 </ul>
 </details>
 }}
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-create_neighbors_t"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-create_neighbors_t}{}}}
-\subsection{Method \code{create_neighbors_t()}}{
+\if{html}{\out{<a id="method-evoland_db-set_neighbors"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-set_neighbors}{}}}
+\subsection{Method \code{set_neighbors()}}{
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$create_neighbors_t(
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$set_neighbors(
   max_distance = 1000,
   distance_breaks = c(0, 100, 500, 1000),
   resolution = 100,
@@ -225,26 +225,6 @@ Add a predictor to the database
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-print"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-print}{}}}
-\subsection{Method \code{print()}}{
-Print method for evoland_db
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$print(...)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{...}}{Not used}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-self (invisibly)
-}
-}
-\if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-clone"></a>}}
 \if{latex}{\out{\hypertarget{method-evoland_db-clone}{}}}
 \subsection{Method \code{clone()}}{

From f3223dfcc4488c866ab227106ac58c36333c971a Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Sun, 30 Nov 2025 13:57:39 +0100
Subject: [PATCH 16/27] changed print method

---
 inst/tinytest/test_parquet_duckdb.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R
index fe9b188..ad58bc8 100644
--- a/inst/tinytest/test_parquet_duckdb.R
+++ b/inst/tinytest/test_parquet_duckdb.R
@@ -343,7 +343,7 @@ db$commit_upsert(test_no_keys, "no_keys_test", key_cols = character(0))
 expect_equal(db$row_count("no_keys_test"), 4L) # Should append
 
 # Test 35: Print method
-output <- capture.output(print(db))
-expect_true(any(grepl("parquet_duckdb", output)))
-expect_true(any(grepl("Database path", output)))
-expect_true(any(grepl("Default format", output)))
+expect_stdout(
+  print(db),
+  "Public methods:|Active bindings:|Format|Compression"
+)

From ea5ed53668aa531ad6d587dc53035f3862b5ff60 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Sun, 30 Nov 2025 13:58:04 +0100
Subject: [PATCH 17/27] cast for safety

---
 R/evoland_db.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/evoland_db.R b/R/evoland_db.R
index 4279ad0..310e317 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -167,7 +167,7 @@ evoland_db <- R6::R6Class(
         where = glue::glue("name = '{names(pred_spec)}'")
       )
 
-      data.table::set(pred_data, j = "id_pred", value = existing_meta[["id_pred"]])
+      data.table::set(pred_data, j = "id_pred", value = as.integer(existing_meta[["id_pred"]]))
       data.table::setcolorder(pred_data, c("id_pred", "id_coord", "id_period", "value"))
 
       self$commit_upsert(

From 0feb2d7a7ef11bbb33ce49ab2747ba16e875182a Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Sun, 30 Nov 2025 13:58:38 +0100
Subject: [PATCH 18/27] enable committing from existing in-memory tables

---
 R/parquet_duckdb.R | 64 +++++++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R
index e84f13e..6307d87 100644
--- a/R/parquet_duckdb.R
+++ b/R/parquet_duckdb.R
@@ -246,7 +246,7 @@ parquet_duckdb <- R6::R6Class(
 
     #' @description
     #' Commit data in overwrite mode
-    #' @param x Data frame to commit
+    #' @param x Data frame to commit. If character, in-duckdb-memory table.
     #' @param table_name Character string table name
     #' @param autoincrement_cols Character vector of column names to auto-increment
     #' @param map_cols Character vector of columns to convert to MAP format
@@ -260,10 +260,11 @@ parquet_duckdb <- R6::R6Class(
       file_path <- file.path(self$path, paste0(table_name, ".", self$default_format))
 
       private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
+      on.exit(private$cleanup_new_data_v(), add = TRUE)
+      new_data_names <- private$new_data_names()
 
       # Warn if overriding existing IDs
-      if (length(intersect(autoincrement_cols, names(x))) > 0) {
+      if (length(intersect(autoincrement_cols, new_data_names)) > 0) {
         warning(glue::glue(
           "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n",
           "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning"
@@ -271,7 +272,7 @@ parquet_duckdb <- R6::R6Class(
       }
 
       # Build SELECT expression
-      ordinary_cols <- setdiff(names(x), autoincrement_cols)
+      ordinary_cols <- setdiff(new_data_names, autoincrement_cols)
       select_expr <- glue::glue_collapse(
         c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols),
         sep = ",\n "
@@ -291,7 +292,7 @@ parquet_duckdb <- R6::R6Class(
 
     #' @description
     #' Commit data in append mode
-    #' @param x Data frame to commit
+    #' @param x Data frame to commit. If character, in-duckdb-memory table.
     #' @param table_name Character string table name
     #' @param autoincrement_cols Character vector of column names to auto-increment
     #' @param map_cols Character vector of columns to convert to MAP format
@@ -313,9 +314,10 @@ parquet_duckdb <- R6::R6Class(
       private$set_autoincrement_vars(table_name, autoincrement_cols)
 
       private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
+      on.exit(private$cleanup_new_data_v(), add = TRUE)
+      new_data_names <- private$get_new_data_names()
 
-      ordinary_cols <- setdiff(names(x), autoincrement_cols)
+      ordinary_cols <- setdiff(new_data_names, autoincrement_cols)
       select_new <- glue::glue_collapse(
         c(
           glue::glue(
@@ -344,16 +346,17 @@ parquet_duckdb <- R6::R6Class(
 
     #' @description
     #' Commit data in upsert mode (update existing, insert new)
-    #' @param x Data frame to commit
+    #' @param x Data frame to commit. If character, in-duckdb-memory table.
     #' @param table_name Character string table name
-    #' @param key_cols Character vector of columns that define uniqueness
+    #' @param key_cols Character vector of columns that define uniqueness. If missing,
+    #' use all columns starting with `id_`
     #' @param autoincrement_cols Character vector of column names to auto-increment
     #' @param map_cols Character vector of columns to convert to MAP format
     #' @return Invisible NULL (called for side effects)
     commit_upsert = function(
       x,
       table_name,
-      key_cols = grep("^id_", names(x), value = TRUE),
+      key_cols,
       autoincrement_cols = character(0),
       map_cols = character(0)
     ) {
@@ -361,8 +364,6 @@ parquet_duckdb <- R6::R6Class(
 
       if (!file_info$exists) {
         return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols))
-      } else if (length(key_cols) == 0) {
-        return(self$commit_append(x, table_name, autoincrement_cols, map_cols))
       }
 
       self$attach_table(table_name)
@@ -371,10 +372,17 @@ parquet_duckdb <- R6::R6Class(
       private$set_autoincrement_vars(table_name, autoincrement_cols)
 
       private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(map_cols), add = TRUE)
+      on.exit(private$cleanup_new_data_v(), add = TRUE)
+      new_data_names <- private$new_data_names()
+      if (missing(key_cols)) {
+        key_cols <- grep("^id_", new_data_names, value = TRUE)
+      }
+      if (length(key_cols) == 0) {
+        return(self$commit_append(x, table_name, autoincrement_cols, map_cols))
+      }
 
       # Update existing data
-      ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols))
+      ordinary_cols <- setdiff(new_data_names, union(key_cols, autoincrement_cols))
       update_select_expr <- glue::glue_collapse(
         glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"),
         sep = ",\n "
@@ -400,7 +408,7 @@ parquet_duckdb <- R6::R6Class(
           glue::glue(
             "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
           ),
-          glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}")
+          glue::glue("new_data_v.{setdiff(new_data_names, autoincrement_cols)}")
         ),
         sep = ",\n "
       )
@@ -567,6 +575,12 @@ parquet_duckdb <- R6::R6Class(
     # @param map_cols Character vector of columns to convert to MAP format
     # @return NULL (called for side effects)
     register_new_data_v = function(x, map_cols = character(0)) {
+      if (is.character(x)) {
+        # TODO add tests
+        self$execute(glue::glue("create view new_data_v as from {x}"))
+        return(invisible(NULL))
+      }
+
       if (length(map_cols) == 0) {
         # No MAP conversion needed - register directly
         duckdb::duckdb_register(self$connection, "new_data_v", x)
@@ -598,20 +612,22 @@ parquet_duckdb <- R6::R6Class(
     #
     # @param map_cols Character vector indicating if MAP conversion was used
     # @return NULL (called for side effects)
-    cleanup_new_data_v = function(map_cols = character(0)) {
-      if (length(map_cols) == 0) {
-        duckdb::duckdb_unregister(self$connection, "new_data_v")
-      } else {
-        self$execute(
-          "drop table if exists new_data_v;
+    cleanup_new_data_v = function() {
+      duckdb::duckdb_unregister(self$connection, "new_data_v")
+      duckdb::duckdb_unregister(self$connection, "new_data_raw")
+      self$execute(
+        "drop table if exists new_data_v;
           drop view if exists new_data_v"
-        )
-        duckdb::duckdb_unregister(self$connection, "new_data_raw")
-      }
+      )
 
       invisible(NULL)
     },
 
+    # Get the names of the current new_data_v
+    new_data_names = function() {
+      self$get_query("select column_name from (describe new_data_v)")[[1]]
+    },
+
     # Set DuckDB variables max_{colname} for autoincrement columns
     #
     # @param table_name Character string table name

From a9fea629b798d7a77e8017472184969da15b6f2a Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Sun, 30 Nov 2025 18:40:12 +0100
Subject: [PATCH 19/27] corrected implementation of neighbour predictors

---
 R/evoland_db_neighbors.R | 67 ++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 40 deletions(-)

diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R
index 9b5896a..b07c749 100644
--- a/R/evoland_db_neighbors.R
+++ b/R/evoland_db_neighbors.R
@@ -66,46 +66,33 @@ evoland_db$set("public", "generate_neighbor_predictors", function() {
     stop("No LULC data found. Add lulc_data_t before generating neighbor predictors.")
   }
 
-  neighbors_sample <- self$fetch("neighbors_t", limit = 1)
+  neighbors_sample <- self$fetch("neighbors_t", limit = 0L)
   if (!"distance_class" %in% names(neighbors_sample)) {
     stop(
       "neighbors_t does not have distance_class column. Run $create_neighbors_t() with distance_breaks parameter."
     )
   }
 
-  self$attach_table("neighbors_t")
+  self$attach_table(
+    "neighbors_t",
+    columns = c("id_coord_origin", "id_coord_neighbor", "distance_class")
+  )
   self$attach_table("lulc_data_t")
   self$attach_table("lulc_meta_t")
 
-  has_pred_meta <- self$row_count("pred_meta_t") > 0
-  if (has_pred_meta) {
-    self$attach_table("pred_meta_t")
-  }
-
   on.exit({
     self$detach_table("neighbors_t")
     self$detach_table("lulc_data_t")
     self$detach_table("lulc_meta_t")
-    if (has_pred_meta) self$detach_table("pred_meta_t")
   })
 
-  max_id_query <- if (has_pred_meta) {
-    "(select coalesce(max(id_pred), 0) as max_pred from pred_meta_t)"
-  } else {
-    "(select 0 as max_pred)"
-  }
-
-  # TODO this could possibly be done using the upsert, since it now takes care of the IDs
-  self$execute(glue::glue(
+  n_predictors <- self$execute(
     r"{
-    create temp table pred_meta_neighbors_t as
+    create or replace temp table pred_meta_neighbors_t as
     with
-      all_distance_classes as (select distinct distance_class from neighbors_t),
-      max_id as }",
-    max_id_query,
-    r"{
+      all_distance_classes as (select distinct distance_class from neighbors_t)
     select
-      row_number() over () + (select max_pred from max_id) as id_pred,
+      NULL as id_pred,
       concat('id_lulc_', l.id_lulc, '_dist_', c.distance_class) as name,
       concat('Count of ', l.pretty_name, ' within distance class ', c.distance_class) as pretty_name,
       'Number of neighbors by land use class and distance interval' as description,
@@ -120,19 +107,25 @@ evoland_db$set("public", "generate_neighbor_predictors", function() {
     cross join
       all_distance_classes c
     }"
-  ))
-
-  pred_meta_neighbors <- self$get_query(
-    "select id_pred, name, pretty_name, description, orig_format, unit
+  )
+  self$execute(
+    "create or replace view pred_meta_upsert_v as 
+     select name, pretty_name, description, orig_format, sources, unit, factor_levels
      from pred_meta_neighbors_t"
   )
 
-  pred_meta_neighbors$sources <- lapply(1:nrow(pred_meta_neighbors), function(i) list())
-  pred_meta_neighbors$factor_levels <- lapply(1:nrow(pred_meta_neighbors), function(i) list())
-
-  self$pred_meta_t <- as_pred_meta_t(pred_meta_neighbors)
-
+  self$attach_table("pred_meta_t")
+  on.exit(self$detach_table("pred_meta_t"), add = TRUE)
   self$execute(
+    r"{
+    update pred_meta_neighbors_t
+    set id_pred = pred_meta_t.id_pred
+    from pred_meta_t
+    where pred_meta_neighbors_t.name = pred_meta_t.name
+    }"
+  )
+
+  n_data_points <- self$execute(
     r"{
     create temp table pred_neighbors_t as
     select
@@ -157,17 +150,11 @@ evoland_db$set("public", "generate_neighbor_predictors", function() {
     }"
   )
 
-  pred_neighbors <- self$get_query("select * from pred_neighbors_t")
-  pred_neighbors$id_pred <- as.integer(pred_neighbors$id_pred)
-  pred_neighbors$id_coord <- as.integer(pred_neighbors$id_coord)
-  pred_neighbors$id_period <- as.integer(pred_neighbors$id_period)
-  pred_neighbors$value <- as.integer(pred_neighbors$value)
-
-  self$pred_data_t_int <- as_pred_data_t(pred_neighbors, type = "int")
+  self$commit_upsert("pred_neighbors_t", "pred_neighbors_t_int")
 
   message(glue::glue(
-    "Generated {nrow(pred_meta_neighbors)} neighbor predictor variables with ",
-    "{nrow(pred_neighbors)} data points"
+    "Generated {n_predictors} neighbor predictor variables with ",
+    "{n_data_points} data points"
   ))
 
   invisible(self)

From 6bd2277539dd6533ade0862faaca6c515846849c Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Mon, 1 Dec 2025 15:03:45 +0100
Subject: [PATCH 20/27] add progress bar to neighbourhood calc

---
 R/RcppExports.R                 |  4 ++--
 inst/tinytest/test_util_terra.R | 23 ++++++++++++++++-------
 src/RcppExports.cpp             |  9 +++++----
 src/distances.cpp               | 33 +++++++++++++++++++++++++++------
 4 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 3a54496..395824b 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,7 +1,7 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) {
-    .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution)
+distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) {
+    .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet)
 }
 
diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R
index efb0dac..82f998a 100644
--- a/inst/tinytest/test_util_terra.R
+++ b/inst/tinytest/test_util_terra.R
@@ -152,7 +152,10 @@ test_coords <- data.table::data.table(
 test_coords <- as_coords_t(test_coords)
 
 # Test basic neighbor computation with max_distance = 150
-neighbors <- create_neighbors_t(test_coords, max_distance = 150)
+expect_stdout(
+  neighbors <- create_neighbors_t(test_coords, max_distance = 150),
+  "Progress"
+)
 
 # Check structure
 expect_true(data.table::is.data.table(neighbors))
@@ -204,10 +207,13 @@ expect_equal(
 )
 
 # Test with distance_breaks
-neighbors_classified <- create_neighbors_t(
-  test_coords,
-  max_distance = 150,
-  distance_breaks = c(0, 100, 150)
+expect_stdout(
+  neighbors_classified <- create_neighbors_t(
+    test_coords,
+    max_distance = 150,
+    distance_breaks = c(0, 100, 150)
+  ),
+  "Progress"
 )
 
 # Check that distance_class is populated
@@ -234,7 +240,10 @@ expect_equal(
 )
 
 # Test with smaller max_distance
-neighbors_small <- create_neighbors_t(test_coords, max_distance = 110)
+expect_stdout(
+  neighbors_small <- create_neighbors_t(test_coords, max_distance = 110),
+  "Progress"
+)
 
 # With max_distance = 110, point 1 should only have neighbors 2 and 4 (distance 100)
 # but not 5 (distance ~141.4)
@@ -266,7 +275,7 @@ expect_error(
 )
 
 # Test with real coords_t from earlier in the test file
-real_neighbors <- create_neighbors_t(coords_t, max_distance = 300)
+expect_stdout(real_neighbors <- create_neighbors_t(coords_t, max_distance = 300), "Progress")
 
 # Each point in a regular 100m grid should have neighbors
 # Interior points should have 8 neighbors within 300m (8-connectivity)
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index c2f3be0..13f1a2b 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -11,21 +11,22 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 #endif
 
 // distance_neighbors_cpp
-List distance_neighbors_cpp(DataFrame coords_t, double max_distance, double resolution);
-RcppExport SEXP _evoland_distance_neighbors_cpp(SEXP coords_tSEXP, SEXP max_distanceSEXP, SEXP resolutionSEXP) {
+List distance_neighbors_cpp(DataFrame coords_t, double max_distance, double resolution, bool quiet);
+RcppExport SEXP _evoland_distance_neighbors_cpp(SEXP coords_tSEXP, SEXP max_distanceSEXP, SEXP resolutionSEXP, SEXP quietSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< DataFrame >::type coords_t(coords_tSEXP);
     Rcpp::traits::input_parameter< double >::type max_distance(max_distanceSEXP);
     Rcpp::traits::input_parameter< double >::type resolution(resolutionSEXP);
-    rcpp_result_gen = Rcpp::wrap(distance_neighbors_cpp(coords_t, max_distance, resolution));
+    Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP);
+    rcpp_result_gen = Rcpp::wrap(distance_neighbors_cpp(coords_t, max_distance, resolution, quiet));
     return rcpp_result_gen;
 END_RCPP
 }
 
 static const R_CallMethodDef CallEntries[] = {
-    {"_evoland_distance_neighbors_cpp", (DL_FUNC) &_evoland_distance_neighbors_cpp, 3},
+    {"_evoland_distance_neighbors_cpp", (DL_FUNC) &_evoland_distance_neighbors_cpp, 4},
     {NULL, NULL, 0}
 };
 
diff --git a/src/distances.cpp b/src/distances.cpp
index cd41ffb..2e23a21 100644
--- a/src/distances.cpp
+++ b/src/distances.cpp
@@ -24,17 +24,16 @@ using namespace Rcpp;
  *   - id_coord_neighbor: ID of the neighboring coordinate
  *   - distance_approx: Approximate distance between origin and neighbor
  *
- * @note The function uses an approximate distance calculation based on grid cells.
- *   If multiple points fall into the same cell, a warning is issued and only the first
- *   point is retained for that cell.
+ * @note The function uses an approximate distance calculation based on grid
+ * cells. If multiple points fall into the same cell, a warning is issued and
+ * only the first point is retained for that cell.
  *
  * @warning Distance calculations are approximate and based on Euclidean distance in
  *   coordinate space. For geographic coordinates, this may not reflect true geodesic distance.
  */
 // [[Rcpp::export]]
-List distance_neighbors_cpp(DataFrame coords_t,
-                                 double max_distance,
-                                 double resolution = 100.0) {
+List distance_neighbors_cpp(DataFrame coords_t, double max_distance,
+                            double resolution = 100.0, bool quiet = false) {
 
   // Extract columns
   IntegerVector id_coord = coords_t["id_coord"];
@@ -112,7 +111,25 @@ List distance_neighbors_cpp(DataFrame coords_t,
   
   // 5. Find neighbors within max_distance
 
+  // Progress reporting setup
+  int progress_interval = std::max(1000, n_points / 20); // Report every 5%
+
+  if (!quiet) {
+    Rcpp::Rcout << "\rProgress: 0% (0/" << n_points << " points)" << std::flush;
+  }
+
   for (int pt_idx = 0; pt_idx < n_points; pt_idx++) {
+    // Check for user interrupt periodically
+    if (pt_idx % 1000 == 0) {
+      Rcpp::checkUserInterrupt();
+    }
+
+    // Report progress
+    if (!quiet && pt_idx > 0 && pt_idx % progress_interval == 0) {
+      int pct = (int)(100.0 * pt_idx / n_points);
+      Rcpp::Rcout << "\rProgress: " << pct << "% (" << pt_idx << "/" << n_points
+                  << " points)" << std::flush;
+    }
     int origin_id = id_coord[pt_idx];
     int origin_row = row_indices[pt_idx];
     int origin_col = col_indices[pt_idx];
@@ -160,6 +177,10 @@ List distance_neighbors_cpp(DataFrame coords_t,
     Named("id_coord_neighbor") = neighbor_ids,
     Named("distance_approx") = distances_approx
   );
+  if (!quiet) {
+    Rcpp::Rcout << "\rProgress: 100% (" << n_points << "/" << n_points
+                << " points)" << std::endl;
+  }
 
   res.attr("class") = CharacterVector::create("data.table", "data.frame");
 

From 7d6c453e473fb1b22d16112e20be50509fde49a8 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Mon, 1 Dec 2025 15:10:58 +0100
Subject: [PATCH 21/27] commit: again a single public method; needed to manage
 table attachment

---
 R/evoland_db.R                      |  14 +-
 R/evoland_db_neighbors.R            |   7 +-
 R/evoland_db_tables.R               |   5 +-
 R/parquet_duckdb.R                  | 370 ++++++++++++++--------------
 inst/tinytest/test_parquet_duckdb.R | 103 +++++---
 man/parquet_duckdb.Rd               | 105 +++-----
 vignettes/evoland.qmd               |   4 +-
 7 files changed, 310 insertions(+), 298 deletions(-)

diff --git a/R/evoland_db.R b/R/evoland_db.R
index 310e317..9aebd16 100644
--- a/R/evoland_db.R
+++ b/R/evoland_db.R
@@ -98,13 +98,14 @@ evoland_db <- R6::R6Class(
       params[["last_opened"]] <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
       params[["last_opened_username"]] <- Sys.getenv("USER", unset = "unknown")
 
-      self$commit_upsert(
+      self$commit(
         data.table::as.data.table(list(
           key = names(params), # cannot name a column "key" in data.table()
           value = unlist(params)
         )),
         table_name = "reporting_t",
-        key_cols = "key"
+        key_cols = "key",
+        method = "upsert"
       )
     },
 
@@ -123,7 +124,7 @@ evoland_db <- R6::R6Class(
         function(x) stop("Unsupported coordinate type specified.")
       )
 
-      self$commit_overwrite(as_coords_t(create_fun(...)), "coords_t")
+      self$commit(as_coords_t(create_fun(...)), "coords_t", method = "overwrite")
     },
 
     #' @description
@@ -144,7 +145,7 @@ evoland_db <- R6::R6Class(
         return(invisible(NULL))
       }
 
-      self$commit_append(
+      self$commit(
         do.call(create_periods_t, as.list(environment())),
         "periods_t"
       )
@@ -170,9 +171,10 @@ evoland_db <- R6::R6Class(
       data.table::set(pred_data, j = "id_pred", value = as.integer(existing_meta[["id_pred"]]))
       data.table::setcolorder(pred_data, c("id_pred", "id_coord", "id_period", "value"))
 
-      self$commit_upsert(
+      self$commit(
         as_pred_data_t(pred_data, pred_type),
-        paste0("pred_data_t_", pred_type)
+        paste0("pred_data_t_", pred_type),
+        method = "upsert"
       )
     }
   )
diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R
index b07c749..f8f6d20 100644
--- a/R/evoland_db_neighbors.R
+++ b/R/evoland_db_neighbors.R
@@ -43,9 +43,10 @@ evoland_db$set(
       resolution = resolution
     )
 
-    self$commit_overwrite(
+    self$commit(
       as_neighbors_t(neighbors),
-      table_name = "neighbors_t"
+      table_name = "neighbors_t",
+      method = "overwrite"
     )
 
     message(glue::glue("Computed {nrow(neighbors)} neighbor relationships"))
@@ -150,7 +151,7 @@ evoland_db$set("public", "generate_neighbor_predictors", function() {
     }"
   )
 
-  self$commit_upsert("pred_neighbors_t", "pred_neighbors_t_int")
+  self$commit("pred_neighbors_t", "pred_neighbors_t_int", method = "upsert")
 
   message(glue::glue(
     "Generated {n_predictors} neighbor predictor variables with ",
diff --git a/R/evoland_db_tables.R b/R/evoland_db_tables.R
index d477ab3..1ff2043 100644
--- a/R/evoland_db_tables.R
+++ b/R/evoland_db_tables.R
@@ -50,12 +50,13 @@ create_table_binding <- function(
 
     stopifnot(inherits(x, table_name))
 
-    self$commit_upsert(
+    self$commit(
       x,
       table_name = table_name,
       key_cols = key_cols,
       autoincrement_cols = autoincrement_cols,
-      map_cols = map_cols
+      map_cols = map_cols,
+      method = "upsert"
     )
   }
 }
diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R
index 6307d87..e7c5538 100644
--- a/R/parquet_duckdb.R
+++ b/R/parquet_duckdb.R
@@ -104,7 +104,6 @@ parquet_duckdb <- R6::R6Class(
         sql <- glue::glue("{sql} where {where}")
       }
 
-      # Execute SQL
       self$execute(sql)
       invisible(NULL)
     },
@@ -242,201 +241,72 @@ parquet_duckdb <- R6::R6Class(
       return(count_before - count_after)
     },
 
-    ### Commit Methods ----
-
     #' @description
-    #' Commit data in overwrite mode
-    #' @param x Data frame to commit. If character, in-duckdb-memory table.
-    #' @param table_name Character string table name
-    #' @param autoincrement_cols Character vector of column names to auto-increment
-    #' @param map_cols Character vector of columns to convert to MAP format
-    #' @return Invisible NULL (called for side effects)
-    commit_overwrite = function(
-      x,
-      table_name,
-      autoincrement_cols = character(0),
-      map_cols = character(0)
-    ) {
-      file_path <- file.path(self$path, paste0(table_name, ".", self$default_format))
-
-      private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(), add = TRUE)
-      new_data_names <- private$new_data_names()
-
-      # Warn if overriding existing IDs
-      if (length(intersect(autoincrement_cols, new_data_names)) > 0) {
-        warning(glue::glue(
-          "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n",
-          "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning"
-        ))
-      }
-
-      # Build SELECT expression
-      ordinary_cols <- setdiff(new_data_names, autoincrement_cols)
-      select_expr <- glue::glue_collapse(
-        c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols),
-        sep = ",\n "
-      )
-
-      self$execute(glue::glue(
-        r"{
-        copy (
-          select {select_expr}
-          from new_data_v
-        ) to '{file_path}' ({self$writeopts})
-        }"
-      ))
-
-      invisible(NULL)
-    },
-
-    #' @description
-    #' Commit data in append mode
+    #' Commit data using overwrite, append, or upsert modes. Handles autoincrement, key
+    #' identity columns, and list-to-MAP conversion.
     #' @param x Data frame to commit. If character, in-duckdb-memory table.
     #' @param table_name Character string table name
+    #' @param key_cols Character vector of columns that define uniqueness. If missing,
+    #' use all columns starting with `id_`
     #' @param autoincrement_cols Character vector of column names to auto-increment
     #' @param map_cols Character vector of columns to convert to MAP format
+    #' @param method Character, one of "overwrite", "append", "upsert" (upsert being an
+    #' update for existing rows, and insert for new rows)
     #' @return Invisible NULL (called for side effects)
-    commit_append = function(
+    commit = function(
       x,
       table_name,
+      key_cols,
       autoincrement_cols = character(0),
-      map_cols = character(0)
+      map_cols = character(0),
+      method = c("overwrite", "append", "upsert")
     ) {
-      file_info <- private$get_file_path(table_name)
-
-      if (!file_info$exists) {
-        return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols))
-      }
-
-      self$attach_table(table_name)
-      on.exit(self$detach_table(table_name), add = TRUE)
-      private$set_autoincrement_vars(table_name, autoincrement_cols)
+      method <- match.arg(method)
 
       private$register_new_data_v(x, map_cols)
       on.exit(private$cleanup_new_data_v(), add = TRUE)
-      new_data_names <- private$get_new_data_names()
-
-      ordinary_cols <- setdiff(new_data_names, autoincrement_cols)
-      select_new <- glue::glue_collapse(
-        c(
-          glue::glue(
-            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
-          ),
-          ordinary_cols
-        ),
-        sep = ",\n "
-      )
-
-      # Concatenation using UNION ALL; "by name" handles missing columns
-      self$execute(glue::glue(
-        r"{
-          copy (
-            select * from {table_name}
-            union all by name
-            select {select_new}
-            from new_data_v
-          )
-          to '{file_info$path}' ({self$writeopts})
-          }"
-      ))
-
-      invisible(NULL)
-    },
+      all_cols <- self$get_query(
+        "select column_name from (describe new_data_v)"
+      )[[1]]
 
-    #' @description
-    #' Commit data in upsert mode (update existing, insert new)
-    #' @param x Data frame to commit. If character, in-duckdb-memory table.
-    #' @param table_name Character string table name
-    #' @param key_cols Character vector of columns that define uniqueness. If missing,
-    #' use all columns starting with `id_`
-    #' @param autoincrement_cols Character vector of column names to auto-increment
-    #' @param map_cols Character vector of columns to convert to MAP format
-    #' @return Invisible NULL (called for side effects)
-    commit_upsert = function(
-      x,
-      table_name,
-      key_cols,
-      autoincrement_cols = character(0),
-      map_cols = character(0)
-    ) {
       file_info <- private$get_file_path(table_name)
 
-      if (!file_info$exists) {
-        return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols))
+      if (method == "overwrite" || !file_info$exists) {
+        # in case overwrite explicitly required, or no previously existing data to
+        # append or upsert to; rest of logic can be skipped
+        return(private$commit_overwrite(
+          table_name = table_name,
+          all_cols = all_cols,
+          autoincrement_cols = autoincrement_cols,
+          file_info = file_info
+        ))
       }
 
       self$attach_table(table_name)
       on.exit(self$detach_table(table_name), add = TRUE)
-
       private$set_autoincrement_vars(table_name, autoincrement_cols)
 
-      private$register_new_data_v(x, map_cols)
-      on.exit(private$cleanup_new_data_v(), add = TRUE)
-      new_data_names <- private$new_data_names()
       if (missing(key_cols)) {
-        key_cols <- grep("^id_", new_data_names, value = TRUE)
-      }
-      if (length(key_cols) == 0) {
-        return(self$commit_append(x, table_name, autoincrement_cols, map_cols))
+        key_cols <- grep("^id_", all_cols, value = TRUE)
       }
 
-      # Update existing data
-      ordinary_cols <- setdiff(new_data_names, union(key_cols, autoincrement_cols))
-      update_select_expr <- glue::glue_collapse(
-        glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"),
-        sep = ",\n "
-      )
-      update_join_condition <- glue::glue_collapse(
-        glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"),
-        sep = "\nand "
-      )
-
-      self$execute(glue::glue(
-        r"{
-        update {table_name} set
-          {update_select_expr}
-        from new_data_v
-        where
-          {update_join_condition};
-        }"
-      ))
-
-      # Insert new data
-      insert_select_expr <- glue::glue_collapse(
-        c(
-          glue::glue(
-            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
-          ),
-          glue::glue("new_data_v.{setdiff(new_data_names, autoincrement_cols)}")
-        ),
-        sep = ",\n "
-      )
-      null_condition <- glue::glue_collapse(
-        glue::glue("{table_name}.{key_cols} is null"),
-        sep = "\nand "
-      )
-
-      self$execute(glue::glue(
-        r"{
-        insert into {table_name}
-        select
-          {insert_select_expr}
-        from
-          new_data_v
-        left join
-          {table_name}
-        on
-          {update_join_condition}
-        where
-          {null_condition}
-        ;
-        }"
-      ))
-
-      self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})"))
-
-      invisible(NULL)
+      if (method == "append" || length(key_cols) == 0L) {
+        # if there are no key columns to join on, upsert becomes append
+        private$commit_append(
+          table_name = table_name,
+          all_cols = all_cols,
+          autoincrement_cols = autoincrement_cols,
+          file_info = file_info
+        )
+      } else {
+        private$commit_upsert(
+          table_name = table_name,
+          all_cols = all_cols,
+          key_cols = key_cols,
+          autoincrement_cols = autoincrement_cols,
+          file_info = file_info
+        )
+      }
     },
 
     #' @description
@@ -542,6 +412,149 @@ parquet_duckdb <- R6::R6Class(
       }
     },
 
+    ### Commit Methods ----
+    #' param x Data frame to commit. If character, in-duckdb-memory table.
+    #' param table_name Character string table name
+    #' param autoincrement_cols Character vector of column names to auto-increment
+    #' param map_cols Character vector of columns to convert to MAP format
+    #' return Invisible NULL (called for side effects)
+    commit_overwrite = function(
+      table_name,
+      all_cols,
+      autoincrement_cols = character(0),
+      file_info
+    ) {
+      # Warn if overriding existing IDs
+      if (length(intersect(autoincrement_cols, all_cols)) > 0) {
+        warning(glue::glue(
+          "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n",
+          "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning"
+        ))
+      }
+
+      # Build SELECT expression
+      ordinary_cols <- setdiff(all_cols, autoincrement_cols)
+      select_expr <- glue::glue_collapse(
+        c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols),
+        sep = ",\n "
+      )
+
+      self$execute(glue::glue(
+        r"{
+        copy (
+          select {select_expr}
+          from new_data_v
+        ) to '{file_info$path}' ({self$writeopts})
+        }"
+      ))
+    },
+
+    #' param x Data frame to commit. If character, in-duckdb-memory table.
+    #' param table_name Character string table name
+    #' param autoincrement_cols Character vector of column names to auto-increment
+    #' param map_cols Character vector of columns to convert to MAP format
+    #' return Invisible NULL (called for side effects)
+    commit_append = function(
+      table_name,
+      all_cols,
+      autoincrement_cols = character(0),
+      file_info
+    ) {
+      ordinary_cols <- setdiff(all_cols, autoincrement_cols)
+      select_new <- glue::glue_collapse(
+        c(
+          glue::glue(
+            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
+          ),
+          ordinary_cols
+        ),
+        sep = ",\n "
+      )
+
+      # Concatenation using UNION ALL; "by name" handles missing columns
+      self$execute(glue::glue(
+        r"{
+          copy (
+            select * from {table_name}
+            union all by name
+            select {select_new}
+            from new_data_v
+          )
+          to '{file_info$path}' ({self$writeopts})
+          }"
+      ))
+    },
+
+    #' param x Data frame to commit. If character, in-duckdb-memory table.
+    #' param table_name Character string table name
+    #' param key_cols Character vector of columns that define uniqueness. If missing,
+    #' use all columns starting with `id_`
+    #' param autoincrement_cols Character vector of column names to auto-increment
+    #' param map_cols Character vector of columns to convert to MAP format
+    #' return Invisible NULL (called for side effects)
+    commit_upsert = function(
+      table_name,
+      all_cols,
+      key_cols,
+      autoincrement_cols = character(0),
+      file_info
+    ) {
+      # Update existing data
+      ordinary_cols <- setdiff(all_cols, union(key_cols, autoincrement_cols))
+      update_select_expr <- glue::glue_collapse(
+        glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"),
+        sep = ",\n "
+      )
+      update_join_condition <- glue::glue_collapse(
+        glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"),
+        sep = "\nand "
+      )
+
+      self$execute(glue::glue(
+        r"{
+        update {table_name} set
+          {update_select_expr}
+        from new_data_v
+        where
+          {update_join_condition};
+        }"
+      ))
+
+      # Insert new data
+      insert_select_expr <- glue::glue_collapse(
+        c(
+          glue::glue(
+            "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}"
+          ),
+          glue::glue("new_data_v.{setdiff(all_cols, autoincrement_cols)}")
+        ),
+        sep = ",\n "
+      )
+      null_condition <- glue::glue_collapse(
+        glue::glue("{table_name}.{key_cols} is null"),
+        sep = "\nand "
+      )
+
+      self$execute(glue::glue(
+        r"{
+        insert into {table_name}
+        select
+          {insert_select_expr}
+        from
+          new_data_v
+        left join
+          {table_name}
+        on
+          {update_join_condition}
+        where
+          {null_condition}
+        ;
+        }"
+      ))
+
+      self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})"))
+    },
+
     # Get file path and format for a table
     #
     # @param table_name Character string table name
@@ -613,21 +626,14 @@ parquet_duckdb <- R6::R6Class(
     # @param map_cols Character vector indicating if MAP conversion was used
     # @return NULL (called for side effects)
     cleanup_new_data_v = function() {
-      duckdb::duckdb_unregister(self$connection, "new_data_v")
-      duckdb::duckdb_unregister(self$connection, "new_data_raw")
-      self$execute(
-        "drop table if exists new_data_v;
-          drop view if exists new_data_v"
-      )
+      try(duckdb::duckdb_unregister(self$connection, "new_data_v"), silent = TRUE)
+      try(duckdb::duckdb_unregister(self$connection, "new_data_raw"), silent = TRUE)
+      try(self$execute("drop table if exists new_data_v"), silent = TRUE)
+      try(self$execute("drop view if exists new_data_v"), silent = TRUE)
 
       invisible(NULL)
     },
 
-    # Get the names of the current new_data_v
-    new_data_names = function() {
-      self$get_query("select column_name from (describe new_data_v)")[[1]]
-    },
-
     # Set DuckDB variables max_{colname} for autoincrement columns
     #
     # @param table_name Character string table name
diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R
index ad58bc8..5f1c3db 100644
--- a/inst/tinytest/test_parquet_duckdb.R
+++ b/inst/tinytest/test_parquet_duckdb.R
@@ -29,14 +29,18 @@ expect_equal(nrow(result), 0L)
 # Test 4: Row count for non-existent table
 expect_equal(db$row_count("nonexistent_table"), 0L)
 
-# Test 5: commit_overwrite creates new table
+# Test 5: commit overwrite creates new table
 test_data_1 <- data.table::data.table(
   id = 1:5,
   name = letters[1:5],
   value = c(10.1, 20.2, 30.3, 40.4, 50.5)
 )
 expect_silent(
-  db$commit_overwrite(test_data_1, "test_table_1")
+  db$commit(
+    method = "overwrite",
+    test_data_1,
+    "test_table_1"
+  )
 )
 expect_true("test_table_1" %in% db$list_tables())
 expect_equal(db$row_count("test_table_1"), 5L)
@@ -45,60 +49,64 @@ expect_equal(db$row_count("test_table_1"), 5L)
 retrieved <- db$fetch("test_table_1")
 expect_equal(retrieved, test_data_1)
 
-# Test 7: commit_overwrite replaces existing data
+# Test 7: commit overwrite replaces existing data
 test_data_1b <- data.table::data.table(
   id = 10:12,
   name = letters[24:26],
   value = c(100.1, 200.2, 300.3)
 )
 expect_silent(
-  db$commit_overwrite(test_data_1b, "test_table_1")
+  db$commit(
+    method = "overwrite",
+    test_data_1b,
+    "test_table_1"
+  )
 )
 expect_equal(db$row_count("test_table_1"), 3L)
 retrieved <- db$fetch("test_table_1")
 expect_equal(retrieved, test_data_1b)
 
-# Test 8: commit_append adds to existing data
+# Test 8: commit append adds to existing data
 test_data_1c <- data.table::data.table(
   id = 13:15,
   name = letters[1:3],
   value = c(111.1, 222.2, 333.3)
 )
 expect_silent(
-  db$commit_append(test_data_1c, "test_table_1")
+  db$commit(test_data_1c, "test_table_1", method = "append")
 )
 expect_equal(db$row_count("test_table_1"), 6L)
 retrieved <- db$fetch("test_table_1")
 expect_equal(nrow(retrieved), 6L)
 expect_true(all(c(10:15) %in% retrieved$id))
 
-# Test 9: commit_append on non-existent table creates it
+# Test 9: commit append on non-existent table creates it
 expect_silent(
-  db$commit_append(test_data_1, "test_table_2")
+  db$commit(test_data_1, "test_table_2", method = "append")
 )
 expect_true("test_table_2" %in% db$list_tables())
 expect_equal(db$row_count("test_table_2"), 5L)
 
-# Test 10: commit_upsert on non-existent table creates it
+# Test 10: commit w/ upsert on non-existent table creates it
 test_data_3 <- data.table::data.table(
   id_key = 1:3,
   name = c("a", "b", "c"),
   value = c(1.1, 2.2, 3.3)
 )
 expect_silent(
-  db$commit_upsert(test_data_3, "test_table_3", key_cols = "id_key")
+  db$commit(test_data_3, "test_table_3", key_cols = "id_key", method = "upsert")
 )
 expect_true("test_table_3" %in% db$list_tables())
 expect_equal(db$row_count("test_table_3"), 3L)
 
-# Test 11: commit_upsert updates existing rows and inserts new ones
+# Test 11: commit w/ upsert updates existing rows and inserts new ones
 test_data_3b <- data.table::data.table(
   id_key = c(2L, 3L, 4L),
   name = c("b_updated", "c_updated", "d"),
   value = c(22.2, 33.3, 44.4)
 )
 expect_silent(
-  db$commit_upsert(test_data_3b, "test_table_3", key_cols = "id_key")
+  db$commit(test_data_3b, "test_table_3", key_cols = "id_key", method = "upsert")
 )
 expect_equal(db$row_count("test_table_3"), 4L)
 retrieved <- db$fetch("test_table_3")
@@ -130,7 +138,11 @@ retrieved <- db$fetch("test_table_3")
 expect_false(1L %in% retrieved$id_key)
 
 # Test 16: delete_from with complex WHERE
-db$commit_overwrite(test_data_1, "test_table_4")
+db$commit(
+  method = "overwrite",
+  test_data_1,
+  "test_table_4"
+)
 deleted_count <- db$delete_from("test_table_4", where = "id < 3")
 expect_equal(deleted_count, 2L)
 retrieved <- db$fetch("test_table_4")
@@ -147,7 +159,11 @@ deleted_count <- db$delete_from("nonexistent", where = "id = 1")
 expect_equal(deleted_count, 0L)
 
 # Test 19: delete_from with WHERE that matches nothing
-db$commit_overwrite(test_data_1, "test_table_5")
+db$commit(
+  method = "overwrite",
+  test_data_1,
+  "test_table_5"
+)
 initial_count <- db$row_count("test_table_5")
 deleted_count <- db$delete_from("test_table_5", where = "id = 999")
 expect_equal(deleted_count, 0L)
@@ -158,7 +174,8 @@ test_autoinc_1 <- data.table::data.table(
   name = c("item_a", "item_b", "item_c"),
   value = c(10, 20, 30)
 )
-db$commit_overwrite(
+db$commit(
+  method = "overwrite",
   test_autoinc_1,
   "test_autoinc_1",
   autoincrement_cols = "id"
@@ -172,10 +189,11 @@ test_autoinc_2 <- data.table::data.table(
   name = c("item_d", "item_e"),
   value = c(40, 50)
 )
-db$commit_append(
+db$commit(
   test_autoinc_2,
   "test_autoinc_1",
-  autoincrement_cols = "id"
+  autoincrement_cols = "id",
+  method = "append"
 )
 result <- db$fetch("test_autoinc_1")
 expect_equal(nrow(result), 5L)
@@ -187,10 +205,11 @@ test_autoinc_3 <- data.table::data.table(
   name = c("item_f", "item_g"),
   value = c(60, 70)
 )
-db$commit_upsert(
+db$commit(
   test_autoinc_3,
   "test_autoinc_1",
-  autoincrement_cols = "id"
+  autoincrement_cols = "id",
+  method = "upsert"
 )
 result <- db$fetch("test_autoinc_1")
 expect_equal(nrow(result), 7L)
@@ -203,7 +222,8 @@ test_autoinc_with_ids <- data.table::data.table(
   value = c(1, 2, 3)
 )
 expect_warning(
-  db$commit_overwrite(
+  db$commit(
+    method = "overwrite",
     test_autoinc_with_ids,
     "test_autoinc_2",
     autoincrement_cols = "id"
@@ -218,7 +238,8 @@ test_multi_autoinc <- data.table::data.table(
   name = c("item1", "item2"),
   value = c(10, 20)
 )
-db$commit_overwrite(
+db$commit(
+  method = "overwrite",
   test_multi_autoinc,
   "test_multi_autoinc",
   autoincrement_cols = c("id_a", "id_b")
@@ -232,15 +253,20 @@ test_continue_a <- data.table::data.table(
   id_seq = c(5L, 10L, 15L),
   value = c(100, 200, 300)
 )
-db$commit_overwrite(test_continue_a, "test_continue")
+db$commit(
+  method = "overwrite",
+  test_continue_a,
+  "test_continue"
+)
 
 test_continue_b <- data.table::data.table(
   value = c(400, 500)
 )
-db$commit_append(
+db$commit(
   test_continue_b,
   "test_continue",
-  autoincrement_cols = "id_seq"
+  autoincrement_cols = "id_seq",
+  method = "append"
 )
 result <- db$fetch("test_continue")
 expect_equal(nrow(result), 5L)
@@ -248,7 +274,11 @@ expect_equal(result$id_seq[4:5], c(16L, 17L))
 expect_equal(result$value[4:5], c(400, 500))
 
 # Test 26: attach_table and detach_table
-db$commit_overwrite(test_data_1, "test_attach")
+db$commit(
+  method = "overwrite",
+  test_data_1,
+  "test_attach"
+)
 expect_silent(db$attach_table("test_attach"))
 # Verify table is attached by querying it directly
 result <- db$get_query("SELECT COUNT(*) as n FROM test_attach")
@@ -302,7 +332,7 @@ test_csv_data <- data.table::data.table(
   id = 1:3,
   name = c("a", "b", "c")
 )
-db_csv$commit_overwrite(test_csv_data, "csv_table")
+db_csv$commit(test_csv_data, "csv_table", method = "overwrite")
 expect_true("csv_table" %in% db_csv$list_tables())
 retrieved <- db_csv$fetch("csv_table")
 expect_equal(retrieved, test_csv_data)
@@ -321,7 +351,11 @@ expect_silent(
 )
 
 # Test 33: Persistence across connections
-db$commit_overwrite(test_data_1, "persist_test")
+db$commit(
+  method = "overwrite",
+  test_data_1,
+  "persist_test"
+)
 rm(db)
 gc()
 
@@ -331,15 +365,24 @@ expect_true("persist_test" %in% db$list_tables())
 retrieved <- db$fetch("persist_test")
 expect_equal(retrieved, test_data_1)
 
-# Test 34: commit_upsert with no key_cols defaults to append
+# Test 34: commit with no key_cols defaults to append
 test_no_keys <- data.table::data.table(
   name = c("x", "y"),
   value = c(1, 2)
 )
-db$commit_overwrite(test_no_keys, "no_keys_test")
+db$commit(
+  method = "overwrite",
+  test_no_keys,
+  "no_keys_test"
+)
 expect_equal(db$row_count("no_keys_test"), 2L)
 
-db$commit_upsert(test_no_keys, "no_keys_test", key_cols = character(0))
+db$commit(
+  test_no_keys,
+  "no_keys_test",
+  key_cols = character(0),
+  method = "upsert"
+)
 expect_equal(db$row_count("no_keys_test"), 4L) # Should append
 
 # Test 35: Print method
diff --git a/man/parquet_duckdb.Rd b/man/parquet_duckdb.Rd
index 964cd90..cb41afa 100644
--- a/man/parquet_duckdb.Rd
+++ b/man/parquet_duckdb.Rd
@@ -35,9 +35,7 @@ domain-specific database classes.
 \item \href{#method-parquet_duckdb-with_tables}{\code{parquet_duckdb$with_tables()}}
 \item \href{#method-parquet_duckdb-fetch}{\code{parquet_duckdb$fetch()}}
 \item \href{#method-parquet_duckdb-delete_from}{\code{parquet_duckdb$delete_from()}}
-\item \href{#method-parquet_duckdb-commit_overwrite}{\code{parquet_duckdb$commit_overwrite()}}
-\item \href{#method-parquet_duckdb-commit_append}{\code{parquet_duckdb$commit_append()}}
-\item \href{#method-parquet_duckdb-commit_upsert}{\code{parquet_duckdb$commit_upsert()}}
+\item \href{#method-parquet_duckdb-commit}{\code{parquet_duckdb$commit()}}
 \item \href{#method-parquet_duckdb-print}{\code{parquet_duckdb$print()}}
 \item \href{#method-parquet_duckdb-clone}{\code{parquet_duckdb$clone()}}
 }
@@ -260,94 +258,38 @@ Number of rows deleted
 }
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-parquet_duckdb-commit_overwrite"></a>}}
-\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_overwrite}{}}}
-\subsection{Method \code{commit_overwrite()}}{
-Commit data in overwrite mode
+\if{html}{\out{<a id="method-parquet_duckdb-commit"></a>}}
+\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit}{}}}
+\subsection{Method \code{commit()}}{
+Commit data using overwrite, append, or upsert modes. Handles autoincrement, key
+identity columns, and list-to-MAP conversion.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit_overwrite(
+\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit(
   x,
   table_name,
+  key_cols,
   autoincrement_cols = character(0),
-  map_cols = character(0)
+  map_cols = character(0),
+  method = c("overwrite", "append", "upsert")
 )}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{x}}{Data frame to commit}
+\item{\code{x}}{Data frame to commit. If character, in-duckdb-memory table.}
 
 \item{\code{table_name}}{Character string table name}
 
-\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
-
-\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Invisible NULL (called for side effects)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-parquet_duckdb-commit_append"></a>}}
-\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_append}{}}}
-\subsection{Method \code{commit_append()}}{
-Commit data in append mode
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit_append(
-  x,
-  table_name,
-  autoincrement_cols = character(0),
-  map_cols = character(0)
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Data frame to commit}
-
-\item{\code{table_name}}{Character string table name}
+\item{\code{key_cols}}{Character vector of columns that define uniqueness. If missing,
+use all columns starting with \code{id_}}
 
 \item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
 
 \item{\code{map_cols}}{Character vector of columns to convert to MAP format}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Invisible NULL (called for side effects)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-parquet_duckdb-commit_upsert"></a>}}
-\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit_upsert}{}}}
-\subsection{Method \code{commit_upsert()}}{
-Commit data in upsert mode (update existing, insert new)
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{parquet_duckdb$commit_upsert(
-  x,
-  table_name,
-  key_cols = grep("^id_", names(x), value = TRUE),
-  autoincrement_cols = character(0),
-  map_cols = character(0)
-)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{x}}{Data frame to commit}
 
-\item{\code{table_name}}{Character string table name}
-
-\item{\code{key_cols}}{Character vector of columns that define uniqueness}
-
-\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment}
-
-\item{\code{map_cols}}{Character vector of columns to convert to MAP format}
+\item{\code{method}}{Character, one of "overwrite", "append", "upsert" (upsert being an
+update for existing rows, and insert for new rows)}
 }
 \if{html}{\out{</div>}}
 }
@@ -373,6 +315,23 @@ Print method for parquet_duckdb
 }
 \subsection{Returns}{
 self (invisibly)
+param x Data frame to commit. If character, in-duckdb-memory table.
+param table_name Character string table name
+param autoincrement_cols Character vector of column names to auto-increment
+param map_cols Character vector of columns to convert to MAP format
+return Invisible NULL (called for side effects)
+param x Data frame to commit. If character, in-duckdb-memory table.
+param table_name Character string table name
+param autoincrement_cols Character vector of column names to auto-increment
+param map_cols Character vector of columns to convert to MAP format
+return Invisible NULL (called for side effects)
+param x Data frame to commit. If character, in-duckdb-memory table.
+param table_name Character string table name
+param key_cols Character vector of columns that define uniqueness. If missing,
+use all columns starting with \code{id_}
+param autoincrement_cols Character vector of column names to auto-increment
+param map_cols Character vector of columns to convert to MAP format
+return Invisible NULL (called for side effects)
 }
 }
 \if{html}{\out{<hr>}}
diff --git a/vignettes/evoland.qmd b/vignettes/evoland.qmd
index f06bef8..37da973 100644
--- a/vignettes/evoland.qmd
+++ b/vignettes/evoland.qmd
@@ -108,7 +108,7 @@ db$commit(
     value = c(lulc_files$url, lulc_files$md5sum, "BFS Arealstatistik")
   ),
   table_name = "reporting_t",
-  mode = "append"
+  method = "append"
 )
 
 zippath <- file.path(
@@ -322,7 +322,7 @@ id_coord_keep <- lulc_data_t[, id_coord]
 db$commit(
   x = db$coords_t[id_coord %in% id_coord_keep],
   table_name = "coords_t",
-  mode = "overwrite"
+  method = "overwrite"
 )
 ```
 

From 8525327fd55c50c595c79d90b2320bfdf6e7b2e9 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Mon, 1 Dec 2025 16:18:13 +0100
Subject: [PATCH 22/27] enable filling trans_pred_data_v with 0s for NAs /
 simpler passing of id preds

---
 R/covariance_filter.R    | 17 ++++++-----------
 R/evoland_db_views.R     | 12 +++++++++++-
 R/trans_preds_t.R        | 16 ++++++++--------
 man/covariance_filter.Rd |  3 +--
 man/evoland_db.Rd        |  6 ++----
 man/trans_preds_t.Rd     | 10 +++++++++-
 6 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/R/covariance_filter.R b/R/covariance_filter.R
index fb06530..e6a813f 100644
--- a/R/covariance_filter.R
+++ b/R/covariance_filter.R
@@ -23,8 +23,7 @@
 #'        coefficients above this threshold will be filtered out. Default is 0 (no filtering).
 #' @param ... Additional arguments passed to rank_fun.
 #'
-#' @return A filtered data.table containing only the selected covariates after ranking
-#'         by the specified method and filtering based on correlation threshold.
+#' @return A set of column names (covariates) to retain
 #'
 #' @details
 #' The function first ranks covariates using the provided ranking function (default:
@@ -70,22 +69,18 @@ covariance_filter <- function(
   )
 
   # Sort by scores (lower = better/more significant)
-  ranked_order <- order(scores)
-  data_ranked <- data[, ..ranked_order]
+  ranked_order <- names(sort(scores))
 
-  # If no correlation filtering needed, return ranked data
+  # If no correlation filtering needed, return ranked predictors
   if (corcut == 1) {
-    return(data_ranked)
+    return(ranked_order)
   }
 
   # Compute correlation matrix once
-  cor_mat <- abs(cor(data_ranked, use = "pairwise.complete.obs"))
+  cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs"))
 
   # Iteratively select covariates based on correlation threshold
-  selected <- select_by_correlation(cor_mat, corcut)
-
-  # Return selected covariates
-  data_ranked[, ..selected, drop = FALSE]
+  select_by_correlation(cor_mat, corcut)
 }
 
 
diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index 42429a3..3acc060 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -101,7 +101,10 @@ evoland_db$set("active", "coords_minimal", function() {
   })
 })
 
-evoland_db$set("public", "trans_pred_data_v", function(id_trans) {
+# get transitions along with their predictor data in a wide data.table
+# id_trans - integer transition ID
+# na_value - if not NA, replace all NULL/NA predictor values with this value
+evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) {
   stopifnot(
     "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans)
   )
@@ -251,6 +254,13 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans) {
       }
       data.table::setnames(result, old_names, new_names)
 
+      if (!is.na(na_value)) {
+        pred_cols <- setdiff(names(result), "result")
+        for (col in pred_cols) {
+          data.table::set(result, i = which(is.na(result[[col]])), j = col, value = na_value)
+        }
+      }
+
       result
     }
   )
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index 4e2dd87..091e9da 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -34,12 +34,14 @@ as_trans_preds_t <- function(x) {
 #' @param db An [evoland_db] instance with populated tables
 #' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()]
 #' @param rank_fun Optional ranking function passed to [covariance_filter()]
+#' @param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value
 #' @param ... Additional arguments passed to rank_fun via [covariance_filter()]
 #' @export
 create_trans_preds_t <- function(
   db,
   corcut = 0.7,
   rank_fun = rank_poly_glm,
+  na_value = NA,
   ...
 ) {
   stopifnot(
@@ -68,7 +70,7 @@ create_trans_preds_t <- function(
     # Get wide transition-predictor data
     tryCatch(
       {
-        trans_pred_data <- db$trans_pred_data_v(id_trans)
+        trans_pred_data <- db$trans_pred_data_v(id_trans, na_value)
 
         # Check if we have any data
         if (nrow(trans_pred_data) == 0L) {
@@ -87,7 +89,8 @@ create_trans_preds_t <- function(
           next
         }
 
-        filtered_data <- covariance_filter(
+        # Return ranked + filtered predictor names as id_pred_{n}
+        filtered_preds <- covariance_filter(
           data = trans_pred_data,
           result_col = "result",
           rank_fun = rank_fun,
@@ -95,15 +98,12 @@ create_trans_preds_t <- function(
           ...
         )
 
-        # Extract selected predictor IDs from column names
-        selected_cols <- setdiff(names(filtered_data), "result")
-
-        if (length(selected_cols) > 0L) {
+        if (length(filtered_preds) > 0L) {
           # Parse id_pred values from column names (e.g., "id_pred_1" -> 1)
-          selected_ids <- as.integer(sub("^id_pred_", "", selected_cols))
+          selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds))
 
           # Create result rows
-          results_list[[length(results_list) + 1]] <- data.table::data.table(
+          results_list[[id_trans]] <- data.table::data.table(
             id_pred = selected_ids,
             id_trans = id_trans
           )
diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd
index ffa87fc..0693652 100644
--- a/man/covariance_filter.Rd
+++ b/man/covariance_filter.Rd
@@ -50,8 +50,7 @@ Should take arguments (x, y, weights, ...) and return a single numeric value
 \item{cor_mat}{Absolute correlation matrix}
 }
 \value{
-A filtered data.table containing only the selected covariates after ranking
-by the specified method and filtering based on correlation threshold.
+A set of column names (covariates) to retain
 }
 \description{
 The \code{covariance_filter} returns a set of covariates for land use land cover change
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index 676de84..bee1130 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -41,9 +41,7 @@ Additional methods and active bindings are added to this class in separate files
 <details><summary>Inherited methods</summary>
 <ul>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="attach_table"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-attach_table'><code>evoland::parquet_duckdb$attach_table()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit_append"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit_append'><code>evoland::parquet_duckdb$commit_append()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit_overwrite"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit_overwrite'><code>evoland::parquet_duckdb$commit_overwrite()</code></a></span></li>
-<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit_upsert"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit_upsert'><code>evoland::parquet_duckdb$commit_upsert()</code></a></span></li>
+<li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="commit"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-commit'><code>evoland::parquet_duckdb$commit()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="delete_from"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-delete_from'><code>evoland::parquet_duckdb$delete_from()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="detach_table"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-detach_table'><code>evoland::parquet_duckdb$detach_table()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="evoland" data-topic="parquet_duckdb" data-id="execute"><a href='../../evoland/html/parquet_duckdb.html#method-parquet_duckdb-execute'><code>evoland::parquet_duckdb$execute()</code></a></span></li>
@@ -83,7 +81,7 @@ Additional methods and active bindings are added to this class in separate files
 \if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}}
 \subsection{Method \code{trans_pred_data_v()}}{
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$trans_pred_data_v(id_trans)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$trans_pred_data_v(id_trans, na_value = NA)}\if{html}{\out{</div>}}
 }
 
 }
diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd
index e2cbebf..e999d9f 100644
--- a/man/trans_preds_t.Rd
+++ b/man/trans_preds_t.Rd
@@ -9,7 +9,13 @@
 \usage{
 as_trans_preds_t(x)
 
-create_trans_preds_t(db, corcut = 0.7, rank_fun = rank_poly_glm, ...)
+create_trans_preds_t(
+  db,
+  corcut = 0.7,
+  rank_fun = rank_poly_glm,
+  na_value = NA,
+  ...
+)
 
 \method{print}{trans_preds_t}(x, nrow = 10, ...)
 }
@@ -20,6 +26,8 @@ create_trans_preds_t(db, corcut = 0.7, rank_fun = rank_poly_glm, ...)
 
 \item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}}
 
+\item{na_value}{Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value}
+
 \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}}
 
 \item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}}

From be909cc16f4d39c21d08a8ab76fc12ceff5b0d27 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Mon, 1 Dec 2025 16:56:10 +0100
Subject: [PATCH 23/27] trans_pred_data_v: enable filtering on id_pred

---
 R/evoland_db_views.R | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
index 3acc060..06325cb 100644
--- a/R/evoland_db_views.R
+++ b/R/evoland_db_views.R
@@ -103,17 +103,19 @@ evoland_db$set("active", "coords_minimal", function() {
 
 # get transitions along with their predictor data in a wide data.table
 # id_trans - integer transition ID
+# id_pred - optional integer vector of predictor IDs to include (NULL = all predictors)
 # na_value - if not NA, replace all NULL/NA predictor values with this value
-evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA) {
+evoland_db$set("public", "trans_pred_data_v", function(id_trans, id_pred = NULL, na_value = NA) {
   stopifnot(
-    "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans)
+    "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans),
+    "id_pred must be NULL or a numeric vector" = is.null(id_pred) || is.numeric(id_pred)
   )
 
   all_tables <- self$list_tables()
   pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool")
   existing_pred_tables <- intersect(pred_tables, all_tables)
 
-  tables_to_attach <- c("trans_meta_t", "lulc_data_t", "pred_meta_t", existing_pred_tables)
+  tables_to_attach <- c("trans_meta_t", "lulc_data_t", existing_pred_tables)
 
   self$with_tables(
     tables_to_attach,
@@ -151,17 +153,26 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA)
         )"
       )
 
+      pred_filter <- ""
+      if (!is.null(id_pred)) {
+        pred_filter <- glue::glue(" AND id_pred IN ({toString(id_pred)})")
+      }
+
       if ("pred_data_t_float" %in% existing_pred_tables) {
-        ctes$pred_float_combined <- "pred_float_combined AS (
+        ctes$pred_float_combined <- glue::glue(
+          "pred_float_combined AS (
           SELECT id_coord, id_period, id_pred, value
           FROM pred_data_t_float
           WHERE id_period >= 1
+            {pred_filter}
           UNION ALL
           SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value
           FROM pred_data_t_float AS p0
           CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods
           WHERE p0.id_period = 0
+            {pred_filter}
         )"
+        )
 
         ctes$pred_float_wide <- "pred_float_wide AS (
           PIVOT pred_float_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period
@@ -169,16 +180,20 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA)
       }
 
       if ("pred_data_t_int" %in% existing_pred_tables) {
-        ctes$pred_int_combined <- "pred_int_combined AS (
+        ctes$pred_int_combined <- glue::glue(
+          "pred_int_combined AS (
           SELECT id_coord, id_period, id_pred, value
           FROM pred_data_t_int
           WHERE id_period >= 1
+            {pred_filter}
           UNION ALL
           SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value
           FROM pred_data_t_int AS p0
           CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods
           WHERE p0.id_period = 0
+            {pred_filter}
         )"
+        )
 
         ctes$pred_int_wide <- "pred_int_wide AS (
           PIVOT pred_int_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period
@@ -186,16 +201,20 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA)
       }
 
       if ("pred_data_t_bool" %in% existing_pred_tables) {
-        ctes$pred_bool_combined <- "pred_bool_combined AS (
+        ctes$pred_bool_combined <- glue::glue(
+          "pred_bool_combined AS (
           SELECT id_coord, id_period, id_pred, value
           FROM pred_data_t_bool
           WHERE id_period >= 1
+            {pred_filter}
           UNION ALL
           SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value
           FROM pred_data_t_bool AS p0
           CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods
           WHERE p0.id_period = 0
+            {pred_filter}
         )"
+        )
 
         ctes$pred_bool_wide <- "pred_bool_wide AS (
           PIVOT pred_bool_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period
@@ -217,23 +236,23 @@ evoland_db$set("public", "trans_pred_data_v", function(id_trans, na_value = NA)
       if ("pred_data_t_float" %in% existing_pred_tables) {
         joins <- paste0(
           joins,
-          "\n        LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period"
+          "\n LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period"
         )
       }
       if ("pred_data_t_int" %in% existing_pred_tables) {
         joins <- paste0(
           joins,
-          "\n        LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period"
+          "\n LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period"
         )
       }
       if ("pred_data_t_bool" %in% existing_pred_tables) {
         joins <- paste0(
           joins,
-          "\n        LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period"
+          "\n LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period"
         )
       }
 
-      cte_string <- paste(unlist(ctes), collapse = ",\n\n        ")
+      cte_string <- paste(unlist(ctes), collapse = ",\n\n ")
 
       query <- glue::glue(
         "WITH {cte_string}

From 1e8d2ec6fc655f6977105907a27fb893d80d2727 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Mon, 1 Dec 2025 17:45:51 +0100
Subject: [PATCH 24/27] make the predictor selection a pruning of a potentially
 existing trans_preds_t

---
 NAMESPACE            |   1 -
 R/trans_preds_t.R    | 217 ++++++++++++++++++++++++-------------------
 man/evoland_db.Rd    |  27 +++++-
 man/trans_preds_t.Rd |  26 +-----
 4 files changed, 149 insertions(+), 122 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 35a656a..3c5e473 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -55,7 +55,6 @@ export(create_neighbors_t)
 export(create_periods_t)
 export(create_pred_meta_t)
 export(create_trans_meta_t)
-export(create_trans_preds_t)
 export(download_and_verify)
 export(evoland_db)
 export(extract_using_coords_t)
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index 091e9da..812bf7c 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -5,6 +5,7 @@
 #' modelling each transition type.
 #'
 #' @name trans_preds_t
+#' @include evoland_db.R
 #'
 #' @param db An [evoland_db] instance with populated trans_meta_t and pred_meta_t tables
 #'
@@ -28,113 +29,137 @@ as_trans_preds_t <- function(x) {
   )
 }
 
-#' @describeIn trans_preds_t Create a transition-predictor relation, i.e. records the
+# set an initial full set of transition / predictor relations
+evoland_db$set(
+  "public",
+  "set_full_trans_preds",
+  function(overwrite = FALSE) {
+    if (self$row_count("trans_preds_t") > 0 && !overwrite) {
+      stop("Set overwrite to TRUE to overwrite existing trans_preds_t")
+    }
+    p <- self$pred_meta_t
+    t <- self$trans_meta_t[is_viable == TRUE]
+
+    full <- expand.grid(id_pred = p[["id_pred"]], id_trans = t[["id_trans"]])
+    self$trans_preds_t <- as_trans_preds_t(full)
+  }
+)
+
+#' describeIn trans_preds_t Create a transition-predictor relation, i.e. records the
 #' result of a predictor selection step. Runs covariance filtering for each viable
 #' transition and stores the selected predictors.
-#' @param db An [evoland_db] instance with populated tables
-#' @param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()]
-#' @param rank_fun Optional ranking function passed to [covariance_filter()]
-#' @param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value
-#' @param ... Additional arguments passed to rank_fun via [covariance_filter()]
-#' @export
-create_trans_preds_t <- function(
-  db,
-  corcut = 0.7,
-  rank_fun = rank_poly_glm,
-  na_value = NA,
-  ...
-) {
-  stopifnot(
-    "db must be an evoland_db instance" = inherits(db, "evoland_db")
-  )
-
-  viable_trans <- db$trans_meta_t[is_viable == TRUE]
-  pred_meta <- db$pred_meta_t
-  stopifnot(
-    "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L
-  )
-
-  results_list <- list()
-
-  # Iterate over transitions (anterior/posterior pairs)
-  for (i in seq_len(nrow(viable_trans))) {
-    id_trans <- viable_trans$id_trans[i]
-    id_lulc_ant <- viable_trans$id_lulc_anterior[i]
-    id_lulc_post <- viable_trans$id_lulc_posterior[i]
-
-    message(glue::glue(
-      "Processing transition {i}/{nrow(viable_trans)}: ",
-      "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})"
-    ))
-
-    # Get wide transition-predictor data
-    tryCatch(
-      {
-        trans_pred_data <- db$trans_pred_data_v(id_trans, na_value)
+#' param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()]
+#' param rank_fun Optional ranking function passed to [covariance_filter()]
+#' param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value
+#' param ... Additional arguments passed to rank_fun via [covariance_filter()]
+evoland_db$set(
+  "public",
+  "prune_trans_preds_two_stage_covar",
+  function(
+    corcut = 0.7,
+    rank_fun = rank_poly_glm,
+    na_value = NA,
+    ...
+  ) {
+    viable_trans <- self$trans_meta_t[is_viable == TRUE]
+    pred_meta <- self$pred_meta_t
+    stopifnot(
+      "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L
+    )
+    if (self$row_count("trans_preds_t") == 0) {
+      self$set_full_trans_preds()
+    }
+    trans_preds_pre <- self$trans_preds_t
+
+    results_list <- list()
+
+    # Iterate over transitions (anterior/posterior pairs)
+    for (i in seq_len(nrow(viable_trans))) {
+      id_trans <- viable_trans$id_trans[i]
+      id_lulc_ant <- viable_trans$id_lulc_anterior[i]
+      id_lulc_post <- viable_trans$id_lulc_posterior[i]
+      id_preds <- trans_preds_pre$id_pred[
+        trans_preds_pre$id_trans == id_trans
+      ]
+
+      message(glue::glue(
+        "Processing transition {i}/{nrow(viable_trans)}: ",
+        "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})"
+      ))
+
+      if (length(id_preds) == 0L) {
+        next
+      }
 
-        # Check if we have any data
-        if (nrow(trans_pred_data) == 0L) {
-          warning(glue::glue(
-            "No data for transition {id_trans}, skipping"
-          ))
-          next
-        }
+      # Get wide transition-predictor data
+      tryCatch(
+        {
+          trans_pred_data <- self$trans_pred_data_v(id_trans, id_preds, na_value)
+
+          # Check if we have any data
+          if (nrow(trans_pred_data) == 0L) {
+            warning(glue::glue(
+              "No data for transition {id_trans}, skipping"
+            ))
+            next
+          }
+
+          # Check if we have any predictor columns
+          pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE)
+          if (length(pred_cols) == 0L) {
+            warning(glue::glue(
+              "No predictor columns for transition {id_trans}, skipping"
+            ))
+            next
+          }
+
+          # Return ranked + filtered predictor names as id_pred_{n}
+          filtered_preds <- covariance_filter(
+            data = trans_pred_data,
+            result_col = "result",
+            rank_fun = rank_fun,
+            corcut = corcut,
+            ...
+          )
 
-        # Check if we have any predictor columns
-        pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE)
-        if (length(pred_cols) == 0L) {
+          if (length(filtered_preds) > 0L) {
+            # Parse id_pred values from column names (e.g., "id_pred_1" -> 1)
+            selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds))
+
+            # Create result rows
+            results_list[[id_trans]] <- data.table::data.table(
+              id_pred = selected_ids,
+              id_trans = id_trans
+            )
+
+            message(glue::glue(
+              "  Selected {length(selected_ids)} predictor(s) for transition {id_trans}"
+            ))
+          } else {
+            message(glue::glue(
+              "  No predictors selected for transition {id_trans}"
+            ))
+          }
+        },
+        error = function(e) {
           warning(glue::glue(
-            "No predictor columns for transition {id_trans}, skipping"
+            "Error processing transition {id_trans}: {e$message}"
           ))
-          next
         }
+      )
+    }
 
-        # Return ranked + filtered predictor names as id_pred_{n}
-        filtered_preds <- covariance_filter(
-          data = trans_pred_data,
-          result_col = "result",
-          rank_fun = rank_fun,
-          corcut = corcut,
-          ...
-        )
-
-        if (length(filtered_preds) > 0L) {
-          # Parse id_pred values from column names (e.g., "id_pred_1" -> 1)
-          selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds))
-
-          # Create result rows
-          results_list[[id_trans]] <- data.table::data.table(
-            id_pred = selected_ids,
-            id_trans = id_trans
-          )
+    # Combine all results
+    if (length(results_list) == 0L) {
+      warning("No predictors selected for any transition")
+      return(invisible(NULL))
+    }
 
-          message(glue::glue(
-            "  Selected {length(selected_ids)} predictor(s) for transition {id_trans}"
-          ))
-        } else {
-          message(glue::glue(
-            "  No predictors selected for transition {id_trans}"
-          ))
-        }
-      },
-      error = function(e) {
-        warning(glue::glue(
-          "Error processing transition {id_trans}: {e$message}"
-        ))
-      }
-    )
-  }
+    result <- data.table::rbindlist(results_list)
 
-  # Combine all results
-  if (length(results_list) == 0L) {
-    warning("No predictors selected for any transition")
-    return(as_trans_preds_t())
+    self$commit(as_trans_preds_t(result), "trans_preds_t", method = "overwrite")
   }
-
-  result <- data.table::rbindlist(results_list)
-
-  as_trans_preds_t(result)
-}
+)
 
 #' @export
 validate.trans_preds_t <- function(x, ...) {
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index bee1130..531b09e 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -34,6 +34,8 @@ Additional methods and active bindings are added to this class in separate files
 \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}}
 \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}}
 \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}}
+\item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}}
+\item \href{#method-evoland_db-prune_trans_preds_two_stage_covar}{\code{evoland_db$prune_trans_preds_two_stage_covar()}}
 \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}}
 }
 }
@@ -81,7 +83,7 @@ Additional methods and active bindings are added to this class in separate files
 \if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}}
 \subsection{Method \code{trans_pred_data_v()}}{
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$trans_pred_data_v(id_trans, na_value = NA)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$trans_pred_data_v(id_trans, id_pred = NULL, na_value = NA)}\if{html}{\out{</div>}}
 }
 
 }
@@ -221,6 +223,29 @@ Add a predictor to the database
 }
 \if{html}{\out{</div>}}
 }
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-evoland_db-set_full_trans_preds"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-set_full_trans_preds}{}}}
+\subsection{Method \code{set_full_trans_preds()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$set_full_trans_preds(overwrite = FALSE)}\if{html}{\out{</div>}}
+}
+
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-evoland_db-prune_trans_preds_two_stage_covar"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds_two_stage_covar}{}}}
+\subsection{Method \code{prune_trans_preds_two_stage_covar()}}{
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$prune_trans_preds_two_stage_covar(
+  corcut = 0.7,
+  rank_fun = rank_poly_glm,
+  na_value = NA,
+  ...
+)}\if{html}{\out{</div>}}
+}
+
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-evoland_db-clone"></a>}}
diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd
index e999d9f..202028e 100644
--- a/man/trans_preds_t.Rd
+++ b/man/trans_preds_t.Rd
@@ -3,34 +3,19 @@
 \name{trans_preds_t}
 \alias{trans_preds_t}
 \alias{as_trans_preds_t}
-\alias{create_trans_preds_t}
 \alias{print.trans_preds_t}
 \title{Create Transition-Predictor Relationship Table}
 \usage{
 as_trans_preds_t(x)
 
-create_trans_preds_t(
-  db,
-  corcut = 0.7,
-  rank_fun = rank_poly_glm,
-  na_value = NA,
-  ...
-)
-
 \method{print}{trans_preds_t}(x, nrow = 10, ...)
 }
 \arguments{
-\item{db}{An \link{evoland_db} instance with populated tables}
-
-\item{corcut}{Numeric threshold (0-1) for correlation filtering passed to \code{\link[=covariance_filter]{covariance_filter()}}}
-
-\item{rank_fun}{Optional ranking function passed to \code{\link[=covariance_filter]{covariance_filter()}}}
-
-\item{na_value}{Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value}
+\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}}
 
 \item{...}{passed to \link[data.table:print.data.table]{data.table::print.data.table}}
 
-\item{nrow}{see \link[data.table:print.data.table]{data.table::print.data.table}}
+\item{db}{An \link{evoland_db} instance with populated trans_meta_t and pred_meta_t tables}
 }
 \value{
 A data.table of class "trans_preds_t" with columns:
@@ -49,10 +34,3 @@ modelling each transition type.
 \item \code{print(trans_preds_t)}: Print a trans_preds_t object, passing params to data.table print
 
 }}
-\section{Functions}{
-\itemize{
-\item \code{create_trans_preds_t()}: Create a transition-predictor relation, i.e. records the
-result of a predictor selection step. Runs covariance filtering for each viable
-transition and stores the selected predictors.
-
-}}

From bd16b400327309b897f8c7ee93fd2b230ce5e206 Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Tue, 2 Dec 2025 10:54:02 +0100
Subject: [PATCH 25/27] shut up distance progress

---
 R/neighbors_t.R                 |  6 ++++--
 inst/tinytest/test_util_terra.R | 11 +++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/R/neighbors_t.R b/R/neighbors_t.R
index f4d09fd..4a50301 100644
--- a/R/neighbors_t.R
+++ b/R/neighbors_t.R
@@ -53,7 +53,8 @@ create_neighbors_t <- function(
   coords_t,
   max_distance,
   distance_breaks = NULL,
-  resolution = 100.0
+  resolution = 100.0,
+  quiet = FALSE
 ) {
   # Validate inputs
   if (!inherits(coords_t, "coords_t")) {
@@ -74,7 +75,8 @@ create_neighbors_t <- function(
   dt <- distance_neighbors_cpp(
     coords_t = coords_t,
     max_distance = max_distance,
-    resolution = resolution
+    resolution = resolution,
+    quiet = quiet
   )
 
   data.table::setalloccol(dt)
diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R
index 82f998a..35a0bab 100644
--- a/inst/tinytest/test_util_terra.R
+++ b/inst/tinytest/test_util_terra.R
@@ -300,10 +300,13 @@ for (i in seq_len(
 }
 
 # Test with distance classification on real data
-real_neighbors_class <- create_neighbors_t(
-  coords_t,
-  max_distance = 300,
-  distance_breaks = c(0, 150, 300)
+expect_silent(
+  real_neighbors_class <- create_neighbors_t(
+    coords_t,
+    max_distance = 300,
+    distance_breaks = c(0, 150, 300),
+    quiet = TRUE
+  )
 )
 
 expect_true(all(!is.na(real_neighbors_class$distance_class)))

From d7e63034081be8cbe101261e6131d105850c373e Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Tue, 2 Dec 2025 10:54:46 +0100
Subject: [PATCH 26/27] add grrf filter

---
 DESCRIPTION                        |   8 +-
 NAMESPACE                          |   1 +
 R/covariance_filter.R              |   4 +-
 R/grrf_filter.r                    | 136 +++++++++++++++++++++++++++++
 R/trans_preds_t.R                  |  13 ++-
 inst/tinytest/test_trans_preds_t.R |  40 ++++-----
 man/evoland_db.Rd                  |  13 ++-
 man/grrf_filter.Rd                 |  70 +++++++++++++++
 man/neighbors_t.Rd                 |   3 +-
 9 files changed, 245 insertions(+), 43 deletions(-)
 create mode 100644 R/grrf_filter.r
 create mode 100644 man/grrf_filter.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index c21e25f..7b3e541 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -28,12 +28,13 @@ Imports:
     terra
 Suggests:
     tinytest,
-    quarto
+    quarto,
+    ranger
 VignetteBuilder: quarto
 Config/testthat/edition: 3
-LinkingTo: 
+LinkingTo:
     Rcpp
-Collate: 
+Collate:
     'RcppExports.R'
     'alloc_params_t.R'
     'coords_t.R'
@@ -43,6 +44,7 @@ Collate:
     'evoland_db_neighbors.R'
     'evoland_db_tables.R'
     'evoland_db_views.R'
+    'grrf_filter.r'
     'init.R'
     'intrv_masks_t.R'
     'intrv_meta_t.R'
diff --git a/NAMESPACE b/NAMESPACE
index 3c5e473..ee973a4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -58,6 +58,7 @@ export(create_trans_meta_t)
 export(download_and_verify)
 export(evoland_db)
 export(extract_using_coords_t)
+export(grrf_filter)
 export(parquet_duckdb)
 export(print_rowwise_yaml)
 export(validate)
diff --git a/R/covariance_filter.R b/R/covariance_filter.R
index e6a813f..bb9fc0c 100644
--- a/R/covariance_filter.R
+++ b/R/covariance_filter.R
@@ -17,8 +17,8 @@
 #' @param rank_fun Optional function to compute ranking scores for each covariate.
 #'        Should take arguments (x, y, weights, ...) and return a single numeric value
 #'        (lower = better). Defaults to polynomial GLM p-value ranking.
-#' @param weights Optional vector of weights to be used in the ranking function.
-#'        If NULL and rank_fun uses default, class-balanced weights are computed automatically.
+#' @param weights Optional vector of weights to be used in the ranking function. Defaults to
+#'        class-balanced weights
 #' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation
 #'        coefficients above this threshold will be filtered out. Default is 0 (no filtering).
 #' @param ... Additional arguments passed to rank_fun.
diff --git a/R/grrf_filter.r b/R/grrf_filter.r
new file mode 100644
index 0000000..28ce993
--- /dev/null
+++ b/R/grrf_filter.r
@@ -0,0 +1,136 @@
+#' Guided Regularized Random Forest Feature Selection
+#'
+#' The `grrf_filter` returns a set of covariates for land use land cover change (LULCC) models based
+#' on feature selection with Guided Regularized Random Forests. This is a two-stage random forest
+#' approach: a first unregularized random forest estimates variable importance scores. These scores
+#' are then used to guide a second regularized random forest that penalizes less important features,
+#' resulting in a more parsimonious feature set.
+#'
+#' @param data A data.table of target variable and candidate covariates to be filtered; wide format
+#'        with one predictor per column.
+#' @param result_col Name of the column representing the transition results (0: no trans, 1: trans)
+#' @param weights Optional named vector of class weights. If NULL, class-balanced weights
+#'        are computed automatically using compute_grrf_weights().
+#' @param gamma Numeric between 0-1 controlling the weight of the normalized importance
+#'        score (the "importance coefficient"). When gamma = 0, we perform unguided
+#'        regularized random forest (no guiding effect). When gamma = 1, we apply the
+#'        strongest guiding effect, leading to the most penalization of redundant
+#'        features and the most concise feature sets. Default is 0.5.
+#' @param num.trees Number of trees to grow in each random forest. Default is 500.
+#' @param ... Additional arguments passed to ranger::ranger().
+#'
+#' @return A character vector of column names (covariates) to retain, ordered by
+#'         importance (most important first)
+#'
+#' @details
+#' The Guided Regularized Random Forest (GRRF) algorithm works as follows:
+#' 1. Fit an initial unregularized random forest to obtain variable importance scores
+#' 2. Normalize these importance scores and use them to compute regularization
+#'    coefficients: coefReg = (1 - gamma) + gamma * normalized_importance
+#' 3. Fit a regularized random forest using these coefficients to penalize splits
+#'    on less important variables
+#' 4. Return variables with positive importance in the regularized model
+#'
+#' Class weights are used to handle class imbalance. Variables in terminal nodes are
+#' weighted by class, and splits are evaluated using weighted Gini impurity.
+#'
+#' The ranger implementation uses the `split.select.weights` parameter to apply
+#' regularization penalties, approximating the RRF regularization approach.
+#'
+#' @references
+#' Deng, H., & Runger, G. (2013). Gene selection with guided regularized random forest.
+#' Pattern Recognition, 46(12), 3483-3489. https://arxiv.org/pdf/1306.0237.pdf
+#'
+#' Original implementation by Antoine Adde, edited by Ben Black and adapted for
+#' ranger by the evoland-plus team.
+#'
+#' @name grrf_filter
+#'
+#' @export
+
+grrf_filter <- function(
+  data,
+  result_col = "result",
+  weights = compute_balanced_weights(data[[result_col]]),
+  gamma = 0.5,
+  num.trees = 500,
+  max.depth = 100,
+  ...
+) {
+  # Check if ranger is available
+  if (!requireNamespace("ranger", quietly = TRUE)) {
+    stop(
+      "Package 'ranger' is required for grrf_filter but is not installed.\n",
+      "Please install it with: install.packages('ranger')",
+      call. = FALSE
+    )
+  }
+
+  data.table::setDT(data)
+
+  # Validate inputs
+  stopifnot(
+    "gamma must be between 0 and 1" = gamma >= 0 && gamma <= 1,
+    "result_col must exist in data" = result_col %in% names(data),
+    "data must have at least one predictor column" = ncol(data) > 1
+  )
+
+  # Prepare data: separate predictors from response
+  predictor_cols <- setdiff(names(data), result_col)
+  y <- as.factor(data[[result_col]])
+  x <- data[, ..predictor_cols]
+
+  # Step 1: Run initial unregularized random forest to get importance scores
+  rf_initial <- ranger::ranger(
+    x = x,
+    y = y,
+    num.trees = num.trees,
+    importance = "impurity",
+    case.weights = weights,
+    max.depth = max.depth,
+    ...
+  )
+
+  # Extract and normalize importance scores
+  imp_initial <- rf_initial$variable.importance
+  imp_normalized <- imp_initial / max(imp_initial)
+
+  # Step 2: Calculate regularization coefficients (penalty weights)
+  # Higher importance -> higher coefficient -> less penalty
+  coef_reg <- (1 - gamma) + gamma * imp_normalized
+
+  # Step 3: Run guided regularized random forest
+  # Use split.select.weights to penalize variables with lower importance
+  # Higher weight = more likely to be selected for splitting
+  rf_grrf <- ranger::ranger(
+    x = x,
+    y = y,
+    num.trees = num.trees,
+    importance = "impurity",
+    case.weights = weights,
+    max.depth = max.depth,
+    split.select.weights = coef_reg,
+    verbose = FALSE, # easiest way to get rid of cpp warning about split selected weights
+    ...
+  )
+
+  # Extract final importance scores
+  imp_final <- rf_grrf$variable.importance
+
+  # Select variables with positive importance
+  selected_vars <- names(imp_final[imp_final > 0])
+
+  if (length(selected_vars) == 0) {
+    warning("No variables with positive importance found. Returning all variables.")
+    selected_vars <- predictor_cols
+  }
+
+  # Order by importance (descending)
+  selected_vars <- selected_vars[order(imp_final[selected_vars], decreasing = TRUE)]
+
+  message(glue::glue(
+    "Selected {length(selected_vars)}/{length(predictor_cols)} predictors"
+  ))
+
+  return(selected_vars)
+}
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index 812bf7c..75949b8 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -41,7 +41,7 @@ evoland_db$set(
     t <- self$trans_meta_t[is_viable == TRUE]
 
     full <- expand.grid(id_pred = p[["id_pred"]], id_trans = t[["id_trans"]])
-    self$trans_preds_t <- as_trans_preds_t(full)
+    self$commit(as_trans_preds_t(full), "trans_preds_t", method = "overwrite")
   }
 )
 
@@ -49,15 +49,14 @@ evoland_db$set(
 #' result of a predictor selection step. Runs covariance filtering for each viable
 #' transition and stores the selected predictors.
 #' param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()]
-#' param rank_fun Optional ranking function passed to [covariance_filter()]
+#' param filter_fun Defaults to [covariance_filter()], but can be any function that returns
 #' param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value
 #' param ... Additional arguments passed to rank_fun via [covariance_filter()]
 evoland_db$set(
   "public",
-  "prune_trans_preds_two_stage_covar",
+  "prune_trans_preds",
   function(
-    corcut = 0.7,
-    rank_fun = rank_poly_glm,
+    filter_fun = covariance_filter,
     na_value = NA,
     ...
   ) {
@@ -114,11 +113,9 @@ evoland_db$set(
           }
 
           # Return ranked + filtered predictor names as id_pred_{n}
-          filtered_preds <- covariance_filter(
+          filtered_preds <- filter_fun(
             data = trans_pred_data,
             result_col = "result",
-            rank_fun = rank_fun,
-            corcut = corcut,
             ...
           )
 
diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R
index 737dc6a..d9c9da2 100644
--- a/inst/tinytest/test_trans_preds_t.R
+++ b/inst/tinytest/test_trans_preds_t.R
@@ -147,30 +147,31 @@ pred_data_int <- data.table::data.table(
 )
 db_tps$pred_data_t_int <- as_pred_data_t(pred_data_int, type = "int")
 
-# Test create_trans_preds_t
+# Test pruning
 expect_message(
-  trans_preds_result <-
-    create_trans_preds_t(
-      db = db_tps,
-      corcut = 0.7
-    ),
+  db_tps$prune_trans_preds(
+    corcut = 0.2
+  ),
   "Processing transition 1/2"
 )
-
+trans_preds_result <- db_tps$trans_preds_t
 expect_true(inherits(trans_preds_result, "trans_preds_t"))
-expect_equal(nrow(trans_preds_result), 8L)
-
-# Verify structure
-expect_true(all(c("id_pred", "id_trans") %in% names(trans_preds_result)))
-expect_true(is.integer(trans_preds_result$id_pred))
-expect_true(is.integer(trans_preds_result$id_trans))
+expect_equal(nrow(trans_preds_result), 4L)
 
 # Verify that all id_trans in result are viable
 viable_trans_ids <- db_tps$trans_meta_t[is_viable == TRUE]$id_trans
 expect_true(all(trans_preds_result$id_trans %in% viable_trans_ids))
 
-# Verify that all id_pred in result exist in pred_meta_t
-expect_true(all(trans_preds_result$id_pred %in% db_tps$pred_meta_t$id_pred))
+# reset to full set of trans - preds
+expect_silent(db_tps$set_full_trans_preds(overwrite = TRUE))
+expect_message(
+  db_tps$prune_trans_preds(
+    filter_fun = grrf_filter,
+    num.trees = 10,
+    gamma = 0.9
+  ),
+  "Selected 5 predictor\\(s\\) for transition"
+)
 
 # Test error handling - empty database
 test_dir_empty <- tempfile("evoland_empty_")
@@ -178,12 +179,7 @@ on.exit(unlink(test_dir_empty, recursive = TRUE), add = TRUE)
 db_empty <- evoland_db$new(test_dir_empty)
 
 expect_error(
-  create_trans_preds_t(db = "not_a_db"),
-  "must be an evoland_db"
-)
-
-expect_error(
-  create_trans_preds_t(db = db_empty),
+  db_empty$prune_trans_preds(),
   "Table `trans_meta_t` does not exist"
 )
 
@@ -200,7 +196,7 @@ expect_warning(
   "Overriding existing IDs"
 )
 expect_error(
-  create_trans_preds_t(db = db_no_pred),
+  db_no_pred$prune_trans_preds(),
   "Table `pred_meta_t` does not exist"
 )
 
diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd
index 531b09e..c3775f3 100644
--- a/man/evoland_db.Rd
+++ b/man/evoland_db.Rd
@@ -35,7 +35,7 @@ Additional methods and active bindings are added to this class in separate files
 \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}}
 \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}}
 \item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}}
-\item \href{#method-evoland_db-prune_trans_preds_two_stage_covar}{\code{evoland_db$prune_trans_preds_two_stage_covar()}}
+\item \href{#method-evoland_db-prune_trans_preds}{\code{evoland_db$prune_trans_preds()}}
 \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}}
 }
 }
@@ -234,13 +234,12 @@ Add a predictor to the database
 
 }
 \if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-evoland_db-prune_trans_preds_two_stage_covar"></a>}}
-\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds_two_stage_covar}{}}}
-\subsection{Method \code{prune_trans_preds_two_stage_covar()}}{
+\if{html}{\out{<a id="method-evoland_db-prune_trans_preds"></a>}}
+\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds}{}}}
+\subsection{Method \code{prune_trans_preds()}}{
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{evoland_db$prune_trans_preds_two_stage_covar(
-  corcut = 0.7,
-  rank_fun = rank_poly_glm,
+\if{html}{\out{<div class="r">}}\preformatted{evoland_db$prune_trans_preds(
+  filter_fun = covariance_filter,
   na_value = NA,
   ...
 )}\if{html}{\out{</div>}}
diff --git a/man/grrf_filter.Rd b/man/grrf_filter.Rd
new file mode 100644
index 0000000..a3bbb93
--- /dev/null
+++ b/man/grrf_filter.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/grrf_filter.r
+\name{grrf_filter}
+\alias{grrf_filter}
+\title{Guided Regularized Random Forest Feature Selection}
+\usage{
+grrf_filter(
+  data,
+  result_col = "result",
+  weights = compute_balanced_weights(data[[result_col]]),
+  gamma = 0.5,
+  num.trees = 500,
+  max.depth = 100,
+  ...
+)
+}
+\arguments{
+\item{data}{A data.table of target variable and candidate covariates to be filtered; wide format
+with one predictor per column.}
+
+\item{result_col}{Name of the column representing the transition results (0: no trans, 1: trans)}
+
+\item{weights}{Optional named vector of class weights. If NULL, class-balanced weights
+are computed automatically using compute_grrf_weights().}
+
+\item{gamma}{Numeric between 0-1 controlling the weight of the normalized importance
+score (the "importance coefficient"). When gamma = 0, we perform unguided
+regularized random forest (no guiding effect). When gamma = 1, we apply the
+strongest guiding effect, leading to the most penalization of redundant
+features and the most concise feature sets. Default is 0.5.}
+
+\item{num.trees}{Number of trees to grow in each random forest. Default is 500.}
+
+\item{...}{Additional arguments passed to ranger::ranger().}
+}
+\value{
+A character vector of column names (covariates) to retain, ordered by
+importance (most important first)
+}
+\description{
+The \code{grrf_filter} returns a set of covariates for land use land cover change (LULCC) models based
+on feature selection with Guided Regularized Random Forests. This is a two-stage random forest
+approach: a first unregularized random forest estimates variable importance scores. These scores
+are then used to guide a second regularized random forest that penalizes less important features,
+resulting in a more parsimonious feature set.
+}
+\details{
+The Guided Regularized Random Forest (GRRF) algorithm works as follows:
+\enumerate{
+\item Fit an initial unregularized random forest to obtain variable importance scores
+\item Normalize these importance scores and use them to compute regularization
+coefficients: coefReg = (1 - gamma) + gamma * normalized_importance
+\item Fit a regularized random forest using these coefficients to penalize splits
+on less important variables
+\item Return variables with positive importance in the regularized model
+}
+
+Class weights are used to handle class imbalance. Variables in terminal nodes are
+weighted by class, and splits are evaluated using weighted Gini impurity.
+
+The ranger implementation uses the \code{split.select.weights} parameter to apply
+regularization penalties, approximating the RRF regularization approach.
+}
+\references{
+Deng, H., & Runger, G. (2013). Gene selection with guided regularized random forest.
+Pattern Recognition, 46(12), 3483-3489. https://arxiv.org/pdf/1306.0237.pdf
+
+Original implementation by Antoine Adde, edited by Ben Black and adapted for
+ranger by the evoland-plus team.
+}
diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd
index 2376f02..55d0260 100644
--- a/man/neighbors_t.Rd
+++ b/man/neighbors_t.Rd
@@ -14,7 +14,8 @@ create_neighbors_t(
   coords_t,
   max_distance,
   distance_breaks = NULL,
-  resolution = 100
+  resolution = 100,
+  quiet = FALSE
 )
 
 \method{validate}{neighbors_t}(x, ...)

From e6c8e5c72c2684ca1275bfd69f066e53ecc99bbe Mon Sep 17 00:00:00 2001
From: mmyrte <24587121+mmyrte@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:26:39 +0100
Subject: [PATCH 27/27] make prune robust / normalize importance to [0,1]

---
 R/grrf_filter.r   | 7 +++++--
 R/trans_preds_t.R | 5 +++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/grrf_filter.r b/R/grrf_filter.r
index 28ce993..dc07e5a 100644
--- a/R/grrf_filter.r
+++ b/R/grrf_filter.r
@@ -93,7 +93,11 @@ grrf_filter <- function(
 
   # Extract and normalize importance scores
   imp_initial <- rf_initial$variable.importance
-  imp_normalized <- imp_initial / max(imp_initial)
+  # Normalize to [0,1] (importance values may be negative)
+  imp_normalized <- {
+    (imp_initial - min(imp_initial)) /
+      (max(imp_initial) - min(imp_initial))
+  }
 
   # Step 2: Calculate regularization coefficients (penalty weights)
   # Higher importance -> higher coefficient -> less penalty
@@ -110,7 +114,6 @@ grrf_filter <- function(
     case.weights = weights,
     max.depth = max.depth,
     split.select.weights = coef_reg,
-    verbose = FALSE, # easiest way to get rid of cpp warning about split selected weights
     ...
   )
 
diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R
index 75949b8..4fc51c9 100644
--- a/R/trans_preds_t.R
+++ b/R/trans_preds_t.R
@@ -139,6 +139,11 @@ evoland_db$set(
           }
         },
         error = function(e) {
+          # do not prune on error
+          results_list[[id_trans]] <- data.table::data.table(
+            id_pred = id_preds,
+            id_trans = id_trans
+          )
           warning(glue::glue(
             "Error processing transition {id_trans}: {e$message}"
           ))