diff --git a/DESCRIPTION b/DESCRIPTION index cb31905..7b3e541 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,8 +28,35 @@ Imports: terra Suggests: tinytest, - quarto + quarto, + ranger VignetteBuilder: quarto Config/testthat/edition: 3 -LinkingTo: +LinkingTo: Rcpp +Collate: + 'RcppExports.R' + 'alloc_params_t.R' + 'coords_t.R' + 'covariance_filter.R' + 'parquet_duckdb.R' + 'evoland_db.R' + 'evoland_db_neighbors.R' + 'evoland_db_tables.R' + 'evoland_db_views.R' + 'grrf_filter.r' + 'init.R' + 'intrv_masks_t.R' + 'intrv_meta_t.R' + 'lulc_data_t.R' + 'lulc_meta_t.R' + 'neighbors_t.R' + 'periods_t.R' + 'pred_data_t.R' + 'pred_meta_t.R' + 'trans_meta_t.R' + 'trans_models_t.R' + 'trans_preds_t.R' + 'util.R' + 'util_download.R' + 'util_terra.R' diff --git a/NAMESPACE b/NAMESPACE index 77baa3d..ee973a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(print,intrv_masks_t) S3method(print,intrv_meta_t) S3method(print,lulc_data_t) S3method(print,lulc_meta_t) +S3method(print,neighbors_t) S3method(print,periods_t) S3method(print,pred_data_t) S3method(print,pred_meta_t) @@ -22,6 +23,7 @@ S3method(validate,intrv_masks_t) S3method(validate,intrv_meta_t) S3method(validate,lulc_data_t) S3method(validate,lulc_meta_t) +S3method(validate,neighbors_t) S3method(validate,periods_t) S3method(validate,pred_data_t) S3method(validate,pred_data_t_bool) @@ -37,24 +39,27 @@ export(as_intrv_masks_t) export(as_intrv_meta_t) export(as_lulc_data_t) export(as_lulc_meta_t) +export(as_neighbors_t) export(as_periods_t) export(as_pred_data_t) export(as_pred_meta_t) export(as_trans_meta_t) export(as_trans_models_t) export(as_trans_preds_t) -export(compute_neighbors) +export(covariance_filter) export(create_coords_t_square) export(create_intrv_meta_t) export(create_intrv_meta_t_row) export(create_lulc_meta_t) +export(create_neighbors_t) export(create_periods_t) export(create_pred_meta_t) export(create_trans_meta_t) -export(create_trans_preds_t) export(download_and_verify) export(evoland_db) export(extract_using_coords_t) +export(grrf_filter) +export(parquet_duckdb) export(print_rowwise_yaml) export(validate) importFrom(Rcpp,sourceCpp) diff --git a/R/RcppExports.R b/R/RcppExports.R index 3a54496..395824b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,7 +1,7 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0) { - .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution) +distance_neighbors_cpp <- function(coords_t, max_distance, resolution = 100.0, quiet = FALSE) { + .Call(`_evoland_distance_neighbors_cpp`, coords_t, max_distance, resolution, quiet) } diff --git a/R/coords_t.R b/R/coords_t.R index e4c77f2..e99609a 100644 --- a/R/coords_t.R +++ b/R/coords_t.R @@ -34,9 +34,9 @@ as_coords_t <- function(x) { geom_polygon = list() ) } - cast_dt_col(x, "id_coord", as.integer) + cast_dt_col(x, "id_coord", "int") if (!is.null(x[["region"]])) { - cast_dt_col(x, "region", as.factor) + cast_dt_col(x, "region", "factor") } new_evoland_table( x, diff --git a/R/covariance_filter.R b/R/covariance_filter.R new file mode 100644 index 0000000..bb9fc0c --- /dev/null +++ b/R/covariance_filter.R @@ -0,0 +1,168 @@ +#' Two stage covariate filtering +#' +#' The `covariance_filter` returns a set of covariates for land use land cover change +#' (LULCC) models based on a two-stage variable selection: a first statistical fit +#' estimates a covariate's quality for a given prediction task. A second step selects +#' all variables below a given correlation threshold: We iterate over a correlation +#' matrix ordered in the first step. Starting within the leftmost column, all rows (i.e. +#' candidates) greater than the given threshold are dropped from the full set of +#' candidates. This candidate selection is retained and used to select the next column, +#' until no further columns are left to investigate. The columns that were iterated over +#' are those returned as a character vector of selected variable names. +#' +#' @param data A data.table of target variable and candidate covariates to be filtered; +#' wide format with one predictor per column. +#' @param result_col Name of the column representing the transition results (0: no +#' trans, 1: trans) +#' @param rank_fun Optional function to compute ranking scores for each covariate. +#' Should take arguments (x, y, weights, ...) and return a single numeric value +#' (lower = better). Defaults to polynomial GLM p-value ranking. +#' @param weights Optional vector of weights to be used in the ranking function. Defaults to +#' class-balanced weights +#' @param corcut Numeric threshold (0-1) for correlation filtering. Covariates with correlation +#' coefficients above this threshold will be filtered out. Default is 0 (no filtering). +#' @param ... Additional arguments passed to rank_fun. +#' +#' @return A set of column names (covariates) to retain +#' +#' @details +#' The function first ranks covariates using the provided ranking function (default: +#' quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson) +#' correlated variables based on the correlation cutoff threshold, preserving variables +#' in order of their ranking. See +#' for +#' where the concept came from. The original author was Antoine Adde, with edits by +#' Benjamin Black. A similar mechanism is found in . +#' +#' @name covariance_filter +#' +#' @export + +covariance_filter <- function( + data, + result_col = "result", + rank_fun = rank_poly_glm, + weights = compute_balanced_weights(data[[result_col]]), + corcut = 0.7, + ... +) { + # Early return for single covariate + if (ncol(data) == 1) { + return(data) + } + + data.table::setDT(data) + + # Validate binary outcome + stopifnot( + "corcut must be between 0 and 1" = corcut >= 0 && corcut <= 1 + ) + + # Compute ranking scores for all covariates (vectorized where possible) + scores <- vapply( + data[, -..result_col], + rank_fun, + FUN.VALUE = numeric(1), + y = data[[result_col]], + weights = weights, + ... + ) + + # Sort by scores (lower = better/more significant) + ranked_order <- names(sort(scores)) + + # If no correlation filtering needed, return ranked predictors + if (corcut == 1) { + return(ranked_order) + } + + # Compute correlation matrix once + cor_mat <- abs(cor(data[, ..ranked_order], use = "pairwise.complete.obs")) + + # Iteratively select covariates based on correlation threshold + select_by_correlation(cor_mat, corcut) +} + + +#' @describeIn covariance_filter Default ranking function using polynomial GLM. Returns +#' the lower p value for each of the polynomial terms +#' @param x A numeric vector representing a single covariate +#' @param y A binary outcome vector (0/1) +#' @param weights Optional weights vector +#' @keywords internal +rank_poly_glm <- function(x, y, weights = NULL, ...) { + fit <- glm.fit( + x = cbind(1, poly(x, degree = 2, simple = TRUE)), + y = y, + family = quasibinomial(), + weights = weights + ) + + # Get p-values for linear and quadratic terms + coef_summary <- summary.glm(fit)$coefficients + + # Return minimum p-value (most significant term) + min(coef_summary[2:3, 4], na.rm = TRUE) +} + + +#' @describeIn covariance_filter Compute class-balanced weights for imbalanced binary +#' outcomes; returns a numeric vector +#' @param trans_result Binary outcome vector (0/1) +#' @param legacy Bool, use legacy weighting? +#' @keywords internal +compute_balanced_weights <- function(trans_result, legacy = FALSE) { + n_total <- length(trans_result) + n_trans <- sum(trans_result) + n_non_trans <- sum(!trans_result) + + # Compute inverse frequency weights + weights <- numeric(n_total) + + if (legacy) { + # I found this weighting in evoland-plus-legacy, but the models wouldn't converge + # https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.splitforcovselection.r + # This is actually just setting the underrepresented class to the rounded imbalance ratio + weights[!trans_result] <- 1 + weights[trans_result] <- round(n_non_trans / n_trans) + return(weights) + } + + # This is the heuristic in scikit-learn, n_samples / (n_classes * np.bincount(y)) + # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html #nolint + # This weighting maintains the exact imbalance ratio + weights[trans_result] <- n_total / (2 * n_trans) + weights[!trans_result] <- n_total / (2 * n_non_trans) + + weights +} + + +#' @describeIn covariance_filter Implements the iterative selection procedure. +#' @param cor_mat Absolute correlation matrix +#' @param corcut Correlation cutoff threshold +#' @keywords internal +select_by_correlation <- function(cor_mat, corcut) { + var_names <- colnames(cor_mat) + + # Early return if all correlations are below threshold + if (all(cor_mat[lower.tri(cor_mat)] < corcut)) { + return(var_names) + } + + selected <- character(0) + remaining_idx <- seq_along(var_names) + + while (length(remaining_idx) > 0) { + # Select the first remaining variable (highest ranked) + current_var <- remaining_idx[1] + selected <- c(selected, var_names[current_var]) + + # Find variables with correlation <= corcut with current variable + # (excluding the variable itself) + keep_idx <- which(cor_mat[remaining_idx, current_var] <= corcut) + remaining_idx <- remaining_idx[keep_idx] + } + + selected +} diff --git a/R/evoland_db.R b/R/evoland_db.R index 5179ffd..9aebd16 100644 --- a/R/evoland_db.R +++ b/R/evoland_db.R @@ -6,25 +6,24 @@ #' This class uses DuckDB for in-memory SQL operations while persisting data #' to disk in parquet format for better compression. #' +#' Inherits from [parquet_duckdb] for generic database operations. +#' +#' @seealso +#' Additional methods and active bindings are added to this class in separate files: +#' +#' - [evoland_db_tables] - Table active bindings (coords_t, lulc_data_t, etc.) +#' - [evoland_db_views] - View active bindings (lulc_meta_long_v, etc.) and methods +#' - [evoland_db_neighbors] - Neighbor analysis methods +#' +#' @include parquet_duckdb.R #' @export evoland_db <- R6::R6Class( classname = "evoland_db", + inherit = parquet_duckdb, ## Public Methods ---- public = list( - #' @field connection DBI connection object to an in-memory DuckDB database - connection = NULL, - - #' @field path Character string path to the data folder - path = NULL, - - #' @field default_format Default file format for new tables - default_format = NULL, - - #' @field writeopts Default write options for DuckDB, see - writeopts = NULL, - #' @description #' Initialize a new evoland_db object #' @param path Character string. Path to the data folder. @@ -38,363 +37,44 @@ evoland_db <- R6::R6Class( default_format = c("parquet", "csv"), ... ) { - # Create folder if it doesn't exist - self$path <- ensure_dir(path) - - # set format / writeopts - self$default_format <- match.arg(default_format) - self$writeopts <- switch( - self$default_format, - parquet = "FORMAT parquet, COMPRESSION zstd", - csv = "FORMAT csv", - stop(glue::glue("Unsupported format: {format}")) + # Initialize parent class with spatial extension + super$initialize( + path = path, + default_format = default_format, + extensions = "spatial" ) - # Create in-memory connection for SQL operations - self$connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") - self$execute("INSTALL spatial; LOAD spatial;") - + # Set evoland-specific reporting metadata self$set_report(...) invisible(self) }, - ### DB methods ---- - # TODO the database methods should be spun out into a parent class "parquet_db" that is - # independent of domain specific logic. the domain logic would then live in evoland_db. - #' @description - #' Commit data in overwrite mode - #' @param x Data frame to commit - #' @param table_name Character string table name - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - commit_overwrite = function( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) - - # if there are any of these, first get existing max values - if (length(intersect(autoincrement_cols, names(x))) > 0) { - warning(glue::glue( - "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", - "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" - )) - } - - # autoincrement = row number; all other cols treated as ordinary - ordinary_cols <- setdiff(names(x), autoincrement_cols) - select_expr <- glue::glue_collapse( - c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), - sep = ",\n " - ) - - self$execute(glue::glue( - r"{ - copy ( - select {select_expr} - from new_data_v - ) to '{file_path}' ({self$writeopts}) - }" - )) - }, - - #' @description - #' Commit data in append mode - #' @param x Data frame to commit - #' @param table_name Character string table name - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - commit_append = function( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - self$commit_overwrite(x, table_name, autoincrement_cols, map_cols) - } - - self$attach_table(table_name) - on.exit(self$detach_table(table_name)) - private$set_autoincrement_vars(table_name, autoincrement_cols) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) - - ordinary_cols <- setdiff(names(x), autoincrement_cols) - select_new <- glue::glue_collapse( - c( - glue::glue( - "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" - ), - ordinary_cols - ), - sep = ",\n " - ) - - # concatenation using union all; the "by name" option inserts NULLs if a column is - # missing in one of the queries. also makes it robust against differing col orders. - self$execute(glue::glue( - r"{ - copy ( - select - * - from - {table_name} - union all by name - select - {select_new} - from - new_data_v - ) - to '{file_info$path}' ({self$writeopts}) - }" - )) - }, - #' @description - #' Commit data in upsert mode - #' @param x Data frame to commit - #' @param table_name Character string table name - #' @param key_cols Identify unique columns - heuristic: if prefixed with - #' id_, the set of all columns designates a uniqueness condition - #' @param autoincrement_cols Character vector of column names to auto-increment - #' @param map_cols Character vector of columns to convert to MAP format - commit_upsert = function( - x, - table_name, - key_cols = grep("^id_", names(x), value = TRUE), - autoincrement_cols = character(0), - map_cols = character(0) - ) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(self$commit_overwrite(x, table_name, autoincrement_cols, map_cols)) - } else if (length(key_cols) == 0) { - return(self$commit_append(x, table_name, autoincrement_cols, map_cols)) - } - - self$attach_table(table_name) - on.exit(self$detach_table(table_name)) - - private$set_autoincrement_vars(table_name, autoincrement_cols) - - private$register_new_data_v(x, map_cols) - on.exit(private$cleanup_new_data_v(map_cols), add = TRUE) - - # Update existing data without touching key_cols or autoincrement_cols - ordinary_cols <- setdiff(names(x), union(key_cols, autoincrement_cols)) - update_select_expr <- glue::glue_collapse( - glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), - sep = ",\n " - ) - update_join_condition <- glue::glue_collapse( - glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"), - sep = "\nand " - ) - - self$execute(glue::glue( - r"{ - update {table_name} set - {update_select_expr} - from new_data_v - where - {update_join_condition}; - }" - )) - - # Insert new data with incrementing autoincrement_cols - insert_select_expr <- glue::glue_collapse( - c( - glue::glue( - "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" - ), - glue::glue("new_data_v.{setdiff(names(x), autoincrement_cols)}") - ), - sep = ",\n " - ) - null_condition <- glue::glue_collapse( - glue::glue("{table_name}.{key_cols} is null"), - sep = "\nand " - ) - - self$execute(glue::glue( - r"{ - insert into {table_name} - select - {insert_select_expr} - from - new_data_v - left join - {table_name} - on - {update_join_condition} - where - {null_condition} - ; - }" - )) - - self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})")) - }, - - #' @description - #' Fetch data from storage + #' Fetch data from storage with evoland-specific view support #' @param table_name Character string. Name of the table to query. #' @param where Character string. Optional WHERE clause for the SQL query. #' @param limit integerish, limit the amount of rows to return #' #' @return A data.table fetch = function(table_name, where = NULL, limit = NULL) { - # Check if this is a view that needs special handling - view_result <- switch( - table_name, - lulc_meta_long_v = make_lulc_meta_long_v(self, private, where), - pred_sources_v = make_pred_sources_v(self, private, where), - transitions_v = make_transitions_v(self, private, where), - NULL - ) - - if (!is.null(view_result)) { - return(view_result) + # Check if this is a view (active binding) + if ( + # TODO these should probably not be active bindings, but instead methods with + # predefined query parameters + table_name %in% + c("lulc_meta_long_v", "pred_sources_v", "transitions_v", "extent", "coords_minimal") + ) { + return(self[[table_name]]) } file_info <- private$get_file_path(table_name) if (!file_info$exists) { - return(private$get_empty_table(table_name)) + stop("Table `", table_name, "` does not exist") } - # Build SQL query - sql <- glue::glue("SELECT * FROM read_{file_info$format}('{file_info$path}')") - - if (!is.null(where)) { - sql <- glue::glue("{sql} WHERE {where}") - } - if (!is.null(limit)) { - sql <- glue::glue("{sql} LIMIT {limit}") - } - - self$get_query(sql) - }, - - #' @description - #' List all tables (files) in storage - #' @return Character vector of table names - list_tables = function() { - list.files(self$path, pattern = "\\.(parquet|csv)$", full.names = FALSE) |> - tools::file_path_sans_ext() |> - unique() |> - sort() - }, - - #' @description - #' Execute statement - #' @param statement A SQL statement - #' @return No. of rows affected by statement - execute = function(statement) { - DBI::dbExecute(self$connection, statement) - }, - - #' @description - #' Get Query - #' @param statement A SQL statement - #' @return No. of rows affected by statement - get_query = function(statement) { - DBI::dbGetQuery(self$connection, statement) |> - data.table::as.data.table() - }, - - #' @description - #' Attach a single table as a temporary table in DuckDB memory. This is - #' useful for working with multiple tables in SQL queries instead of loading - #' them into R objects. - #' @param table_name Character vector. Names of table to attach. - #' @param columns Character vector. Optional sql column selection, defaults to "*" - attach_table = function(table_name, columns = "*", where = NULL) { - # TODO the attach/detach combo should be available in a DB method "with - # table"; it should do nothing if the table is already attached - file_info <- private$get_file_path(table_name) - - # Build SQL query - sql <- glue::glue( - "CREATE TEMP TABLE {table_name} AS ", - "SELECT {paste(columns, collapse = ', ')} ", - "FROM read_{file_info$format}('{file_info$path}')" - ) - - if (!is.null(where)) { - sql <- glue::glue("{sql} WHERE {where}") - } - - # Execute SQL - self$execute(sql) - }, - - #' @description - #' Detach one or more tables from the database. - #' @param table_name Character. Name of table to drop. - detach_table = function(table_name) { - self$execute(paste0("drop table ", table_name, ";")) - }, - - #' @description - #' Get table row count - #' @param table_name Character string. Name of the table to query. - #' @return No. of rows - row_count = function(table_name) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(0L) - } - - self$get_query( - glue::glue("SELECT COUNT(*) as n FROM read_{file_info$format}('{file_info$path}')") - )[[1]] - }, - - #' @description - #' Delete rows from a table - #' @param table_name Character string. Name of the table to delete from. - #' @param where Character string, defaults to NULL: delete everything in table. - #' @return No. of rows affected - delete_from = function(table_name, where = NULL) { - file_info <- private$get_file_path(table_name) - - if (!file_info$exists) { - return(0L) - } - - count_before <- self$row_count(table_name) - - if (is.null(where)) { - file.remove(file_info$path) - return(count_before) - } - - self$execute(glue::glue( - r"{ - COPY ( - SELECT * FROM read_{file_info$format}('{file_info$path}') - WHERE NOT ({where}) - ) - TO '{file_info$path}' ({self$writeopts}) - }" - )) - - count_after <- self$row_count(table_name) - - return(count_before - count_after) + super$fetch(table_name, where, limit) }, ### Setter methods ---- @@ -405,21 +85,27 @@ evoland_db <- R6::R6Class( set_report = function(...) { params <- list(...) if (self$row_count("reporting_t") == 0L) { - params[["report_name"]] <- "evoland_scenario" - params[["report_name_pretty"]] <- "Default Evoland Scenario" - params[["report_include_date"]] <- "TRUE" - params[["creator_username"]] <- Sys.getenv("USER", unset = "unknown") + # only upsert if these values are missing upon DB init + params[["report_name"]] <- + params[["report_name"]] %||% "evoland_scenario" + params[["report_name_pretty"]] <- + params[["report_name_pretty"]] %||% "Default Evoland Scenario" + params[["report_include_date"]] <- + params[["report_include_date"]] %||% "TRUE" + params[["creator_username"]] <- + params[["creator_username"]] %||% Sys.getenv("USER", unset = "unknown") } params[["last_opened"]] <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ", tz = "UTC") params[["last_opened_username"]] <- Sys.getenv("USER", unset = "unknown") - self$commit_upsert( + self$commit( data.table::as.data.table(list( key = names(params), # cannot name a column "key" in data.table() value = unlist(params) )), table_name = "reporting_t", - key_cols = "key" + key_cols = "key", + method = "upsert" ) }, @@ -438,7 +124,7 @@ evoland_db <- R6::R6Class( function(x) stop("Unsupported coordinate type specified.") ) - self$commit_overwrite(as_coords_t(create_fun(...)), "coords_t") + self$commit(as_coords_t(create_fun(...)), "coords_t", method = "overwrite") }, #' @description @@ -459,7 +145,7 @@ evoland_db <- R6::R6Class( return(invisible(NULL)) } - self$commit_append( + self$commit( do.call(create_periods_t, as.list(environment())), "periods_t" ) @@ -482,485 +168,14 @@ evoland_db <- R6::R6Class( where = glue::glue("name = '{names(pred_spec)}'") ) - data.table::set(pred_data, j = "id_pred", value = existing_meta[["id_pred"]]) + data.table::set(pred_data, j = "id_pred", value = as.integer(existing_meta[["id_pred"]])) data.table::setcolorder(pred_data, c("id_pred", "id_coord", "id_period", "value")) - self$commit_upsert( + self$commit( as_pred_data_t(pred_data, pred_type), - paste0("pred_data_t_", pred_type) - ) - }, - - #' @description - #' Print method for evoland_db - #' @param ... Not used - #' @return self (invisibly) - print = function(...) { - cat("\n") - cat(sprintf("Database path: %s\n\n", self$path)) - - cat("Database methods:\n") - cat(" Commit: commit_overwrite(x, table_name, autoincrement_cols, map_cols),\n") - cat(" commit_append(x, table_name, autoincrement_cols, map_cols),\n") - cat(" commit_upsert(x, table_name, key_cols, autoincrement_cols, map_cols)\n") - cat(" Fetch: fetch(table_name, where, limit)\n") - cat(" Delete: delete_from(table_name, where)\n") - cat(" Other: list_tables(), execute(statement), get_query(statement),\n") - cat(" attach_table(table_name, columns), detach_table(table_name),\n") - cat(" row_count(table_name)\n\n") - - cat("Set / Add methods:\n") - cat(" set_report(...), set_coords(type, ...),\n") - cat(" set_periods(period_length_str, start_observed, end_observed, end_extrapolated),\n") - cat(" add_predictor(pred_spec, pred_data, pred_type)\n\n") - - cat("Active bindings (read-write):\n") - cat(" coords_t, periods_t, lulc_meta_t, lulc_data_t, pred_data_t_float,\n") - cat(" pred_data_t_int, pred_data_t_bool, pred_meta_t, trans_meta_t, trans_preds_t,\n") - cat(" intrv_meta_t, intrv_masks_t, trans_models_t, alloc_params_t\n\n") - - cat("Active bindings (read-only):\n") - cat(" extent, coords_minimal, lulc_meta_long_v, pred_sources_v\n") - - invisible(self) - } - ), - - ## Active Bindings ---- - active = list( - ### Bindings for tables ---- - #' @field coords_t A `coords_t` instance; see [create_coords_t()] for the type of - #' object to assign. Assigning is an upsert operation. - coords_t = function(x) { - create_active_binding(self, "coords_t", as_coords_t)(x) - }, - - #' @field periods_t A `periods_t` instance; see [create_periods_t()] for the type of - #' object to assign. Assigning is an upsert operation. - periods_t = function(x) { - create_active_binding(self, "periods_t", as_periods_t)(x) - }, - - #' @field lulc_meta_t A `lulc_meta_t` instance; see [create_lulc_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - lulc_meta_t = function(x) { - create_active_binding(self, "lulc_meta_t", as_lulc_meta_t)(x) - }, - - #' @field lulc_meta_long_v Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`. - lulc_meta_long_v = function() { - make_lulc_meta_long_v(self, private) - }, - - #' @field lulc_data_t A `lulc_data_t` instance; see [as_lulc_data_t()] for the type of - #' object to assign. Assigning is an upsert operation. - lulc_data_t = function(x) { - create_active_binding(self, "lulc_data_t", as_lulc_data_t)(x) - }, - - #' @field transitions_v Get the transitions from `lulc_data_t`. - transitions_v = function() { - make_transitions_v(self, private) - }, - - #' @field pred_data_t_float A `pred_data_t_float` instance; see - #' [create_pred_data_t()] for the type of object to assign. Assigning is an - #' upsert operation. - pred_data_t_float = function(x) { - create_active_binding( - self, - "pred_data_t_float", - as_pred_data_t, - type = "float" - )(x) - }, - - #' @field pred_data_t_int A `pred_data_t_int` instance; see - #' [create_pred_data_t()] for the type of object to assign. Assigning is an - #' upsert operation. - pred_data_t_int = function(x) { - create_active_binding( - self, - "pred_data_t_int", - as_pred_data_t, - type = "int" - )(x) - }, - - #' @field pred_data_t_bool A `pred_data_t_bool` instance; see - #' [create_pred_data_t()] for the type of object to assign. Assigning is an - #' upsert operation. - pred_data_t_bool = function(x) { - create_active_binding( - self, - "pred_data_t_bool", - as_pred_data_t, - type = "bool" - )(x) - }, - - ### Bindings for descriptions, views, etc. ---- - #' @field extent Return a terra SpatExtent based on coords_t - extent = function() { - make_extent_db(self, private) - }, - - #' @field coords_minimal data.table with only (id_coord, lon, lat) - coords_minimal = function() { - make_coords_minimal(self, private) - }, - - #' @field pred_meta_t A `pred_meta_t` instance; see [create_pred_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - pred_meta_t = function(x) { - if (missing(x)) { - x <- self$fetch("pred_meta_t") - return(as_pred_meta_t(x)) - } - stopifnot(inherits(x, "pred_meta_t")) - self$commit_upsert( - x, - table_name = "pred_meta_t", - key_cols = "name", - autoincrement_cols = "id_pred" - ) - }, - - #' @field pred_sources_v Retrieve a table of distinct predictor urls and their - #' md5sum - pred_sources_v = function() { - make_pred_sources_v(self, private) - }, - - #' @field trans_meta_t A `trans_meta_t` instance; see [create_trans_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - trans_meta_t = function(x) { - if (missing(x)) { - x <- self$fetch("trans_meta_t") - return(as_trans_meta_t(x)) - } - stopifnot(inherits(x, "trans_meta_t")) - self$commit_upsert( - x, - table_name = "trans_meta_t", - key_cols = c("id_lulc_anterior", "id_lulc_posterior"), - autoincrement_cols = "id_trans" - ) - }, - - #' @field trans_preds_t A `trans_preds_t` instance; see [create_trans_preds_t()] for the type of - #' object to assign. Assigning is an upsert operation. - trans_preds_t = function(x) { - create_active_binding(self, "trans_preds_t", as_trans_preds_t)(x) - }, - - #' @field intrv_meta_t A `intrv_meta_t` instance; see [create_intrv_meta_t()] for the type of - #' object to assign. Assigning is an upsert operation. - intrv_meta_t = function(x) { - if (missing(x)) { - return(as_intrv_meta_t( - convert_list_cols(self$fetch("intrv_meta_t"), "params", kv_df_to_list) - )) - } - stopifnot(inherits(x, "intrv_meta_t")) - - self$commit_upsert(x, "intrv_meta_t", key_cols = "id_intrv", map_cols = "params") - }, - - #' @field intrv_masks_t A `intrv_masks_t` instance; see [as_intrv_masks_t()] for the type of - #' object to assign. Assigning is an upsert operation. - intrv_masks_t = function(x) { - create_active_binding(self, "intrv_masks_t", as_intrv_masks_t)(x) - }, - - #' @field trans_models_t A `trans_models_t` instance; see [create_trans_models_t()] for the type - #' of object to assign. Assigning is an upsert operation. - trans_models_t = function(x) { - if (missing(x)) { - return(as_trans_models_t( - convert_list_cols( - self$fetch("trans_models_t"), - c("model_params", "goodness_of_fit"), - kv_df_to_list - ) - )) - } - stopifnot(inherits(x, "trans_models_t")) - - self$commit_upsert( - x, - "trans_models_t", - key_cols = c("id_trans", "id_period"), - map_cols = c("model_params", "goodness_of_fit") - ) - }, - - #' @field alloc_params_t A `alloc_params_t` instance; see [as_alloc_params_t()] for the type - #' of object to assign. Assigning is an upsert operation. - alloc_params_t = function(x) { - if (missing(x)) { - return(as_alloc_params_t( - convert_list_cols( - self$fetch("alloc_params_t"), - c("alloc_params", "goodness_of_fit"), - kv_df_to_list - ) - )) - } - stopifnot(inherits(x, "alloc_params_t")) - - self$commit_upsert( - x, - "alloc_params_t", - key_cols = c("id_trans", "id_period"), - map_cols = c("alloc_params", "goodness_of_fit") + paste0("pred_data_t_", pred_type), + method = "upsert" ) } - ), - - ## Private Methods ---- - private = list( - # r6 hook called on gc(); Close the database connection - # - # return NULL (called for side effects) - finalize = function() { - if (!is.null(self$connection)) { - DBI::dbDisconnect(self$connection) - self$connection <- NULL - } - }, - - # Get file path and format for a table - # - # param table_name Character string table name - # return List with path, format, and exists flag - get_file_path = function(table_name) { - # Check for parquet first, then csv - parquet_path <- file.path(self$path, paste0(table_name, ".parquet")) - csv_path <- file.path(self$path, paste0(table_name, ".csv")) - - if (file.exists(parquet_path)) { - return(list(path = parquet_path, format = "parquet", exists = TRUE)) - } else if (file.exists(csv_path)) { - return(list(path = csv_path, format = "csv", exists = TRUE)) - } else { - # Return default format for new file - default_path <- file.path( - self$path, - paste0(table_name, ".", self$default_format) - ) - return(list( - path = default_path, - format = self$default_format, - exists = FALSE - )) - } - }, - - # Register new_data_v table, optionally converting MAP columns - # - # param x Data to register or character name of existing table/view - # param map_cols Character vector of columns to convert to MAP format - # return NULL (called for side effects) - register_new_data_v = function(x, map_cols = character(0)) { - # If x is a table name, create alias to new_data_v - if (is.character(x) && length(x) == 1) { - self$execute(glue::glue("create view new_data_v as select * from {x}")) - return(invisible(NULL)) - } - - if (length(map_cols) == 0) { - # No MAP conversion needed - register directly - duckdb::duckdb_register(self$connection, "new_data_v", x) - } else { - # Convert list columns to key-value dataframes - x <- - data.table::copy(x) |> - convert_list_cols(map_cols, list_to_kv_df) - - # Register as intermediate table - duckdb::duckdb_register(self$connection, "new_data_raw", x) - - # Build SELECT expression with map_from_entries for MAP columns - map_exprs <- glue::glue("map_from_entries({map_cols}) as {map_cols}") - other_cols <- setdiff(names(x), map_cols) - all_exprs <- c(other_cols, map_exprs) - select_expr <- glue::glue_collapse(all_exprs, sep = ", ") - - # Create new_data_v from new_data_raw - self$execute(glue::glue( - "create temp table new_data_v as select {select_expr} from new_data_raw" - )) - } - - invisible(NULL) - }, - - # Cleanup new_data_v and related tables - # - # param map_cols Character vector indicating if MAP conversion was used - # return NULL (called for side effects) - cleanup_new_data_v = function(map_cols = character(0)) { - if (length(map_cols) == 0) { - duckdb::duckdb_unregister(self$connection, "new_data_v") - } else { - self$execute( - "drop table if exists new_data_v; - drop view if exists new_data_v" - ) - duckdb::duckdb_unregister(self$connection, "new_data_raw") - } - - invisible(NULL) - }, - - # Write data to file - # - # param x Data frame to write - # param table_name Character string table name - write_file = function(x, table_name) { - file_path <- file.path(self$path, paste0(table_name, ".", self$default_format)) - - duckdb::duckdb_register(self$connection, "temp_write_table", x) - on.exit(duckdb::duckdb_unregister(self$connection, "temp_write_table")) - - sql <- glue::glue("COPY temp_write_table TO '{file_path}' ({self$writeopts})") - DBI::dbExecute(self$connection, sql) - }, - - # Set one duckdb variable name max_{colname} to the maximum found for each - # autoincrement_col in table_name. if column is missing from table_name, set to 0. - set_autoincrement_vars = function(table_name, autoincrement_cols) { - if (length(autoincrement_cols) == 0L) { - return(NULL) - } - if (!DBI::dbExistsTable(self$connection, table_name)) { - # attach / detach unless it was already there - self$attach_table(table_name) - on.exit(self$detach_table(table_name)) - } - - existing_cols <- - glue::glue("select column_name from (describe {table_name})") |> - self$get_query() |> - (\(x) x[[1]])() |> - intersect(autoincrement_cols) - - missing_cols <- setdiff(autoincrement_cols, existing_cols) - - set_exprs <- glue::glue_collapse( - c( - glue::glue( - "set variable max_{existing_cols} = (select coalesce(max({existing_cols}), 0) from {table_name});" - ), - glue::glue( - "set variable max_{missing_cols} = 0;" - ) - ), - sep = "\n" - ) - - self$execute(set_exprs) - }, - - # Get empty table with proper structure - # - # param table_name Character string table name - # return Empty data.table with correct columns - get_empty_table = function(table_name) { - # Define empty table structures - empty_tables <- list( - reporting_t = suppressWarnings(data.table::data.table( - key = character(0), - value = character(0) - )), - coords_t = as_coords_t(), - periods_t = as_periods_t(), - lulc_meta_t = as_lulc_meta_t(), - lulc_data_t = as_lulc_data_t(), - pred_meta_t = as_pred_meta_t(), - pred_data_t_float = as_pred_data_t(type = "float"), - pred_data_t_int = as_pred_data_t(type = "int"), - pred_data_t_bool = as_pred_data_t(type = "bool"), - trans_meta_t = as_trans_meta_t(), - trans_preds_t = as_trans_preds_t(), - intrv_meta_t = as_intrv_meta_t(), - intrv_masks_t = as_intrv_masks_t(), - trans_models_t = as_trans_models_t(), - alloc_params_t = as_alloc_params_t() - ) - - if (table_name %in% names(empty_tables)) { - return(empty_tables[[table_name]]) - } - - # Default: return empty data.table - data.table::data.table() - } ) ) - -# Helper function to create simple Bindings with standard fetch/commit pattern -# param self The R6 instance (self). -# param table_name Character string. Name of the table. -# param as_fn Function to convert fetched data to the appropriate type. -# param ... Additional arguments passed to as_fn. -# return A function suitable for use as an active binding. -create_active_binding <- function(self, table_name, as_fn, ...) { - extra_args <- list(...) - function(x) { - if (missing(x)) { - x <- self$fetch(table_name) - return(do.call(as_fn, c(list(x), extra_args))) - } - stopifnot(inherits(x, table_name)) - self$commit_upsert(x, table_name) - } -} - - -# Helper functions for converting between list and data.frame formats for DuckDB MAPs -convert_list_cols <- function(x, cols, fn) { - for (col in cols) { - x[[col]] <- lapply(x[[col]], fn) - } - x -} - -list_to_kv_df <- function(x) { - if (is.null(x) || length(x) == 0) { - return(data.frame( - key = character(0), - value = character(0), - stringsAsFactors = FALSE - )) - } - data.frame( - key = names(x), - value = as.character(unlist(x)), - stringsAsFactors = FALSE - ) -} - -kv_df_to_list <- function(x) { - if (is.null(x) || nrow(x) == 0) { - return(NULL) - } - - out <- list() - - for (row in seq_len(nrow(x))) { - key <- x$key[row] - val <- x$value[row] - - # Try numeric conversion - num_val <- suppressWarnings(as.numeric(val)) - if (!is.na(num_val)) { - out[[key]] <- num_val - } else { - out[[key]] <- val - } - } - - out -} diff --git a/R/evoland_db_neighbors.R b/R/evoland_db_neighbors.R new file mode 100644 index 0000000..f8f6d20 --- /dev/null +++ b/R/evoland_db_neighbors.R @@ -0,0 +1,162 @@ +#' Neighbor analysis methods for evoland_db +#' +#' @description +#' This file adds neighbor analysis methods to the `evoland_db` class using R6's `$set()` method. +#' These methods compute neighbor relationships and generate neighbor-based predictors. +#' +#' @section Methods Added: +#' +#' - `create_neighbors_t(max_distance, distance_breaks, resolution, overwrite)` - +#' Computes neighbor relationships between coordinates. +#' - `max_distance`: Maximum distance for neighbors (default: 1000) +#' - `distance_breaks`: Vector of breaks for distance classes (default: c(0, 100, 500, 1000)) +#' - `resolution`: Grid resolution for distance calculations (default: 100) +#' - `overwrite`: Whether to overwrite existing neighbors_t (default: FALSE) +#' - `generate_neighbor_predictors()` - Generates predictor variables based on neighbor +#' land use counts by distance class. Requires neighbors_t with distance_class column +#' (create_neighbors_t with distance_breaks). +#' +#' @name evoland_db_neighbors +#' @include evoland_db.R +NULL + +evoland_db$set( + "public", + "set_neighbors", + function( + max_distance = 1000, + distance_breaks = c(0, 100, 500, 1000), + resolution = 100, + overwrite = FALSE + ) { + if (!overwrite && self$row_count("neighbors_t") > 0) { + message("neighbors_t already exists. Use overwrite = TRUE to recompute.") + return(invisible(self)) + } + + coords <- self$coords_t + + neighbors <- create_neighbors_t( + coords, + max_distance = max_distance, + distance_breaks = distance_breaks, + resolution = resolution + ) + + self$commit( + as_neighbors_t(neighbors), + table_name = "neighbors_t", + method = "overwrite" + ) + + message(glue::glue("Computed {nrow(neighbors)} neighbor relationships")) + invisible(self) + } +) + +evoland_db$set("public", "generate_neighbor_predictors", function() { + if (self$row_count("neighbors_t") == 0) { + stop("No neighbor data found. Run $set_neighbors() first.") + } + + if (self$row_count("lulc_meta_t") == 0) { + stop("No LULC metadata found. Add lulc_meta_t before generating neighbor predictors.") + } + + if (self$row_count("lulc_data_t") == 0) { + stop("No LULC data found. Add lulc_data_t before generating neighbor predictors.") + } + + neighbors_sample <- self$fetch("neighbors_t", limit = 0L) + if (!"distance_class" %in% names(neighbors_sample)) { + stop( + "neighbors_t does not have distance_class column. Run $create_neighbors_t() with distance_breaks parameter." + ) + } + + self$attach_table( + "neighbors_t", + columns = c("id_coord_origin", "id_coord_neighbor", "distance_class") + ) + self$attach_table("lulc_data_t") + self$attach_table("lulc_meta_t") + + on.exit({ + self$detach_table("neighbors_t") + self$detach_table("lulc_data_t") + self$detach_table("lulc_meta_t") + }) + + n_predictors <- self$execute( + r"{ + create or replace temp table pred_meta_neighbors_t as + with + all_distance_classes as (select distinct distance_class from neighbors_t) + select + NULL as id_pred, + concat('id_lulc_', l.id_lulc, '_dist_', c.distance_class) as name, + concat('Count of ', l.pretty_name, ' within distance class ', c.distance_class) as pretty_name, + 'Number of neighbors by land use class and distance interval' as description, + 'land use coordinate data' as orig_format, + NULL as sources, + 'number of neighbors' as unit, + NULL as factor_levels, + c.distance_class, + l.id_lulc + from + lulc_meta_t l + cross join + all_distance_classes c + }" + ) + self$execute( + "create or replace view pred_meta_upsert_v as + select name, pretty_name, description, orig_format, sources, unit, factor_levels + from pred_meta_neighbors_t" + ) + + self$attach_table("pred_meta_t") + on.exit(self$detach_table("pred_meta_t"), add = TRUE) + self$execute( + r"{ + update pred_meta_neighbors_t + set id_pred = pred_meta_t.id_pred + from pred_meta_t + where pred_meta_neighbors_t.name = pred_meta_t.name + }" + ) + + n_data_points <- self$execute( + r"{ + create temp table pred_neighbors_t as + select + p.id_pred, + n.id_coord_origin as id_coord, + t.id_period, + count(n.id_coord_neighbor) as value + from + neighbors_t n, + lulc_data_t t, + pred_meta_neighbors_t p + where + n.id_coord_neighbor = t.id_coord + and p.id_lulc = t.id_lulc + and p.distance_class = n.distance_class + group by + n.id_coord_origin, + n.distance_class, + t.id_period, + t.id_lulc, + p.id_pred + }" + ) + + self$commit("pred_neighbors_t", "pred_neighbors_t_int", method = "upsert") + + message(glue::glue( + "Generated {n_predictors} neighbor predictor variables with ", + "{n_data_points} data points" + )) + + invisible(self) +}) diff --git a/R/evoland_db_tables.R b/R/evoland_db_tables.R new file mode 100644 index 0000000..1ff2043 --- /dev/null +++ b/R/evoland_db_tables.R @@ -0,0 +1,207 @@ +#' Table active bindings for evoland_db +#' +#' @description +#' This file adds table active bindings to the `evoland_db` class using R6's `$set()` method. +#' These bindings provide read/write access to database tables with automatic validation. +#' +#' @section Active Bindings Added: +#' +#' - `coords_t` - Coordinates table. See [as_coords_t()] +#' - `periods_t` - Time periods table. See [as_periods_t()] +#' - `lulc_meta_t` - Land use/land cover metadata. See [as_lulc_meta_t()] +#' - `lulc_data_t` - Land use/land cover data. See [as_lulc_data_t()] +#' - `pred_data_t_float` - Float predictor data. See [as_pred_data_t()] +#' - `pred_data_t_int` - Integer predictor data. See [as_pred_data_t()] +#' - `pred_data_t_bool` - Boolean predictor data. See [as_pred_data_t()] +#' - `pred_meta_t` - Predictor metadata. See [as_pred_meta_t()] +#' - `trans_meta_t` - Transition metadata. See [as_trans_meta_t()] +#' - `trans_preds_t` - Transition-predictor relationships. See [as_trans_preds_t()] +#' - `intrv_meta_t` - Intervention metadata. See [as_intrv_meta_t()] +#' - `intrv_masks_t` - Intervention masks. See [as_intrv_masks_t()] +#' - `trans_models_t` - Transition models. See [as_trans_models_t()] +#' - `alloc_params_t` - Allocation parameters. See [as_alloc_params_t()] +#' - `neighbors_t` - Neighbor relationships. See [as_neighbors_t()] +#' +#' @name evoland_db_tables +#' @include evoland_db.R +NULL + +create_table_binding <- function( + self, + table_name, + as_fn, + key_cols = NULL, + autoincrement_cols = NULL, + map_cols = NULL, + ... +) { + extra_args <- list(...) + + function(x) { + if (missing(x)) { + fetched <- self$fetch(table_name) + + if (!is.null(map_cols) && nrow(fetched) > 0) { + fetched <- convert_list_cols(fetched, map_cols, kv_df_to_list) + } + + return(do.call(as_fn, c(list(fetched), extra_args))) + } + + stopifnot(inherits(x, table_name)) + + self$commit( + x, + table_name = table_name, + key_cols = key_cols, + autoincrement_cols = autoincrement_cols, + map_cols = map_cols, + method = "upsert" + ) + } +} + +evoland_db$set("active", "coords_t", function(x) { + create_table_binding( + self, + "coords_t", + as_coords_t, + key_cols = "id_coord" + )(x) +}) + +evoland_db$set("active", "periods_t", function(x) { + create_table_binding( + self, + "periods_t", + as_periods_t, + key_cols = "id_period" + )(x) +}) + +evoland_db$set("active", "lulc_meta_t", function(x) { + create_table_binding( + self, + "lulc_meta_t", + as_lulc_meta_t, + key_cols = "id_lulc" + )(x) +}) + +evoland_db$set("active", "lulc_data_t", function(x) { + create_table_binding( + self, + "lulc_data_t", + as_lulc_data_t, + key_cols = c("id_coord", "id_period") + )( + x + ) +}) + +evoland_db$set("active", "pred_data_t_float", function(x) { + create_table_binding( + self, + "pred_data_t_float", + as_pred_data_t, + key_cols = c("id_pred", "id_coord", "id_period"), + type = "float" + )(x) +}) + +evoland_db$set("active", "pred_data_t_int", function(x) { + create_table_binding( + self, + "pred_data_t_int", + as_pred_data_t, + key_cols = c("id_pred", "id_coord", "id_period"), + type = "int" + )(x) +}) + +evoland_db$set("active", "pred_data_t_bool", function(x) { + create_table_binding( + self, + "pred_data_t_bool", + as_pred_data_t, + key_cols = c("id_pred", "id_coord", "id_period"), + type = "bool" + )(x) +}) + +evoland_db$set("active", "pred_meta_t", function(x) { + create_table_binding( + self, + "pred_meta_t", + as_pred_meta_t, + key_cols = "name", + autoincrement_cols = "id_pred" + )(x) +}) + +evoland_db$set("active", "trans_meta_t", function(x) { + create_table_binding( + self, + "trans_meta_t", + as_trans_meta_t, + key_cols = c("id_lulc_anterior", "id_lulc_posterior"), + autoincrement_cols = "id_trans" + )(x) +}) + +evoland_db$set("active", "trans_preds_t", function(x) { + create_table_binding( + self, + "trans_preds_t", + as_trans_preds_t, + key_cols = c("id_trans", "id_pred") + )(x) +}) + +evoland_db$set("active", "intrv_meta_t", function(x) { + create_table_binding( + self, + "intrv_meta_t", + as_intrv_meta_t, + key_cols = "id_intrv", + map_cols = "params" + )(x) +}) + +evoland_db$set("active", "intrv_masks_t", function(x) { + create_table_binding( + self, + "intrv_masks_t", + as_intrv_masks_t, + key_cols = c("id_coord", "id_intrv") + )(x) +}) + +evoland_db$set("active", "trans_models_t", function(x) { + create_table_binding( + self, + "trans_models_t", + as_trans_models_t, + key_cols = c("id_trans", "id_period"), + map_cols = c("model_params", "goodness_of_fit") + )(x) +}) + +evoland_db$set("active", "alloc_params_t", function(x) { + create_table_binding( + self, + "alloc_params_t", + as_alloc_params_t, + key_cols = c("id_trans", "id_period"), + map_cols = c("alloc_params", "goodness_of_fit") + )(x) +}) + +evoland_db$set("active", "neighbors_t", function(x) { + create_table_binding( + self, + "neighbors_t", + as_neighbors_t, + key_cols = c("id_coord_origin", "id_coord_neighbor") + )(x) +}) diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R index f17c8df..06325cb 100644 --- a/R/evoland_db_views.R +++ b/R/evoland_db_views.R @@ -1,111 +1,286 @@ #' Views on the evoland-plus data model #' #' @description -#' Functions to generate views on the database +#' This file adds view active bindings and methods to the `evoland_db` class using R6's `$set()` method. +#' These provide computed views on the database without storing additional data. +#' +#' @section Active Bindings Added: +#' +#' - `lulc_meta_long_v` - Unrolled LULC metadata with one row per source class +#' - `pred_sources_v` - Distinct predictor URLs and their MD5 checksums +#' - `transitions_v` - Land use transitions derived from lulc_data_t +#' - `extent` - Spatial extent of coords_t as terra::SpatExtent +#' - `coords_minimal` - Minimal coordinate representation (id_coord, lon, lat) +#' +#' @section Methods Added: +#' +#' - `trans_pred_data_v(id_trans)` - Returns wide table of transition results and predictor data for a specific transition. Used as input to covariance filtering. #' #' @name evoland_db_views +#' @include evoland_db.R NULL -#' @describeIn evoland_db_views Retrieve a table of distinct predictor urls and their -#' md5sum -make_pred_sources_v <- function(self, private, where = NULL) { - self$attach_table("pred_meta_t") - on.exit(self$detach_table("pred_meta_t")) - - where_clause <- if (!is.null(where)) paste("AND", where) else "" - - self$get_query(glue::glue( - r"{ - select distinct - unnest(sources).url as url, - unnest(sources).md5sum as md5sum - from pred_meta_t - where sources is not null {where_clause} - }" - )) -} - - -#' @describeIn evoland_db_views Return a `lulc_meta_long_v` instance, i.e. unrolled `lulc_meta_t`. -make_lulc_meta_long_v <- function(self, private, where = NULL) { - self$attach_table("lulc_meta_t") - on.exit(self$detach_table("lulc_meta_t")) - - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - - self$get_query(glue::glue( - r"{ - select - id_lulc, - name, - unnest(src_classes) as src_class - from - lulc_meta_t - {where_clause} - }" - )) -} - -#' @describeIn evoland_db_views Minimal coordinate representation (id_coord, lon, lat) -make_coords_minimal <- function(self, private, where = NULL) { - self$attach_table("coords_t", c("id_coord", "lon", "lat")) - on.exit(self$detach_table("coords_t")) - - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - - self$get_query(glue::glue( - r"{ - select id_coord, lon, lat - from coords_t - {where_clause} - }" - )) |> - cast_dt_col("id_coord", as.integer) |> - data.table::setkeyv("id_coord") -} - -#' @describeIn evoland_db_views Returns the extent of the coords_t as terra::SpatExtent -make_extent_db <- function(self, private) { - self$attach_table("coords_t", c("lon", "lat")) - on.exit(self$detach_table("coords_t")) - - self$get_query(glue::glue( - r"{ - SELECT - min(lon) as xmin, - max(lon) as xmax, - min(lat) as ymin, - max(lat) as ymax - FROM - coords_t - }" - )) |> - unlist() |> - terra::ext() -} - -#' @describeIn evoland_db_views -make_transitions_v <- function(self, private, where = NULL) { - self$attach_table("lulc_data_t") - on.exit(self$detach_table("lulc_data_t")) - - where_clause <- if (!is.null(where)) paste("WHERE", where) else "" - - self$get_query(glue::glue( - r"{ - SELECT - curr.id_period, - prev.id_lulc as id_lulc_anterior, - curr.id_lulc as id_lulc_posterior, - curr.id_coord - FROM - lulc_data_t as curr - INNER JOIN - lulc_data_t as prev - ON - curr.id_coord = prev.id_coord - AND curr.id_period = prev.id_period + 1 - {where_clause} - }" - )) -} +evoland_db$set("active", "lulc_meta_long_v", function() { + self$with_tables("lulc_meta_t", function() { + self$get_query(glue::glue( + r"{ + select + id_lulc, + name, + unnest(src_classes) as src_class + from + lulc_meta_t + }" + )) + }) +}) + +evoland_db$set("active", "pred_sources_v", function() { + self$with_tables("pred_meta_t", function() { + self$get_query(glue::glue( + r"{ + select distinct + unnest(sources).url as url, + unnest(sources).md5sum as md5sum + from pred_meta_t + where sources is not null + }" + )) + }) +}) + +evoland_db$set("active", "transitions_v", function() { + self$with_tables("lulc_data_t", function() { + self$get_query(glue::glue( + r"{ + SELECT + curr.id_period, + prev.id_lulc as id_lulc_anterior, + curr.id_lulc as id_lulc_posterior, + curr.id_coord + FROM + lulc_data_t as curr + INNER JOIN + lulc_data_t as prev + ON + curr.id_coord = prev.id_coord + AND curr.id_period = prev.id_period + 1 + }" + )) + }) +}) + +evoland_db$set("active", "extent", function() { + self$with_tables("coords_t", function() { + self$get_query(glue::glue( + r"{ + SELECT + min(lon) as xmin, + max(lon) as xmax, + min(lat) as ymin, + max(lat) as ymax + FROM + coords_t + }" + )) |> + unlist() |> + terra::ext() + }) +}) + +evoland_db$set("active", "coords_minimal", function() { + self$with_tables("coords_t", function() { + self$get_query(glue::glue( + r"{ + select id_coord, lon, lat + from coords_t + }" + )) |> + cast_dt_col("id_coord", "int") |> + data.table::setkeyv("id_coord") + }) +}) + +# get transitions along with their predictor data in a wide data.table +# id_trans - integer transition ID +# id_pred - optional integer vector of predictor IDs to include (NULL = all predictors) +# na_value - if not NA, replace all NULL/NA predictor values with this value +evoland_db$set("public", "trans_pred_data_v", function(id_trans, id_pred = NULL, na_value = NA) { + stopifnot( + "id_trans must be a single integer" = length(id_trans) == 1L && is.numeric(id_trans), + "id_pred must be NULL or a numeric vector" = is.null(id_pred) || is.numeric(id_pred) + ) + + all_tables <- self$list_tables() + pred_tables <- c("pred_data_t_float", "pred_data_t_int", "pred_data_t_bool") + existing_pred_tables <- intersect(pred_tables, all_tables) + + tables_to_attach <- c("trans_meta_t", "lulc_data_t", existing_pred_tables) + + self$with_tables( + tables_to_attach, + function() { + trans_info <- self$get_query(glue::glue( + "SELECT id_lulc_anterior, id_lulc_posterior + FROM trans_meta_t + WHERE id_trans = {id_trans}" + )) + + if (nrow(trans_info) == 0L) { + stop(glue::glue("Transition id_trans = {id_trans} not found in trans_meta_t")) + } + + id_lulc_ant <- trans_info$id_lulc_anterior + id_lulc_post <- trans_info$id_lulc_posterior + + ctes <- list() + + ctes$trans_result <- glue::glue( + "trans_result AS ( + SELECT + curr.id_coord, + curr.id_period, + CASE + WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc = {id_lulc_post} THEN TRUE + WHEN prev.id_lulc = {id_lulc_ant} AND curr.id_lulc != {id_lulc_post} THEN FALSE + ELSE NULL + END AS result + FROM lulc_data_t AS curr + INNER JOIN lulc_data_t AS prev + ON curr.id_coord = prev.id_coord + AND curr.id_period = prev.id_period + 1 + WHERE prev.id_lulc = {id_lulc_ant} + )" + ) + + pred_filter <- "" + if (!is.null(id_pred)) { + pred_filter <- glue::glue(" AND id_pred IN ({toString(id_pred)})") + } + + if ("pred_data_t_float" %in% existing_pred_tables) { + ctes$pred_float_combined <- glue::glue( + "pred_float_combined AS ( + SELECT id_coord, id_period, id_pred, value + FROM pred_data_t_float + WHERE id_period >= 1 + {pred_filter} + UNION ALL + SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value + FROM pred_data_t_float AS p0 + CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods + WHERE p0.id_period = 0 + {pred_filter} + )" + ) + + ctes$pred_float_wide <- "pred_float_wide AS ( + PIVOT pred_float_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period + )" + } + + if ("pred_data_t_int" %in% existing_pred_tables) { + ctes$pred_int_combined <- glue::glue( + "pred_int_combined AS ( + SELECT id_coord, id_period, id_pred, value + FROM pred_data_t_int + WHERE id_period >= 1 + {pred_filter} + UNION ALL + SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value + FROM pred_data_t_int AS p0 + CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods + WHERE p0.id_period = 0 + {pred_filter} + )" + ) + + ctes$pred_int_wide <- "pred_int_wide AS ( + PIVOT pred_int_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period + )" + } + + if ("pred_data_t_bool" %in% existing_pred_tables) { + ctes$pred_bool_combined <- glue::glue( + "pred_bool_combined AS ( + SELECT id_coord, id_period, id_pred, value + FROM pred_data_t_bool + WHERE id_period >= 1 + {pred_filter} + UNION ALL + SELECT p0.id_coord, periods.id_period, p0.id_pred, p0.value + FROM pred_data_t_bool AS p0 + CROSS JOIN (SELECT DISTINCT id_period FROM trans_result WHERE id_period >= 1) AS periods + WHERE p0.id_period = 0 + {pred_filter} + )" + ) + + ctes$pred_bool_wide <- "pred_bool_wide AS ( + PIVOT pred_bool_combined ON id_pred USING FIRST(value) GROUP BY id_coord, id_period + )" + } + + select_cols <- "tr.result" + if ("pred_data_t_float" %in% existing_pred_tables) { + select_cols <- paste0(select_cols, ", pf.* EXCLUDE (id_coord, id_period)") + } + if ("pred_data_t_int" %in% existing_pred_tables) { + select_cols <- paste0(select_cols, ", pi.* EXCLUDE (id_coord, id_period)") + } + if ("pred_data_t_bool" %in% existing_pred_tables) { + select_cols <- paste0(select_cols, ", pb.* EXCLUDE (id_coord, id_period)") + } + + joins <- "" + if ("pred_data_t_float" %in% existing_pred_tables) { + joins <- paste0( + joins, + "\n LEFT JOIN pred_float_wide AS pf ON tr.id_coord = pf.id_coord AND tr.id_period = pf.id_period" + ) + } + if ("pred_data_t_int" %in% existing_pred_tables) { + joins <- paste0( + joins, + "\n LEFT JOIN pred_int_wide AS pi ON tr.id_coord = pi.id_coord AND tr.id_period = pi.id_period" + ) + } + if ("pred_data_t_bool" %in% existing_pred_tables) { + joins <- paste0( + joins, + "\n LEFT JOIN pred_bool_wide AS pb ON tr.id_coord = pb.id_coord AND tr.id_period = pb.id_period" + ) + } + + cte_string <- paste(unlist(ctes), collapse = ",\n\n ") + + query <- glue::glue( + "WITH {cte_string} + + SELECT {select_cols} + FROM trans_result AS tr{joins} + WHERE tr.result IS NOT NULL" + ) + + result <- self$get_query(query) + + old_names <- names(result) + new_names <- old_names + for (i in seq_along(old_names)) { + if (old_names[i] != "result" && grepl("^\\d+$", old_names[i])) { + new_names[i] <- paste0("id_pred_", old_names[i]) + } + } + data.table::setnames(result, old_names, new_names) + + if (!is.na(na_value)) { + pred_cols <- setdiff(names(result), "result") + for (col in pred_cols) { + data.table::set(result, i = which(is.na(result[[col]])), j = col, value = na_value) + } + } + + result + } + ) +}) diff --git a/R/grrf_filter.r b/R/grrf_filter.r new file mode 100644 index 0000000..dc07e5a --- /dev/null +++ b/R/grrf_filter.r @@ -0,0 +1,139 @@ +#' Guided Regularized Random Forest Feature Selection +#' +#' The `grrf_filter` returns a set of covariates for land use land cover change (LULCC) models based +#' on feature selection with Guided Regularized Random Forests. This is a two-stage random forest +#' approach: a first unregularized random forest estimates variable importance scores. These scores +#' are then used to guide a second regularized random forest that penalizes less important features, +#' resulting in a more parsimonious feature set. +#' +#' @param data A data.table of target variable and candidate covariates to be filtered; wide format +#' with one predictor per column. +#' @param result_col Name of the column representing the transition results (0: no trans, 1: trans) +#' @param weights Optional named vector of class weights. If NULL, class-balanced weights +#' are computed automatically using compute_grrf_weights(). +#' @param gamma Numeric between 0-1 controlling the weight of the normalized importance +#' score (the "importance coefficient"). When gamma = 0, we perform unguided +#' regularized random forest (no guiding effect). When gamma = 1, we apply the +#' strongest guiding effect, leading to the most penalization of redundant +#' features and the most concise feature sets. Default is 0.5. +#' @param num.trees Number of trees to grow in each random forest. Default is 500. +#' @param ... Additional arguments passed to ranger::ranger(). +#' +#' @return A character vector of column names (covariates) to retain, ordered by +#' importance (most important first) +#' +#' @details +#' The Guided Regularized Random Forest (GRRF) algorithm works as follows: +#' 1. Fit an initial unregularized random forest to obtain variable importance scores +#' 2. Normalize these importance scores and use them to compute regularization +#' coefficients: coefReg = (1 - gamma) + gamma * normalized_importance +#' 3. Fit a regularized random forest using these coefficients to penalize splits +#' on less important variables +#' 4. Return variables with positive importance in the regularized model +#' +#' Class weights are used to handle class imbalance. Variables in terminal nodes are +#' weighted by class, and splits are evaluated using weighted Gini impurity. +#' +#' The ranger implementation uses the `split.select.weights` parameter to apply +#' regularization penalties, approximating the RRF regularization approach. +#' +#' @references +#' Deng, H., & Runger, G. (2013). Gene selection with guided regularized random forest. +#' Pattern Recognition, 46(12), 3483-3489. https://arxiv.org/pdf/1306.0237.pdf +#' +#' Original implementation by Antoine Adde, edited by Ben Black and adapted for +#' ranger by the evoland-plus team. +#' +#' @name grrf_filter +#' +#' @export + +grrf_filter <- function( + data, + result_col = "result", + weights = compute_balanced_weights(data[[result_col]]), + gamma = 0.5, + num.trees = 500, + max.depth = 100, + ... +) { + # Check if ranger is available + if (!requireNamespace("ranger", quietly = TRUE)) { + stop( + "Package 'ranger' is required for grrf_filter but is not installed.\n", + "Please install it with: install.packages('ranger')", + call. = FALSE + ) + } + + data.table::setDT(data) + + # Validate inputs + stopifnot( + "gamma must be between 0 and 1" = gamma >= 0 && gamma <= 1, + "result_col must exist in data" = result_col %in% names(data), + "data must have at least one predictor column" = ncol(data) > 1 + ) + + # Prepare data: separate predictors from response + predictor_cols <- setdiff(names(data), result_col) + y <- as.factor(data[[result_col]]) + x <- data[, ..predictor_cols] + + # Step 1: Run initial unregularized random forest to get importance scores + rf_initial <- ranger::ranger( + x = x, + y = y, + num.trees = num.trees, + importance = "impurity", + case.weights = weights, + max.depth = max.depth, + ... + ) + + # Extract and normalize importance scores + imp_initial <- rf_initial$variable.importance + # Normalize to [0,1] (importance values may be negative) + imp_normalized <- { + (imp_initial - min(imp_initial)) / + (max(imp_initial) - min(imp_initial)) + } + + # Step 2: Calculate regularization coefficients (penalty weights) + # Higher importance -> higher coefficient -> less penalty + coef_reg <- (1 - gamma) + gamma * imp_normalized + + # Step 3: Run guided regularized random forest + # Use split.select.weights to penalize variables with lower importance + # Higher weight = more likely to be selected for splitting + rf_grrf <- ranger::ranger( + x = x, + y = y, + num.trees = num.trees, + importance = "impurity", + case.weights = weights, + max.depth = max.depth, + split.select.weights = coef_reg, + ... + ) + + # Extract final importance scores + imp_final <- rf_grrf$variable.importance + + # Select variables with positive importance + selected_vars <- names(imp_final[imp_final > 0]) + + if (length(selected_vars) == 0) { + warning("No variables with positive importance found. Returning all variables.") + selected_vars <- predictor_cols + } + + # Order by importance (descending) + selected_vars <- selected_vars[order(imp_final[selected_vars], decreasing = TRUE)] + + message(glue::glue( + "Selected {length(selected_vars)}/{length(predictor_cols)} predictors" + )) + + return(selected_vars) +} diff --git a/R/intrv_masks_t.R b/R/intrv_masks_t.R index 501daf7..8aee5f3 100644 --- a/R/intrv_masks_t.R +++ b/R/intrv_masks_t.R @@ -18,8 +18,8 @@ as_intrv_masks_t <- function(x) { id_coord = integer(0) ) } - cast_dt_col(x, "id_coord", as.integer) - cast_dt_col(x, "id_coord", as.integer) + cast_dt_col(x, "id_coord", "int") + cast_dt_col(x, "id_coord", "int") new_evoland_table( x, "intrv_masks_t", diff --git a/R/lulc_data_t.R b/R/lulc_data_t.R index 91c4bcc..817df05 100644 --- a/R/lulc_data_t.R +++ b/R/lulc_data_t.R @@ -19,7 +19,7 @@ as_lulc_data_t <- function(x) { id_period = integer(0) ) } - cast_dt_col(x, "id_coord", as.integer) + cast_dt_col(x, "id_coord", "int") new_evoland_table( x, "lulc_data_t", diff --git a/R/neighbors_t.R b/R/neighbors_t.R new file mode 100644 index 0000000..4a50301 --- /dev/null +++ b/R/neighbors_t.R @@ -0,0 +1,151 @@ +#' Create Neighbors Table +#' +#' Creates a `neighbors_t` table and validates that it matches the schema. +#' +#' @name neighbors_t +#' +#' @param x An object that can be passed to [data.table::setDT()] +#' +#' @return A data.table of class "neighbors_t" with columns: +#' - `id_coord_origin`: Foreign key to coords_t (origin coordinate) +#' - `id_coord_neighbor`: Foreign key to coords_t (neighbor coordinate) +#' - `distance`: Numeric distance between coordinates +#' - `distance_class`: Optional factor representing distance intervals +#' @export +as_neighbors_t <- function(x) { + if (missing(x)) { + x <- data.table::data.table( + id_coord_origin = integer(0), + id_coord_neighbor = integer(0), + distance = numeric(0) + ) + } + cast_dt_col(x, "id_coord_origin", "int") + cast_dt_col(x, "id_coord_neighbor", "int") + if ("distance_class" %in% names(x)) { + cast_dt_col(x, "distance_class", "factor") + } + new_evoland_table( + x, + "neighbors_t", + c("id_coord_origin", "id_coord_neighbor") + ) +} + +#' @describeIn neighbors_t Compute neighboring coordinates within specified distances. In +#' order to be computationally feasible, the coordinates' IDs are rasterized before +#' their actual Euclidean distance is calculated. If coordinates are so close that they +#' get rasterized to the same cell, the first one is used and a warning is emitted. If +#' this happens, try again using a lower resolution. +#' @param max_distance Maximum distance to search for neighbors (in same units as +#' coordinates and resolution) +#' @param distance_breaks Optional numeric vector defining distance class boundaries. +#' If NULL, no distance classification is performed. +#' If provided, must have at least 2 elements defining interval breaks. +#' @param resolution Grid cell size for rasterization (default: 100.0, in same units as coordinates) +#' @return A data.table with columns: +#' - id_coord_origin: ID of the origin coordinate +#' - id_coord_neighbor: ID of the neighboring coordinate +#' - distance: Distance between origin and neighbor +#' - distance_class: Factor indicating distance class (if distance_breaks provided) +#' @export +create_neighbors_t <- function( + coords_t, + max_distance, + distance_breaks = NULL, + resolution = 100.0, + quiet = FALSE +) { + # Validate inputs + if (!inherits(coords_t, "coords_t")) { + stop("coords_t must be a coords_t object") + } + + if (!is.numeric(max_distance) || length(max_distance) != 1 || max_distance <= 0) { + stop("max_distance must be a positive scalar numeric") + } + + if (!is.null(distance_breaks)) { + if (!is.numeric(distance_breaks) || length(distance_breaks) < 2) { + stop("distance_breaks must be NULL or a numeric vector with at least 2 elements") + } + } + + # Call C++ function + dt <- distance_neighbors_cpp( + coords_t = coords_t, + max_distance = max_distance, + resolution = resolution, + quiet = quiet + ) + + data.table::setalloccol(dt) + + # Rename distance_approx to distance + data.table::setnames(dt, "distance_approx", "distance") + + # Add distance class if breaks provided + if (!is.null(distance_breaks)) { + dt[, + distance_class := cut( + distance, + breaks = distance_breaks, + right = FALSE, + include.lowest = TRUE + ) + ] + } + + as_neighbors_t(dt) +} + + +#' @describeIn neighbors_t Validate a neighbors_t object +#' @export +validate.neighbors_t <- function(x, ...) { + NextMethod() + + required_cols <- c("id_coord_origin", "id_coord_neighbor", "distance") + + data.table::setcolorder(x, required_cols) + + stopifnot( + is.integer(x[["id_coord_origin"]]), + is.integer(x[["id_coord_neighbor"]]), + is.numeric(x[["distance"]]) + ) + + return(x) +} + +#' @describeIn neighbors_t Print a neighbors_t object +#' @param nrow Maximum number of rows to print. See [data.table::print.data.table] +#' @param ... Passed to [data.table::print.data.table] +#' @export +print.neighbors_t <- function(x, nrow = 10, ...) { + if (nrow(x) > 0) { + total_pairs <- format( + nrow(x), + big.mark = "_", + scientific = FALSE + ) + + extra_info <- "" + if ("distance_class" %in% names(x)) { + extra_info <- paste( + "Distance classes:", + paste(levels(x[["distance_class"]]), collapse = ", ") + ) + } + + cat(glue::glue( + "Neighbors Table\n", + "Neighbor pairs: {total_pairs}\n", + "{extra_info}\n\n" + )) + } else { + cat("Neighbors Table (empty)\n") + } + NextMethod(nrow = nrow, ...) + invisible(x) +} diff --git a/R/parquet_duckdb.R b/R/parquet_duckdb.R new file mode 100644 index 0000000..e7c5538 --- /dev/null +++ b/R/parquet_duckdb.R @@ -0,0 +1,725 @@ +#' R6 Base Class for Parquet-Backed DuckDB Storage +#' +#' @description +#' A domain-agnostic R6 class that provides an interface to a folder-based data +#' storage system using DuckDB for in-memory SQL operations and parquet files +#' for efficient on-disk persistence. This class can be inherited by +#' domain-specific database classes. +#' +#' @export + +parquet_duckdb <- R6::R6Class( + classname = "parquet_duckdb", + + ## Public Methods ---- + public = list( + #' @field connection DBI connection object to an in-memory DuckDB database + connection = NULL, + + #' @field path Character string path to the data folder + path = NULL, + + #' @field default_format Default file format for new tables + default_format = NULL, + + #' @field writeopts Default write options for DuckDB + writeopts = NULL, + + #' @description + #' Initialize a new parquet_duckdb object + #' @param path Character string. Path to the data folder. + #' @param default_format Character. Default file format ("parquet" or "csv"). + #' Default is "parquet". + #' @param extensions Character vector of DuckDB extensions to load (e.g., "spatial") + #' + #' @return A new `parquet_duckdb` object + initialize = function( + path, + default_format = c("parquet", "csv"), + extensions = character(0) + ) { + # Create folder if it doesn't exist + self$path <- ensure_dir(path) + + # Set format / writeopts + self$default_format <- match.arg(default_format) + self$writeopts <- switch( + self$default_format, + parquet = "format parquet, compression zstd", + csv = "format csv", + stop(glue::glue("Unsupported format: {self$default_format}")) + ) + + # Create in-memory connection for SQL operations + self$connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + + # load extensions + for (ext in extensions) { + self$execute(glue::glue("install {ext}; load {ext};")) + } + + invisible(self) + }, + + ### Core Database Methods ---- + + #' @description + #' Execute a SQL statement + #' @param statement A SQL statement + #' @return Number of rows affected by statement + execute = function(statement) { + DBI::dbExecute(self$connection, statement) + }, + + #' @description + #' Execute a SQL query and return results + #' @param statement A SQL query statement + #' @return A data.table with query results + get_query = function(statement) { + DBI::dbGetQuery(self$connection, statement) |> + data.table::as.data.table() + }, + + #' @description + #' Attach a table from parquet/CSV file as a temporary table in DuckDB + #' @param table_name Character. Name of table to attach. + #' @param columns Character vector. Optional SQL column selection, defaults to "*" + #' @param where Character. Optional SQL WHERE clause to subset the table. + #' @return Invisible NULL (called for side effects) + attach_table = function(table_name, columns = "*", where = NULL) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + stop(glue::glue("Table '{table_name}' does not exist at path: {self$path}")) + } + + # Build SQL query + sql <- glue::glue( + "create temp table {table_name} as ", + "select {paste(columns, collapse = ', ')} ", + "from read_{file_info$format}('{file_info$path}')" + ) + + if (!is.null(where)) { + sql <- glue::glue("{sql} where {where}") + } + + self$execute(sql) + invisible(NULL) + }, + + #' @description + #' Detach a table from the in-memory database + #' @param table_name Character. Name of table to drop. + #' @return Invisible NULL (called for side effects) + detach_table = function(table_name) { + self$execute(paste0("drop table if exists ", table_name, ";")) + invisible(NULL) + }, + + #' @description + #' Get row count for a table + #' @param table_name Character string. Name of the table to query. + #' @return Integer number of rows + row_count = function(table_name) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(0L) + } + + self$get_query( + glue::glue("select count(*) as n from read_{file_info$format}('{file_info$path}')") + )[[1]] + }, + + #' @description + #' List all tables (files) in storage + #' @return Character vector of table names + list_tables = function() { + list.files(self$path, pattern = "\\.(parquet|csv)$", full.names = FALSE) |> + tools::file_path_sans_ext() |> + unique() |> + sort() + }, + + #' @description + #' Execute a function with specified tables attached, handling attach/detach automatically. + #' If a table is already attached in the DuckDB instance, it won't be re-attached or detached. + #' + #' @param tables Character vector of table names to attach + #' @param func Function to execute with tables attached + #' @param ... Additional arguments passed to func + #' @return Result of func + with_tables = function(tables, func, ...) { + # Track which tables we attach (so we know which to detach) + attached_tables <- character(0) + + # Check which tables are already attached + existing_tables <- DBI::dbListTables(self$connection) + + # Attach tables that aren't already present + for (table in tables) { + if (!table %in% existing_tables) { + self$attach_table(table) + attached_tables <- c(attached_tables, table) + } + } + + # Ensure cleanup on exit + on.exit( + { + for (table in attached_tables) { + self$detach_table(table) + } + }, + add = TRUE + ) + + # Execute the function + func(...) + }, + + #' @description + #' Fetch data from a table + #' @param table_name Character string. Name of the table to query. + #' @param where Character string. Optional WHERE clause for the SQL query. + #' @param limit Integer. Optional limit on number of rows to return. + #' + #' @return A data.table + fetch = function(table_name, where = NULL, limit = NULL) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(data.table::data.table()) + } + + # build sql query + sql <- glue::glue("select * from read_{file_info$format}('{file_info$path}')") + + if (!is.null(where)) { + sql <- glue::glue("{sql} where {where}") + } + if (!is.null(limit)) { + sql <- glue::glue("{sql} limit {limit}") + } + + self$get_query(sql) + }, + + #' @description + #' Delete rows from a table + #' @param table_name Character string. Name of the table to delete from. + #' @param where Character string. Optional WHERE clause; if NULL, deletes all rows. + #' @return Number of rows deleted + delete_from = function(table_name, where = NULL) { + file_info <- private$get_file_path(table_name) + + if (!file_info$exists) { + return(0L) + } + + count_before <- self$row_count(table_name) + + if (is.null(where)) { + file.remove(file_info$path) + return(count_before) + } + + self$execute(glue::glue( + r"{ + copy ( + select * from read_{file_info$format}('{file_info$path}') + where not ({where}) + ) + to '{file_info$path}' ({self$writeopts}) + }" + )) + + count_after <- self$row_count(table_name) + + return(count_before - count_after) + }, + + #' @description + #' Commit data using overwrite, append, or upsert modes. Handles autoincrement, key + #' identity columns, and list-to-MAP conversion. + #' @param x Data frame to commit. If character, in-duckdb-memory table. + #' @param table_name Character string table name + #' @param key_cols Character vector of columns that define uniqueness. If missing, + #' use all columns starting with `id_` + #' @param autoincrement_cols Character vector of column names to auto-increment + #' @param map_cols Character vector of columns to convert to MAP format + #' @param method Character, one of "overwrite", "append", "upsert" (upsert being an + #' update for existing rows, and insert for new rows) + #' @return Invisible NULL (called for side effects) + commit = function( + x, + table_name, + key_cols, + autoincrement_cols = character(0), + map_cols = character(0), + method = c("overwrite", "append", "upsert") + ) { + method <- match.arg(method) + + private$register_new_data_v(x, map_cols) + on.exit(private$cleanup_new_data_v(), add = TRUE) + all_cols <- self$get_query( + "select column_name from (describe new_data_v)" + )[[1]] + + file_info <- private$get_file_path(table_name) + + if (method == "overwrite" || !file_info$exists) { + # in case overwrite explicitly required, or no previously existing data to + # append or upsert to; rest of logic can be skipped + return(private$commit_overwrite( + table_name = table_name, + all_cols = all_cols, + autoincrement_cols = autoincrement_cols, + file_info = file_info + )) + } + + self$attach_table(table_name) + on.exit(self$detach_table(table_name), add = TRUE) + private$set_autoincrement_vars(table_name, autoincrement_cols) + + if (missing(key_cols)) { + key_cols <- grep("^id_", all_cols, value = TRUE) + } + + if (method == "append" || length(key_cols) == 0L) { + # if there are no key columns to join on, upsert becomes append + private$commit_append( + table_name = table_name, + all_cols = all_cols, + autoincrement_cols = autoincrement_cols, + file_info = file_info + ) + } else { + private$commit_upsert( + table_name = table_name, + all_cols = all_cols, + key_cols = key_cols, + autoincrement_cols = autoincrement_cols, + file_info = file_info + ) + } + }, + + #' @description + #' Print method for parquet_duckdb + #' @param ... Not used + #' @return self (invisibly) + print = function(...) { + # gather data to be printed + classes <- class(self) + classes <- classes[classes != "R6"] + + all_names <- names(self) + methods <- character(0) + active_bindings <- character(0) + + names(self$.__enclos_env__$private) + if (!is.null(self$.__enclos_env__$super)) { + # exclude private super names + super_names <- setdiff( + ls(self$.__enclos_env__$super), + ls(self$.__enclos_env__$super$.__enclos_env__$private) + ) + } else { + super_names <- character(0) + } + nonsuper_names <- setdiff(all_names, super_names) + + for (name in nonsuper_names) { + # Check if it's an active binding first; subset2 would evaluate it + if (bindingIsActive(name, self$.__enclos_env__$self)) { + active_bindings <- c(active_bindings, name) + } else { + obj <- .subset2(self, name) + if (is.function(obj) && !name %in% c("initialize", "print", "clone")) { + methods <- c(methods, name) + } + } + } + + methods <- sort(methods) + active_bindings <- + active_bindings[!grepl("_t($|_)", active_bindings)] |> + sort() + + # actually start printing + if (length(classes) == 1) { + cat("<", classes[1], "> Object\n") + } else { + cat(classes[1], "Object. Inherits from", toString(classes[-1]), "\n") + } + + compression <- if (grepl("compression\\s+(\\w+)", self$writeopts)) { + sub(".*compression\\s+(\\w+).*", "\\1", self$writeopts) + } else { + "none" + } + + # Database info on one line + cat( + glue::glue( + "Database: {self$path} | Format: {self$default_format} | Compression: {compression}" + ), + "\n\n" + ) + + tables <- self$list_tables() + if (length(tables) > 0) { + cat("Tables present:\n ") + cat(strwrap(toString(tables), width = 80), sep = "\n ") + cat("\n") + } else { + cat("Tables present: (none)\n\n") + } + + if (length(super_names) > 0) { + cat("DB Methods:\n ") + cat(strwrap(toString(super_names), width = 80), sep = "\n ") + cat("\n") + } + + if (length(methods) > 0) { + cat("Public methods:\n ") + cat(strwrap(toString(methods), width = 80), sep = "\n ") + cat("\n") + } + + if (length(active_bindings) > 0) { + cat("Active bindings:\n ") + cat(strwrap(toString(active_bindings), width = 80), sep = "\n ") + } + + invisible(self) + } + ), + + ## Private Methods ---- + private = list( + # R6 hook called on gc(); Close the database connection + finalize = function() { + if (!is.null(self$connection)) { + DBI::dbDisconnect(self$connection) + self$connection <- NULL + } + }, + + ### Commit Methods ---- + #' param x Data frame to commit. If character, in-duckdb-memory table. + #' param table_name Character string table name + #' param autoincrement_cols Character vector of column names to auto-increment + #' param map_cols Character vector of columns to convert to MAP format + #' return Invisible NULL (called for side effects) + commit_overwrite = function( + table_name, + all_cols, + autoincrement_cols = character(0), + file_info + ) { + # Warn if overriding existing IDs + if (length(intersect(autoincrement_cols, all_cols)) > 0) { + warning(glue::glue( + "Overriding existing IDs ({toString(autoincrement_cols)}) with row numbers;\n", + "Assign these IDs manually and do not pass any autoincrement_cols to avoid this warning" + )) + } + + # Build SELECT expression + ordinary_cols <- setdiff(all_cols, autoincrement_cols) + select_expr <- glue::glue_collapse( + c(glue::glue("row_number() over () as {autoincrement_cols}"), ordinary_cols), + sep = ",\n " + ) + + self$execute(glue::glue( + r"{ + copy ( + select {select_expr} + from new_data_v + ) to '{file_info$path}' ({self$writeopts}) + }" + )) + }, + + #' param x Data frame to commit. If character, in-duckdb-memory table. + #' param table_name Character string table name + #' param autoincrement_cols Character vector of column names to auto-increment + #' param map_cols Character vector of columns to convert to MAP format + #' return Invisible NULL (called for side effects) + commit_append = function( + table_name, + all_cols, + autoincrement_cols = character(0), + file_info + ) { + ordinary_cols <- setdiff(all_cols, autoincrement_cols) + select_new <- glue::glue_collapse( + c( + glue::glue( + "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" + ), + ordinary_cols + ), + sep = ",\n " + ) + + # Concatenation using UNION ALL; "by name" handles missing columns + self$execute(glue::glue( + r"{ + copy ( + select * from {table_name} + union all by name + select {select_new} + from new_data_v + ) + to '{file_info$path}' ({self$writeopts}) + }" + )) + }, + + #' param x Data frame to commit. If character, in-duckdb-memory table. + #' param table_name Character string table name + #' param key_cols Character vector of columns that define uniqueness. If missing, + #' use all columns starting with `id_` + #' param autoincrement_cols Character vector of column names to auto-increment + #' param map_cols Character vector of columns to convert to MAP format + #' return Invisible NULL (called for side effects) + commit_upsert = function( + table_name, + all_cols, + key_cols, + autoincrement_cols = character(0), + file_info + ) { + # Update existing data + ordinary_cols <- setdiff(all_cols, union(key_cols, autoincrement_cols)) + update_select_expr <- glue::glue_collapse( + glue::glue("{ordinary_cols} = new_data_v.{ordinary_cols}"), + sep = ",\n " + ) + update_join_condition <- glue::glue_collapse( + glue::glue("{table_name}.{key_cols} = new_data_v.{key_cols}"), + sep = "\nand " + ) + + self$execute(glue::glue( + r"{ + update {table_name} set + {update_select_expr} + from new_data_v + where + {update_join_condition}; + }" + )) + + # Insert new data + insert_select_expr <- glue::glue_collapse( + c( + glue::glue( + "row_number() over () + getvariable('max_{autoincrement_cols}') as {autoincrement_cols}" + ), + glue::glue("new_data_v.{setdiff(all_cols, autoincrement_cols)}") + ), + sep = ",\n " + ) + null_condition <- glue::glue_collapse( + glue::glue("{table_name}.{key_cols} is null"), + sep = "\nand " + ) + + self$execute(glue::glue( + r"{ + insert into {table_name} + select + {insert_select_expr} + from + new_data_v + left join + {table_name} + on + {update_join_condition} + where + {null_condition} + ; + }" + )) + + self$execute(glue::glue("copy {table_name} to '{file_info$path}' ({self$writeopts})")) + }, + + # Get file path and format for a table + # + # @param table_name Character string table name + # @return List with path, format, and exists flag + get_file_path = function(table_name) { + # Check for parquet first, then csv + parquet_path <- file.path(self$path, paste0(table_name, ".parquet")) + csv_path <- file.path(self$path, paste0(table_name, ".csv")) + + if (file.exists(parquet_path)) { + return(list(path = parquet_path, format = "parquet", exists = TRUE)) + } else if (file.exists(csv_path)) { + return(list(path = csv_path, format = "csv", exists = TRUE)) + } else { + # Return default format for new file + default_path <- file.path( + self$path, + paste0(table_name, ".", self$default_format) + ) + return(list( + path = default_path, + format = self$default_format, + exists = FALSE + )) + } + }, + + # Register new_data_v table, optionally converting MAP columns + # + # @param x Data to register + # @param map_cols Character vector of columns to convert to MAP format + # @return NULL (called for side effects) + register_new_data_v = function(x, map_cols = character(0)) { + if (is.character(x)) { + # TODO add tests + self$execute(glue::glue("create view new_data_v as from {x}")) + return(invisible(NULL)) + } + + if (length(map_cols) == 0) { + # No MAP conversion needed - register directly + duckdb::duckdb_register(self$connection, "new_data_v", x) + } else { + # Convert list columns to key-value dataframes + x <- + data.table::copy(x) |> + convert_list_cols(map_cols, list_to_kv_df) + + # Register as intermediate table + duckdb::duckdb_register(self$connection, "new_data_raw", x) + + # Build SELECT expression with map_from_entries for MAP columns + map_exprs <- glue::glue("map_from_entries({map_cols}) as {map_cols}") + other_cols <- setdiff(names(x), map_cols) + all_exprs <- c(other_cols, map_exprs) + select_expr <- glue::glue_collapse(all_exprs, sep = ", ") + + # Create new_data_v from new_data_raw + self$execute(glue::glue( + "create temp table new_data_v as select {select_expr} from new_data_raw" + )) + } + + invisible(NULL) + }, + + # Cleanup new_data_v and related tables + # + # @param map_cols Character vector indicating if MAP conversion was used + # @return NULL (called for side effects) + cleanup_new_data_v = function() { + try(duckdb::duckdb_unregister(self$connection, "new_data_v"), silent = TRUE) + try(duckdb::duckdb_unregister(self$connection, "new_data_raw"), silent = TRUE) + try(self$execute("drop table if exists new_data_v"), silent = TRUE) + try(self$execute("drop view if exists new_data_v"), silent = TRUE) + + invisible(NULL) + }, + + # Set DuckDB variables max_{colname} for autoincrement columns + # + # @param table_name Character string table name + # @param autoincrement_cols Character vector of column names + # @return NULL (called for side effects) + set_autoincrement_vars = function(table_name, autoincrement_cols) { + if (length(autoincrement_cols) == 0L) { + return(NULL) + } + + if (!DBI::dbExistsTable(self$connection, table_name)) { + # Attach/detach if not already attached + self$attach_table(table_name) + on.exit(self$detach_table(table_name)) + } + + existing_cols <- + glue::glue("select column_name from (describe {table_name})") |> + self$get_query() |> + (\(x) x[[1]])() |> + intersect(autoincrement_cols) + + missing_cols <- setdiff(autoincrement_cols, existing_cols) + + set_exprs <- glue::glue_collapse( + c( + glue::glue( + "set variable max_{existing_cols} = + (select coalesce(max({existing_cols}), 0) from {table_name});" + ), + glue::glue( + "set variable max_{missing_cols} = 0;" + ) + ), + sep = "\n" + ) + + self$execute(set_exprs) + invisible(NULL) + } + ) +) + + +# Helper functions for converting between list and data.frame formats for DuckDB MAPs +convert_list_cols <- function(x, cols, fn) { + for (col in cols) { + x[[col]] <- lapply(x[[col]], fn) + } + x +} + +list_to_kv_df <- function(x) { + if (is.null(x) || length(x) == 0) { + return(data.frame( + key = character(0), + value = character(0), + stringsAsFactors = FALSE + )) + } + data.frame( + key = names(x), + value = as.character(unlist(x)), + stringsAsFactors = FALSE + ) +} + +kv_df_to_list <- function(x) { + if (is.null(x) || nrow(x) == 0) { + return(NULL) + } + + out <- list() + + for (row in seq_len(nrow(x))) { + key <- x$key[row] + val <- x$value[row] + + # Try numeric conversion + num_val <- suppressWarnings(as.numeric(val)) + if (!is.na(num_val)) { + out[[key]] <- num_val + } else { + out[[key]] <- val + } + } + + out +} diff --git a/R/pred_data_t.R b/R/pred_data_t.R index d3c7972..45ee49e 100644 --- a/R/pred_data_t.R +++ b/R/pred_data_t.R @@ -18,26 +18,18 @@ as_pred_data_t <- function(x, type) { stopifnot(type %in% c("float", "int", "bool")) - # Create empty table with proper value type - coercion_fn <- switch( - type, - float = as.numeric, - int = as.integer, - bool = as.logical - ) - if (missing(x)) { x <- data.table::data.table( id_pred = integer(0), id_coord = integer(0), id_period = integer(0), - value = coercion_fn(integer(0)) + value = integer(0) ) } data.table::setDT(x, key = c("id_pred", "id_coord", "id_period")) |> - cast_dt_col("value", coercion_fn) |> - cast_dt_col("id_coord", as.integer) + cast_dt_col("value", type) |> + cast_dt_col("id_coord", "int") class_name <- paste0("pred_data_t_", type) new_evoland_table(x, c(class_name, "pred_data_t")) diff --git a/R/trans_meta_t.R b/R/trans_meta_t.R index 8613add..db81f8e 100644 --- a/R/trans_meta_t.R +++ b/R/trans_meta_t.R @@ -41,7 +41,8 @@ as_trans_meta_t <- function(x) { ) } -#' @describeIn trans_meta_t Calculate the transition metadata and mark for modelling feasibility +#' @describeIn trans_meta_t Calculate the transition metadata and mark for modelling +#' feasibility. Does not attribute `id_trans`; this only makes sense as part of a DB. #' @export create_trans_meta_t <- function( transitions, @@ -108,9 +109,6 @@ create_trans_meta_t <- function( } } - # Add id_trans - trans_summary[, id_trans := .I] - as_trans_meta_t(trans_summary) } diff --git a/R/trans_preds_t.R b/R/trans_preds_t.R index 6d7c5dc..4fc51c9 100644 --- a/R/trans_preds_t.R +++ b/R/trans_preds_t.R @@ -5,6 +5,7 @@ #' modelling each transition type. #' #' @name trans_preds_t +#' @include evoland_db.R #' #' @param db An [evoland_db] instance with populated trans_meta_t and pred_meta_t tables #' @@ -19,6 +20,8 @@ as_trans_preds_t <- function(x) { id_trans = integer(0) ) } + cast_dt_col(x, "id_pred", "int") + cast_dt_col(x, "id_trans", "int") new_evoland_table( x, "trans_preds_t", @@ -26,12 +29,139 @@ as_trans_preds_t <- function(x) { ) } -#' @describeIn trans_preds_t Create a transition-predictor relation, i.e. records the -#' result of a predictor selection step. -#' @export -create_trans_preds_t <- function() { - as_trans_preds_t() -} +# set an initial full set of transition / predictor relations +evoland_db$set( + "public", + "set_full_trans_preds", + function(overwrite = FALSE) { + if (self$row_count("trans_preds_t") > 0 && !overwrite) { + stop("Set overwrite to TRUE to overwrite existing trans_preds_t") + } + p <- self$pred_meta_t + t <- self$trans_meta_t[is_viable == TRUE] + + full <- expand.grid(id_pred = p[["id_pred"]], id_trans = t[["id_trans"]]) + self$commit(as_trans_preds_t(full), "trans_preds_t", method = "overwrite") + } +) + +#' describeIn trans_preds_t Create a transition-predictor relation, i.e. records the +#' result of a predictor selection step. Runs covariance filtering for each viable +#' transition and stores the selected predictors. +#' param corcut Numeric threshold (0-1) for correlation filtering passed to [covariance_filter()] +#' param filter_fun Defaults to [covariance_filter()], but can be any function that returns +#' param na_value Passed to db$trans_pred_data_v - if not NA, replace all NA predictor values with this value +#' param ... Additional arguments passed to rank_fun via [covariance_filter()] +evoland_db$set( + "public", + "prune_trans_preds", + function( + filter_fun = covariance_filter, + na_value = NA, + ... + ) { + viable_trans <- self$trans_meta_t[is_viable == TRUE] + pred_meta <- self$pred_meta_t + stopifnot( + "No viable transitions found in trans_meta_t" = nrow(viable_trans) > 0L + ) + if (self$row_count("trans_preds_t") == 0) { + self$set_full_trans_preds() + } + trans_preds_pre <- self$trans_preds_t + + results_list <- list() + + # Iterate over transitions (anterior/posterior pairs) + for (i in seq_len(nrow(viable_trans))) { + id_trans <- viable_trans$id_trans[i] + id_lulc_ant <- viable_trans$id_lulc_anterior[i] + id_lulc_post <- viable_trans$id_lulc_posterior[i] + id_preds <- trans_preds_pre$id_pred[ + trans_preds_pre$id_trans == id_trans + ] + + message(glue::glue( + "Processing transition {i}/{nrow(viable_trans)}: ", + "id_trans={id_trans} ({id_lulc_ant} -> {id_lulc_post})" + )) + + if (length(id_preds) == 0L) { + next + } + + # Get wide transition-predictor data + tryCatch( + { + trans_pred_data <- self$trans_pred_data_v(id_trans, id_preds, na_value) + + # Check if we have any data + if (nrow(trans_pred_data) == 0L) { + warning(glue::glue( + "No data for transition {id_trans}, skipping" + )) + next + } + + # Check if we have any predictor columns + pred_cols <- grep("^id_pred_", names(trans_pred_data), value = TRUE) + if (length(pred_cols) == 0L) { + warning(glue::glue( + "No predictor columns for transition {id_trans}, skipping" + )) + next + } + + # Return ranked + filtered predictor names as id_pred_{n} + filtered_preds <- filter_fun( + data = trans_pred_data, + result_col = "result", + ... + ) + + if (length(filtered_preds) > 0L) { + # Parse id_pred values from column names (e.g., "id_pred_1" -> 1) + selected_ids <- as.integer(sub("^id_pred_", "", filtered_preds)) + + # Create result rows + results_list[[id_trans]] <- data.table::data.table( + id_pred = selected_ids, + id_trans = id_trans + ) + + message(glue::glue( + " Selected {length(selected_ids)} predictor(s) for transition {id_trans}" + )) + } else { + message(glue::glue( + " No predictors selected for transition {id_trans}" + )) + } + }, + error = function(e) { + # do not prune on error + results_list[[id_trans]] <- data.table::data.table( + id_pred = id_preds, + id_trans = id_trans + ) + warning(glue::glue( + "Error processing transition {id_trans}: {e$message}" + )) + } + ) + } + + # Combine all results + if (length(results_list) == 0L) { + warning("No predictors selected for any transition") + return(invisible(NULL)) + } + + result <- data.table::rbindlist(results_list) + + self$commit(as_trans_preds_t(result), "trans_preds_t", method = "overwrite") + } +) #' @export validate.trans_preds_t <- function(x, ...) { diff --git a/R/util.R b/R/util.R index ceebbcb..7b8e1c0 100644 --- a/R/util.R +++ b/R/util.R @@ -143,11 +143,29 @@ print_rowwise_yaml <- function(df) { } #' @describeIn util Cast a data.table column; invisibly returns x -cast_dt_col <- function(x, colname, castfun) { +cast_dt_col <- function(x, colname, type) { + predicate_fn <- switch( + type, + float = is.numeric, + int = is.integer, + bool = is.logical, + factor = is.factor + ) + if (predicate_fn(x[[colname]])) { + return(invisible(x)) + } + + coercion_fn <- switch( + type, + float = as.numeric, + int = as.integer, + bool = as.logical, + factor = as.factor + ) data.table::set( x = x, j = colname, - value = castfun(x[[colname]]) + value = coercion_fn(x[[colname]]) ) invisible(x) } diff --git a/R/util_terra.R b/R/util_terra.R index 00d8825..463c772 100644 --- a/R/util_terra.R +++ b/R/util_terra.R @@ -89,81 +89,5 @@ extract_using_coords_t.SpatVector <- function(x, coords_t, na_omit = TRUE) { out } -#' @describeIn util_terra Compute neighboring coordinates within specified distances. In -#' order to be computationally feasible, the coordinates' IDs are rasterized before -#' their actual Euclidean distance is calculated. If coordinates are so close that they -#' get rasterized to the same cell, the first one is used and a warning is emitted. If -#' this happens, try again using a lower resolution. -#' @param max_distance Maximum distance to search for neighbors (in same units as -#' coordinates and resolution) -#' @param distance_breaks Optional numeric vector defining distance class boundaries. -#' If NULL, no distance classification is performed. -#' If provided, must have at least 2 elements defining interval breaks. -#' @param resolution Grid cell size for rasterization (default: 100.0, in same units as coordinates) -#' @return A data.table with columns: -#' - id_coord_origin: ID of the origin coordinate -#' - id_coord_neighbor: ID of the neighboring coordinate -#' - distance: Distance between origin and neighbor -#' - distance_class: Factor indicating distance class (if distance_breaks provided) -#' @export -compute_neighbors <- function( - coords_t, - max_distance, - distance_breaks = NULL, - resolution = 100.0 -) { - # Validate inputs - if (!inherits(coords_t, "coords_t")) { - stop("coords_t must be a coords_t object") - } - - if (!is.numeric(max_distance) || length(max_distance) != 1 || max_distance <= 0) { - stop("max_distance must be a positive scalar numeric") - } - - if (!is.null(distance_breaks)) { - if (!is.numeric(distance_breaks) || length(distance_breaks) < 2) { - stop("distance_breaks must be NULL or a numeric vector with at least 2 elements") - } - } - - # Call C++ function - dt <- distance_neighbors_cpp( - coords_t = coords_t, - max_distance = max_distance, - resolution = resolution - ) - - # Set data.table allocation - dt <- data.table::setalloccol(dt) - - # Rename distance_approx to distance - data.table::setnames(dt, "distance_approx", "distance") - - # Add distance class if breaks provided - if (!is.null(distance_breaks)) { - dt[, - distance_class := cut( - distance, - breaks = distance_breaks, - right = FALSE, - include.lowest = TRUE - ) - ] - } - - # Reorder columns - if (!is.null(distance_breaks)) { - data.table::setcolorder( - dt, - c("id_coord_origin", "id_coord_neighbor", "distance_class", "distance") - ) - } else { - data.table::setcolorder( - dt, - c("id_coord_origin", "id_coord_neighbor", "distance") - ) - } - - return(dt) -} +# TODO move this to a neighbors_t.R file that includes a formal class definition +# for the return table, including validation, print, coercion diff --git a/inst/tinytest/test_evoland_db.R b/inst/tinytest/test_evoland_db.R index b84f060..c48f2df 100644 --- a/inst/tinytest/test_evoland_db.R +++ b/inst/tinytest/test_evoland_db.R @@ -1,10 +1,12 @@ -# Test that a new database can be set up using evoland_db$new() +# Test evoland_db domain-specific functionality +# Generic parquet_duckdb tests are in test_parquet_duckdb.R library(tinytest) # Create temporary directory for testing test_dir <- tempfile("evoland_test_") on.exit(unlink(test_dir, recursive = TRUE), add = TRUE) +# Test 1: evoland_db initialization with reporting expect_silent( db <- evoland_db$new( path = test_dir, @@ -13,13 +15,15 @@ expect_silent( ) ) expect_true(inherits(db, "evoland_db")) +expect_true(inherits(db, "parquet_duckdb")) # Should inherit from parent -# In folder-based storage, only reporting_t exists initially -# Other tables are created on demand -expected_tables_initial <- c("reporting_t") -expect_identical(db$list_tables(), expected_tables_initial) - +# Test 2: Reporting table is created and populated +expect_identical(db$list_tables(), "reporting_t") reporting1 <- db$fetch("reporting_t") +expect_true("report_name" %in% reporting1$key) +expect_true("tinytest" %in% reporting1$value) + +# Test 3: Persistence of reporting data rm(db) gc() db <- evoland_db$new( @@ -34,30 +38,9 @@ expect_equal( sort(reporting2$key) ) +# Test 4 removed: Fetching a missing table now produces an error -# Check that accessing non-existent tables returns empty data.tables -# (these tables don't appear in list_tables() until they have data) -empty_tables <- c( - "alloc_params_t", - "coords_t", - "intrv_masks_t", - "intrv_meta_t", - "lulc_data_t", - "lulc_meta_t", - "periods_t", - "pred_data_t_bool", - "pred_data_t_float", - "pred_data_t_int", - "pred_meta_t", - "trans_meta_t", - "trans_models_t", - "trans_preds_t" -) -for (table in empty_tables) { - expect_equal(nrow(db[[table]]), 0L) -} - -# Create synthetic, minimal test data +# Test 5: Create synthetic evoland-specific test data coords_t <- create_coords_t_square( epsg = 2056, extent = terra::ext(c( @@ -230,8 +213,7 @@ intrv_masks_t <- as_intrv_masks_t( ) ) -# Test DB roundtrips & integrity checks, repeated assignments (~upserts) -# TODO make this into a testing harness that iterates over a list of sample data +# Test 6: Active bindings - coords_t expect_silent(db$coords_t <- coords_t) expect_silent(db$coords_t <- coords_t) expect_identical(db$coords_t, coords_t) @@ -240,12 +222,15 @@ expect_identical( data.table::as.data.table(coords_t[, 1:3]) ) +# Test 7: Active bindings - lulc_meta_t expect_silent(db$lulc_meta_t <- lulc_meta_t) expect_identical(db$lulc_meta_t, lulc_meta_t) +# Test 8: Active bindings - periods_t expect_silent(db$periods_t <- periods_t) expect_identical(db$periods_t, periods_t) +# Test 9: Active bindings - pred_meta_t with auto-increment expect_silent(db$pred_meta_t <- pred_meta_t[1, ]) expect_silent(db$pred_meta_t <- pred_meta_t) # After committing, id_pred should be assigned (1:2 in this case) @@ -254,20 +239,24 @@ expect_equal(retrieved_pred_meta[["id_pred"]], 1:2) expect_equal(retrieved_pred_meta[["name"]], c("noise", "distance_to_lake")) db$delete_from("pred_meta_t") # clear up, we want to ensure this works with add_predictor +# Test 10: Active bindings - intrv_meta_t (with MAP columns) expect_silent(db$intrv_meta_t <- intrv_meta_t) expect_equal(db$intrv_meta_t, intrv_meta_t) +# Test 11: Active bindings - trans_meta_t expect_silent(db$trans_meta_t <- trans_meta_t) expect_equal(db$trans_meta_t[, c(-1)], trans_meta_t) +# Test 12: Active bindings - trans_models_t (with MAP and BLOB columns) expect_silent(db$trans_models_t <- trans_models_t) expect_equal(db$trans_models_t, trans_models_t) -# now that we have the trans_meta we shouldn't be violating the FK anymore +# Test 13: Active bindings - alloc_params_t (with MAP columns) expect_silent(db$alloc_params_t <- alloc_params_t) expect_equal(db$alloc_params_t, alloc_params_t) -# repeated upsert should be idempotent +# Test 14: add_predictor method - first predictor +# Test 15: add_predictor method - second predictor expect_silent( db$add_predictor( pred_spec = pred_spec["noise"], @@ -284,17 +273,19 @@ expect_silent( pred_type = "float" ) ) -# check that add_predictor works for metadata +# Test 16: add_predictor correctly updates metadata retrieved_pred_meta <- db$pred_meta_t expect_equal(retrieved_pred_meta[["id_pred"]], 1:2) expect_equal(retrieved_pred_meta[["name"]], c("noise", "distance_to_lake")) -# check that add_predictor works for data +# Test 17: add_predictor correctly updates data expect_equal(db$row_count("pred_data_t_float"), 48L) +# Test 18: Active bindings - pred_data_t_float upsert is idempotent expect_silent(db$pred_data_t_float <- pred_data_t) expect_equal(db$row_count("pred_data_t_float"), 48L) expect_silent(db$pred_data_t_float <- pred_data_t) expect_equal(db$row_count("pred_data_t_float"), 48L) +# Test 19: Active bindings enforce type validation expect_error( # should only be able to insert the correct class db$pred_data_t_float <- data.table::data.table( @@ -306,6 +297,7 @@ expect_error( r"(^inherits.* is not TRUE$)" ) +# Test 20: lulc_data_t requires proper type expect_error( db$lulc_data_t <- lulc_data_dt <- data.table::data.table( id_coord = c(1L, 2L, 1L), @@ -318,146 +310,77 @@ expect_silent( db$lulc_data_t <- as_lulc_data_t(lulc_data_dt) ) -# Test delete_from functionality -# Setup: we already have pred_data_t_float with 48 rows (id_pred 1:2, id_coord 3:50, id_period 1) -expect_equal(db$row_count("pred_data_t_float"), 48L) - -# Test 1: Delete with WHERE clause - delete specific predictor -deleted_count <- db$delete_from("pred_data_t_float", where = "id_pred = 1") -expect_equal(deleted_count, 24L) -expect_equal(db$row_count("pred_data_t_float"), 24L) - -# Verify only id_pred = 2 remains -remaining <- db$pred_data_t_float -expect_equal(unique(remaining$id_pred), 2L) -expect_equal(nrow(remaining), 24L) - -# Test 2: Delete with complex WHERE clause -# Add back some data first -db$pred_data_t_float <- pred_data_t -expect_equal(db$row_count("pred_data_t_float"), 48L) +# Test 21: Domain-specific view - coords_minimal +coords_minimal <- db$coords_minimal +expect_true(inherits(coords_minimal, "data.table")) +expect_equal(ncol(coords_minimal), 3L) +expect_true(all(c("id_coord", "lon", "lat") %in% names(coords_minimal))) +expect_equal(nrow(coords_minimal), nrow(coords_t)) + +# Test 22: Domain-specific view - extent +# Set up coords first +db$coords_t <- coords_t +extent <- db$extent +expect_true(inherits(extent, "SpatExtent")) + +# Test 23: Domain-specific view - lulc_meta_long_v +db$lulc_meta_t <- lulc_meta_t +lulc_long <- db$lulc_meta_long_v +expect_true(inherits(lulc_long, "data.table")) +expect_true("src_class" %in% names(lulc_long)) +# Should have one row per src_class +expect_true(nrow(lulc_long) > nrow(lulc_meta_t)) + +# Test 24: Domain-specific view - pred_sources_v +db$pred_meta_t <- pred_meta_t +sources <- db$pred_sources_v +expect_true(inherits(sources, "data.table")) +expect_true(all(c("url", "md5sum") %in% names(sources))) +expect_true(nrow(sources) > 0L) + +# Test 25: set_coords method +test_dir_coords <- tempfile("evoland_coords_") +on.exit(unlink(test_dir_coords, recursive = TRUE), add = TRUE) +db_coords <- evoland_db$new(test_dir_coords) -# Delete only specific coordinates -deleted_count <- db$delete_from("pred_data_t_float", where = "id_coord < 10") -expect_true(deleted_count > 0L) -remaining <- db$pred_data_t_float -expect_true(all(remaining$id_coord >= 10)) - -# Test 3: Delete all rows (NULL where clause) -count_before_delete <- db$row_count("pred_data_t_float") -deleted_count <- db$delete_from("pred_data_t_float", where = NULL) -expect_equal(deleted_count, count_before_delete) -expect_equal(db$row_count("pred_data_t_float"), 0L) - -# Test 4: Delete from non-existent table returns 0 -deleted_count <- db$delete_from("nonexistent_table", where = "id = 1") -expect_equal(deleted_count, 0L) - -# Test 5: Delete with WHERE that matches nothing -db$pred_data_t_float <- pred_data_t -initial_count <- db$row_count("pred_data_t_float") -deleted_count <- db$delete_from("pred_data_t_float", where = "id_pred = 999") -expect_equal(deleted_count, 0L) -expect_equal(db$row_count("pred_data_t_float"), initial_count) - -# Test auto-increment functionality -# Create a new test database for auto-increment tests -test_dir_autoinc <- tempfile("evoland_autoinc_") -on.exit(unlink(test_dir_autoinc, recursive = TRUE), add = TRUE) -db_autoinc <- evoland_db$new(test_dir_autoinc) - -# Test 1: Auto-increment on overwrite mode (new table) -test_data_1 <- data.table::data.table( - name = c("predictor_a", "predictor_b", "predictor_c"), - unit = c("m", "kg", "s") -) -db_autoinc$commit_overwrite( - test_data_1, - "test_autoinc_t", - autoincrement_cols = "id_test" -) -result_1 <- db_autoinc$fetch("test_autoinc_t") -expect_equal(result_1$id_test, 1:3) -expect_equal(result_1$name, c("predictor_a", "predictor_b", "predictor_c")) - -# Test 2: Auto-increment on append mode -test_data_2 <- data.table::data.table( - name = c("predictor_d", "predictor_e"), - unit = c("A", "V") -) -db_autoinc$commit_append( - test_data_2, - "test_autoinc_t", - autoincrement_cols = "id_test" -) -result_2 <- db_autoinc$fetch("test_autoinc_t") -expect_equal(nrow(result_2), 5L) -expect_equal(result_2$id_test, 1:5) -expect_equal(result_2$name[4:5], c("predictor_d", "predictor_e")) - -# Test 3: Auto-increment on upsert mode with new rows -test_data_3 <- data.table::data.table( - name = c("predictor_f", "predictor_g"), - unit = c("W", "J") -) -db_autoinc$commit_upsert( - test_data_3, - "test_autoinc_t", - autoincrement_cols = "id_test" -) -result_3 <- db_autoinc$fetch("test_autoinc_t") -expect_equal(nrow(result_3), 7L) -expect_equal(result_3$id_test, 1:7) - -# Test 4: Auto-increment preserves existing IDs in data -test_data_4 <- data.table::data.table( - id_test = c(NA, 100L, NA), - name = c("new_a", "existing", "new_b"), - unit = c("x", "y", "z") +expect_silent( + db_coords$set_coords( + type = "square", + epsg = 2056, + extent = terra::ext(c(xmin = 2697000, xmax = 2698000, ymin = 1252000, ymax = 1253000)), + resolution = 100 + ) ) +expect_true(db_coords$row_count("coords_t") > 0L) + +# Should refuse to overwrite expect_warning( - db_autoinc$commit_overwrite( - test_data_4, - "test_autoinc2_t", - autoincrement_cols = "id_test" + db_coords$set_coords( + type = "square", + epsg = 2056, + extent = terra::ext(c(xmin = 2697000, xmax = 2698000, ymin = 1252000, ymax = 1253000)), + resolution = 100 ), - "Overriding existing IDs" + "not empty" ) -result_4 <- db_autoinc$fetch("test_autoinc2_t") -expect_equal(result_4$id_test, 1:3) -# Test 5: Multiple auto-increment columns -test_data_5 <- data.table::data.table( - name = c("item1", "item2"), - value = c(10, 20) -) -db_autoinc$commit_overwrite( - test_data_5, - "test_multi_autoinc_t", - autoincrement_cols = c("id_a", "id_b") -) -result_5 <- db_autoinc$fetch("test_multi_autoinc_t") -expect_equal(result_5$id_a, 1:2) -expect_equal(result_5$id_b, 1:2) - -# Test 6: Auto-increment continues from max in append -test_data_6a <- data.table::data.table( - id_seq = c(5L, 10L, 15L), - value = c(100, 200, 300) -) -db_autoinc$commit_overwrite( - test_data_6a, - "test_continue_t" -) -test_data_6b <- data.table::data.table( - value = c(400, 500) +# Test 26: set_periods method +test_dir_periods <- tempfile("evoland_periods_") +on.exit(unlink(test_dir_periods, recursive = TRUE), add = TRUE) +db_periods <- evoland_db$new(test_dir_periods) + +expect_silent( + db_periods$set_periods( + period_length_str = "P10Y", + start_observed = "1985-01-01", + end_observed = "2020-01-01", + end_extrapolated = "2060-01-01" + ) ) -db_autoinc$commit_append( - test_data_6b, - "test_continue_t", - autoincrement_cols = "id_seq" +expect_true(db_periods$row_count("periods_t") > 0L) + +# Should refuse to overwrite +expect_warning( + db_periods$set_periods(), + "not empty" ) -result_6 <- db_autoinc$fetch("test_continue_t") -expect_equal(nrow(result_6), 5L) -expect_equal(result_6$id_seq[4:5], c(16L, 17L)) -expect_equal(result_6$value[4:5], c(400, 500)) diff --git a/inst/tinytest/test_parquet_duckdb.R b/inst/tinytest/test_parquet_duckdb.R new file mode 100644 index 0000000..5f1c3db --- /dev/null +++ b/inst/tinytest/test_parquet_duckdb.R @@ -0,0 +1,392 @@ +# Test generic parquet_duckdb functionality +library(tinytest) + +# Create temporary directory for testing +test_dir <- tempfile("parquet_duckdb_test_") +on.exit(unlink(test_dir, recursive = TRUE), add = TRUE) + +# Test 1: Initialization +expect_silent( + db <- parquet_duckdb$new( + path = test_dir, + default_format = "parquet" + ) +) +expect_true(inherits(db, "parquet_duckdb")) +expect_true(dir.exists(test_dir)) +expect_equal(db$default_format, "parquet") +expect_true(!is.null(db$connection)) +expect_true(inherits(db$connection, "duckdb_connection")) + +# Test 2: Initial state - no tables +expect_identical(db$list_tables(), character(0)) + +# Test 3: Fetch from non-existent table returns empty data.table +result <- db$fetch("nonexistent_table") +expect_true(inherits(result, "data.table")) +expect_equal(nrow(result), 0L) + +# Test 4: Row count for non-existent table +expect_equal(db$row_count("nonexistent_table"), 0L) + +# Test 5: commit overwrite creates new table +test_data_1 <- data.table::data.table( + id = 1:5, + name = letters[1:5], + value = c(10.1, 20.2, 30.3, 40.4, 50.5) +) +expect_silent( + db$commit( + method = "overwrite", + test_data_1, + "test_table_1" + ) +) +expect_true("test_table_1" %in% db$list_tables()) +expect_equal(db$row_count("test_table_1"), 5L) + +# Test 6: Fetch retrieves committed data +retrieved <- db$fetch("test_table_1") +expect_equal(retrieved, test_data_1) + +# Test 7: commit overwrite replaces existing data +test_data_1b <- data.table::data.table( + id = 10:12, + name = letters[24:26], + value = c(100.1, 200.2, 300.3) +) +expect_silent( + db$commit( + method = "overwrite", + test_data_1b, + "test_table_1" + ) +) +expect_equal(db$row_count("test_table_1"), 3L) +retrieved <- db$fetch("test_table_1") +expect_equal(retrieved, test_data_1b) + +# Test 8: commit append adds to existing data +test_data_1c <- data.table::data.table( + id = 13:15, + name = letters[1:3], + value = c(111.1, 222.2, 333.3) +) +expect_silent( + db$commit(test_data_1c, "test_table_1", method = "append") +) +expect_equal(db$row_count("test_table_1"), 6L) +retrieved <- db$fetch("test_table_1") +expect_equal(nrow(retrieved), 6L) +expect_true(all(c(10:15) %in% retrieved$id)) + +# Test 9: commit append on non-existent table creates it +expect_silent( + db$commit(test_data_1, "test_table_2", method = "append") +) +expect_true("test_table_2" %in% db$list_tables()) +expect_equal(db$row_count("test_table_2"), 5L) + +# Test 10: commit w/ upsert on non-existent table creates it +test_data_3 <- data.table::data.table( + id_key = 1:3, + name = c("a", "b", "c"), + value = c(1.1, 2.2, 3.3) +) +expect_silent( + db$commit(test_data_3, "test_table_3", key_cols = "id_key", method = "upsert") +) +expect_true("test_table_3" %in% db$list_tables()) +expect_equal(db$row_count("test_table_3"), 3L) + +# Test 11: commit w/ upsert updates existing rows and inserts new ones +test_data_3b <- data.table::data.table( + id_key = c(2L, 3L, 4L), + name = c("b_updated", "c_updated", "d"), + value = c(22.2, 33.3, 44.4) +) +expect_silent( + db$commit(test_data_3b, "test_table_3", key_cols = "id_key", method = "upsert") +) +expect_equal(db$row_count("test_table_3"), 4L) +retrieved <- db$fetch("test_table_3") +expect_equal(nrow(retrieved), 4L) +expect_equal(retrieved[id_key == 2]$name, "b_updated") +expect_equal(retrieved[id_key == 3]$value, 33.3) +expect_equal(retrieved[id_key == 4]$name, "d") +expect_equal(retrieved[id_key == 1]$name, "a") # unchanged + +# Test 12: Fetch with WHERE clause +result <- db$fetch("test_table_3", where = "id_key >= 3") +expect_equal(nrow(result), 2L) +expect_true(all(result$id_key >= 3)) + +# Test 13: Fetch with LIMIT +result <- db$fetch("test_table_3", limit = 2) +expect_equal(nrow(result), 2L) + +# Test 14: Fetch with WHERE and LIMIT +result <- db$fetch("test_table_3", where = "value > 20", limit = 1) +expect_equal(nrow(result), 1L) +expect_true(result$value > 20) + +# Test 15: delete_from with WHERE clause +deleted_count <- db$delete_from("test_table_3", where = "id_key = 1") +expect_equal(deleted_count, 1L) +expect_equal(db$row_count("test_table_3"), 3L) +retrieved <- db$fetch("test_table_3") +expect_false(1L %in% retrieved$id_key) + +# Test 16: delete_from with complex WHERE +db$commit( + method = "overwrite", + test_data_1, + "test_table_4" +) +deleted_count <- db$delete_from("test_table_4", where = "id < 3") +expect_equal(deleted_count, 2L) +retrieved <- db$fetch("test_table_4") +expect_true(all(retrieved$id >= 3)) + +# Test 17: delete_from with NULL (delete all) +count_before <- db$row_count("test_table_4") +deleted_count <- db$delete_from("test_table_4", where = NULL) +expect_equal(deleted_count, count_before) +expect_equal(db$row_count("test_table_4"), 0L) + +# Test 18: delete_from on non-existent table +deleted_count <- db$delete_from("nonexistent", where = "id = 1") +expect_equal(deleted_count, 0L) + +# Test 19: delete_from with WHERE that matches nothing +db$commit( + method = "overwrite", + test_data_1, + "test_table_5" +) +initial_count <- db$row_count("test_table_5") +deleted_count <- db$delete_from("test_table_5", where = "id = 999") +expect_equal(deleted_count, 0L) +expect_equal(db$row_count("test_table_5"), initial_count) + +# Test 20: Auto-increment on overwrite (new table) +test_autoinc_1 <- data.table::data.table( + name = c("item_a", "item_b", "item_c"), + value = c(10, 20, 30) +) +db$commit( + method = "overwrite", + test_autoinc_1, + "test_autoinc_1", + autoincrement_cols = "id" +) +result <- db$fetch("test_autoinc_1") +expect_equal(result$id, 1:3) +expect_equal(result$name, c("item_a", "item_b", "item_c")) + +# Test 21: Auto-increment on append +test_autoinc_2 <- data.table::data.table( + name = c("item_d", "item_e"), + value = c(40, 50) +) +db$commit( + test_autoinc_2, + "test_autoinc_1", + autoincrement_cols = "id", + method = "append" +) +result <- db$fetch("test_autoinc_1") +expect_equal(nrow(result), 5L) +expect_equal(result$id, 1:5) +expect_equal(result$name[4:5], c("item_d", "item_e")) + +# Test 22: Auto-increment on upsert with new rows +test_autoinc_3 <- data.table::data.table( + name = c("item_f", "item_g"), + value = c(60, 70) +) +db$commit( + test_autoinc_3, + "test_autoinc_1", + autoincrement_cols = "id", + method = "upsert" +) +result <- db$fetch("test_autoinc_1") +expect_equal(nrow(result), 7L) +expect_equal(result$id, 1:7) + +# Test 23: Auto-increment warning when overriding existing IDs +test_autoinc_with_ids <- data.table::data.table( + id = c(NA, 100L, NA), + name = c("new_a", "existing", "new_b"), + value = c(1, 2, 3) +) +expect_warning( + db$commit( + method = "overwrite", + test_autoinc_with_ids, + "test_autoinc_2", + autoincrement_cols = "id" + ), + "Overriding existing IDs" +) +result <- db$fetch("test_autoinc_2") +expect_equal(result$id, 1:3) + +# Test 24: Multiple auto-increment columns +test_multi_autoinc <- data.table::data.table( + name = c("item1", "item2"), + value = c(10, 20) +) +db$commit( + method = "overwrite", + test_multi_autoinc, + "test_multi_autoinc", + autoincrement_cols = c("id_a", "id_b") +) +result <- db$fetch("test_multi_autoinc") +expect_equal(result$id_a, 1:2) +expect_equal(result$id_b, 1:2) + +# Test 25: Auto-increment continues from max +test_continue_a <- data.table::data.table( + id_seq = c(5L, 10L, 15L), + value = c(100, 200, 300) +) +db$commit( + method = "overwrite", + test_continue_a, + "test_continue" +) + +test_continue_b <- data.table::data.table( + value = c(400, 500) +) +db$commit( + test_continue_b, + "test_continue", + autoincrement_cols = "id_seq", + method = "append" +) +result <- db$fetch("test_continue") +expect_equal(nrow(result), 5L) +expect_equal(result$id_seq[4:5], c(16L, 17L)) +expect_equal(result$value[4:5], c(400, 500)) + +# Test 26: attach_table and detach_table +db$commit( + method = "overwrite", + test_data_1, + "test_attach" +) +expect_silent(db$attach_table("test_attach")) +# Verify table is attached by querying it directly +result <- db$get_query("SELECT COUNT(*) as n FROM test_attach") +expect_equal(result$n, 5L) +expect_silent(db$detach_table("test_attach")) +# After detach, table should not be accessible +expect_error( + db$get_query("SELECT COUNT(*) as n FROM test_attach"), + "test_attach" +) + +# Test 27: attach_table with column selection +db$attach_table("test_attach", columns = c("id", "name")) +result <- db$get_query("SELECT * FROM test_attach") +expect_equal(ncol(result), 2L) +expect_true(all(c("id", "name") %in% names(result))) +expect_false("value" %in% names(result)) +db$detach_table("test_attach") + +# Test 28: attach_table with WHERE clause +db$attach_table("test_attach", where = "id > 3") +result <- db$get_query("SELECT * FROM test_attach") +expect_equal(nrow(result), 2L) +expect_true(all(result$id > 3)) +db$detach_table("test_attach") + +# Test 29: execute() method +db$attach_table("test_attach") +rows_affected <- db$execute("DELETE FROM test_attach WHERE id = 1") +expect_true(rows_affected >= 0) # DuckDB returns number of affected rows +db$detach_table("test_attach") + +# Test 30: get_query() method +db$attach_table("test_attach") +result <- db$get_query("SELECT MAX(id) as max_id FROM test_attach") +expect_true(inherits(result, "data.table")) +expect_true("max_id" %in% names(result)) +db$detach_table("test_attach") + +# Test 31: CSV format support +test_dir_csv <- tempfile("parquet_duckdb_csv_") +on.exit(unlink(test_dir_csv, recursive = TRUE), add = TRUE) + +db_csv <- parquet_duckdb$new( + path = test_dir_csv, + default_format = "csv" +) +expect_equal(db_csv$default_format, "csv") + +test_csv_data <- data.table::data.table( + id = 1:3, + name = c("a", "b", "c") +) +db_csv$commit(test_csv_data, "csv_table", method = "overwrite") +expect_true("csv_table" %in% db_csv$list_tables()) +retrieved <- db_csv$fetch("csv_table") +expect_equal(retrieved, test_csv_data) + +# Test 32: Extension loading +test_dir_ext <- tempfile("parquet_duckdb_ext_") +on.exit(unlink(test_dir_ext, recursive = TRUE), add = TRUE) + +db_ext <- parquet_duckdb$new( + path = test_dir_ext, + extensions = "spatial" +) +# Verify spatial extension is loaded by using a spatial function +expect_silent( + db_ext$get_query("SELECT ST_Point(0, 0) as geom") +) + +# Test 33: Persistence across connections +db$commit( + method = "overwrite", + test_data_1, + "persist_test" +) +rm(db) +gc() + +# Reconnect to same path +db <- parquet_duckdb$new(path = test_dir) +expect_true("persist_test" %in% db$list_tables()) +retrieved <- db$fetch("persist_test") +expect_equal(retrieved, test_data_1) + +# Test 34: commit with no key_cols defaults to append +test_no_keys <- data.table::data.table( + name = c("x", "y"), + value = c(1, 2) +) +db$commit( + method = "overwrite", + test_no_keys, + "no_keys_test" +) +expect_equal(db$row_count("no_keys_test"), 2L) + +db$commit( + test_no_keys, + "no_keys_test", + key_cols = character(0), + method = "upsert" +) +expect_equal(db$row_count("no_keys_test"), 4L) # Should append + +# Test 35: Print method +expect_stdout( + print(db), + "Public methods:|Active bindings:|Format|Compression" +) diff --git a/inst/tinytest/test_trans_preds_t.R b/inst/tinytest/test_trans_preds_t.R index 866c455..d9c9da2 100644 --- a/inst/tinytest/test_trans_preds_t.R +++ b/inst/tinytest/test_trans_preds_t.R @@ -2,6 +2,206 @@ library(tinytest) # Test creation and validation -trans_preds_t <- create_trans_preds_t() +trans_preds_t <- as_trans_preds_t() expect_silent(print(trans_preds_t)) expect_true(nrow(trans_preds_t) >= 0L) + + +#' Actually running the predictor selection logic is heavily dependent on previous +#' steps. Hence, this is a more heavy-handed test verging on integration testing. +test_dir_trans_preds <- tempfile("evoland_trans_preds_") +on.exit(unlink(test_dir_trans_preds, recursive = TRUE), add = TRUE) +db_tps <- evoland_db$new(test_dir_trans_preds) + +# Set up minimal coords and periods +db_tps$coords_t <- create_coords_t_square( + epsg = 2056, + extent = terra::ext(c(xmin = 2697000, xmax = 2697500, ymin = 1252000, ymax = 1252500)), + resolution = 100 +) + +db_tps$periods_t <- create_periods_t( + period_length_str = "P10Y", + start_observed = "1985-01-01", + end_observed = "2005-01-01", + end_extrapolated = "2015-01-01" +) + +# Set up LULC classes +db_tps$lulc_meta_t <- create_lulc_meta_t(list( + forest = list(pretty_name = "Forest", src_classes = 1L), + urban = list(pretty_name = "Urban", src_classes = 2L), + agriculture = list(pretty_name = "Agriculture", src_classes = 3L) +)) + +# Create synthetic LULC data with known transitions +set.seed(42) +n_coords <- nrow(db_tps$coords_t) +lulc_data <- data.table::rbindlist(list( + data.table::data.table( + id_coord = 1:n_coords, + id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.7, 0.3)), + id_period = 1L + ), + data.table::data.table( + id_coord = 1:n_coords, + id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.5, 0.5)), + id_period = 2L + ), + data.table::data.table( + id_coord = 1:n_coords, + id_lulc = sample(1:2, n_coords, replace = TRUE, prob = c(0.4, 0.6)), + id_period = 3L + ) +)) +db_tps$lulc_data_t <- as_lulc_data_t(lulc_data) + +# Create transition metadata +transitions <- db_tps$fetch("transitions_v") +db_tps$trans_meta_t <- create_trans_meta_t( + transitions, + min_cardinality_abs = 5L +) + +# Add predictor metadata with multiple predictors +pred_spec_tps <- list( + elevation = list( + unit = "m", + pretty_name = "Elevation", + description = "Elevation above sea level", + sources = list(list(url = "https://example.com/elevation.tif", md5sum = "abc123")) + ), + slope = list( + unit = "degrees", + pretty_name = "Slope", + description = "Terrain slope", + sources = list(list(url = "https://example.com/slope.tif", md5sum = "def456")) + ), + distance_to_road = list( + unit = "m", + pretty_name = "Distance to road", + description = "Distance to nearest road", + sources = list(list(url = "https://example.com/roads.gpkg", md5sum = "ghi789")) + ), + aspect = list( + unit = "degrees", + pretty_name = "Aspect", + description = "Terrain aspect", + sources = list(list(url = "https://example.com/aspect.tif", md5sum = "jkl012")) + ), + soil_type = list( + pretty_name = "Soil type", + description = "Soil classification", + sources = list(list(url = "https://example.com/soil.tif", md5sum = "mno345")) + ) +) +db_tps$pred_meta_t <- create_pred_meta_t(pred_spec_tps) + +# Add predictor data - mix of static and time-varying +set.seed(43) +pred_data_static <- data.table::rbindlist(list( + # Elevation (static, period 0) + data.table::data.table( + id_pred = 1L, + id_coord = 1:n_coords, + id_period = 0L, + value = runif(n_coords, 400, 800) + ), + # Slope (static, period 0) + data.table::data.table( + id_pred = 2L, + id_coord = 1:n_coords, + id_period = 0L, + value = runif(n_coords, 0, 30) + ), + # Aspect (static, period 0) + data.table::data.table( + id_pred = 4L, + id_coord = 1:n_coords, + id_period = 0L, + value = runif(n_coords, 0, 360) + ) +)) + +# Time-varying predictor +pred_data_varying <- data.table::rbindlist(lapply(1:3, function(period) { + data.table::data.table( + id_pred = 3L, # distance_to_road + id_coord = 1:n_coords, + id_period = period, + value = runif(n_coords, 0, 5000) + ) +})) + +db_tps$pred_data_t_float <- as_pred_data_t( + rbind(pred_data_static, pred_data_varying), + type = "float" +) + +# Add integer predictor (soil_type) +pred_data_int <- data.table::data.table( + id_pred = 5L, + id_coord = 1:n_coords, + id_period = 0L, + value = sample(1:5, n_coords, replace = TRUE) +) +db_tps$pred_data_t_int <- as_pred_data_t(pred_data_int, type = "int") + +# Test pruning +expect_message( + db_tps$prune_trans_preds( + corcut = 0.2 + ), + "Processing transition 1/2" +) +trans_preds_result <- db_tps$trans_preds_t +expect_true(inherits(trans_preds_result, "trans_preds_t")) +expect_equal(nrow(trans_preds_result), 4L) + +# Verify that all id_trans in result are viable +viable_trans_ids <- db_tps$trans_meta_t[is_viable == TRUE]$id_trans +expect_true(all(trans_preds_result$id_trans %in% viable_trans_ids)) + +# reset to full set of trans - preds +expect_silent(db_tps$set_full_trans_preds(overwrite = TRUE)) +expect_message( + db_tps$prune_trans_preds( + filter_fun = grrf_filter, + num.trees = 10, + gamma = 0.9 + ), + "Selected 5 predictor\\(s\\) for transition" +) + +# Test error handling - empty database +test_dir_empty <- tempfile("evoland_empty_") +on.exit(unlink(test_dir_empty, recursive = TRUE), add = TRUE) +db_empty <- evoland_db$new(test_dir_empty) + +expect_error( + db_empty$prune_trans_preds(), + "Table `trans_meta_t` does not exist" +) + +# Test with no predictors +test_dir_no_pred <- tempfile("evoland_no_pred_") +on.exit(unlink(test_dir_no_pred, recursive = TRUE), add = TRUE) +db_no_pred <- evoland_db$new(test_dir_no_pred) +db_no_pred$coords_t <- db_tps$coords_t +db_no_pred$periods_t <- db_tps$periods_t +db_no_pred$lulc_meta_t <- db_tps$lulc_meta_t +db_no_pred$lulc_data_t <- db_tps$lulc_data_t +expect_warning( + db_no_pred$trans_meta_t <- db_tps$trans_meta_t, + "Overriding existing IDs" +) +expect_error( + db_no_pred$prune_trans_preds(), + "Table `pred_meta_t` does not exist" +) + +# Test print method +expect_stdout( + print(trans_preds_result), + "Transition-Predictor|Total relationships" +) diff --git a/inst/tinytest/test_util_terra.R b/inst/tinytest/test_util_terra.R index b3089d3..35a0bab 100644 --- a/inst/tinytest/test_util_terra.R +++ b/inst/tinytest/test_util_terra.R @@ -140,7 +140,7 @@ expect_true(!anyNA(vector_na_omit_true$value)) vector_na_omit_false <- extract_using_coords_t(vect_with_na, coords_t, na_omit = FALSE) expect_true(anyNA(vector_na_omit_false$value)) -# Test compute_neighbors +# Test create_neighbors_t # Create a simple test coords_t with known distances test_coords <- data.table::data.table( id_coord = 1L:5L, @@ -152,7 +152,10 @@ test_coords <- data.table::data.table( test_coords <- as_coords_t(test_coords) # Test basic neighbor computation with max_distance = 150 -neighbors <- compute_neighbors(test_coords, max_distance = 150) +expect_stdout( + neighbors <- create_neighbors_t(test_coords, max_distance = 150), + "Progress" +) # Check structure expect_true(data.table::is.data.table(neighbors)) @@ -204,10 +207,13 @@ expect_equal( ) # Test with distance_breaks -neighbors_classified <- compute_neighbors( - test_coords, - max_distance = 150, - distance_breaks = c(0, 100, 150) +expect_stdout( + neighbors_classified <- create_neighbors_t( + test_coords, + max_distance = 150, + distance_breaks = c(0, 100, 150) + ), + "Progress" ) # Check that distance_class is populated @@ -234,7 +240,10 @@ expect_equal( ) # Test with smaller max_distance -neighbors_small <- compute_neighbors(test_coords, max_distance = 110) +expect_stdout( + neighbors_small <- create_neighbors_t(test_coords, max_distance = 110), + "Progress" +) # With max_distance = 110, point 1 should only have neighbors 2 and 4 (distance 100) # but not 5 (distance ~141.4) @@ -246,27 +255,27 @@ expect_false(5L %in% neighbors_from_1_small$id_coord_neighbor) # Test error handling expect_error( - compute_neighbors("not_coords_t", max_distance = 100), + create_neighbors_t("not_coords_t", max_distance = 100), "coords_t must be a coords_t object" ) expect_error( - compute_neighbors(test_coords, max_distance = -10), + create_neighbors_t(test_coords, max_distance = -10), "max_distance must be a positive scalar numeric" ) expect_error( - compute_neighbors(test_coords, max_distance = 100, distance_breaks = c(1)), + create_neighbors_t(test_coords, max_distance = 100, distance_breaks = c(1)), "distance_breaks must be NULL or a numeric vector with at least 2 elements" ) expect_error( - compute_neighbors(test_coords, max_distance = 100, distance_breaks = "invalid"), + create_neighbors_t(test_coords, max_distance = 100, distance_breaks = "invalid"), "distance_breaks must be NULL or a numeric vector with at least 2 elements" ) # Test with real coords_t from earlier in the test file -real_neighbors <- compute_neighbors(coords_t, max_distance = 300) +expect_stdout(real_neighbors <- create_neighbors_t(coords_t, max_distance = 300), "Progress") # Each point in a regular 100m grid should have neighbors # Interior points should have 8 neighbors within 300m (8-connectivity) @@ -291,10 +300,13 @@ for (i in seq_len( } # Test with distance classification on real data -real_neighbors_class <- compute_neighbors( - coords_t, - max_distance = 300, - distance_breaks = c(0, 150, 300) +expect_silent( + real_neighbors_class <- create_neighbors_t( + coords_t, + max_distance = 300, + distance_breaks = c(0, 150, 300), + quiet = TRUE + ) ) expect_true(all(!is.na(real_neighbors_class$distance_class))) diff --git a/man/covariance_filter.Rd b/man/covariance_filter.Rd new file mode 100644 index 0000000..0693652 --- /dev/null +++ b/man/covariance_filter.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/covariance_filter.R +\name{covariance_filter} +\alias{covariance_filter} +\alias{rank_poly_glm} +\alias{compute_balanced_weights} +\alias{select_by_correlation} +\title{Two stage covariate filtering} +\usage{ +covariance_filter( + data, + result_col = "result", + rank_fun = rank_poly_glm, + weights = compute_balanced_weights(data[[result_col]]), + corcut = 0.7, + ... +) + +rank_poly_glm(x, y, weights = NULL, ...) + +compute_balanced_weights(trans_result, legacy = FALSE) + +select_by_correlation(cor_mat, corcut) +} +\arguments{ +\item{data}{A data.table of target variable and candidate covariates to be filtered; +wide format with one predictor per column.} + +\item{result_col}{Name of the column representing the transition results (0: no +trans, 1: trans)} + +\item{rank_fun}{Optional function to compute ranking scores for each covariate. +Should take arguments (x, y, weights, ...) and return a single numeric value +(lower = better). Defaults to polynomial GLM p-value ranking.} + +\item{weights}{Optional weights vector} + +\item{corcut}{Correlation cutoff threshold} + +\item{...}{Additional arguments passed to rank_fun.} + +\item{x}{A numeric vector representing a single covariate} + +\item{y}{A binary outcome vector (0/1)} + +\item{trans_result}{Binary outcome vector (0/1)} + +\item{legacy}{Bool, use legacy weighting?} + +\item{cor_mat}{Absolute correlation matrix} +} +\value{ +A set of column names (covariates) to retain +} +\description{ +The \code{covariance_filter} returns a set of covariates for land use land cover change +(LULCC) models based on a two-stage variable selection: a first statistical fit +estimates a covariate's quality for a given prediction task. A second step selects +all variables below a given correlation threshold: We iterate over a correlation +matrix ordered in the first step. Starting within the leftmost column, all rows (i.e. +candidates) greater than the given threshold are dropped from the full set of +candidates. This candidate selection is retained and used to select the next column, +until no further columns are left to investigate. The columns that were iterated over +are those returned as a character vector of selected variable names. +} +\details{ +The function first ranks covariates using the provided ranking function (default: +quasibinomial polynomial GLM). Then, it iteratively removes highly (Pearson) +correlated variables based on the correlation cutoff threshold, preserving variables +in order of their ranking. See +\url{https://github.com/ethzplus/evoland-plus-legacy/blob/main/R/lulcc.covfilter.r} for +where the concept came from. The original author was Antoine Adde, with edits by +Benjamin Black. A similar mechanism is found in \url{https://github.com/antadde/covsel/}. +} +\section{Functions}{ +\itemize{ +\item \code{rank_poly_glm()}: Default ranking function using polynomial GLM. Returns +the lower p value for each of the polynomial terms + +\item \code{compute_balanced_weights()}: Compute class-balanced weights for imbalanced binary +outcomes; returns a numeric vector + +\item \code{select_by_correlation()}: Implements the iterative selection procedure. + +}} +\keyword{internal} diff --git a/man/evoland_db.Rd b/man/evoland_db.Rd index e555300..c3775f3 100644 --- a/man/evoland_db.Rd +++ b/man/evoland_db.Rd @@ -8,219 +8,115 @@ An R6 class that provides an interface to a folder-based data storage system for the evoland package. Each table is stored as a parquet (or CSV) file. This class uses DuckDB for in-memory SQL operations while persisting data to disk in parquet format for better compression. -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{connection}}{DBI connection object to an in-memory DuckDB database} - -\item{\code{path}}{Character string path to the data folder} -\item{\code{default_format}}{Default file format for new tables} - -\item{\code{writeopts}}{Default write options for DuckDB, see} +Inherits from \link{parquet_duckdb} for generic database operations. } -\if{html}{\out{
}} +\seealso{ +Additional methods and active bindings are added to this class in separate files: +\itemize{ +\item \link{evoland_db_tables} - Table active bindings (coords_t, lulc_data_t, etc.) +\item \link{evoland_db_views} - View active bindings (lulc_meta_long_v, etc.) and methods +\item \link{evoland_db_neighbors} - Neighbor analysis methods } -\section{Active bindings}{ -\if{html}{\out{
}} -\describe{ -\item{\code{coords_t}}{A \code{coords_t} instance; see \code{\link[=create_coords_t]{create_coords_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{periods_t}}{A \code{periods_t} instance; see \code{\link[=create_periods_t]{create_periods_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{lulc_meta_t}}{A \code{lulc_meta_t} instance; see \code{\link[=create_lulc_meta_t]{create_lulc_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{lulc_meta_long_v}}{Return a \code{lulc_meta_long_v} instance, i.e. unrolled \code{lulc_meta_t}.} - -\item{\code{lulc_data_t}}{A \code{lulc_data_t} instance; see \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{pred_data_t_float}}{A \code{pred_data_t_float} instance; see -\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an -upsert operation.} - -\item{\code{pred_data_t_int}}{A \code{pred_data_t_int} instance; see -\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an -upsert operation.} - -\item{\code{pred_data_t_bool}}{A \code{pred_data_t_bool} instance; see -\code{\link[=create_pred_data_t]{create_pred_data_t()}} for the type of object to assign. Assigning is an -upsert operation.} - -\item{\code{extent}}{Return a terra SpatExtent based on coords_t} - -\item{\code{coords_minimal}}{data.table with only (id_coord, lon, lat)} - -\item{\code{pred_meta_t}}{A \code{pred_meta_t} instance; see \code{\link[=create_pred_meta_t]{create_pred_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{pred_sources_v}}{Retrieve a table of distinct predictor urls and their -md5sum} - -\item{\code{trans_meta_t}}{A \code{trans_meta_t} instance; see \code{\link[=create_trans_meta_t]{create_trans_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{trans_preds_t}}{A \code{trans_preds_t} instance; see \code{\link[=create_trans_preds_t]{create_trans_preds_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{intrv_meta_t}}{A \code{intrv_meta_t} instance; see \code{\link[=create_intrv_meta_t]{create_intrv_meta_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{intrv_masks_t}}{A \code{intrv_masks_t} instance; see \code{\link[=as_intrv_masks_t]{as_intrv_masks_t()}} for the type of -object to assign. Assigning is an upsert operation.} - -\item{\code{trans_models_t}}{A \code{trans_models_t} instance; see \code{\link[=create_trans_models_t]{create_trans_models_t()}} for the type -of object to assign. Assigning is an upsert operation.} - -\item{\code{alloc_params_t}}{A \code{alloc_params_t} instance; see \code{\link[=as_alloc_params_t]{as_alloc_params_t()}} for the type -of object to assign. Assigning is an upsert operation.} } -\if{html}{\out{
}} +\section{Super class}{ +\code{\link[evoland:parquet_duckdb]{evoland::parquet_duckdb}} -> \code{evoland_db} } \section{Methods}{ \subsection{Public methods}{ \itemize{ +\item \href{#method-evoland_db-set_neighbors}{\code{evoland_db$set_neighbors()}} +\item \href{#method-evoland_db-generate_neighbor_predictors}{\code{evoland_db$generate_neighbor_predictors()}} +\item \href{#method-evoland_db-trans_pred_data_v}{\code{evoland_db$trans_pred_data_v()}} \item \href{#method-evoland_db-new}{\code{evoland_db$new()}} -\item \href{#method-evoland_db-commit_overwrite}{\code{evoland_db$commit_overwrite()}} -\item \href{#method-evoland_db-commit_append}{\code{evoland_db$commit_append()}} -\item \href{#method-evoland_db-commit_upsert}{\code{evoland_db$commit_upsert()}} \item \href{#method-evoland_db-fetch}{\code{evoland_db$fetch()}} -\item \href{#method-evoland_db-list_tables}{\code{evoland_db$list_tables()}} -\item \href{#method-evoland_db-execute}{\code{evoland_db$execute()}} -\item \href{#method-evoland_db-get_query}{\code{evoland_db$get_query()}} -\item \href{#method-evoland_db-attach_table}{\code{evoland_db$attach_table()}} -\item \href{#method-evoland_db-detach_table}{\code{evoland_db$detach_table()}} -\item \href{#method-evoland_db-row_count}{\code{evoland_db$row_count()}} -\item \href{#method-evoland_db-delete_from}{\code{evoland_db$delete_from()}} \item \href{#method-evoland_db-set_report}{\code{evoland_db$set_report()}} \item \href{#method-evoland_db-set_coords}{\code{evoland_db$set_coords()}} \item \href{#method-evoland_db-set_periods}{\code{evoland_db$set_periods()}} \item \href{#method-evoland_db-add_predictor}{\code{evoland_db$add_predictor()}} +\item \href{#method-evoland_db-set_full_trans_preds}{\code{evoland_db$set_full_trans_preds()}} +\item \href{#method-evoland_db-prune_trans_preds}{\code{evoland_db$prune_trans_preds()}} \item \href{#method-evoland_db-clone}{\code{evoland_db$clone()}} } } +\if{html}{\out{ +
Inherited methods + +
+}} \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-new}{}}} -\subsection{Method \code{new()}}{ -Initialize a new evoland_db object +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-set_neighbors}{}}} +\subsection{Method \code{set_neighbors()}}{ \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$new(path, default_format = c("parquet", "csv"), ...)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{evoland_db$set_neighbors( + max_distance = 1000, + distance_breaks = c(0, 100, 500, 1000), + resolution = 100, + overwrite = FALSE +)}\if{html}{\out{
}} } -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{path}}{Character string. Path to the data folder.} - -\item{\code{default_format}}{Character. Default file format ("parquet" or "csv"). -Default is "parquet".} - -\item{\code{...}}{passed on to \code{set_report}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -A new \code{evoland_db} object -} } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-commit_overwrite}{}}} -\subsection{Method \code{commit_overwrite()}}{ -Commit data in overwrite mode +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-generate_neighbor_predictors}{}}} +\subsection{Method \code{generate_neighbor_predictors()}}{ \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$commit_overwrite( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{evoland_db$generate_neighbor_predictors()}\if{html}{\out{
}} } -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} - -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
}} -} } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-commit_append}{}}} -\subsection{Method \code{commit_append()}}{ -Commit data in append mode +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-trans_pred_data_v}{}}} +\subsection{Method \code{trans_pred_data_v()}}{ \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$commit_append( - x, - table_name, - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{evoland_db$trans_pred_data_v(id_trans, id_pred = NULL, na_value = NA)}\if{html}{\out{
}} } -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} - -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} - -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} -} -\if{html}{\out{
}} -} } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-commit_upsert}{}}} -\subsection{Method \code{commit_upsert()}}{ -Commit data in upsert mode +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-new}{}}} +\subsection{Method \code{new()}}{ +Initialize a new evoland_db object \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$commit_upsert( - x, - table_name, - key_cols = grep("^id_", names(x), value = TRUE), - autoincrement_cols = character(0), - map_cols = character(0) -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{evoland_db$new(path, default_format = c("parquet", "csv"), ...)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{x}}{Data frame to commit} - -\item{\code{table_name}}{Character string table name} - -\item{\code{key_cols}}{Identify unique columns - heuristic: if prefixed with -id_, the set of all columns designates a uniqueness condition} +\item{\code{path}}{Character string. Path to the data folder.} -\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} +\item{\code{default_format}}{Character. Default file format ("parquet" or "csv"). +Default is "parquet".} -\item{\code{map_cols}}{Character vector of columns to convert to MAP format} +\item{\code{...}}{passed on to \code{set_report}} } \if{html}{\out{
}} } +\subsection{Returns}{ +A new \code{evoland_db} object +} } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-fetch}{}}} \subsection{Method \code{fetch()}}{ -Fetch data from storage +Fetch data from storage with evoland-specific view support \subsection{Usage}{ \if{html}{\out{
}}\preformatted{evoland_db$fetch(table_name, where = NULL, limit = NULL)}\if{html}{\out{
}} } @@ -241,139 +137,6 @@ A data.table } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-list_tables}{}}} -\subsection{Method \code{list_tables()}}{ -List all tables (files) in storage -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$list_tables()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -Character vector of table names -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-execute}{}}} -\subsection{Method \code{execute()}}{ -Execute statement -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$execute(statement)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{statement}}{A SQL statement} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows affected by statement -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-get_query}{}}} -\subsection{Method \code{get_query()}}{ -Get Query -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$get_query(statement)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{statement}}{A SQL statement} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows affected by statement -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-attach_table}{}}} -\subsection{Method \code{attach_table()}}{ -Attach one or more tables from the database folder as temporary tables in DuckDB. -This is useful for working with multiple tables in SQL queries without loading -them into R memory. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$attach_table(table_name, columns = "*")}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character vector. Names of table to attach.} - -\item{\code{columns}}{Character vector. Optional sql column selection, defaults to "*"} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-detach_table}{}}} -\subsection{Method \code{detach_table()}}{ -Detach one or more tables from the database. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$detach_table(table_name)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character. Name of table to drop.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-row_count}{}}} -\subsection{Method \code{row_count()}}{ -Get table row count -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$row_count(table_name)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character string. Name of the table to query.} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-evoland_db-delete_from}{}}} -\subsection{Method \code{delete_from()}}{ -Delete rows from a table -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{evoland_db$delete_from(table_name, where = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{table_name}}{Character string. Name of the table to delete from.} - -\item{\code{where}}{Character string, defaults to NULL: delete everything in table.} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -No. of rows affected -} -} -\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-evoland_db-set_report}{}}} \subsection{Method \code{set_report()}}{ @@ -460,6 +223,28 @@ Add a predictor to the database } \if{html}{\out{}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-set_full_trans_preds}{}}} +\subsection{Method \code{set_full_trans_preds()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$set_full_trans_preds(overwrite = FALSE)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-evoland_db-prune_trans_preds}{}}} +\subsection{Method \code{prune_trans_preds()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{evoland_db$prune_trans_preds( + filter_fun = covariance_filter, + na_value = NA, + ... +)}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/evoland_db_neighbors.Rd b/man/evoland_db_neighbors.Rd new file mode 100644 index 0000000..91db904 --- /dev/null +++ b/man/evoland_db_neighbors.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evoland_db_neighbors.R +\name{evoland_db_neighbors} +\alias{evoland_db_neighbors} +\title{Neighbor analysis methods for evoland_db} +\description{ +This file adds neighbor analysis methods to the \code{evoland_db} class using R6's \verb{$set()} method. +These methods compute neighbor relationships and generate neighbor-based predictors. +} +\section{Methods Added}{ + +\itemize{ +\item \code{create_neighbors_t(max_distance, distance_breaks, resolution, overwrite)} - +Computes neighbor relationships between coordinates. +\itemize{ +\item \code{max_distance}: Maximum distance for neighbors (default: 1000) +\item \code{distance_breaks}: Vector of breaks for distance classes (default: c(0, 100, 500, 1000)) +\item \code{resolution}: Grid resolution for distance calculations (default: 100) +\item \code{overwrite}: Whether to overwrite existing neighbors_t (default: FALSE) +} +\item \code{generate_neighbor_predictors()} - Generates predictor variables based on neighbor +land use counts by distance class. Requires neighbors_t with distance_class column +(create_neighbors_t with distance_breaks). +} +} + diff --git a/man/evoland_db_tables.Rd b/man/evoland_db_tables.Rd new file mode 100644 index 0000000..59cc49d --- /dev/null +++ b/man/evoland_db_tables.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/evoland_db_tables.R +\name{evoland_db_tables} +\alias{evoland_db_tables} +\title{Table active bindings for evoland_db} +\description{ +This file adds table active bindings to the \code{evoland_db} class using R6's \verb{$set()} method. +These bindings provide read/write access to database tables with automatic validation. +} +\section{Active Bindings Added}{ + +\itemize{ +\item \code{coords_t} - Coordinates table. See \code{\link[=as_coords_t]{as_coords_t()}} +\item \code{periods_t} - Time periods table. See \code{\link[=as_periods_t]{as_periods_t()}} +\item \code{lulc_meta_t} - Land use/land cover metadata. See \code{\link[=as_lulc_meta_t]{as_lulc_meta_t()}} +\item \code{lulc_data_t} - Land use/land cover data. See \code{\link[=as_lulc_data_t]{as_lulc_data_t()}} +\item \code{pred_data_t_float} - Float predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}} +\item \code{pred_data_t_int} - Integer predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}} +\item \code{pred_data_t_bool} - Boolean predictor data. See \code{\link[=as_pred_data_t]{as_pred_data_t()}} +\item \code{pred_meta_t} - Predictor metadata. See \code{\link[=as_pred_meta_t]{as_pred_meta_t()}} +\item \code{trans_meta_t} - Transition metadata. See \code{\link[=as_trans_meta_t]{as_trans_meta_t()}} +\item \code{trans_preds_t} - Transition-predictor relationships. See \code{\link[=as_trans_preds_t]{as_trans_preds_t()}} +\item \code{intrv_meta_t} - Intervention metadata. See \code{\link[=as_intrv_meta_t]{as_intrv_meta_t()}} +\item \code{intrv_masks_t} - Intervention masks. See \code{\link[=as_intrv_masks_t]{as_intrv_masks_t()}} +\item \code{trans_models_t} - Transition models. See \code{\link[=as_trans_models_t]{as_trans_models_t()}} +\item \code{alloc_params_t} - Allocation parameters. See \code{\link[=as_alloc_params_t]{as_alloc_params_t()}} +\item \code{neighbors_t} - Neighbor relationships. See \code{\link[=as_neighbors_t]{as_neighbors_t()}} +} +} + diff --git a/man/evoland_db_views.Rd b/man/evoland_db_views.Rd index 68d5b66..d0a7c93 100644 --- a/man/evoland_db_views.Rd +++ b/man/evoland_db_views.Rd @@ -2,27 +2,26 @@ % Please edit documentation in R/evoland_db_views.R \name{evoland_db_views} \alias{evoland_db_views} -\alias{make_pred_sources_v} -\alias{make_lulc_meta_long_v} -\alias{make_coords_minimal} \title{Views on the evoland-plus data model} -\usage{ -make_pred_sources_v(self, private) - -make_lulc_meta_long_v(self, private) - -make_coords_minimal(self, private) -} \description{ -Functions to generate views on the database +This file adds view active bindings and methods to the \code{evoland_db} class using R6's \verb{$set()} method. +These provide computed views on the database without storing additional data. } -\section{Functions}{ +\section{Active Bindings Added}{ + \itemize{ -\item \code{make_pred_sources_v()}: Retrieve a table of distinct predictor urls and their -md5sum +\item \code{lulc_meta_long_v} - Unrolled LULC metadata with one row per source class +\item \code{pred_sources_v} - Distinct predictor URLs and their MD5 checksums +\item \code{transitions_v} - Land use transitions derived from lulc_data_t +\item \code{extent} - Spatial extent of coords_t as terra::SpatExtent +\item \code{coords_minimal} - Minimal coordinate representation (id_coord, lon, lat) +} +} -\item \code{make_lulc_meta_long_v()}: Return a \code{lulc_meta_long_v} instance, i.e. unrolled \code{lulc_meta_t}. +\section{Methods Added}{ -\item \code{make_coords_minimal()}: Minimal coordinate representation (id_coord, lon, lat) +\itemize{ +\item \code{trans_pred_data_v(id_trans)} - Returns wide table of transition results and predictor data for a specific transition. Used as input to covariance filtering. +} +} -}} diff --git a/man/grrf_filter.Rd b/man/grrf_filter.Rd new file mode 100644 index 0000000..a3bbb93 --- /dev/null +++ b/man/grrf_filter.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grrf_filter.r +\name{grrf_filter} +\alias{grrf_filter} +\title{Guided Regularized Random Forest Feature Selection} +\usage{ +grrf_filter( + data, + result_col = "result", + weights = compute_balanced_weights(data[[result_col]]), + gamma = 0.5, + num.trees = 500, + max.depth = 100, + ... +) +} +\arguments{ +\item{data}{A data.table of target variable and candidate covariates to be filtered; wide format +with one predictor per column.} + +\item{result_col}{Name of the column representing the transition results (0: no trans, 1: trans)} + +\item{weights}{Optional named vector of class weights. If NULL, class-balanced weights +are computed automatically using compute_grrf_weights().} + +\item{gamma}{Numeric between 0-1 controlling the weight of the normalized importance +score (the "importance coefficient"). When gamma = 0, we perform unguided +regularized random forest (no guiding effect). When gamma = 1, we apply the +strongest guiding effect, leading to the most penalization of redundant +features and the most concise feature sets. Default is 0.5.} + +\item{num.trees}{Number of trees to grow in each random forest. Default is 500.} + +\item{...}{Additional arguments passed to ranger::ranger().} +} +\value{ +A character vector of column names (covariates) to retain, ordered by +importance (most important first) +} +\description{ +The \code{grrf_filter} returns a set of covariates for land use land cover change (LULCC) models based +on feature selection with Guided Regularized Random Forests. This is a two-stage random forest +approach: a first unregularized random forest estimates variable importance scores. These scores +are then used to guide a second regularized random forest that penalizes less important features, +resulting in a more parsimonious feature set. +} +\details{ +The Guided Regularized Random Forest (GRRF) algorithm works as follows: +\enumerate{ +\item Fit an initial unregularized random forest to obtain variable importance scores +\item Normalize these importance scores and use them to compute regularization +coefficients: coefReg = (1 - gamma) + gamma * normalized_importance +\item Fit a regularized random forest using these coefficients to penalize splits +on less important variables +\item Return variables with positive importance in the regularized model +} + +Class weights are used to handle class imbalance. Variables in terminal nodes are +weighted by class, and splits are evaluated using weighted Gini impurity. + +The ranger implementation uses the \code{split.select.weights} parameter to apply +regularization penalties, approximating the RRF regularization approach. +} +\references{ +Deng, H., & Runger, G. (2013). Gene selection with guided regularized random forest. +Pattern Recognition, 46(12), 3483-3489. https://arxiv.org/pdf/1306.0237.pdf + +Original implementation by Antoine Adde, edited by Ben Black and adapted for +ranger by the evoland-plus team. +} diff --git a/man/neighbors_t.Rd b/man/neighbors_t.Rd new file mode 100644 index 0000000..55d0260 --- /dev/null +++ b/man/neighbors_t.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/neighbors_t.R +\name{neighbors_t} +\alias{neighbors_t} +\alias{as_neighbors_t} +\alias{create_neighbors_t} +\alias{validate.neighbors_t} +\alias{print.neighbors_t} +\title{Create Neighbors Table} +\usage{ +as_neighbors_t(x) + +create_neighbors_t( + coords_t, + max_distance, + distance_breaks = NULL, + resolution = 100, + quiet = FALSE +) + +\method{validate}{neighbors_t}(x, ...) + +\method{print}{neighbors_t}(x, nrow = 10, ...) +} +\arguments{ +\item{x}{An object that can be passed to \code{\link[data.table:setDT]{data.table::setDT()}}} + +\item{max_distance}{Maximum distance to search for neighbors (in same units as +coordinates and resolution)} + +\item{distance_breaks}{Optional numeric vector defining distance class boundaries. +If NULL, no distance classification is performed. +If provided, must have at least 2 elements defining interval breaks.} + +\item{resolution}{Grid cell size for rasterization (default: 100.0, in same units as coordinates)} + +\item{...}{Passed to \link[data.table:print.data.table]{data.table::print.data.table}} + +\item{nrow}{Maximum number of rows to print. See \link[data.table:print.data.table]{data.table::print.data.table}} +} +\value{ +A data.table of class "neighbors_t" with columns: +\itemize{ +\item \code{id_coord_origin}: Foreign key to coords_t (origin coordinate) +\item \code{id_coord_neighbor}: Foreign key to coords_t (neighbor coordinate) +\item \code{distance}: Numeric distance between coordinates +\item \code{distance_class}: Optional factor representing distance intervals +} + +A data.table with columns: +\itemize{ +\item id_coord_origin: ID of the origin coordinate +\item id_coord_neighbor: ID of the neighboring coordinate +\item distance: Distance between origin and neighbor +\item distance_class: Factor indicating distance class (if distance_breaks provided) +} +} +\description{ +Creates a \code{neighbors_t} table and validates that it matches the schema. +} +\section{Methods (by generic)}{ +\itemize{ +\item \code{validate(neighbors_t)}: Validate a neighbors_t object + +\item \code{print(neighbors_t)}: Print a neighbors_t object + +}} +\section{Functions}{ +\itemize{ +\item \code{create_neighbors_t()}: Compute neighboring coordinates within specified distances. In +order to be computationally feasible, the coordinates' IDs are rasterized before +their actual Euclidean distance is calculated. If coordinates are so close that they +get rasterized to the same cell, the first one is used and a warning is emitted. If +this happens, try again using a lower resolution. + +}} diff --git a/man/parquet_duckdb.Rd b/man/parquet_duckdb.Rd new file mode 100644 index 0000000..cb41afa --- /dev/null +++ b/man/parquet_duckdb.Rd @@ -0,0 +1,354 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet_duckdb.R +\name{parquet_duckdb} +\alias{parquet_duckdb} +\title{R6 Base Class for Parquet-Backed DuckDB Storage} +\description{ +A domain-agnostic R6 class that provides an interface to a folder-based data +storage system using DuckDB for in-memory SQL operations and parquet files +for efficient on-disk persistence. This class can be inherited by +domain-specific database classes. +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{connection}}{DBI connection object to an in-memory DuckDB database} + +\item{\code{path}}{Character string path to the data folder} + +\item{\code{default_format}}{Default file format for new tables} + +\item{\code{writeopts}}{Default write options for DuckDB} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-parquet_duckdb-new}{\code{parquet_duckdb$new()}} +\item \href{#method-parquet_duckdb-execute}{\code{parquet_duckdb$execute()}} +\item \href{#method-parquet_duckdb-get_query}{\code{parquet_duckdb$get_query()}} +\item \href{#method-parquet_duckdb-attach_table}{\code{parquet_duckdb$attach_table()}} +\item \href{#method-parquet_duckdb-detach_table}{\code{parquet_duckdb$detach_table()}} +\item \href{#method-parquet_duckdb-row_count}{\code{parquet_duckdb$row_count()}} +\item \href{#method-parquet_duckdb-list_tables}{\code{parquet_duckdb$list_tables()}} +\item \href{#method-parquet_duckdb-with_tables}{\code{parquet_duckdb$with_tables()}} +\item \href{#method-parquet_duckdb-fetch}{\code{parquet_duckdb$fetch()}} +\item \href{#method-parquet_duckdb-delete_from}{\code{parquet_duckdb$delete_from()}} +\item \href{#method-parquet_duckdb-commit}{\code{parquet_duckdb$commit()}} +\item \href{#method-parquet_duckdb-print}{\code{parquet_duckdb$print()}} +\item \href{#method-parquet_duckdb-clone}{\code{parquet_duckdb$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-new}{}}} +\subsection{Method \code{new()}}{ +Initialize a new parquet_duckdb object +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$new( + path, + default_format = c("parquet", "csv"), + extensions = character(0) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Character string. Path to the data folder.} + +\item{\code{default_format}}{Character. Default file format ("parquet" or "csv"). +Default is "parquet".} + +\item{\code{extensions}}{Character vector of DuckDB extensions to load (e.g., "spatial")} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A new \code{parquet_duckdb} object +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-execute}{}}} +\subsection{Method \code{execute()}}{ +Execute a SQL statement +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$execute(statement)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{statement}}{A SQL statement} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Number of rows affected by statement +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-get_query}{}}} +\subsection{Method \code{get_query()}}{ +Execute a SQL query and return results +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$get_query(statement)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{statement}}{A SQL query statement} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table with query results +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-attach_table}{}}} +\subsection{Method \code{attach_table()}}{ +Attach a table from parquet/CSV file as a temporary table in DuckDB +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$attach_table(table_name, columns = "*", where = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character. Name of table to attach.} + +\item{\code{columns}}{Character vector. Optional SQL column selection, defaults to "*"} + +\item{\code{where}}{Character. Optional SQL WHERE clause to subset the table.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-detach_table}{}}} +\subsection{Method \code{detach_table()}}{ +Detach a table from the in-memory database +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$detach_table(table_name)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character. Name of table to drop.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-row_count}{}}} +\subsection{Method \code{row_count()}}{ +Get row count for a table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$row_count(table_name)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character string. Name of the table to query.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Integer number of rows +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-list_tables}{}}} +\subsection{Method \code{list_tables()}}{ +List all tables (files) in storage +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$list_tables()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Character vector of table names +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-with_tables}{}}} +\subsection{Method \code{with_tables()}}{ +Execute a function with specified tables attached, handling attach/detach automatically. +If a table is already attached in the DuckDB instance, it won't be re-attached or detached. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$with_tables(tables, func, ...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{tables}}{Character vector of table names to attach} + +\item{\code{func}}{Function to execute with tables attached} + +\item{\code{...}}{Additional arguments passed to func} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Result of func +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-fetch}{}}} +\subsection{Method \code{fetch()}}{ +Fetch data from a table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$fetch(table_name, where = NULL, limit = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character string. Name of the table to query.} + +\item{\code{where}}{Character string. Optional WHERE clause for the SQL query.} + +\item{\code{limit}}{Integer. Optional limit on number of rows to return.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-delete_from}{}}} +\subsection{Method \code{delete_from()}}{ +Delete rows from a table +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$delete_from(table_name, where = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{table_name}}{Character string. Name of the table to delete from.} + +\item{\code{where}}{Character string. Optional WHERE clause; if NULL, deletes all rows.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Number of rows deleted +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-commit}{}}} +\subsection{Method \code{commit()}}{ +Commit data using overwrite, append, or upsert modes. Handles autoincrement, key +identity columns, and list-to-MAP conversion. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$commit( + x, + table_name, + key_cols, + autoincrement_cols = character(0), + map_cols = character(0), + method = c("overwrite", "append", "upsert") +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Data frame to commit. If character, in-duckdb-memory table.} + +\item{\code{table_name}}{Character string table name} + +\item{\code{key_cols}}{Character vector of columns that define uniqueness. If missing, +use all columns starting with \code{id_}} + +\item{\code{autoincrement_cols}}{Character vector of column names to auto-increment} + +\item{\code{map_cols}}{Character vector of columns to convert to MAP format} + +\item{\code{method}}{Character, one of "overwrite", "append", "upsert" (upsert being an +update for existing rows, and insert for new rows)} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-print}{}}} +\subsection{Method \code{print()}}{ +Print method for parquet_duckdb +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$print(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{Not used} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +self (invisibly) +param x Data frame to commit. If character, in-duckdb-memory table. +param table_name Character string table name +param autoincrement_cols Character vector of column names to auto-increment +param map_cols Character vector of columns to convert to MAP format +return Invisible NULL (called for side effects) +param x Data frame to commit. If character, in-duckdb-memory table. +param table_name Character string table name +param autoincrement_cols Character vector of column names to auto-increment +param map_cols Character vector of columns to convert to MAP format +return Invisible NULL (called for side effects) +param x Data frame to commit. If character, in-duckdb-memory table. +param table_name Character string table name +param key_cols Character vector of columns that define uniqueness. If missing, +use all columns starting with \code{id_} +param autoincrement_cols Character vector of column names to auto-increment +param map_cols Character vector of columns to convert to MAP format +return Invisible NULL (called for side effects) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-parquet_duckdb-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{parquet_duckdb$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/trans_meta_t.Rd b/man/trans_meta_t.Rd index 8c1d23b..9577213 100644 --- a/man/trans_meta_t.Rd +++ b/man/trans_meta_t.Rd @@ -10,7 +10,7 @@ as_trans_meta_t(x) create_trans_meta_t( - lulc_data, + transitions, min_cardinality_abs = NULL, min_frequency_rel = NULL, exclude_anterior = NULL, @@ -20,7 +20,8 @@ create_trans_meta_t( \method{print}{trans_meta_t}(x, nrow = 10, ...) } \arguments{ -\item{lulc_data}{A \link{lulc_data_t} table with land use observations} +\item{transitions}{A transitions_v table, with columns id_coord, id_lulc_anterior, +id_lulc_posterior, id_period} \item{min_cardinality_abs}{Minimum absolute number of transitions for viability (optional)} @@ -58,6 +59,7 @@ entries for each viable transition type. }} \section{Functions}{ \itemize{ -\item \code{create_trans_meta_t()}: Calculate the transition metadata and mark for modelling feasibility +\item \code{create_trans_meta_t()}: Calculate the transition metadata and mark for modelling +feasibility. Does not attribute \code{id_trans}; this only makes sense as part of a DB. }} diff --git a/man/trans_preds_t.Rd b/man/trans_preds_t.Rd index 545381f..202028e 100644 --- a/man/trans_preds_t.Rd +++ b/man/trans_preds_t.Rd @@ -3,14 +3,11 @@ \name{trans_preds_t} \alias{trans_preds_t} \alias{as_trans_preds_t} -\alias{create_trans_preds_t} \alias{print.trans_preds_t} \title{Create Transition-Predictor Relationship Table} \usage{ as_trans_preds_t(x) -create_trans_preds_t() - \method{print}{trans_preds_t}(x, nrow = 10, ...) } \arguments{ @@ -37,9 +34,3 @@ modelling each transition type. \item \code{print(trans_preds_t)}: Print a trans_preds_t object, passing params to data.table print }} -\section{Functions}{ -\itemize{ -\item \code{create_trans_preds_t()}: Create a transition-predictor relation, i.e. records the -result of a predictor selection step. - -}} diff --git a/man/util.Rd b/man/util.Rd index 575d7f8..a83b6fc 100644 --- a/man/util.Rd +++ b/man/util.Rd @@ -26,7 +26,7 @@ ensure_dir(dir) print_rowwise_yaml(df) -cast_dt_col(x, colname, castfun) +cast_dt_col(x, colname, type) } \arguments{ \item{x}{Left-hand side value} diff --git a/man/util_terra.Rd b/man/util_terra.Rd index d46c499..e429d0a 100644 --- a/man/util_terra.Rd +++ b/man/util_terra.Rd @@ -3,43 +3,18 @@ \name{util_terra} \alias{util_terra} \alias{extract_using_coords_t} -\alias{compute_neighbors} \title{evoland utility functions to work with terra objects} \usage{ extract_using_coords_t(x, coords_t, na_omit = TRUE) - -compute_neighbors( - coords_t, - max_distance, - distance_breaks = NULL, - resolution = 100 -) } \arguments{ \item{x}{The object to extract from; use "simple" extraction for rasters, i.e. no resampling is done.} \item{coords_t}{A coords_t object containing coordinate points} - -\item{max_distance}{Maximum distance to search for neighbors (in same units as -coordinates and resolution)} - -\item{distance_breaks}{Optional numeric vector defining distance class boundaries. -If NULL, no distance classification is performed. -If provided, must have at least 2 elements defining interval breaks.} - -\item{resolution}{Grid cell size for rasterization (default: 100.0, in same units as coordinates)} } \value{ A long data.table with \code{id_coord}, a \code{layer}/\code{attribute} column, and a \code{value} column. NAs are omitted - -A data.table with columns: -\itemize{ -\item id_coord_origin: ID of the origin coordinate -\item id_coord_neighbor: ID of the neighboring coordinate -\item distance: Distance between origin and neighbor -\item distance_class: Factor indicating distance class (if distance_breaks provided) -} } \description{ Useful to coax terra raster and vector data into evoland tabular form. @@ -48,10 +23,4 @@ Useful to coax terra raster and vector data into evoland tabular form. \itemize{ \item \code{extract_using_coords_t()}: Extract values from a SpatRaster or SpatVector object using a (minimal) \code{coords_t} -\item \code{compute_neighbors()}: Compute neighboring coordinates within specified distances. In -order to be computationally feasible, the coordinates' IDs are rasterized before -their actual Euclidean distance is calculated. If coordinates are so close that they -get rasterized to the same cell, the first one is used and a warning is emitted. If -this happens, try again using a lower resolution. - }} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c2f3be0..13f1a2b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,21 +11,22 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // distance_neighbors_cpp -List distance_neighbors_cpp(DataFrame coords_t, double max_distance, double resolution); -RcppExport SEXP _evoland_distance_neighbors_cpp(SEXP coords_tSEXP, SEXP max_distanceSEXP, SEXP resolutionSEXP) { +List distance_neighbors_cpp(DataFrame coords_t, double max_distance, double resolution, bool quiet); +RcppExport SEXP _evoland_distance_neighbors_cpp(SEXP coords_tSEXP, SEXP max_distanceSEXP, SEXP resolutionSEXP, SEXP quietSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< DataFrame >::type coords_t(coords_tSEXP); Rcpp::traits::input_parameter< double >::type max_distance(max_distanceSEXP); Rcpp::traits::input_parameter< double >::type resolution(resolutionSEXP); - rcpp_result_gen = Rcpp::wrap(distance_neighbors_cpp(coords_t, max_distance, resolution)); + Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); + rcpp_result_gen = Rcpp::wrap(distance_neighbors_cpp(coords_t, max_distance, resolution, quiet)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_evoland_distance_neighbors_cpp", (DL_FUNC) &_evoland_distance_neighbors_cpp, 3}, + {"_evoland_distance_neighbors_cpp", (DL_FUNC) &_evoland_distance_neighbors_cpp, 4}, {NULL, NULL, 0} }; diff --git a/src/distances.cpp b/src/distances.cpp index cd41ffb..2e23a21 100644 --- a/src/distances.cpp +++ b/src/distances.cpp @@ -24,17 +24,16 @@ using namespace Rcpp; * - id_coord_neighbor: ID of the neighboring coordinate * - distance_approx: Approximate distance between origin and neighbor * - * @note The function uses an approximate distance calculation based on grid cells. - * If multiple points fall into the same cell, a warning is issued and only the first - * point is retained for that cell. + * @note The function uses an approximate distance calculation based on grid + * cells. If multiple points fall into the same cell, a warning is issued and + * only the first point is retained for that cell. * * @warning Distance calculations are approximate and based on Euclidean distance in * coordinate space. For geographic coordinates, this may not reflect true geodesic distance. */ // [[Rcpp::export]] -List distance_neighbors_cpp(DataFrame coords_t, - double max_distance, - double resolution = 100.0) { +List distance_neighbors_cpp(DataFrame coords_t, double max_distance, + double resolution = 100.0, bool quiet = false) { // Extract columns IntegerVector id_coord = coords_t["id_coord"]; @@ -112,7 +111,25 @@ List distance_neighbors_cpp(DataFrame coords_t, // 5. Find neighbors within max_distance + // Progress reporting setup + int progress_interval = std::max(1000, n_points / 20); // Report every 5% + + if (!quiet) { + Rcpp::Rcout << "\rProgress: 0% (0/" << n_points << " points)" << std::flush; + } + for (int pt_idx = 0; pt_idx < n_points; pt_idx++) { + // Check for user interrupt periodically + if (pt_idx % 1000 == 0) { + Rcpp::checkUserInterrupt(); + } + + // Report progress + if (!quiet && pt_idx > 0 && pt_idx % progress_interval == 0) { + int pct = (int)(100.0 * pt_idx / n_points); + Rcpp::Rcout << "\rProgress: " << pct << "% (" << pt_idx << "/" << n_points + << " points)" << std::flush; + } int origin_id = id_coord[pt_idx]; int origin_row = row_indices[pt_idx]; int origin_col = col_indices[pt_idx]; @@ -160,6 +177,10 @@ List distance_neighbors_cpp(DataFrame coords_t, Named("id_coord_neighbor") = neighbor_ids, Named("distance_approx") = distances_approx ); + if (!quiet) { + Rcpp::Rcout << "\rProgress: 100% (" << n_points << "/" << n_points + << " points)" << std::endl; + } res.attr("class") = CharacterVector::create("data.table", "data.frame"); diff --git a/vignettes/evoland.qmd b/vignettes/evoland.qmd index f06bef8..37da973 100644 --- a/vignettes/evoland.qmd +++ b/vignettes/evoland.qmd @@ -108,7 +108,7 @@ db$commit( value = c(lulc_files$url, lulc_files$md5sum, "BFS Arealstatistik") ), table_name = "reporting_t", - mode = "append" + method = "append" ) zippath <- file.path( @@ -322,7 +322,7 @@ id_coord_keep <- lulc_data_t[, id_coord] db$commit( x = db$coords_t[id_coord %in% id_coord_keep], table_name = "coords_t", - mode = "overwrite" + method = "overwrite" ) ```