| 1 |
#' @title Plot Bipartite Network Metrics by Age |
|
| 2 |
#' @description |
|
| 3 |
#' Creates a faceted line-and-point plot of bipartite community |
|
| 4 |
#' network metrics (e.g. connectance, nestedness, modularity) |
|
| 5 |
#' across time slices (ages). Each metric is shown in its own |
|
| 6 |
#' panel with a free y-axis scale. |
|
| 7 |
#' @param data_network_metrics |
|
| 8 |
#' A data frame or tibble with columns \code{age} (numeric),
|
|
| 9 |
#' \code{metric} (character), and \code{value} (numeric).
|
|
| 10 |
#' Typically the output of |
|
| 11 |
#' \code{\link{compute_network_metrics}()} aggregated across
|
|
| 12 |
#' time slices and with the age column already parsed to numeric. |
|
| 13 |
#' @param title |
|
| 14 |
#' Optional character string for the plot title. |
|
| 15 |
#' Defaults to \code{NULL} (no title).
|
|
| 16 |
#' @param subtitle |
|
| 17 |
#' Optional character string for the plot subtitle. |
|
| 18 |
#' Defaults to \code{NULL} (no subtitle).
|
|
| 19 |
#' @return |
|
| 20 |
#' A \code{ggplot} object.
|
|
| 21 |
#' @details |
|
| 22 |
#' The x-axis is reversed so that older ages appear on the left. |
|
| 23 |
#' Each metric is shown in a separate facet panel with an |
|
| 24 |
#' independent y-axis scale (\code{scales = "free_y"}) because
|
|
| 25 |
#' connectance, nestedness, and modularity occupy different |
|
| 26 |
#' numeric ranges. Each panel shows a coloured line with points. |
|
| 27 |
#' @seealso [compute_network_metrics()] |
|
| 28 |
#' @export |
|
| 29 |
plot_network_metrics_by_age <- function( |
|
| 30 |
data_network_metrics, |
|
| 31 |
title = NULL, |
|
| 32 |
subtitle = NULL) {
|
|
| 33 | 13x |
assertthat::assert_that( |
| 34 | 13x |
base::is.data.frame(data_network_metrics), |
| 35 | 13x |
msg = "'data_network_metrics' must be a data frame." |
| 36 |
) |
|
| 37 | ||
| 38 | 10x |
assertthat::assert_that( |
| 39 | 10x |
base::all( |
| 40 | 10x |
c("age", "metric", "value") %in%
|
| 41 | 10x |
base::names(data_network_metrics) |
| 42 |
), |
|
| 43 | 10x |
msg = paste0( |
| 44 | 10x |
"'data_network_metrics' must have columns", |
| 45 | 10x |
" 'age', 'metric', and 'value'." |
| 46 |
) |
|
| 47 |
) |
|
| 48 | ||
| 49 | 7x |
assertthat::assert_that( |
| 50 | 7x |
base::is.numeric(data_network_metrics[["age"]]), |
| 51 | 7x |
msg = "Column 'age' in 'data_network_metrics' must be numeric." |
| 52 |
) |
|
| 53 | ||
| 54 | 6x |
assertthat::assert_that( |
| 55 | 6x |
base::is.numeric(data_network_metrics[["value"]]), |
| 56 | 6x |
msg = "Column 'value' in 'data_network_metrics' must be numeric." |
| 57 |
) |
|
| 58 | ||
| 59 | 5x |
res <- |
| 60 | 5x |
data_network_metrics |> |
| 61 | 5x |
ggplot2::ggplot( |
| 62 | 5x |
mapping = ggplot2::aes( |
| 63 | 5x |
x = age, |
| 64 | 5x |
y = value, |
| 65 | 5x |
colour = metric, |
| 66 | 5x |
group = metric |
| 67 |
) |
|
| 68 |
) + |
|
| 69 | 5x |
ggplot2::geom_line() + |
| 70 | 5x |
ggplot2::geom_point() + |
| 71 | 5x |
ggplot2::scale_x_continuous( |
| 72 | 5x |
trans = "reverse" |
| 73 |
) + |
|
| 74 | 5x |
ggplot2::facet_wrap( |
| 75 | 5x |
facets = ggplot2::vars(metric), |
| 76 | 5x |
scales = "free_y" |
| 77 |
) + |
|
| 78 | 5x |
ggplot2::labs( |
| 79 | 5x |
title = title, |
| 80 | 5x |
subtitle = subtitle, |
| 81 | 5x |
x = "Age (cal yr BP)", |
| 82 | 5x |
y = "Metric value", |
| 83 | 5x |
colour = "Metric" |
| 84 |
) + |
|
| 85 | 5x |
ggplot2::theme( |
| 86 | 5x |
legend.position = "none" |
| 87 |
) |
|
| 88 | ||
| 89 | 5x |
return(res) |
| 90 |
} |
| 1 |
#' @title Filter Rare Taxa |
|
| 2 |
#' @description |
|
| 3 |
#' Filters out rare taxa from community data based on a minimum proportion |
|
| 4 |
#' threshold. Only taxa meeting or exceeding the threshold are retained. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame containing taxon abundance data. Must include a column |
|
| 7 |
#' named 'pollen_prop' with numeric proportions or abundances. |
|
| 8 |
#' @param minimal_proportion |
|
| 9 |
#' Numeric value between 0 and 1 specifying the minimum proportion |
|
| 10 |
#' threshold for retaining taxa. Default is 0.01 (1%). |
|
| 11 |
#' @return |
|
| 12 |
#' A filtered data frame containing only taxa that meet or exceed the |
|
| 13 |
#' minimal proportion threshold. Preserves all original columns. |
|
| 14 |
#' @details |
|
| 15 |
#' The function validates that minimal_proportion is a numeric value |
|
| 16 |
#' between 0 and 1. After filtering, it checks that at least one taxon |
|
| 17 |
#' remains in the dataset. If no taxa meet the threshold, an error is |
|
| 18 |
#' raised suggesting the threshold may be too high. |
|
| 19 |
#' @export |
|
| 20 |
filter_rare_taxa <- function( |
|
| 21 |
data = NULL, |
|
| 22 |
minimal_proportion = 0.01) {
|
|
| 23 | 43x |
assertthat::assert_that( |
| 24 | 43x |
is.data.frame(data), |
| 25 | 43x |
msg = "data must be a data frame" |
| 26 |
) |
|
| 27 | ||
| 28 | 38x |
assertthat::assert_that( |
| 29 | 38x |
is.numeric(minimal_proportion), |
| 30 | 38x |
msg = "minimal_proportion must be a number" |
| 31 |
) |
|
| 32 | ||
| 33 | 34x |
assertthat::assert_that( |
| 34 | 34x |
minimal_proportion > 0, |
| 35 | 34x |
msg = "minimal_proportion must be greater than 0" |
| 36 |
) |
|
| 37 | ||
| 38 | 31x |
assertthat::assert_that( |
| 39 | 31x |
minimal_proportion <= 1, |
| 40 | 31x |
msg = "minimal_proportion must be less than or equal to 1" |
| 41 |
) |
|
| 42 | ||
| 43 | 28x |
res <- |
| 44 | 28x |
data |> |
| 45 | 28x |
dplyr::filter(pollen_prop >= minimal_proportion) |
| 46 | ||
| 47 | ||
| 48 | 28x |
assertthat::assert_that( |
| 49 | 28x |
nrow(res) > 0, |
| 50 | 28x |
msg = paste( |
| 51 | 28x |
"No taxa found in data. Please check the input data.", |
| 52 | 28x |
"The minimal_proportion is too high." |
| 53 |
) |
|
| 54 |
) |
|
| 55 | ||
| 56 | 24x |
return(res) |
| 57 |
} |
| 1 |
#' @title Get predictor collinearity |
|
| 2 |
#' @description |
|
| 3 |
#' Analyses collinearity among abiotic predictors in a long-format |
|
| 4 |
#' data frame and returns a `collinear_output` object produced by |
|
| 5 |
#' `collinear::collinear()`. The function pivots `data_source` from |
|
| 6 |
#' long to wide format (one column per variable), removes any `age` |
|
| 7 |
#' column, screens out zero-variance columns, and then performs the |
|
| 8 |
#' collinearity analysis. |
|
| 9 |
#' @param data_source |
|
| 10 |
#' A data frame in long format containing at minimum the columns |
|
| 11 |
#' `abiotic_variable_name` (character, predictor names) and |
|
| 12 |
#' `abiotic_value` (numeric, predictor values). An optional `age` |
|
| 13 |
#' column is silently dropped before analysis. |
|
| 14 |
#' @return |
|
| 15 |
#' A `collinear_output` object as returned by |
|
| 16 |
#' `collinear::collinear()`. The object contains a `result` element |
|
| 17 |
#' with a `selection` character vector of the non-collinear predictor |
|
| 18 |
#' names that were retained. |
|
| 19 |
#' @details |
|
| 20 |
#' The function validates inputs with `assertthat` and performs |
|
| 21 |
#' post-hoc assertions on the output to guarantee structural |
|
| 22 |
#' integrity before returning. Missing values are filled with `NA` |
|
| 23 |
#' when pivoting to wide format. The `age` column is excluded |
|
| 24 |
#' because it is a sampling dimension rather than a predictor. |
|
| 25 |
#' Predictor names are captured before pivoting so that the |
|
| 26 |
#' zero-variance check is scoped to predictor columns only ā |
|
| 27 |
#' ID or metadata columns that survive the pivot are never passed |
|
| 28 |
#' to `collinear::collinear()`. Any predictor whose standard |
|
| 29 |
#' deviation is zero across all samples is dropped and reported via |
|
| 30 |
#' `cli::cli_warn()`. If no predictor with non-zero variance |
|
| 31 |
#' remains, the function aborts via `cli::cli_abort()`. |
|
| 32 |
#' @seealso |
|
| 33 |
#' [collinear::collinear()] for the underlying collinearity method, |
|
| 34 |
#' [get_abiotic_data()] for producing the expected input format. |
|
| 35 |
#' @export |
|
| 36 |
get_predictor_collinearity <- function(data_source) {
|
|
| 37 | 13x |
assertthat::assert_that( |
| 38 | 13x |
is.data.frame(data_source), |
| 39 | 13x |
msg = "data_source must be a data frame" |
| 40 |
) |
|
| 41 | ||
| 42 | 10x |
assertthat::assert_that( |
| 43 | 10x |
all( |
| 44 | 10x |
c("abiotic_variable_name", "abiotic_value") %in%
|
| 45 | 10x |
colnames(data_source) |
| 46 |
), |
|
| 47 | 10x |
msg = paste0( |
| 48 | 10x |
"data_source must contain columns", |
| 49 | 10x |
" 'abiotic_variable_name' and 'abiotic_value'" |
| 50 |
) |
|
| 51 |
) |
|
| 52 | ||
| 53 |
# Capture predictor names before pivoting so the variation check |
|
| 54 |
# is restricted to predictors only (not ID/metadata columns) |
|
| 55 | 7x |
vec_predictor_names <- |
| 56 | 7x |
dplyr::pull(data_source, abiotic_variable_name) |> |
| 57 | 7x |
base::unique() |> |
| 58 | 7x |
base::setdiff(c("age"))
|
| 59 | ||
| 60 | 7x |
data_wide <- |
| 61 | 7x |
data_source |> |
| 62 | 7x |
tidyr::pivot_wider( |
| 63 | 7x |
names_from = abiotic_variable_name, |
| 64 | 7x |
values_from = abiotic_value, |
| 65 | 7x |
values_fill = list(abiotic_value = NA) |
| 66 |
) |> |
|
| 67 | 7x |
dplyr::select( |
| 68 | 7x |
!dplyr::any_of(c("age"))
|
| 69 |
) |
|
| 70 | ||
| 71 | 7x |
vec_has_variation <- |
| 72 | 7x |
data_wide |> |
| 73 | 7x |
dplyr::select( |
| 74 | 7x |
dplyr::any_of(vec_predictor_names) |
| 75 |
) |> |
|
| 76 | 7x |
purrr::map_lgl( |
| 77 | 7x |
.f = ~ stats::sd(.x, na.rm = TRUE) > 0 |
| 78 |
) |
|
| 79 | ||
| 80 | 7x |
vec_cols_zero_var <- |
| 81 | 7x |
base::names(vec_has_variation)[!vec_has_variation] |
| 82 | ||
| 83 |
if ( |
|
| 84 | 7x |
base::length(vec_cols_zero_var) > 0 |
| 85 |
) {
|
|
| 86 | 2x |
cli::cli_warn( |
| 87 | 2x |
c( |
| 88 | 2x |
"!" = paste0( |
| 89 | 2x |
"{base::length(vec_cols_zero_var)} zero-variance column(s) ",
|
| 90 | 2x |
"dropped before collinearity analysis:" |
| 91 |
), |
|
| 92 | 2x |
"i" = "{.val {vec_cols_zero_var}}"
|
| 93 |
) |
|
| 94 |
) |
|
| 95 |
} |
|
| 96 | ||
| 97 | 7x |
vec_cols_with_variation <- |
| 98 | 7x |
base::names(vec_has_variation)[vec_has_variation] |
| 99 | ||
| 100 |
if ( |
|
| 101 | 7x |
base::length(vec_cols_with_variation) == 0L |
| 102 |
) {
|
|
| 103 | 1x |
cli::cli_abort( |
| 104 | 1x |
c( |
| 105 | 1x |
"x" = paste0( |
| 106 | 1x |
"No columns with non-zero variance remain after ", |
| 107 | 1x |
"removing constant columns." |
| 108 |
), |
|
| 109 | 1x |
"i" = paste0( |
| 110 | 1x |
"All {base::length(vec_predictor_names)} predictor column(s)",
|
| 111 | 1x |
" have zero variance." |
| 112 |
) |
|
| 113 |
) |
|
| 114 |
) |
|
| 115 |
} |
|
| 116 | ||
| 117 | 6x |
res <- |
| 118 | 6x |
data_wide |> |
| 119 | 6x |
dplyr::select( |
| 120 | 6x |
dplyr::all_of(vec_cols_with_variation) |
| 121 |
) |> |
|
| 122 | 6x |
collinear::collinear(quiet = TRUE) |
| 123 | ||
| 124 | 6x |
assertthat::assert_that( |
| 125 | 6x |
inherits(res, "collinear_output"), |
| 126 | 6x |
msg = paste0( |
| 127 | 6x |
"Output of collinear::collinear()", |
| 128 | 6x |
" should be a collinear_output object" |
| 129 |
) |
|
| 130 |
) |
|
| 131 | ||
| 132 | 6x |
assertthat::assert_that( |
| 133 | 6x |
"result" %in% names(res), |
| 134 | 6x |
msg = paste0( |
| 135 | 6x |
"Output of collinear::collinear()", |
| 136 | 6x |
" should contain a 'result' element" |
| 137 |
) |
|
| 138 |
) |
|
| 139 | ||
| 140 | 6x |
assertthat::assert_that( |
| 141 | 6x |
"selection" %in% names(res$result), |
| 142 | 6x |
msg = paste0( |
| 143 | 6x |
"Output of collinear::collinear()", |
| 144 | 6x |
" should contain a 'selection' element in the 'result'" |
| 145 |
) |
|
| 146 |
) |
|
| 147 | ||
| 148 | 6x |
assertthat::assert_that( |
| 149 | 6x |
is.character(res$result$selection), |
| 150 | 6x |
length(res$result$selection) > 0, |
| 151 | 6x |
msg = "Selection of predictors should be a non-empty character vector" |
| 152 |
) |
|
| 153 | ||
| 154 | 6x |
return(res) |
| 155 |
} |
| 1 |
#' @title Compute Standard Errors for a Fitted sjSDM Model |
|
| 2 |
#' @description |
|
| 3 |
#' Post-hoc calculation of standard errors for a fitted sjSDM |
|
| 4 |
#' model using `sjSDM::getSe()`. This is intentionally separated |
|
| 5 |
#' from `fit_jsdm_model()` so that the GPU-based model fitting |
|
| 6 |
#' step and the CPU-based SE computation step can be run |
|
| 7 |
#' independently, allowing the SE step to exploit all available |
|
| 8 |
#' CPU cores without being constrained by the GPU device setting. |
|
| 9 |
#' @param mod_jsdm |
|
| 10 |
#' An object of class `sjSDM` returned by `fit_jsdm_model()` or |
|
| 11 |
#' `sjSDM::sjSDM()`. Must not be `NULL`. |
|
| 12 |
#' @param parallel |
|
| 13 |
#' Number of CPU cores to use for the data loader during SE |
|
| 14 |
#' computation. Passed to the `parallel` argument of |
|
| 15 |
#' `sjSDM::getSe()`. Default is `0L` (no parallelisation). |
|
| 16 |
#' Use `config.model_fitting$n_cores` from the project |
|
| 17 |
#' configuration to exploit all available cores. |
|
| 18 |
#' @param step_size |
|
| 19 |
#' Batch size for stochastic gradient descent used during SE |
|
| 20 |
#' computation. Passed to `sjSDM::getSe()`. Default is `NULL`, |
|
| 21 |
#' which lets sjSDM choose automatically. |
|
| 22 |
#' @param verbose |
|
| 23 |
#' Logical scalar. If `FALSE` (default), all console output |
|
| 24 |
#' from `sjSDM::getSe()` ā including Python/reticulate stdout |
|
| 25 |
#' ā is suppressed. Set to `TRUE` to see progress messages, |
|
| 26 |
#' which is recommended when running inside a pipeline target |
|
| 27 |
#' to monitor long-running SE computation. |
|
| 28 |
#' @return |
|
| 29 |
#' The input `mod_jsdm` object with its `$se` field populated |
|
| 30 |
#' with the computed standard errors. |
|
| 31 |
#' @details |
|
| 32 |
#' `sjSDM::getSe()` uses CPU for SE computation regardless of the |
|
| 33 |
#' device used for model fitting. Separating SE computation into |
|
| 34 |
#' its own pipeline target therefore allows the caller to pass |
|
| 35 |
#' the full number of available CPU cores via `parallel` without |
|
| 36 |
#' conflict with the GPU device setting used during fitting. |
|
| 37 |
#' |
|
| 38 |
#' When `verbose = FALSE`, output is captured with |
|
| 39 |
#' `reticulate::py_capture_output()` (Python stdout) and |
|
| 40 |
#' `utils::capture.output()` (R stdout) so the console remains |
|
| 41 |
#' clean. When `verbose = TRUE`, both output streams are printed |
|
| 42 |
#' for monitoring progress. |
|
| 43 |
#' |
|
| 44 |
#' If SE computation fails, the function raises an error with |
|
| 45 |
#' an informative message. |
|
| 46 |
#' @seealso |
|
| 47 |
#' `sjSDM::getSe`, `fit_jsdm_model` |
|
| 48 |
#' @export |
|
| 49 |
compute_jsdm_se <- function( |
|
| 50 |
mod_jsdm = NULL, |
|
| 51 |
parallel = 0L, |
|
| 52 |
step_size = NULL, |
|
| 53 |
verbose = FALSE) {
|
|
| 54 | 16x |
assertthat::assert_that( |
| 55 | 16x |
inherits(mod_jsdm, "sjSDM"), |
| 56 | 16x |
msg = paste0( |
| 57 | 16x |
"`mod_jsdm` must be an object of class 'sjSDM'.", |
| 58 | 16x |
" Use `fit_jsdm_model()` to produce one." |
| 59 |
) |
|
| 60 |
) |
|
| 61 | ||
| 62 | 13x |
assertthat::assert_that( |
| 63 | 13x |
is.numeric(parallel), |
| 64 | 13x |
length(parallel) == 1L, |
| 65 | 13x |
parallel >= 0L, |
| 66 | 13x |
msg = paste0( |
| 67 | 13x |
"`parallel` must be a single non-negative numeric value" |
| 68 |
) |
|
| 69 |
) |
|
| 70 | ||
| 71 | 10x |
assertthat::assert_that( |
| 72 | 10x |
is.null(step_size) || |
| 73 | 10x |
(is.numeric(step_size) && length(step_size) == 1L && |
| 74 | 10x |
step_size > 0L), |
| 75 | 10x |
msg = paste0( |
| 76 | 10x |
"`step_size` must be NULL or a single positive numeric value" |
| 77 |
) |
|
| 78 |
) |
|
| 79 | ||
| 80 | 7x |
assertthat::assert_that( |
| 81 | 7x |
is.logical(verbose), |
| 82 | 7x |
length(verbose) == 1L, |
| 83 | 7x |
!is.na(verbose), |
| 84 | 7x |
msg = paste0( |
| 85 | 7x |
"`verbose` must be a single non-NA logical value" |
| 86 |
) |
|
| 87 |
) |
|
| 88 | ||
| 89 |
if ( |
|
| 90 | 2x |
isTRUE(verbose) |
| 91 |
) {
|
|
| 92 | ! |
mod_with_se <- |
| 93 | ! |
sjSDM::getSe( |
| 94 | ! |
object = mod_jsdm, |
| 95 | ! |
step_size = step_size, |
| 96 | ! |
parallel = as.integer(parallel) |
| 97 |
) |
|
| 98 |
} else {
|
|
| 99 |
# Suppress both R stdout and Python/reticulate stdout. |
|
| 100 |
# Note: reticulate::py_capture_output() is needed because |
|
| 101 |
# sjSDM::getSe() routes Python output through reticulate, |
|
| 102 |
# which bypasses utils::capture.output(). The outer wrapper |
|
| 103 |
# intercepts Python-side output; the inner wrapper |
|
| 104 |
# intercepts any R-level stdout from the call. |
|
| 105 | 2x |
reticulate::py_capture_output( |
| 106 | 2x |
utils::capture.output( |
| 107 |
{
|
|
| 108 | 2x |
mod_with_se <- |
| 109 | 2x |
sjSDM::getSe( |
| 110 | 2x |
object = mod_jsdm, |
| 111 | 2x |
step_size = step_size, |
| 112 | 2x |
parallel = as.integer(parallel) |
| 113 |
) |
|
| 114 |
}, |
|
| 115 | 2x |
type = "output" |
| 116 |
) |
|
| 117 |
) |
|
| 118 |
} |
|
| 119 | ||
| 120 | 2x |
return(mod_with_se) |
| 121 |
} |
| 1 |
#' @title Classify Taxonomic Resolution |
|
| 2 |
#' @description |
|
| 3 |
#' Classifies taxa in a data frame to a specified taxonomic resolution |
|
| 4 |
#' using a classification table, and aggregates pollen proportions |
|
| 5 |
#' accordingly. Supported resolutions are `kingdom`, `phylum`, `class`, |
|
| 6 |
#' `order`, `family`, `genus`, and `species`. |
|
| 7 |
#' @param data |
|
| 8 |
#' A data frame containing taxon data with columns including 'taxon', |
|
| 9 |
#' 'dataset_name', 'age', and 'pollen_prop'. |
|
| 10 |
#' @param data_classification_table |
|
| 11 |
#' A data frame mapping 'sel_name' to taxonomic levels. Must contain |
|
| 12 |
#' at least one rank column at or below `taxonomic_resolution` |
|
| 13 |
#' (e.g. 'family', 'genus', 'species'). |
|
| 14 |
#' @param taxonomic_resolution |
|
| 15 |
#' A character string specifying the finest taxonomic level to use. |
|
| 16 |
#' Must be one of `'kingdom'`, `'phylum'`, `'class'`, `'order'`, |
|
| 17 |
#' `'family'`, `'genus'`, or `'species'`. Taxa will be classified at |
|
| 18 |
#' this rank if possible, or at the coarsest available rank below it |
|
| 19 |
#' if not (fallback behaviour). |
|
| 20 |
#' @return |
|
| 21 |
#' A data frame with taxa classified to the finest available rank at |
|
| 22 |
#' or below `taxonomic_resolution` and pollen proportions aggregated |
|
| 23 |
#' accordingly. The output preserves all dataset_name and age |
|
| 24 |
#' combinations for true negatives. |
|
| 25 |
#' @details |
|
| 26 |
#' Performs a left join to map taxa to all available rank columns up |
|
| 27 |
#' to and including `taxonomic_resolution`. The finest non-NA rank is |
|
| 28 |
#' then selected via `dplyr::coalesce()` applied from finest to |
|
| 29 |
#' coarsest. This means a taxon known only to family when genus is |
|
| 30 |
#' requested will be assigned to its family name rather than dropped. |
|
| 31 |
#' Taxa with no valid classification at any available rank are removed |
|
| 32 |
#' with a `cli::cli_warn()` warning. Taxa that fall back to a coarser |
|
| 33 |
#' rank are reported via `cli::cli_inform()`. Ranks finer than |
|
| 34 |
#' `taxonomic_resolution` (e.g. species when genus is requested) are |
|
| 35 |
#' never used, even when present in the classification table. The |
|
| 36 |
#' NA-drop step prevents a column literally named NA appearing in the |
|
| 37 |
#' community matrix produced by downstream `pivot_wider()` calls. |
|
| 38 |
#' @seealso [filter_non_plantae_taxa()], [filter_rare_taxa()] |
|
| 39 |
#' @export |
|
| 40 |
classify_taxonomic_resolution <- function( |
|
| 41 |
data, |
|
| 42 |
data_classification_table, |
|
| 43 |
taxonomic_resolution) {
|
|
| 44 | 24x |
assertthat::assert_that( |
| 45 | 24x |
is.data.frame(data), |
| 46 | 24x |
msg = "data must be a data frame" |
| 47 |
) |
|
| 48 | ||
| 49 | 21x |
assertthat::assert_that( |
| 50 | 21x |
all( |
| 51 | 21x |
c("taxon", "dataset_name", "age", "pollen_prop") %in%
|
| 52 | 21x |
colnames(data) |
| 53 |
), |
|
| 54 | 21x |
msg = paste( |
| 55 | 21x |
"data must contain columns:", |
| 56 | 21x |
"taxon, dataset_name, age, and pollen_prop" |
| 57 |
) |
|
| 58 |
) |
|
| 59 | ||
| 60 | 20x |
assertthat::assert_that( |
| 61 | 20x |
is.data.frame(data_classification_table), |
| 62 | 20x |
msg = "data_classification_table must be a data frame" |
| 63 |
) |
|
| 64 | ||
| 65 | 17x |
assertthat::assert_that( |
| 66 | 17x |
is.character(taxonomic_resolution) && |
| 67 | 17x |
length(taxonomic_resolution) == 1, |
| 68 | 17x |
msg = "taxonomic_resolution must be a single character string" |
| 69 |
) |
|
| 70 | ||
| 71 | 15x |
vec_all_ranks <- c( |
| 72 | 15x |
"kingdom", "phylum", "class", "order", |
| 73 | 15x |
"family", "genus", "species" |
| 74 |
) |
|
| 75 | ||
| 76 | 15x |
assertthat::assert_that( |
| 77 | 15x |
taxonomic_resolution %in% vec_all_ranks, |
| 78 | 15x |
msg = paste( |
| 79 | 15x |
"taxonomic_resolution must be one of", |
| 80 | 15x |
"'kingdom', 'phylum', 'class', 'order',", |
| 81 | 15x |
"'family', 'genus', or 'species'" |
| 82 |
) |
|
| 83 |
) |
|
| 84 | ||
| 85 |
# All ranks from kingdom up to and including the requested level. |
|
| 86 |
# These are the only ranks eligible for fallback assignment. |
|
| 87 | 14x |
vec_target_ranks <- |
| 88 | 14x |
vec_all_ranks[ |
| 89 | 14x |
base::seq_len( |
| 90 | 14x |
base::which(vec_all_ranks == taxonomic_resolution) |
| 91 |
) |
|
| 92 |
] |
|
| 93 | ||
| 94 |
# Subset to ranks actually present in the classification table. |
|
| 95 | 14x |
vec_available_ranks <- |
| 96 | 14x |
base::intersect( |
| 97 | 14x |
vec_target_ranks, |
| 98 | 14x |
base::colnames(data_classification_table) |
| 99 |
) |
|
| 100 | ||
| 101 | 14x |
assertthat::assert_that( |
| 102 | 14x |
base::length(vec_available_ranks) > 0, |
| 103 | 14x |
msg = paste0( |
| 104 | 14x |
"data_classification_table must contain at least one ", |
| 105 | 14x |
"rank column at or below '", |
| 106 | 14x |
taxonomic_resolution, |
| 107 | 14x |
"'. Expected one of: ", |
| 108 | 14x |
paste(vec_target_ranks, collapse = ", ") |
| 109 |
) |
|
| 110 |
) |
|
| 111 | ||
| 112 | 13x |
data_classification_table_sub <- |
| 113 | 13x |
data_classification_table |> |
| 114 | 13x |
dplyr::select( |
| 115 | 13x |
sel_name, |
| 116 | 13x |
dplyr::all_of(vec_available_ranks) |
| 117 |
) |
|
| 118 | ||
| 119 |
# Join all available rank columns, then coalesce from finest to |
|
| 120 |
# coarsest so each taxon gets the most-specific non-NA label. |
|
| 121 | 13x |
data_joined <- |
| 122 | 13x |
data |> |
| 123 | 13x |
dplyr::left_join( |
| 124 | 13x |
data_classification_table_sub, |
| 125 | 13x |
by = dplyr::join_by("taxon" == "sel_name")
|
| 126 |
) |> |
|
| 127 | 13x |
dplyr::select(-taxon) |> |
| 128 | 13x |
dplyr::mutate( |
| 129 | 13x |
taxon = dplyr::coalesce( |
| 130 | 13x |
!!!rlang::syms(base::rev(vec_available_ranks)) |
| 131 |
) |
|
| 132 |
) |
|
| 133 | ||
| 134 |
# Report taxa that fell back to a coarser rank. |
|
| 135 |
# These are kept in the data but flagged informally. |
|
| 136 | 13x |
if (taxonomic_resolution %in% vec_available_ranks) {
|
| 137 | 13x |
n_fallback <- |
| 138 | 13x |
data_joined |> |
| 139 | 13x |
dplyr::filter( |
| 140 | 13x |
base::is.na(!!rlang::sym(taxonomic_resolution)), |
| 141 | 13x |
!base::is.na(taxon) |
| 142 |
) |> |
|
| 143 | 13x |
dplyr::distinct(taxon) |> |
| 144 | 13x |
base::nrow() |
| 145 | ||
| 146 | 13x |
if (n_fallback > 0) {
|
| 147 | 3x |
cli::cli_inform( |
| 148 | 3x |
c( |
| 149 | 3x |
"i" = paste0( |
| 150 | 3x |
"{n_fallback} taxon/taxa could not be classified ",
|
| 151 | 3x |
"to '{taxonomic_resolution}' and ",
|
| 152 | 3x |
"{?was/were} assigned to a coarser rank."
|
| 153 |
) |
|
| 154 |
) |
|
| 155 |
) |
|
| 156 |
} |
|
| 157 |
} |
|
| 158 | ||
| 159 | 13x |
data_classified <- |
| 160 | 13x |
data_joined |> |
| 161 | 13x |
dplyr::select(-dplyr::all_of(vec_available_ranks)) |
| 162 | ||
| 163 |
# Warn and drop taxa with no valid classification at any available |
|
| 164 |
# rank. Without this filter, the NA taxon flows into pivot_wider() |
|
| 165 |
# and creates a column literally named NA in the community matrix. |
|
| 166 | 13x |
vec_na_taxa <- |
| 167 | 13x |
data_classified |> |
| 168 | 13x |
dplyr::filter(base::is.na(taxon)) |> |
| 169 | 13x |
dplyr::distinct(taxon) |> |
| 170 | 13x |
base::nrow() |
| 171 | ||
| 172 | 13x |
if (vec_na_taxa > 0) {
|
| 173 | 1x |
cli::cli_warn( |
| 174 | 1x |
c( |
| 175 | 1x |
"!" = paste0( |
| 176 | 1x |
"{vec_na_taxa} taxon/taxa ",
|
| 177 | 1x |
"ha{?s/ve} no classification at any available ",
|
| 178 | 1x |
"rank up to '{taxonomic_resolution}' and ",
|
| 179 | 1x |
"{?was/were} dropped."
|
| 180 |
), |
|
| 181 | 1x |
"i" = paste0( |
| 182 | 1x |
"Check the classification table for missing ", |
| 183 | 1x |
"rank values up to '{taxonomic_resolution}'."
|
| 184 |
) |
|
| 185 |
) |
|
| 186 |
) |
|
| 187 |
} |
|
| 188 | ||
| 189 | 13x |
data_classified <- |
| 190 | 13x |
data_classified |> |
| 191 | 13x |
dplyr::filter(!base::is.na(taxon)) |
| 192 | ||
| 193 |
# make dummy table with all dataset_name and age combinations |
|
| 194 |
# this is needed to ensure that all combinations are present in the |
|
| 195 |
# final output to preserve true negative values |
|
| 196 | 13x |
data_dataset_age_cross_ref <- |
| 197 | 13x |
data_classified |> |
| 198 | 13x |
dplyr::distinct(dataset_name, age, taxon) |
| 199 | ||
| 200 | 13x |
res <- |
| 201 | 13x |
data_classified |> |
| 202 | 13x |
tidyr::drop_na(pollen_prop) |> |
| 203 | 13x |
dplyr::group_by( |
| 204 | 13x |
dataset_name, age, taxon |
| 205 |
) |> |
|
| 206 | 13x |
dplyr::summarise( |
| 207 | 13x |
.groups = "drop", |
| 208 | 13x |
pollen_prop = sum(pollen_prop) |
| 209 |
) |> |
|
| 210 | 13x |
dplyr::full_join( |
| 211 | 13x |
data_dataset_age_cross_ref, |
| 212 | 13x |
by = c("dataset_name", "age", "taxon")
|
| 213 |
) |> |
|
| 214 | 13x |
dplyr::arrange(age, dataset_name, taxon) |> |
| 215 | 13x |
dplyr::select( |
| 216 | 13x |
names(data) |
| 217 |
) |
|
| 218 | ||
| 219 | 13x |
return(res) |
| 220 |
} |
| 1 |
#' @title Fit an sjSDM Model |
|
| 2 |
#' @description |
|
| 3 |
#' Fits a joint Species Distribution Model (jSDM) using the sjSDM |
|
| 4 |
#' package with specified abiotic, spatial, and error family |
|
| 5 |
#' configurations. |
|
| 6 |
#' @param data_to_fit |
|
| 7 |
#' A list containing the data to fit the model. Must include: |
|
| 8 |
#' - `data_community_to_fit`: matrix of community composition |
|
| 9 |
#' (constant-presence taxa already removed by |
|
| 10 |
#' `filter_constant_taxa()`) |
|
| 11 |
#' - `data_abiotic_to_fit`: data frame of abiotic variables, |
|
| 12 |
#' already scaled by `scale_abiotic_for_fit()` |
|
| 13 |
#' - `data_spatial_to_fit`: data frame of spatial predictors, |
|
| 14 |
#' already scaled by `scale_spatial_for_fit()`. Required |
|
| 15 |
#' only when `spatial_method` is not `"none"`. |
|
| 16 |
#' @param sel_abiotic_formula |
|
| 17 |
#' A formula object specifying the abiotic (environmental) predictors |
|
| 18 |
#' @param abiotic_method |
|
| 19 |
#' Method for modeling abiotic effects. One of "linear" (default) or |
|
| 20 |
#' "DNN" (deep neural network) |
|
| 21 |
#' @param sel_spatial_formula |
|
| 22 |
#' A formula object specifying the spatial predictors. Defaults to an |
|
| 23 |
#' interaction between longitude and latitude |
|
| 24 |
#' (`~ 0 + coord_long:coord_lat`). Only used if |
|
| 25 |
#' `spatial_method` is `"linear"` or `"DNN"`. |
|
| 26 |
#' @param spatial_method |
|
| 27 |
#' Method for modeling spatial effects. One of `"linear"` (default), |
|
| 28 |
#' `"DNN"` (deep neural network), or `"none"` (no spatial structure) |
|
| 29 |
#' @param error_family |
|
| 30 |
#' Error family distribution. One of "gaussian" (default) or "binomial". |
|
| 31 |
#' If "binomial", community data is converted to presence/absence and |
|
| 32 |
#' a probit link is used |
|
| 33 |
#' @param device |
|
| 34 |
#' Computing device to use. One of "cpu" (default) or "gpu" |
|
| 35 |
#' @param compute_se |
|
| 36 |
#' Logical indicating whether to compute standard errors inline |
|
| 37 |
#' during model fitting. Default is `FALSE`. Prefer the |
|
| 38 |
#' post-hoc approach via `compute_jsdm_se()` in a separate |
|
| 39 |
#' pipeline target, which allows CPU parallelisation |
|
| 40 |
#' independent of the GPU device setting. |
|
| 41 |
#' @param parallel |
|
| 42 |
#' Number of CPU cores to use for parallel processing. |
|
| 43 |
#' Only applicable if `device = "cpu"`. Default is `0L` |
|
| 44 |
#' (no parallelisation). |
|
| 45 |
#' @param ... |
|
| 46 |
#' Additional arguments passed to `sjSDM::sjSDM()` (e.g. |
|
| 47 |
#' `sampling`, `step_size`, `seed`). Do NOT pass `iter`, |
|
| 48 |
#' `control` or `early_stopping` directly ā use the dedicated |
|
| 49 |
#' `iter` and `n_early_stopping` parameters instead. |
|
| 50 |
#' @param iter |
|
| 51 |
#' Positive integer. Number of training epochs. Default is `100L`. |
|
| 52 |
#' @param n_early_stopping |
|
| 53 |
#' Early stopping patience. Controls how many consecutive epochs |
|
| 54 |
#' without improvement are tolerated before training is halted. |
|
| 55 |
#' Three accepted values: |
|
| 56 |
#' - `NULL` (default): auto-compute as `round(iter * 0.20)`, i.e. |
|
| 57 |
#' at least 20 \% of the epoch budget. |
|
| 58 |
#' - `0L` or negative integer: disables early stopping entirely. |
|
| 59 |
#' - Positive integer: uses `max(value, round(iter * 0.20))` to |
|
| 60 |
#' ensure patience is never set below 20 \% of `iter`. Passed |
|
| 61 |
#' as `early_stopping_training` in `sjSDM::sjSDMControl()`. |
|
| 62 |
#' @return |
|
| 63 |
#' An object of class sjSDM containing the fitted model |
|
| 64 |
#' @details |
|
| 65 |
#' This function prepares the data and fits a joint Species Distribution |
|
| 66 |
#' Model using the sjSDM package. The spatial structure is modeled using |
|
| 67 |
#' an interaction term between longitude and latitude coordinates. When |
|
| 68 |
#' binomial error family is specified, the community data is converted |
|
| 69 |
#' to binary presence/absence data. |
|
| 70 |
#' |
|
| 71 |
#' Standard error computation (`compute_se = TRUE`) may fail with certain |
|
| 72 |
#' model configurations, particularly when using DNN methods or complex |
|
| 73 |
#' spatial structures. If SE computation fails, the model will still be |
|
| 74 |
#' returned with a warning. |
|
| 75 |
#' @seealso sjSDM::sjSDM, sjSDM::linear, sjSDM::DNN, compute_jsdm_se |
|
| 76 |
#' @export |
|
| 77 |
fit_jsdm_model <- function( |
|
| 78 |
data_to_fit = NULL, |
|
| 79 |
sel_abiotic_formula = NULL, |
|
| 80 |
abiotic_method = c("linear", "DNN"),
|
|
| 81 |
sel_spatial_formula = as.formula(~ 0 + coord_long:coord_lat), |
|
| 82 |
spatial_method = c("linear", "DNN", "none"),
|
|
| 83 |
error_family = c("gaussian", "binomial"),
|
|
| 84 |
device = c("cpu", "gpu"),
|
|
| 85 |
parallel = 0L, |
|
| 86 |
compute_se = FALSE, |
|
| 87 |
..., |
|
| 88 |
iter = 100L, |
|
| 89 |
n_early_stopping = NULL, |
|
| 90 |
verbose = FALSE) {
|
|
| 91 |
# Validate `data_to_fit` structure |
|
| 92 | 48x |
assertthat::assert_that( |
| 93 | 48x |
is.list(data_to_fit), |
| 94 | 48x |
msg = "data_to_fit must be a list" |
| 95 |
) |
|
| 96 | ||
| 97 | 45x |
assertthat::assert_that( |
| 98 | 45x |
"data_community_to_fit" %in% names(data_to_fit), |
| 99 | 45x |
msg = "`data_to_fit` must be a list containing `data_community_to_fit`" |
| 100 |
) |
|
| 101 | ||
| 102 | 42x |
assertthat::assert_that( |
| 103 | 42x |
"data_abiotic_to_fit" %in% names(data_to_fit), |
| 104 | 42x |
msg = "`data_to_fit` must be a list containing `data_abiotic_to_fit`" |
| 105 |
) |
|
| 106 | ||
| 107 |
# Extract data components |
|
| 108 | 42x |
data_community <- |
| 109 | 42x |
data_to_fit |> |
| 110 | 42x |
purrr::chuck("data_community_to_fit")
|
| 111 | ||
| 112 | 42x |
data_abiotic <- |
| 113 | 42x |
data_to_fit |> |
| 114 | 42x |
purrr::chuck("data_abiotic_to_fit")
|
| 115 | ||
| 116 |
# Validate extracted data types |
|
| 117 | 42x |
assertthat::assert_that( |
| 118 | 42x |
is.matrix(data_community), |
| 119 | 42x |
msg = "data_community must be a matrix" |
| 120 |
) |
|
| 121 | ||
| 122 | 40x |
assertthat::assert_that( |
| 123 | 40x |
is.data.frame(data_abiotic), |
| 124 | 40x |
msg = "data_abiotic must be a data frame" |
| 125 |
) |
|
| 126 | ||
| 127 |
# Validate formula arguments |
|
| 128 | 38x |
assertthat::assert_that( |
| 129 | 38x |
class(sel_abiotic_formula) == "formula", |
| 130 | 38x |
msg = "sel_abiotic_formula must be a formula object" |
| 131 |
) |
|
| 132 | ||
| 133 | 34x |
assertthat::assert_that( |
| 134 | 34x |
class(sel_spatial_formula) == "formula", |
| 135 | 34x |
msg = "sel_spatial_formula must be a formula object" |
| 136 |
) |
|
| 137 | ||
| 138 |
# Validate and match character arguments |
|
| 139 | 34x |
assertthat::assert_that( |
| 140 | 34x |
any(abiotic_method %in% c("linear", "DNN")),
|
| 141 | 34x |
msg = "abiotic_method must be either 'linear' or 'DNN'" |
| 142 |
) |
|
| 143 | ||
| 144 | 33x |
abiotic_method <- match.arg(abiotic_method) |
| 145 | ||
| 146 | 33x |
assertthat::assert_that( |
| 147 | 33x |
any(spatial_method %in% c("linear", "DNN", "none")),
|
| 148 | 33x |
msg = "spatial_method must be either 'linear', 'DNN', or 'none'" |
| 149 |
) |
|
| 150 | ||
| 151 | 33x |
spatial_method <- match.arg(spatial_method) |
| 152 | ||
| 153 |
# Extract and validate spatial data when needed ----- |
|
| 154 |
# Note: data_spatial_to_fit is pre-scaled by |
|
| 155 |
# scale_spatial_for_fit(); no additional scaling applied. |
|
| 156 |
if ( |
|
| 157 | 33x |
spatial_method %in% c("linear", "DNN")
|
| 158 |
) {
|
|
| 159 | 3x |
assertthat::assert_that( |
| 160 | 3x |
"data_spatial_to_fit" %in% names(data_to_fit), |
| 161 | 3x |
msg = paste0( |
| 162 | 3x |
"`data_to_fit` must contain `data_spatial_to_fit`", |
| 163 | 3x |
" when spatial_method is not 'none'" |
| 164 |
) |
|
| 165 |
) |
|
| 166 | ||
| 167 | 2x |
data_spatial <- |
| 168 | 2x |
data_to_fit |> |
| 169 | 2x |
purrr::chuck("data_spatial_to_fit")
|
| 170 | ||
| 171 | 2x |
assertthat::assert_that( |
| 172 | 2x |
is.data.frame(data_spatial), |
| 173 | 2x |
msg = "data_spatial must be a data frame" |
| 174 |
) |
|
| 175 |
} else {
|
|
| 176 | 30x |
data_spatial <- NULL |
| 177 |
} |
|
| 178 | ||
| 179 | 31x |
assertthat::assert_that( |
| 180 | 31x |
any(error_family %in% c("gaussian", "binomial")),
|
| 181 | 31x |
msg = "error_family must be either 'gaussian' or 'binomial'" |
| 182 |
) |
|
| 183 | ||
| 184 | 30x |
error_family <- match.arg(error_family) |
| 185 | ||
| 186 | 30x |
assertthat::assert_that( |
| 187 | 30x |
any(device %in% c("cpu", "gpu")),
|
| 188 | 30x |
msg = "device must be either 'cpu' or 'gpu'" |
| 189 |
) |
|
| 190 | ||
| 191 | 30x |
device <- match.arg(device) |
| 192 | ||
| 193 |
# Validate numeric and logical arguments |
|
| 194 | 30x |
assertthat::assert_that( |
| 195 | 30x |
is.numeric(parallel), |
| 196 | 30x |
length(parallel) == 1, |
| 197 | 30x |
msg = "parallel must be a numeric value of length 1" |
| 198 |
) |
|
| 199 | ||
| 200 | 30x |
assertthat::assert_that( |
| 201 | 30x |
is.logical(compute_se), |
| 202 | 30x |
length(compute_se) == 1, |
| 203 | 30x |
msg = "compute_se must be a logical value of length 1" |
| 204 |
) |
|
| 205 | ||
| 206 | 30x |
assertthat::assert_that( |
| 207 | 30x |
is.logical(verbose), |
| 208 | 30x |
length(verbose) == 1, |
| 209 | 30x |
msg = "verbose must be a logical value of length 1" |
| 210 |
) |
|
| 211 | ||
| 212 | 30x |
assertthat::assert_that( |
| 213 | 30x |
is.numeric(iter), |
| 214 | 30x |
length(iter) == 1, |
| 215 | 30x |
iter > 0, |
| 216 | 30x |
msg = paste0( |
| 217 | 30x |
"`iter` must be a single positive numeric value of length 1" |
| 218 |
) |
|
| 219 |
) |
|
| 220 | ||
| 221 | 27x |
assertthat::assert_that( |
| 222 | 27x |
is.null(n_early_stopping) || |
| 223 | 27x |
(is.numeric(n_early_stopping) && length(n_early_stopping) == 1), |
| 224 | 27x |
msg = paste0( |
| 225 | 27x |
"`n_early_stopping` must be NULL or a single numeric", |
| 226 | 27x |
" value of length 1" |
| 227 |
) |
|
| 228 |
) |
|
| 229 | ||
| 230 |
# Handle device/parallel conflict |
|
| 231 |
if ( |
|
| 232 | 25x |
device == "gpu" && parallel > 0L |
| 233 |
) {
|
|
| 234 | ! |
message( |
| 235 | ! |
paste0( |
| 236 | ! |
"Parallel processing is not supported when device = 'gpu'.", |
| 237 | ! |
" Setting parallel to 0L." |
| 238 |
) |
|
| 239 |
) |
|
| 240 | ! |
parallel <- 0L |
| 241 |
} |
|
| 242 | ||
| 243 |
# Convert community data to presence/absence for binomial |
|
| 244 |
if ( |
|
| 245 | 25x |
error_family == "binomial" |
| 246 |
) {
|
|
| 247 | 10x |
data_community <- |
| 248 | 10x |
data_community > 0 |
| 249 | ||
| 250 | 10x |
error_family <- binomial("probit")
|
| 251 |
} else {
|
|
| 252 | 15x |
error_family <- gaussian() |
| 253 |
} |
|
| 254 | ||
| 255 |
# Build spatial structure |
|
| 256 |
# Note: sjSDM::linear/DNN use match.call() internally and re-evaluate |
|
| 257 |
# formula symbols in parent.env(environment()) = namespace:sjSDM. |
|
| 258 |
# Using do.call passes the formula as an already-evaluated object |
|
| 259 |
# (class "formula"), so the bare-name eval branch is never triggered. |
|
| 260 |
if ( |
|
| 261 | 25x |
spatial_method == "linear" |
| 262 |
) {
|
|
| 263 | 1x |
spatial <- |
| 264 | 1x |
do.call( |
| 265 | 1x |
sjSDM::linear, |
| 266 | 1x |
list( |
| 267 | 1x |
data = data_spatial, |
| 268 | 1x |
formula = sel_spatial_formula |
| 269 |
) |
|
| 270 |
) |
|
| 271 | 24x |
} else if (spatial_method == "DNN") {
|
| 272 | ! |
spatial <- |
| 273 | ! |
do.call( |
| 274 | ! |
sjSDM::DNN, |
| 275 | ! |
list( |
| 276 | ! |
data = data_spatial, |
| 277 | ! |
formula = sel_spatial_formula |
| 278 |
) |
|
| 279 |
) |
|
| 280 | 24x |
} else if (spatial_method == "none") {
|
| 281 | 24x |
spatial <- NULL |
| 282 |
} |
|
| 283 | ||
| 284 |
# Build abiotic (environmental) structure |
|
| 285 |
# Note: data_abiotic is already scaled upstream by |
|
| 286 |
# scale_abiotic_for_fit(); no additional scaling is applied. |
|
| 287 |
if ( |
|
| 288 | 25x |
abiotic_method == "linear" |
| 289 |
) {
|
|
| 290 | 24x |
sel_biotic <- |
| 291 | 24x |
do.call( |
| 292 | 24x |
sjSDM::linear, |
| 293 | 24x |
list( |
| 294 | 24x |
data = data_abiotic, |
| 295 | 24x |
formula = sel_abiotic_formula |
| 296 |
) |
|
| 297 |
) |
|
| 298 |
} else {
|
|
| 299 | 1x |
sel_biotic <- |
| 300 | 1x |
do.call( |
| 301 | 1x |
sjSDM::DNN, |
| 302 | 1x |
list( |
| 303 | 1x |
data = data_abiotic, |
| 304 | 1x |
formula = sel_abiotic_formula |
| 305 |
) |
|
| 306 |
) |
|
| 307 |
} |
|
| 308 | ||
| 309 |
# Three-tier early stopping patience: |
|
| 310 |
# "NULL" -> auto: round(iter * 0.20), ensuring >= 20% of budget |
|
| 311 |
# <= 0 -> 0 (disabled, maps to sjSDMControl's "disabled" value) |
|
| 312 |
# > 0 -> max(value, round(iter * 0.20)), floor at 20% of iter |
|
| 313 | 25x |
sel_early_stopping <- |
| 314 | 25x |
if ( |
| 315 | 25x |
base::is.null(n_early_stopping) |
| 316 |
) {
|
|
| 317 | 20x |
base::as.integer(base::round(iter * 0.20)) |
| 318 | 25x |
} else if ( |
| 319 | 25x |
n_early_stopping <= 0 |
| 320 |
) {
|
|
| 321 | 4x |
0L |
| 322 |
} else {
|
|
| 323 | 1x |
base::max( |
| 324 | 1x |
base::as.integer(n_early_stopping), |
| 325 | 1x |
base::as.integer(base::round(iter * 0.20)) |
| 326 |
) |
|
| 327 |
} |
|
| 328 | ||
| 329 | 25x |
sel_control <- |
| 330 | 25x |
sjSDM::sjSDMControl(early_stopping_training = sel_early_stopping) |
| 331 | ||
| 332 | 25x |
mod_sjsdm <- |
| 333 | 25x |
sjSDM::sjSDM( |
| 334 | 25x |
Y = as.matrix(data_community), |
| 335 | 25x |
env = sel_biotic, |
| 336 | 25x |
spatial = spatial, |
| 337 | 25x |
se = compute_se, |
| 338 | 25x |
family = error_family, |
| 339 | 25x |
device = device, |
| 340 | 25x |
verbose = verbose, |
| 341 | 25x |
control = sel_control, |
| 342 | 25x |
iter = iter, |
| 343 |
... |
|
| 344 |
) |
|
| 345 | ||
| 346 | 23x |
return(mod_sjsdm) |
| 347 |
} |
| 1 |
#' @title Compute Spatiotemporal Moran Eigenvector Maps |
|
| 2 |
#' @description |
|
| 3 |
#' Computes Moran Eigenvector Maps (MEMs) from a 3-D |
|
| 4 |
#' coordinate matrix `(x_km, y_km, age_kyr)` built at the |
|
| 5 |
#' **sample level** (one row per site Ć time-slice). |
|
| 6 |
#' All three dimensions are z-scored before eigenvector |
|
| 7 |
#' computation so that spatial and temporal extents |
|
| 8 |
#' contribute equally to the Euclidean distance structure. |
|
| 9 |
#' Returns the first `n_mev` eigenvectors as a data frame |
|
| 10 |
#' with row names in `"dataset_name__age"` format, ready |
|
| 11 |
#' for `scale_spatial_for_fit()` and |
|
| 12 |
#' `assemble_data_to_fit()`. |
|
| 13 |
#' @param data_coords_projected |
|
| 14 |
#' A data frame with `dataset_name` as row names and |
|
| 15 |
#' columns `coord_x_km` and `coord_y_km`, as returned by |
|
| 16 |
#' `project_coords_to_metric()`. One row per unique |
|
| 17 |
#' site/core. |
|
| 18 |
#' @param data_sample_ids |
|
| 19 |
#' A data frame with columns `dataset_name` and `age` |
|
| 20 |
#' giving the valid (site Ć time-slice) pairs to model, |
|
| 21 |
#' as returned by `align_sample_ids()`. |
|
| 22 |
#' @param n_mev |
|
| 23 |
#' A positive integer giving the number of eigenvectors to |
|
| 24 |
#' return. If it exceeds the number of positive Moran |
|
| 25 |
#' eigenvectors produced by `sjSDM::generateSpatialEV()` |
|
| 26 |
#' for the 3-D coordinate matrix, it is automatically |
|
| 27 |
#' clamped down to that count and a `cli::cli_warn()` |
|
| 28 |
#' message is emitted. Default is `20L`. |
|
| 29 |
#' @return |
|
| 30 |
#' A data frame with row names `"<dataset_name>__<age>"` |
|
| 31 |
#' and `n_mev` columns named `mev_1`, `mev_2`, ā¦, |
|
| 32 |
#' `mev_{n_mev}`. The row order follows
|
|
| 33 |
#' `data_sample_ids` sorted by `dataset_name` then `age`, |
|
| 34 |
#' matching the ordering produced by |
|
| 35 |
#' `prepare_abiotic_for_fit()` and |
|
| 36 |
#' `prepare_community_for_fit()`. |
|
| 37 |
#' @details |
|
| 38 |
#' Unlike `compute_spatial_mev()`, which operates on |
|
| 39 |
#' unique site coordinates and must be expanded to sample |
|
| 40 |
#' level via `prepare_spatial_predictors_for_fit()`, this |
|
| 41 |
#' function builds the eigenvectors directly on the |
|
| 42 |
#' sample-level 3-D coordinate matrix. This means each |
|
| 43 |
#' observation (site Ć age) gets a unique row in the MEV |
|
| 44 |
#' matrix, and within-core temporal autocorrelation is |
|
| 45 |
#' captured alongside between-site spatial autocorrelation. |
|
| 46 |
#' |
|
| 47 |
#' The 3-D coordinate matrix is constructed as follows: |
|
| 48 |
#' 1. Join `data_coords_projected` (x_km, y_km) onto |
|
| 49 |
#' `data_sample_ids` to get one row per sample. |
|
| 50 |
#' 2. Convert age (years BP) to kiloyears: `age_kyr = |
|
| 51 |
#' age / 1000`. |
|
| 52 |
#' 3. Z-score each dimension independently so that the |
|
| 53 |
#' spatial and temporal extents are on the same scale |
|
| 54 |
#' before Euclidean distances are computed. |
|
| 55 |
#' |
|
| 56 |
#' The z-scoring step is critical: if age is left in |
|
| 57 |
#' kiloyears and coordinates in km, the time axis will |
|
| 58 |
#' dominate or be negligible depending on the data range, |
|
| 59 |
#' producing eigenvectors that are either purely temporal |
|
| 60 |
#' or purely spatial. |
|
| 61 |
#' |
|
| 62 |
#' Because the eigenvectors are computed at the sample |
|
| 63 |
#' level, **no further expansion step is required**. |
|
| 64 |
#' Pass the returned data frame directly to |
|
| 65 |
#' `scale_spatial_for_fit()`. |
|
| 66 |
#' @seealso |
|
| 67 |
#' [compute_spatial_mev()], |
|
| 68 |
#' [project_coords_to_metric()], |
|
| 69 |
#' [align_sample_ids()], |
|
| 70 |
#' [scale_spatial_for_fit()], |
|
| 71 |
#' [assemble_data_to_fit()] |
|
| 72 |
#' @export |
|
| 73 |
compute_spatiotemporal_mev <- function( |
|
| 74 |
data_coords_projected = NULL, |
|
| 75 |
data_sample_ids = NULL, |
|
| 76 |
n_mev = 20L) {
|
|
| 77 | 17x |
assertthat::assert_that( |
| 78 | 17x |
is.data.frame(data_coords_projected), |
| 79 | 17x |
msg = "data_coords_projected must be a data frame" |
| 80 |
) |
|
| 81 | ||
| 82 | 16x |
assertthat::assert_that( |
| 83 | 16x |
all( |
| 84 | 16x |
c("coord_x_km", "coord_y_km") %in%
|
| 85 | 16x |
base::names(data_coords_projected) |
| 86 |
), |
|
| 87 | 16x |
msg = base::paste0( |
| 88 | 16x |
"data_coords_projected must contain columns", |
| 89 | 16x |
" 'coord_x_km' and 'coord_y_km'" |
| 90 |
) |
|
| 91 |
) |
|
| 92 | ||
| 93 | 14x |
assertthat::assert_that( |
| 94 | 14x |
is.data.frame(data_sample_ids), |
| 95 | 14x |
msg = "data_sample_ids must be a data frame" |
| 96 |
) |
|
| 97 | ||
| 98 | 13x |
assertthat::assert_that( |
| 99 | 13x |
all( |
| 100 | 13x |
c("dataset_name", "age") %in%
|
| 101 | 13x |
base::names(data_sample_ids) |
| 102 |
), |
|
| 103 | 13x |
msg = base::paste0( |
| 104 | 13x |
"data_sample_ids must contain columns", |
| 105 | 13x |
" 'dataset_name' and 'age'" |
| 106 |
) |
|
| 107 |
) |
|
| 108 | ||
| 109 | 11x |
assertthat::assert_that( |
| 110 | 11x |
is.numeric(n_mev) || is.integer(n_mev), |
| 111 | 11x |
length(n_mev) == 1, |
| 112 | 11x |
n_mev >= 1, |
| 113 | 11x |
msg = "n_mev must be a single positive integer" |
| 114 |
) |
|
| 115 | ||
| 116 | 8x |
n_mev <- base::as.integer(n_mev) |
| 117 | ||
| 118 |
# 1. Build sample-level 3-D coordinate data frame ----- |
|
| 119 | ||
| 120 | 8x |
data_samples_3d <- |
| 121 | 8x |
data_sample_ids |> |
| 122 | 8x |
dplyr::arrange(dataset_name, age) |> |
| 123 | 8x |
dplyr::inner_join( |
| 124 | 8x |
data_coords_projected |> |
| 125 | 8x |
tibble::rownames_to_column("dataset_name"),
|
| 126 | 8x |
by = dplyr::join_by(dataset_name) |
| 127 |
) |> |
|
| 128 | 8x |
dplyr::mutate( |
| 129 | 8x |
age_kyr = age / 1000, |
| 130 | 8x |
.row_name = base::paste0(dataset_name, "__", age) |
| 131 |
) |
|
| 132 | ||
| 133 | 8x |
assertthat::assert_that( |
| 134 | 8x |
nrow(data_samples_3d) >= 3, |
| 135 | 8x |
msg = base::paste0( |
| 136 | 8x |
"The joined sample Ć site data must have at least", |
| 137 | 8x |
" 3 rows (required by sjSDM::generateSpatialEV())" |
| 138 |
) |
|
| 139 |
) |
|
| 140 | ||
| 141 |
# 2. Z-score each dimension separately ----- |
|
| 142 | ||
| 143 | 8x |
mat_coords_3d_raw <- |
| 144 | 8x |
data_samples_3d |> |
| 145 | 8x |
dplyr::select(coord_x_km, coord_y_km, age_kyr) |> |
| 146 | 8x |
base::as.matrix() |
| 147 | ||
| 148 | 8x |
mat_coords_3d <- |
| 149 | 8x |
base::scale( |
| 150 | 8x |
mat_coords_3d_raw, |
| 151 | 8x |
center = TRUE, |
| 152 | 8x |
scale = TRUE |
| 153 |
) |
|
| 154 | ||
| 155 |
# 3. Compute Moran eigenvectors on 3-D matrix ----- |
|
| 156 | ||
| 157 | 8x |
mat_mev_raw <- |
| 158 | 8x |
sjSDM::generateSpatialEV( |
| 159 | 8x |
coords = mat_coords_3d |
| 160 |
) |
|
| 161 | ||
| 162 |
# Force to matrix: sjSDM returns a vector when exactly |
|
| 163 |
# one positive eigenvalue is found (drops the dimension) |
|
| 164 | 8x |
mat_mev_all <- |
| 165 | 8x |
base::as.matrix(mat_mev_raw) |
| 166 | ||
| 167 |
# 4. Post-call validation: clamp n_mev if needed ----- |
|
| 168 | ||
| 169 | 8x |
n_produced <- |
| 170 | 8x |
base::ncol(mat_mev_all) |
| 171 | ||
| 172 |
if ( |
|
| 173 | 8x |
n_mev > n_produced |
| 174 |
) {
|
|
| 175 | 1x |
cli::cli_warn( |
| 176 | 1x |
c( |
| 177 | 1x |
"{n_mev} MEV(s) requested; only {n_produced} positive.",
|
| 178 | 1x |
"i" = "Lowering n_mev from {n_mev} to {n_produced}."
|
| 179 |
) |
|
| 180 |
) |
|
| 181 | 1x |
n_mev <- n_produced |
| 182 |
} |
|
| 183 | ||
| 184 |
# 5. Select first n_mev columns ----- |
|
| 185 | ||
| 186 | 8x |
mat_mev <- |
| 187 | 8x |
mat_mev_all[, base::seq_len(n_mev), drop = FALSE] |
| 188 | ||
| 189 |
# 6. Coerce to data frame with named columns ----- |
|
| 190 | ||
| 191 | 8x |
vec_col_names <- |
| 192 | 8x |
base::paste0("mev_", base::seq_len(n_mev))
|
| 193 | ||
| 194 | 8x |
res <- |
| 195 | 8x |
base::as.data.frame(mat_mev) |
| 196 | ||
| 197 | 8x |
base::colnames(res) <- vec_col_names |
| 198 | 8x |
base::rownames(res) <- data_samples_3d$.row_name |
| 199 | ||
| 200 | 8x |
return(res) |
| 201 |
} |
| 1 |
#' @title Save Progress Visualisation |
|
| 2 |
#' @description |
|
| 3 |
#' Generates a visualisation of project progress and saves it as HTML and PNG. |
|
| 4 |
#' @param sel_script |
|
| 5 |
#' The script file to be visualised. |
|
| 6 |
#' @param sel_store |
|
| 7 |
#' Path to the targets store directory. Defaults to the value from the active |
|
| 8 |
#' configuration key "target_store". |
|
| 9 |
#' @param output_file |
|
| 10 |
#' The name of the output file (default: "project_status"). |
|
| 11 |
#' @param output_dir |
|
| 12 |
#' Directory where the output files will be saved |
|
| 13 |
#' (default: "Documentation/Progress"). |
|
| 14 |
#' @param background_color |
|
| 15 |
#' Background color for the visualisation |
|
| 16 |
#' (default: `"#141B22"` ā BIODYNAMICS brand Console Panel; one step lighter |
|
| 17 |
#' than the page background, used for cards and panels). |
|
| 18 |
#' @param physics |
|
| 19 |
#' Logical indicating whether to enable physics simulation in the network |
|
| 20 |
#' graph (default: TRUE). |
|
| 21 |
#' @param level_separation |
|
| 22 |
#' Level separation for the visualisation graph (default: 250). |
|
| 23 |
#' @return |
|
| 24 |
#' No return value. Called for side effects: saves HTML and PNG files to a |
|
| 25 |
#' store-specific subdirectory within `output_dir`. |
|
| 26 |
#' @details |
|
| 27 |
#' Uses `targets::tar_visnetwork` to create a network graph and saves two |
|
| 28 |
#' HTML files (full and targets-only) using `visNetwork::visSave`, plus a |
|
| 29 |
#' static PNG via `webshot2::webshot`. If the browser-backed PNG export fails, |
|
| 30 |
#' the function emits a warning and keeps the HTML outputs so progress saving |
|
| 31 |
#' does not abort the surrounding pipeline run. Files are written to |
|
| 32 |
#' `output_dir/<store_name>/` where `<store_name>` is the last path segment |
|
| 33 |
#' of `sel_store` after the `targets/` directory. |
|
| 34 |
#' |
|
| 35 |
#' The targets store is read only once. `targets_only = FALSE` retrieves the |
|
| 36 |
#' full network; the targets-only variant for the static PNG graph is derived |
|
| 37 |
#' by filtering non-target nodes (`type` values `"function"`, `"object"`, |
|
| 38 |
#' `"value"`) from the same object, avoiding a second store read. If the |
|
| 39 |
#' `type` column is absent (targets internal change), the function falls back |
|
| 40 |
#' to a second `tar_visnetwork()` call so correctness is preserved. |
|
| 41 |
#' |
|
| 42 |
#' When `physics = TRUE`, the static graph for the PNG uses a |
|
| 43 |
#' `stabilizationIterationsDone` vis.js event to disable physics as soon as |
|
| 44 |
#' the silent background layout pass finishes. This ensures the screenshot |
|
| 45 |
#' captures the fully settled layout without requiring a long wait or |
|
| 46 |
#' rendering the graph twice. |
|
| 47 |
#' @export |
|
| 48 |
save_progress_visualisation <- function( |
|
| 49 |
sel_script, |
|
| 50 |
sel_store = get_active_config("target_store"),
|
|
| 51 |
output_file = "project_status", |
|
| 52 |
output_dir = here::here( |
|
| 53 |
"Documentation/Progress" |
|
| 54 |
), |
|
| 55 |
background_color = "#141B22", |
|
| 56 |
physics = TRUE, |
|
| 57 |
level_separation = 250) {
|
|
| 58 |
# test to make sure {pandoc} is installed for webshot to work
|
|
| 59 | ||
| 60 |
if ( |
|
| 61 | 1x |
is.null(rmarkdown::find_pandoc()$dir) |
| 62 |
) {
|
|
| 63 | ! |
pandoc::pandoc_activate() |
| 64 |
} |
|
| 65 | ||
| 66 |
# Read the targets store once. targets_only = FALSE gives the full network; |
|
| 67 |
# the targets-only static variant is derived below by filtering the same |
|
| 68 |
# object, avoiding a second expensive store read. |
|
| 69 | 1x |
network_graph_raw <- |
| 70 | 1x |
targets::tar_visnetwork( |
| 71 | 1x |
script = sel_script, |
| 72 | 1x |
outdated = FALSE, |
| 73 | 1x |
store = sel_store, |
| 74 | 1x |
targets_only = FALSE, |
| 75 | 1x |
physics = physics, |
| 76 | 1x |
level_separation = level_separation |
| 77 |
) |
|
| 78 | ||
| 79 | 1x |
network_graph <- |
| 80 | 1x |
network_graph_raw |> |
| 81 |
# Apply BIODYNAMICS brand theme over the targets-generated colors. |
|
| 82 |
# Node fill colors (status: up-to-date / outdated / etc.) are set |
|
| 83 |
# internally by targets and cannot be overridden here; only font |
|
| 84 |
# and edge styling are applied. |
|
| 85 | 1x |
visNetwork::visNodes( |
| 86 | 1x |
font = base::list( |
| 87 | 1x |
color = "#E6EDF3", # Bone Text |
| 88 | 1x |
face = "IBM Plex Mono, monospace" |
| 89 |
) |
|
| 90 |
) |> |
|
| 91 | 1x |
visNetwork::visEdges( |
| 92 | 1x |
color = base::list( |
| 93 | 1x |
color = "#2A3441", # Slate Border |
| 94 | 1x |
highlight = "#8DF59A", # Phosphor Green |
| 95 | 1x |
hover = "#48C7B8" # Moss Teal |
| 96 |
), |
|
| 97 | 1x |
font = base::list( |
| 98 | 1x |
color = "#98A6B3", # Muted Mist |
| 99 | 1x |
strokeWidth = 0 |
| 100 |
) |
|
| 101 |
) |
|
| 102 | ||
| 103 |
# Derive the targets-only static base graph from the raw object already |
|
| 104 |
# in memory. targets sets type = "function" / "object" / "value" for |
|
| 105 |
# non-target nodes; filtering them mirrors targets_only = TRUE. |
|
| 106 |
# The fallback re-calls tar_visnetwork() if targets ever drops the |
|
| 107 |
# type column, so correctness is preserved across package changes. |
|
| 108 |
if ( |
|
| 109 | 1x |
"type" %in% base::names(network_graph_raw$x$nodes) |
| 110 |
) {
|
|
| 111 | ! |
vec_target_node_ids <- |
| 112 | ! |
network_graph_raw$x$nodes |> |
| 113 | ! |
dplyr::filter( |
| 114 | ! |
!(.data[["type"]] %in% c("function", "object", "value"))
|
| 115 |
) |> |
|
| 116 | ! |
dplyr::pull(id) |
| 117 | ||
| 118 | ! |
network_graph_static_base <- network_graph_raw |
| 119 | ! |
network_graph_static_base$x$nodes <- |
| 120 | ! |
dplyr::filter( |
| 121 | ! |
network_graph_raw$x$nodes, |
| 122 | ! |
.data[["id"]] %in% vec_target_node_ids |
| 123 |
) |
|
| 124 | ! |
network_graph_static_base$x$edges <- |
| 125 | ! |
dplyr::filter( |
| 126 | ! |
network_graph_raw$x$edges, |
| 127 | ! |
.data[["from"]] %in% vec_target_node_ids, |
| 128 | ! |
.data[["to"]] %in% vec_target_node_ids |
| 129 |
) |
|
| 130 |
} else {
|
|
| 131 |
# Fallback: targets internals changed and type column is absent. |
|
| 132 | 1x |
network_graph_static_base <- |
| 133 | 1x |
targets::tar_visnetwork( |
| 134 | 1x |
script = sel_script, |
| 135 | 1x |
store = sel_store, |
| 136 | 1x |
targets_only = TRUE, |
| 137 | 1x |
outdated = FALSE, |
| 138 | 1x |
physics = physics, |
| 139 | 1x |
level_separation = level_separation |
| 140 |
) |
|
| 141 |
} |
|
| 142 | ||
| 143 | 1x |
network_graph_static <- |
| 144 | 1x |
network_graph_static_base |> |
| 145 | 1x |
visNetwork::visNodes( |
| 146 | 1x |
font = base::list( |
| 147 | 1x |
color = "#E6EDF3", |
| 148 | 1x |
face = "IBM Plex Mono, monospace" |
| 149 |
) |
|
| 150 |
) |> |
|
| 151 | 1x |
visNetwork::visEdges( |
| 152 | 1x |
color = base::list( |
| 153 | 1x |
color = "#2A3441", |
| 154 | 1x |
highlight = "#8DF59A", |
| 155 | 1x |
hover = "#48C7B8" |
| 156 |
), |
|
| 157 | 1x |
font = base::list( |
| 158 | 1x |
color = "#98A6B3", |
| 159 | 1x |
strokeWidth = 0 |
| 160 |
) |
|
| 161 |
) |> |
|
| 162 |
# Freeze the graph the moment background stabilization finishes so the |
|
| 163 |
# webshot captures the settled layout, not a mid-simulation frame. |
|
| 164 |
# vis.js fires `stabilizationIterationsDone` after the silent pre-layout |
|
| 165 |
# pass (fast, < 1 s), then starts the animated physics ā disabling |
|
| 166 |
# physics here prevents any further movement before the screenshot. |
|
| 167 | 1x |
visNetwork::visEvents( |
| 168 | 1x |
stabilizationIterationsDone = "function() {
|
| 169 | 1x |
this.setOptions({ physics: false });
|
| 170 |
}" |
|
| 171 |
) |
|
| 172 | ||
| 173 | 1x |
sel_store_simple <- |
| 174 | 1x |
stringr::str_replace( |
| 175 | 1x |
string = sel_store, |
| 176 | 1x |
pattern = ".*/targets/", |
| 177 | 1x |
replacement = "" |
| 178 |
) |
|
| 179 | ||
| 180 | 1x |
output_store_dir <- |
| 181 | 1x |
paste0(output_dir, "/", sel_store_simple) |
| 182 | ||
| 183 | 1x |
output_html_path <- |
| 184 | 1x |
paste0(output_store_dir, "/", output_file, ".html") |
| 185 | ||
| 186 | 1x |
output_small_html_path <- |
| 187 | 1x |
paste0(output_store_dir, "/", output_file, "_small.html") |
| 188 | ||
| 189 | 1x |
output_png_path <- |
| 190 | 1x |
paste0(output_store_dir, "/", output_file, "_static.png") |
| 191 | ||
| 192 |
# need to create the output directory if it doesn't exist |
|
| 193 |
if ( |
|
| 194 | 1x |
!dir.exists(output_store_dir) |
| 195 |
) {
|
|
| 196 | 1x |
dir.create(output_store_dir, recursive = TRUE) |
| 197 |
} |
|
| 198 | ||
| 199 | 1x |
visNetwork::visSave( |
| 200 | 1x |
graph = network_graph, |
| 201 | 1x |
file = output_html_path, |
| 202 | 1x |
selfcontained = TRUE, |
| 203 | 1x |
background = background_color |
| 204 |
) |
|
| 205 | ||
| 206 | 1x |
visNetwork::visSave( |
| 207 | 1x |
graph = network_graph_static, |
| 208 | 1x |
file = output_small_html_path, |
| 209 | 1x |
selfcontained = TRUE, |
| 210 | 1x |
background = background_color |
| 211 |
) |
|
| 212 | ||
| 213 |
# PNG export depends on launching a browser through chromote/webshot2, |
|
| 214 |
# which can fail in interactive sessions even when the HTML outputs save. |
|
| 215 |
# `delay = 1` gives the browser time to complete the background |
|
| 216 |
# stabilization pass before the screenshot is taken; the visEvents |
|
| 217 |
# handler then keeps the graph frozen so the captured layout is stable. |
|
| 218 | 1x |
tryCatch( |
| 219 | 1x |
webshot2::webshot( |
| 220 | 1x |
url = output_small_html_path, |
| 221 | 1x |
file = output_png_path, |
| 222 | 1x |
vwidth = 950, |
| 223 | 1x |
vheight = 750, |
| 224 | 1x |
delay = 3 |
| 225 |
), |
|
| 226 | 1x |
error = function(err) {
|
| 227 | 1x |
cli::cli_warn( |
| 228 | 1x |
c( |
| 229 | 1x |
"Failed to save static PNG progress visualisation.", |
| 230 | 1x |
"i" = "HTML progress files were saved successfully.", |
| 231 | 1x |
"i" = paste0( |
| 232 | 1x |
"Original error: ", |
| 233 | 1x |
base::conditionMessage(err) |
| 234 |
) |
|
| 235 |
) |
|
| 236 |
) |
|
| 237 | 1x |
invisible(NULL) |
| 238 |
} |
|
| 239 |
) |
|
| 240 |
} |
| 1 |
#' @title Interpolate 2-D Spatial MEVs to Prediction Grid |
|
| 2 |
#' @description |
|
| 3 |
#' Uses Inverse Distance Weighting (IDW, power = 2) to |
|
| 4 |
#' approximate Moran Eigenvector Map (MEM) values from |
|
| 5 |
#' training-site locations to arbitrary prediction locations, |
|
| 6 |
#' then scales the result using training spatial scale |
|
| 7 |
#' attributes so that the interpolated predictors are on the |
|
| 8 |
#' same scale as those seen during model fitting. |
|
| 9 |
#' @param data_coords_projected_train |
|
| 10 |
#' A data frame with `coord_x_km` and `coord_y_km` columns |
|
| 11 |
#' and `dataset_name` as row names, as returned by |
|
| 12 |
#' `project_coords_to_metric()`. One row per unique training |
|
| 13 |
#' site. |
|
| 14 |
#' @param data_mev_core |
|
| 15 |
#' A data frame with unscaled MEV columns (`mev_1`, `mev_2`, |
|
| 16 |
#' ā¦) and `dataset_name` as row names, as returned by |
|
| 17 |
#' `compute_spatial_mev()`. |
|
| 18 |
#' @param data_coords_projected_pred |
|
| 19 |
#' A data frame with `coord_x_km` and `coord_y_km` columns |
|
| 20 |
#' and arbitrary row names identifying prediction locations |
|
| 21 |
#' (e.g. `"grid_1"`, `"grid_2"`), as returned by |
|
| 22 |
#' `project_coords_to_metric()`. |
|
| 23 |
#' @param spatial_scale_attributes |
|
| 24 |
#' A named list of `"scaled:center"` and `"scaled:scale"` |
|
| 25 |
#' attributes per MEV column, as returned by |
|
| 26 |
#' `scale_spatial_for_fit()` in the `spatial_scale_attributes` |
|
| 27 |
#' element. Used to bring interpolated MEV values onto the |
|
| 28 |
#' same scale as the training spatial predictors. |
|
| 29 |
#' @return |
|
| 30 |
#' A data frame with the same row names as |
|
| 31 |
#' `data_coords_projected_pred` and one column per MEV |
|
| 32 |
#' (names matching `data_mev_core`). All columns are scaled |
|
| 33 |
#' to match the training MEV distribution. |
|
| 34 |
#' @details |
|
| 35 |
#' MEMs are eigenvectors of the spatial connectivity matrix |
|
| 36 |
#' at training sites and cannot be analytically evaluated at |
|
| 37 |
#' new locations. IDW (power = 2) with a small epsilon |
|
| 38 |
#' (1e-10) to prevent division-by-zero provides a smooth |
|
| 39 |
#' spatial interpolation. |
|
| 40 |
#' |
|
| 41 |
#' This function handles the **2-D spatial case** only |
|
| 42 |
#' (x_km, y_km). For models fitted with |
|
| 43 |
#' `spatial_mode = "spatiotemporal"` use |
|
| 44 |
#' `interpolate_st_mev_to_grid()` instead. |
|
| 45 |
#' @seealso |
|
| 46 |
#' [compute_spatial_mev()], |
|
| 47 |
#' [interpolate_st_mev_to_grid()], |
|
| 48 |
#' [project_coords_to_metric()], |
|
| 49 |
#' [scale_spatial_for_fit()] |
|
| 50 |
#' @export |
|
| 51 |
interpolate_mev_to_grid <- function( |
|
| 52 |
data_coords_projected_train = NULL, |
|
| 53 |
data_mev_core = NULL, |
|
| 54 |
data_coords_projected_pred = NULL, |
|
| 55 |
spatial_scale_attributes = NULL) {
|
|
| 56 | 15x |
assertthat::assert_that( |
| 57 | 15x |
is.data.frame(data_coords_projected_train), |
| 58 | 15x |
all( |
| 59 | 15x |
c("coord_x_km", "coord_y_km") %in%
|
| 60 | 15x |
base::names(data_coords_projected_train) |
| 61 |
), |
|
| 62 | 15x |
msg = paste0( |
| 63 | 15x |
"data_coords_projected_train must be a data frame", |
| 64 | 15x |
" with columns 'coord_x_km' and 'coord_y_km'" |
| 65 |
) |
|
| 66 |
) |
|
| 67 | ||
| 68 | 11x |
assertthat::assert_that( |
| 69 | 11x |
is.data.frame(data_mev_core), |
| 70 | 11x |
nrow(data_mev_core) > 0, |
| 71 | 11x |
ncol(data_mev_core) > 0, |
| 72 | 11x |
msg = "data_mev_core must be a non-empty data frame" |
| 73 |
) |
|
| 74 | ||
| 75 | 7x |
assertthat::assert_that( |
| 76 | 7x |
is.data.frame(data_coords_projected_pred), |
| 77 | 7x |
all( |
| 78 | 7x |
c("coord_x_km", "coord_y_km") %in%
|
| 79 | 7x |
base::names(data_coords_projected_pred) |
| 80 |
), |
|
| 81 | 7x |
msg = paste0( |
| 82 | 7x |
"data_coords_projected_pred must be a data frame", |
| 83 | 7x |
" with columns 'coord_x_km' and 'coord_y_km'" |
| 84 |
) |
|
| 85 |
) |
|
| 86 | ||
| 87 | 4x |
assertthat::assert_that( |
| 88 | 4x |
is.list(spatial_scale_attributes), |
| 89 | 4x |
length(spatial_scale_attributes) > 0, |
| 90 | 4x |
msg = "spatial_scale_attributes must be a non-empty list" |
| 91 |
) |
|
| 92 | ||
| 93 |
# 1. Combine training km coords and unscaled MEV values ----- |
|
| 94 | 2x |
vec_mev_cols <- |
| 95 | 2x |
base::names(data_mev_core) |
| 96 | ||
| 97 | 2x |
data_train_mev_coords <- |
| 98 | 2x |
data_coords_projected_train |> |
| 99 | 2x |
tibble::rownames_to_column("dataset_name") |>
|
| 100 | 2x |
dplyr::inner_join( |
| 101 | 2x |
data_mev_core |> |
| 102 | 2x |
tibble::rownames_to_column("dataset_name"),
|
| 103 | 2x |
by = dplyr::join_by(dataset_name) |
| 104 |
) |
|
| 105 | ||
| 106 |
# 2. Build km coordinate matrices ----- |
|
| 107 | 2x |
mat_xy_train_km <- |
| 108 | 2x |
data_train_mev_coords |> |
| 109 | 2x |
dplyr::select(coord_x_km, coord_y_km) |> |
| 110 | 2x |
base::as.matrix() |
| 111 | ||
| 112 | 2x |
mat_xy_pred_km <- |
| 113 | 2x |
data_coords_projected_pred |> |
| 114 | 2x |
dplyr::select(coord_x_km, coord_y_km) |> |
| 115 | 2x |
base::as.matrix() |
| 116 | ||
| 117 |
# 3. 2-D Euclidean distances (rows = pred, cols = train) ----- |
|
| 118 | 2x |
mat_dist_km <- |
| 119 | 2x |
base::sqrt( |
| 120 | 2x |
base::outer( |
| 121 | 2x |
mat_xy_pred_km[, 1], mat_xy_train_km[, 1], `-` |
| 122 | 2x |
)^2 + |
| 123 | 2x |
base::outer( |
| 124 | 2x |
mat_xy_pred_km[, 2], mat_xy_train_km[, 2], `-` |
| 125 | 2x |
)^2 |
| 126 |
) |
|
| 127 | ||
| 128 |
# 4. IDW weights (power = 2, epsilon avoids div-by-zero) ----- |
|
| 129 | 2x |
mat_idw_weights <- |
| 130 | 2x |
1 / (mat_dist_km^2 + 1e-10) |
| 131 | ||
| 132 | 2x |
mat_idw_weights <- |
| 133 | 2x |
mat_idw_weights / base::rowSums(mat_idw_weights) |
| 134 | ||
| 135 |
# 5. Interpolate unscaled MEV values ----- |
|
| 136 | 2x |
mat_train_mev <- |
| 137 | 2x |
data_train_mev_coords |> |
| 138 | 2x |
dplyr::select(dplyr::all_of(vec_mev_cols)) |> |
| 139 | 2x |
base::as.matrix() |
| 140 | ||
| 141 | 2x |
data_pred_mev_raw <- |
| 142 | 2x |
base::as.data.frame(mat_idw_weights %*% mat_train_mev) |
| 143 | ||
| 144 | 2x |
base::colnames(data_pred_mev_raw) <- vec_mev_cols |
| 145 | 2x |
base::rownames(data_pred_mev_raw) <- |
| 146 | 2x |
base::rownames(data_coords_projected_pred) |
| 147 | ||
| 148 |
# 6. Scale using training spatial scale attributes ----- |
|
| 149 | 2x |
data_pred_mev_scaled <- |
| 150 | 2x |
data_pred_mev_raw |> |
| 151 | 2x |
dplyr::mutate( |
| 152 | 2x |
dplyr::across( |
| 153 | 2x |
.cols = dplyr::everything(), |
| 154 | 2x |
.fns = ~ {
|
| 155 | 4x |
col_nm <- dplyr::cur_column() |
| 156 | 4x |
center <- base::as.numeric( |
| 157 | 4x |
spatial_scale_attributes[[col_nm]][["scaled:center"]] |
| 158 |
) |
|
| 159 | 4x |
sc <- base::as.numeric( |
| 160 | 4x |
spatial_scale_attributes[[col_nm]][["scaled:scale"]] |
| 161 |
) |
|
| 162 | 4x |
(.x - center) / sc |
| 163 |
} |
|
| 164 |
) |
|
| 165 |
) |
|
| 166 | ||
| 167 | 2x |
return(data_pred_mev_scaled) |
| 168 |
} |
| 1 |
#' @title Extract Data from VegVault |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts data from the VegVault SQLite database based on specified |
|
| 4 |
#' geographic, temporal, and dataset type constraints. |
|
| 5 |
#' @param path_to_vegvault |
|
| 6 |
#' A character string specifying the path to the VegVault SQLite database |
|
| 7 |
#' (default: "Data/Input/VegVault.sqlite"). |
|
| 8 |
#' @param x_lim |
|
| 9 |
#' A numeric vector of length 2 specifying the longitude range. |
|
| 10 |
#' @param y_lim |
|
| 11 |
#' A numeric vector of length 2 specifying the latitude range. |
|
| 12 |
#' @param age_lim |
|
| 13 |
#' A numeric vector of length 2 specifying the age range. |
|
| 14 |
#' @param sel_dataset_type |
|
| 15 |
#' A character vector specifying the dataset types to select. |
|
| 16 |
#' @param sel_abiotic_var_name |
|
| 17 |
#' A character vector specifying the abiotic variable names to select. |
|
| 18 |
#' @return |
|
| 19 |
#' A data frame containing the extracted data. |
|
| 20 |
#' @details |
|
| 21 |
#' The function performs the following steps: |
|
| 22 |
#' |
|
| 23 |
#' 1. Validates input parameters. |
|
| 24 |
#' 2. Checks the presence of the VegVault SQLite database. |
|
| 25 |
#' 3. Builds the vaultkeepr query plan (lazy SQL). If vaultkeepr raises |
|
| 26 |
#' an error during plan assembly (e.g. insufficient data for the |
|
| 27 |
#' specified constraints), the error is caught and re-thrown via |
|
| 28 |
#' `cli::cli_abort()` with the original message preserved. |
|
| 29 |
#' 4. Retrieves abiotic data and taxa information. |
|
| 30 |
#' 5. Returns the extracted data as a data frame. |
|
| 31 |
#' @export |
|
| 32 |
extract_data_from_vegvault <- function( |
|
| 33 |
path_to_vegvault = here::here("Data/Input/VegVault.sqlite"),
|
|
| 34 |
x_lim = NULL, |
|
| 35 |
y_lim = NULL, |
|
| 36 |
age_lim = NULL, |
|
| 37 |
sel_dataset_type = NULL, |
|
| 38 |
sel_abiotic_var_name = NULL) {
|
|
| 39 | 10x |
`%>%` <- magrittr::`%>%` |
| 40 | ||
| 41 | 10x |
assertthat::assert_that( |
| 42 | 10x |
is.character(path_to_vegvault), |
| 43 | 10x |
length(path_to_vegvault) == 1, |
| 44 | 10x |
msg = "path_to_vegvault must be a single character string" |
| 45 |
) |
|
| 46 | ||
| 47 |
# Check if the VegVault file exists |
|
| 48 | 9x |
check_presence_of_vegvault(path_to_vegvault) |
| 49 | ||
| 50 | 7x |
assertthat::assert_that( |
| 51 | 7x |
is.numeric(x_lim) && length(x_lim) == 2, |
| 52 | 7x |
msg = "x_lim must be a numeric vector of length 2" |
| 53 |
) |
|
| 54 | ||
| 55 | 6x |
assertthat::assert_that( |
| 56 | 6x |
is.numeric(y_lim) && length(y_lim) == 2, |
| 57 | 6x |
msg = "y_lim must be a numeric vector of length 2" |
| 58 |
) |
|
| 59 | ||
| 60 | 5x |
assertthat::assert_that( |
| 61 | 5x |
is.numeric(age_lim) && length(age_lim) == 2, |
| 62 | 5x |
msg = "age_lim must be a numeric vector of length 2" |
| 63 |
) |
|
| 64 | ||
| 65 | 4x |
assertthat::assert_that( |
| 66 | 4x |
is.character(sel_dataset_type) && length(sel_dataset_type) > 0, |
| 67 | 4x |
msg = "sel_dataset_type must be a character vector of length > 0" |
| 68 |
) |
|
| 69 | ||
| 70 | 3x |
assertthat::assert_that( |
| 71 | 3x |
is.character(sel_abiotic_var_name) && length(sel_abiotic_var_name) > 0, |
| 72 | 3x |
msg = "sel_abiotic_var_name must be a character vector of length > 0" |
| 73 |
) |
|
| 74 | ||
| 75 | 2x |
plan_error <- NULL |
| 76 | ||
| 77 | 2x |
vaultkeepr_plan <- |
| 78 | 2x |
tryCatch( |
| 79 | 2x |
expr = {
|
| 80 |
# Access the VegVault file |
|
| 81 | 2x |
vaultkeepr::open_vault( |
| 82 | 2x |
path = path_to_vegvault |
| 83 |
) %>% |
|
| 84 |
# Add the dataset information |
|
| 85 | 2x |
vaultkeepr::get_datasets() %>% |
| 86 |
# Select modern plot data and climate |
|
| 87 | 2x |
vaultkeepr::select_dataset_by_type( |
| 88 | 2x |
sel_dataset_type = sel_dataset_type |
| 89 |
) %>% |
|
| 90 |
# Limit data to Czech Republic |
|
| 91 | 2x |
vaultkeepr::select_dataset_by_geo( |
| 92 | 2x |
lat_lim = y_lim, |
| 93 | 2x |
long_lim = x_lim, |
| 94 | 2x |
verbose = FALSE |
| 95 |
) %>% |
|
| 96 |
# Add samples |
|
| 97 | 2x |
vaultkeepr::get_samples() %>% |
| 98 |
# Select only modern data |
|
| 99 | 2x |
vaultkeepr::select_samples_by_age( |
| 100 | 2x |
age_lim = age_lim, |
| 101 | 2x |
verbose = FALSE |
| 102 |
) %>% |
|
| 103 |
# Add abiotic data |
|
| 104 | 2x |
vaultkeepr::get_abiotic_data(verbose = FALSE) %>% |
| 105 |
# Select only Mean Annual Temperature (bio1) |
|
| 106 | 2x |
vaultkeepr::select_abiotic_var_by_name( |
| 107 | 2x |
sel_var_name = sel_abiotic_var_name |
| 108 |
) %>% |
|
| 109 |
# Add taxa |
|
| 110 | 2x |
vaultkeepr::get_taxa() |
| 111 |
}, |
|
| 112 | 2x |
error = function(e) {
|
| 113 | 1x |
plan_error <<- base::conditionMessage(e) |
| 114 | 1x |
NULL |
| 115 |
} |
|
| 116 |
) |
|
| 117 | ||
| 118 | 2x |
if (base::is.null(vaultkeepr_plan)) {
|
| 119 | 1x |
cli::cli_abort( |
| 120 | 1x |
c( |
| 121 | 1x |
"Failed to build the vaultkeepr query plan.", |
| 122 | 1x |
"i" = "No data available for the specified constraints.", |
| 123 | 1x |
"x" = plan_error |
| 124 |
) |
|
| 125 |
) |
|
| 126 |
} |
|
| 127 | ||
| 128 | 1x |
data_extracted <- |
| 129 | 1x |
vaultkeepr_plan %>% |
| 130 | 1x |
vaultkeepr::extract_data( |
| 131 | 1x |
return_raw_data = FALSE, |
| 132 | 1x |
verbose = FALSE |
| 133 |
) |
|
| 134 | ||
| 135 | 1x |
return(data_extracted) |
| 136 |
} |
| 1 |
#' @title Verify sjSDM Setup |
|
| 2 |
#' |
|
| 3 |
#' @description |
|
| 4 |
#' Comprehensive verification of sjSDM installation, including Python |
|
| 5 |
#' environment, PyTorch, CUDA support, and sjSDM functionality. |
|
| 6 |
#' This function checks that Radian and sjSDM are using the same |
|
| 7 |
#' Python environment and that all dependencies are properly installed. |
|
| 8 |
#' |
|
| 9 |
#' @param run_test_model Logical. Should a test model be fitted to |
|
| 10 |
#' verify full functionality? Default is TRUE. |
|
| 11 |
#' |
|
| 12 |
#' @return Invisible list with verification results. Called primarily |
|
| 13 |
#' for side effects (printing verification status). |
|
| 14 |
#' |
|
| 15 |
#' @details |
|
| 16 |
#' This function performs the following checks: |
|
| 17 |
#' 1. Radian configuration (correct Python environment) |
|
| 18 |
#' 2. Python version and location |
|
| 19 |
#' 3. PyTorch installation and version |
|
| 20 |
#' 4. CUDA/GPU availability |
|
| 21 |
#' 5. sjSDM package installation |
|
| 22 |
#' 6. sjSDM Python dependencies |
|
| 23 |
#' 7. Test model fitting (optional) |
|
| 24 |
#' |
|
| 25 |
#' All checks print their status with [OK] (success) or [FAIL] (failure). |
|
| 26 |
#' If any critical check fails, the function provides troubleshooting |
|
| 27 |
#' guidance. |
|
| 28 |
#' |
|
| 29 |
#' @seealso |
|
| 30 |
#' \code{\link[reticulate]{py_config}}
|
|
| 31 |
#' \code{\link[sjSDM]{sjSDM}}
|
|
| 32 |
#' |
|
| 33 |
#' @export |
|
| 34 |
verify_sjsdm_setup <- function(run_test_model = interactive()) {
|
|
| 35 |
# Detect runtime environment |
|
| 36 | ! |
in_rstudio <- Sys.getenv("RSTUDIO") == "1"
|
| 37 | ||
| 38 |
# Initialize results list |
|
| 39 | ! |
results <- |
| 40 | ! |
list( |
| 41 | ! |
radian_ok = FALSE, |
| 42 | ! |
python_ok = FALSE, |
| 43 | ! |
pytorch_ok = FALSE, |
| 44 | ! |
cuda_available = FALSE, |
| 45 | ! |
sjsdm_ok = FALSE, |
| 46 | ! |
test_model_ok = FALSE |
| 47 |
) |
|
| 48 | ||
| 49 | ||
| 50 | ! |
cat("=============================================================\n")
|
| 51 | ! |
cat(" sjSDM Setup Verification\n")
|
| 52 | ||
| 53 | ! |
if (in_rstudio) {
|
| 54 | ! |
cat(" Running in: RStudio\n")
|
| 55 |
} else {
|
|
| 56 | ! |
cat(" Running in: VS Code / Radian\n")
|
| 57 |
} |
|
| 58 | ||
| 59 | ! |
cat("=============================================================\n")
|
| 60 | ||
| 61 | ||
| 62 | ||
| 63 |
#----------------------------------------------------------# |
|
| 64 |
# 1. Check Radian / Python environment configuration ----- |
|
| 65 |
#----------------------------------------------------------# |
|
| 66 | ||
| 67 | ! |
if (in_rstudio) {
|
| 68 | ! |
cat("1. Checking RStudio Python Configuration\n")
|
| 69 | ! |
cat(" ----------------------------------------\n")
|
| 70 | ||
| 71 | ! |
reticulate_python <- Sys.getenv("RETICULATE_PYTHON")
|
| 72 | ||
| 73 |
if ( |
|
| 74 | ! |
nchar(reticulate_python) > 0 && |
| 75 | ! |
grepl("r-sjsdm", reticulate_python, ignore.case = TRUE)
|
| 76 |
) {
|
|
| 77 | ! |
results$radian_ok <- TRUE |
| 78 | ! |
cat(" [OK] RETICULATE_PYTHON points to r-sjsdm environment\n")
|
| 79 | ! |
cat(" Path: ", reticulate_python, "\n")
|
| 80 | ! |
} else if (nchar(reticulate_python) > 0) {
|
| 81 | ! |
cat(" [WARN] RETICULATE_PYTHON is set but not pointing to r-sjsdm\n")
|
| 82 | ! |
cat(" Current: ", reticulate_python, "\n")
|
| 83 | ! |
cat(" Fix: Set RETICULATE_PYTHON in project .Renviron:\n")
|
| 84 | ! |
cat(" RETICULATE_PYTHON=C:/Users/ondre/AppData/Local/r-miniconda/envs/r-sjsdm/python.exe\n")
|
| 85 |
} else {
|
|
| 86 | ! |
cat(" [WARN] RETICULATE_PYTHON not set in .Renviron\n")
|
| 87 | ! |
cat(" sjSDM may not find PyTorch\n")
|
| 88 | ! |
cat(" Fix: Add to project .Renviron file:\n")
|
| 89 | ! |
cat(" RETICULATE_PYTHON=C:/Users/ondre/AppData/Local/r-miniconda/envs/r-sjsdm/python.exe\n")
|
| 90 | ! |
cat(" Then restart R\n")
|
| 91 |
} |
|
| 92 |
} else {
|
|
| 93 | ! |
cat("1. Checking Radian Configuration\n")
|
| 94 | ! |
cat(" ----------------------------------------\n")
|
| 95 | ||
| 96 | ! |
radian_path <- |
| 97 | ! |
tryCatch( |
| 98 | ! |
expr = {
|
| 99 | ! |
system("where radian", intern = TRUE)[1]
|
| 100 |
}, |
|
| 101 | ! |
error = function(e) NA |
| 102 |
) |
|
| 103 | ||
| 104 | ! |
expected_path <- |
| 105 | ! |
"C:\\Users\\ondre\\AppData\\Local\\r-miniconda\\envs\\r-sjsdm\\Scripts\\radian.exe" |
| 106 | ||
| 107 |
if ( |
|
| 108 | ! |
isFALSE(is.na(radian_path)) && |
| 109 | ! |
grepl("r-sjsdm", radian_path, ignore.case = TRUE)
|
| 110 |
) {
|
|
| 111 | ! |
results$radian_ok <- TRUE |
| 112 | ! |
cat(" [OK] Radian is from r-sjsdm environment\n")
|
| 113 | ! |
cat(" Path: ", radian_path, "\n")
|
| 114 |
} else {
|
|
| 115 | ! |
cat(" [FAIL] Radian not from r-sjsdm environment\n")
|
| 116 | ! |
cat(" Current: ", radian_path, "\n")
|
| 117 | ! |
cat(" Expected: ", expected_path, "\n")
|
| 118 | ! |
cat("\n Fix: Update VS Code settings:\n")
|
| 119 | ! |
cat(' "r.rterm.windows": "', expected_path, '"\n', sep = "")
|
| 120 |
} |
|
| 121 |
} |
|
| 122 | ||
| 123 | ! |
cat("\n")
|
| 124 | ||
| 125 | ||
| 126 |
#----------------------------------------------------------# |
|
| 127 |
# 2. Check Python Configuration ----- |
|
| 128 |
#----------------------------------------------------------# |
|
| 129 | ||
| 130 | ||
| 131 | ! |
cat("2. Checking Python Configuration\n")
|
| 132 | ! |
cat(" ----------------------------------------\n")
|
| 133 | ||
| 134 |
if ( |
|
| 135 | ! |
isFALSE( |
| 136 | ! |
requireNamespace("reticulate", quietly = TRUE)
|
| 137 |
) |
|
| 138 |
) {
|
|
| 139 | ! |
cat(" [FAIL] reticulate package not installed\n")
|
| 140 | ! |
cat(" Fix: install.packages('reticulate')\n\n")
|
| 141 | ! |
return(invisible(results)) |
| 142 |
} |
|
| 143 | ||
| 144 | ! |
py_conf <- |
| 145 | ! |
tryCatch( |
| 146 | ! |
expr = {
|
| 147 | ! |
reticulate::py_config() |
| 148 |
}, |
|
| 149 | ! |
error = function(e) NULL |
| 150 |
) |
|
| 151 | ||
| 152 |
if ( |
|
| 153 | ! |
isFALSE(is.null(py_conf)) |
| 154 |
) {
|
|
| 155 | ! |
results$python_ok <- TRUE |
| 156 | ! |
python_path <- py_conf$python |
| 157 | ! |
python_version <- py_conf$version |
| 158 | ||
| 159 | ! |
cat(" [OK] Python found\n")
|
| 160 | ! |
cat(" Version: ", as.character(python_version), "\n")
|
| 161 | ! |
cat(" Path: ", python_path, "\n")
|
| 162 | ||
| 163 |
if ( |
|
| 164 | ! |
grepl("r-sjsdm", python_path, ignore.case = TRUE)
|
| 165 |
) {
|
|
| 166 | ! |
cat(" [OK] Using r-sjsdm environment\n")
|
| 167 |
} else {
|
|
| 168 | ! |
cat(" [WARN] Warning: Not using r-sjsdm environment\n")
|
| 169 | ! |
cat(" This may cause issues\n")
|
| 170 |
} |
|
| 171 |
} else {
|
|
| 172 | ! |
cat(" [FAIL] Python configuration failed\n")
|
| 173 |
} |
|
| 174 | ||
| 175 | ! |
cat("\n")
|
| 176 | ||
| 177 |
#----------------------------------------------------------# |
|
| 178 |
# 3. Check PyTorch Installation ----- |
|
| 179 |
#----------------------------------------------------------# |
|
| 180 | ||
| 181 | ! |
cat("3. Checking PyTorch Installation\n")
|
| 182 | ! |
cat(" ----------------------------------------\n")
|
| 183 | ||
| 184 | ! |
torch <- |
| 185 | ! |
tryCatch( |
| 186 | ! |
expr = {
|
| 187 | ! |
reticulate::import("torch")
|
| 188 |
}, |
|
| 189 | ! |
error = function(e) NULL |
| 190 |
) |
|
| 191 | ||
| 192 |
if ( |
|
| 193 | ! |
isFALSE(is.null(torch)) |
| 194 |
) {
|
|
| 195 | ! |
results$pytorch_ok <- TRUE |
| 196 | ! |
pytorch_version <- torch$`__version__` |
| 197 | ||
| 198 | ! |
cat(" [OK] PyTorch is installed\n")
|
| 199 | ! |
cat(" Version: ", pytorch_version, "\n")
|
| 200 | ||
| 201 |
# Check CUDA |
|
| 202 | ! |
cuda_available <- |
| 203 | ! |
tryCatch( |
| 204 | ! |
expr = {
|
| 205 | ! |
torch$cuda$is_available() |
| 206 |
}, |
|
| 207 | ! |
error = function(e) FALSE |
| 208 |
) |
|
| 209 | ||
| 210 | ! |
results$cuda_available <- cuda_available |
| 211 | ||
| 212 | ! |
if (cuda_available) {
|
| 213 | ! |
cat(" [OK] CUDA available (GPU mode)\n")
|
| 214 | ! |
cat(" CUDA version: ", torch$version$cuda, "\n")
|
| 215 | ||
| 216 | ! |
device_name <- |
| 217 | ! |
tryCatch( |
| 218 | ! |
expr = {
|
| 219 | ! |
torch$cuda$get_device_name(0L) |
| 220 |
}, |
|
| 221 | ! |
error = function(e) "Unknown" |
| 222 |
) |
|
| 223 | ! |
cat(" GPU: ", device_name, "\n")
|
| 224 |
} else {
|
|
| 225 | ! |
cat(" [WARN] CUDA not available (CPU mode)\n")
|
| 226 | ! |
cat(" This is normal if you don't have NVIDIA GPU\n")
|
| 227 | ! |
cat(" sjSDM will work but slower for large datasets\n")
|
| 228 |
} |
|
| 229 |
} else {
|
|
| 230 | ! |
cat(" [FAIL] PyTorch not found\n")
|
| 231 | ! |
cat("\n Fix: Reinstall PyTorch in r-sjsdm environment\n")
|
| 232 | ! |
cat(" Run in PowerShell:\n")
|
| 233 | ! |
cat(' & "C:\\Users\\ondre\\AppData\\Local\\r-miniconda\\Scripts\\conda.exe" run -n r-sjsdm pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121\n')
|
| 234 |
} |
|
| 235 | ||
| 236 | ! |
cat("\n")
|
| 237 | ||
| 238 |
#----------------------------------------------------------# |
|
| 239 |
# 4. Check sjSDM Package ----- |
|
| 240 |
#----------------------------------------------------------# |
|
| 241 | ||
| 242 | ! |
cat("4. Checking sjSDM Package\n")
|
| 243 | ! |
cat(" ----------------------------------------\n")
|
| 244 | ||
| 245 | ! |
sjsdm_installed <- |
| 246 | ! |
requireNamespace("sjSDM", quietly = TRUE)
|
| 247 | ||
| 248 | ! |
if (sjsdm_installed) {
|
| 249 | ! |
cat(" [OK] sjSDM package is installed\n")
|
| 250 | ||
| 251 |
# Try to load sjSDM |
|
| 252 | ! |
sjsdm_loaded <- |
| 253 | ! |
tryCatch( |
| 254 | ! |
expr = {
|
| 255 | ! |
library(sjSDM) |
| 256 | ! |
TRUE |
| 257 |
}, |
|
| 258 | ! |
error = function(e) FALSE, |
| 259 | ! |
warning = function(w) {
|
| 260 | ! |
cat(" [WARN] Warning: ", conditionMessage(w), "\n")
|
| 261 | ! |
TRUE |
| 262 |
} |
|
| 263 |
) |
|
| 264 | ||
| 265 | ! |
if (sjsdm_loaded) {
|
| 266 | ! |
results$sjsdm_ok <- TRUE |
| 267 | ! |
cat(" [OK] sjSDM loaded successfully\n")
|
| 268 | ! |
cat(" All Python dependencies available\n")
|
| 269 |
} else {
|
|
| 270 | ! |
cat(" [FAIL] sjSDM failed to load\n")
|
| 271 | ! |
cat("\n Fix: Reinstall sjSDM dependencies\n")
|
| 272 | ! |
cat(" sjSDM::install_sjSDM()\n")
|
| 273 |
} |
|
| 274 |
} else {
|
|
| 275 | ! |
cat(" [FAIL] sjSDM package not installed\n")
|
| 276 | ! |
cat("\n Fix: install.packages('sjSDM')\n")
|
| 277 |
} |
|
| 278 | ||
| 279 | ! |
cat("\n")
|
| 280 | ||
| 281 |
#----------------------------------------------------------# |
|
| 282 |
# 5. Run Test Model ----- |
|
| 283 |
#----------------------------------------------------------# |
|
| 284 | ||
| 285 |
if ( |
|
| 286 | ! |
run_test_model && |
| 287 | ! |
results$sjsdm_ok |
| 288 |
) {
|
|
| 289 | ! |
cat("5. Running Test Model\n")
|
| 290 | ! |
cat(" ----------------------------------------\n")
|
| 291 | ||
| 292 | ! |
test_result <- |
| 293 | ! |
tryCatch( |
| 294 | ! |
expr = {
|
| 295 | ! |
set.seed(900723) |
| 296 | ! |
community <- |
| 297 | ! |
sjSDM::simulate_SDM( |
| 298 | ! |
sites = 50, |
| 299 | ! |
species = 5, |
| 300 | ! |
env = 3 |
| 301 |
) |
|
| 302 | ||
| 303 | ! |
model <- |
| 304 | ! |
sjSDM::sjSDM( |
| 305 | ! |
Y = community$response, |
| 306 | ! |
env = sjSDM::linear( |
| 307 | ! |
data = community$env_weights, |
| 308 | ! |
formula = ~ X1 + X2 + X3 |
| 309 |
), |
|
| 310 | ! |
verbose = FALSE |
| 311 |
) |
|
| 312 | ||
| 313 | ! |
cat(" [OK] Test model fitted successfully\n")
|
| 314 | ! |
cat(" LogLik: ", model$logLik[[1]], "\n")
|
| 315 | ||
| 316 | ! |
results$test_model_ok <- TRUE |
| 317 | ! |
TRUE |
| 318 |
}, |
|
| 319 | ! |
error = function(e) {
|
| 320 | ! |
cat(" [FAIL] Test model failed\n")
|
| 321 | ! |
cat(" Error: ", conditionMessage(e), "\n")
|
| 322 | ! |
FALSE |
| 323 |
} |
|
| 324 |
) |
|
| 325 |
} |
|
| 326 | ||
| 327 |
#----------------------------------------------------------# |
|
| 328 |
# 6. Summary ----- |
|
| 329 |
#----------------------------------------------------------# |
|
| 330 | ||
| 331 | ! |
all_critical_ok <- |
| 332 | ! |
results$python_ok && |
| 333 | ! |
results$pytorch_ok && |
| 334 | ! |
results$sjsdm_ok |
| 335 | ||
| 336 |
# In RStudio, radian_ok reflects RETICULATE_PYTHON configuration; |
|
| 337 |
# it is informational and doesn't block the critical check. |
|
| 338 | ||
| 339 | ! |
if (!all_critical_ok) {
|
| 340 | ! |
cat("[FAIL] Some critical checks failed\n\n")
|
| 341 | ! |
cat("Issues found:\n")
|
| 342 | ||
| 343 |
if ( |
|
| 344 | ! |
isFALSE(results$python_ok) |
| 345 |
) {
|
|
| 346 | ! |
cat(" - Python configuration issue\n")
|
| 347 |
} |
|
| 348 | ||
| 349 |
if ( |
|
| 350 | ! |
isFALSE(results$pytorch_ok) |
| 351 |
) {
|
|
| 352 | ! |
cat(" - PyTorch not available\n")
|
| 353 |
} |
|
| 354 | ||
| 355 |
if ( |
|
| 356 | ! |
isFALSE(results$sjsdm_ok) |
| 357 |
) {
|
|
| 358 | ! |
cat(" - sjSDM not working\n")
|
| 359 |
} |
|
| 360 | ||
| 361 | ! |
cat("\nRefer to the detailed checks above for solutions.\n")
|
| 362 | ! |
cat("See: Documentation/Materials/sjSDM_installation_guide.md\n")
|
| 363 |
} |
|
| 364 | ||
| 365 | ! |
cat("\n=============================================================\n")
|
| 366 | ||
| 367 | ! |
return( |
| 368 | ! |
invisible(results) |
| 369 |
) |
|
| 370 |
} |
| 1 |
#' @title Scale Abiotic Data for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Centres and scales abiotic predictor variables, records the |
|
| 4 |
#' scaling attributes for later back-transformation, and returns |
|
| 5 |
#' both the scaled data frame and the attributes as a named list. |
|
| 6 |
#' |
|
| 7 |
#' `age` is centred only (mean subtracted, no division by SD). |
|
| 8 |
#' All other variables are both centred and scaled (divided by SD) |
|
| 9 |
#' when more than one sample is present. |
|
| 10 |
#' @param data_abiotic_wide |
|
| 11 |
#' A data frame in wide format as returned by |
|
| 12 |
#' `prepare_abiotic_for_fit()`, containing real columns |
|
| 13 |
#' `dataset_name`, `age`, and one column per abiotic variable. |
|
| 14 |
#' @return |
|
| 15 |
#' A named list with two elements: |
|
| 16 |
#' \describe{
|
|
| 17 |
#' \item{`data_abiotic_scaled`}{A data frame with row names in
|
|
| 18 |
#' the format `"<dataset_name>__<age>"`, an `age` column |
|
| 19 |
#' (centre-only scaled), and all other abiotic variable columns |
|
| 20 |
#' (centre-and-scale). Rows with any `NA` are dropped before |
|
| 21 |
#' scaling.} |
|
| 22 |
#' \item{`scale_attributes`}{A named list of `center` and
|
|
| 23 |
#' `scale` attributes for each variable (including `age`), |
|
| 24 |
#' which can be used to back-transform predictions.} |
|
| 25 |
#' } |
|
| 26 |
#' @details |
|
| 27 |
#' Rows with any `NA` across the abiotic variables are silently |
|
| 28 |
#' dropped via `tidyr::drop_na()` before scaling. The returned |
|
| 29 |
#' `scale_attributes` list preserves the same structure as |
|
| 30 |
#' `attributes(scale(x))[-1]` (i.e., `dim` excluded). |
|
| 31 |
#' @seealso [prepare_abiotic_for_fit()], [assemble_data_to_fit()] |
|
| 32 |
#' @export |
|
| 33 |
scale_abiotic_for_fit <- function(data_abiotic_wide = NULL) {
|
|
| 34 | 11x |
assertthat::assert_that( |
| 35 | 11x |
is.data.frame(data_abiotic_wide), |
| 36 | 11x |
msg = "data_abiotic_wide must be a data frame" |
| 37 |
) |
|
| 38 | ||
| 39 | 10x |
assertthat::assert_that( |
| 40 | 10x |
all(c("dataset_name", "age") %in% names(data_abiotic_wide)),
|
| 41 | 10x |
msg = paste0( |
| 42 | 10x |
"data_abiotic_wide must contain columns", |
| 43 | 10x |
" 'dataset_name' and 'age'" |
| 44 |
) |
|
| 45 |
) |
|
| 46 | ||
| 47 |
# 1. Drop rows with any NA ----- |
|
| 48 | ||
| 49 | 9x |
data_clean <- |
| 50 | 9x |
tidyr::drop_na(data_abiotic_wide) |
| 51 | ||
| 52 | 9x |
is_scalable <- |
| 53 | 9x |
nrow(data_clean) > 1 |
| 54 | ||
| 55 |
# 2. Capture scale attributes ----- |
|
| 56 | ||
| 57 | 9x |
vec_age_scaled <- |
| 58 | 9x |
data_clean |> |
| 59 | 9x |
dplyr::pull(age) |> |
| 60 | 9x |
scale(center = TRUE, scale = FALSE) |
| 61 | ||
| 62 | 9x |
list_age_attributes <- |
| 63 | 9x |
list( |
| 64 | 9x |
age = attributes(vec_age_scaled)[-1] |
| 65 |
) |
|
| 66 | ||
| 67 | 9x |
list_clim_attributes <- |
| 68 | 9x |
data_clean |> |
| 69 | 9x |
dplyr::select(-dataset_name, -age) |> |
| 70 | 9x |
purrr::map( |
| 71 | 9x |
.f = ~ scale(.x, center = TRUE, scale = is_scalable) |> |
| 72 | 9x |
attributes() %>% # use magrittr pipe for environment handling |
| 73 |
{
|
|
| 74 | 10x |
.[-1] |
| 75 |
} |
|
| 76 |
) |
|
| 77 | ||
| 78 | 9x |
scale_attributes <- |
| 79 | 9x |
c( |
| 80 | 9x |
list_age_attributes, |
| 81 | 9x |
list_clim_attributes |
| 82 |
) |
|
| 83 | ||
| 84 |
# 3. Apply scaling and add row names ----- |
|
| 85 | ||
| 86 | 9x |
data_abiotic_scaled <- |
| 87 | 9x |
data_clean |> |
| 88 | 9x |
dplyr::mutate( |
| 89 | 9x |
.row_name = paste0(dataset_name, "__", age), |
| 90 | 9x |
age = scale(age, center = TRUE, scale = FALSE) |> |
| 91 | 9x |
as.numeric() |
| 92 |
) |> |
|
| 93 | 9x |
dplyr::mutate( |
| 94 | 9x |
dplyr::across( |
| 95 | 9x |
.cols = -c(dataset_name, age, .row_name), |
| 96 | 9x |
.fns = ~ scale( |
| 97 | 9x |
.x, |
| 98 | 9x |
center = TRUE, |
| 99 | 9x |
scale = is_scalable |
| 100 |
) |> |
|
| 101 | 9x |
as.numeric() |
| 102 |
) |
|
| 103 |
) |> |
|
| 104 | 9x |
dplyr::select(-dataset_name) |> |
| 105 | 9x |
tibble::column_to_rownames(".row_name")
|
| 106 | ||
| 107 |
# 4. Return list ----- |
|
| 108 | ||
| 109 | 9x |
res <- |
| 110 | 9x |
list( |
| 111 | 9x |
data_abiotic_scaled = data_abiotic_scaled, |
| 112 | 9x |
scale_attributes = scale_attributes |
| 113 |
) |
|
| 114 | ||
| 115 | 9x |
return(res) |
| 116 |
} |
| 1 |
#' @title Assemble Final Data List for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Validates that the community matrix and scaled abiotic data |
|
| 4 |
#' share the same sample rows in the same order, then bundles |
|
| 5 |
#' them ā together with optional scaled spatial predictors ā |
|
| 6 |
#' into the named list expected by `fit_jsdm_model()`. |
|
| 7 |
#' @param data_community_filtered |
|
| 8 |
#' A numeric matrix with row names `"<dataset_name>__<age>"` |
|
| 9 |
#' and taxon columns, as returned by `filter_constant_taxa()`. |
|
| 10 |
#' @param data_abiotic_scaled_list |
|
| 11 |
#' A named list with elements `data_abiotic_scaled` and |
|
| 12 |
#' `scale_attributes`, as returned by |
|
| 13 |
#' `scale_abiotic_for_fit()`. `data_abiotic_scaled` must have |
|
| 14 |
#' row names matching those of the community matrix. |
|
| 15 |
#' @param data_spatial_scaled_list |
|
| 16 |
#' Optional. A named list with elements `data_spatial_scaled` |
|
| 17 |
#' and `spatial_scale_attributes`, as returned by |
|
| 18 |
#' `scale_spatial_for_fit()`. `data_spatial_scaled` must have |
|
| 19 |
#' row names matching those of the community matrix. If `NULL` |
|
| 20 |
#' (default), no spatial predictors are included. |
|
| 21 |
#' @return |
|
| 22 |
#' A named list with three mandatory and up to two optional |
|
| 23 |
#' elements: |
|
| 24 |
#' \describe{
|
|
| 25 |
#' \item{`data_community_to_fit`}{The (filtered) community
|
|
| 26 |
#' matrix.} |
|
| 27 |
#' \item{`data_abiotic_to_fit`}{The scaled abiotic data
|
|
| 28 |
#' frame.} |
|
| 29 |
#' \item{`scale_attributes`}{Abiotic scaling attributes for
|
|
| 30 |
#' back-transformation.} |
|
| 31 |
#' \item{`data_spatial_to_fit`}{Scaled spatial predictor
|
|
| 32 |
#' data frame (only present when `data_spatial_scaled_list` |
|
| 33 |
#' is supplied).} |
|
| 34 |
#' \item{`spatial_scale_attributes`}{Spatial scaling
|
|
| 35 |
#' attributes (only present when `data_spatial_scaled_list` |
|
| 36 |
#' is supplied).} |
|
| 37 |
#' } |
|
| 38 |
#' @details |
|
| 39 |
#' This function performs only validation and assembly; all |
|
| 40 |
#' data transformations are handled by the preceding pipeline |
|
| 41 |
#' targets. An error is raised if any two inputs differ in row |
|
| 42 |
#' count or row name ordering. |
|
| 43 |
#' @seealso [filter_constant_taxa()], [scale_abiotic_for_fit()], |
|
| 44 |
#' [scale_spatial_for_fit()], [fit_jsdm_model()] |
|
| 45 |
#' @export |
|
| 46 |
assemble_data_to_fit <- function( |
|
| 47 |
data_community_filtered = NULL, |
|
| 48 |
data_abiotic_scaled_list = NULL, |
|
| 49 |
data_spatial_scaled_list = NULL) {
|
|
| 50 | 27x |
assertthat::assert_that( |
| 51 | 27x |
is.matrix(data_community_filtered), |
| 52 | 27x |
msg = "data_community_filtered must be a matrix" |
| 53 |
) |
|
| 54 | ||
| 55 | 26x |
assertthat::assert_that( |
| 56 | 26x |
is.list(data_abiotic_scaled_list), |
| 57 | 26x |
all( |
| 58 | 26x |
c("data_abiotic_scaled", "scale_attributes") %in%
|
| 59 | 26x |
names(data_abiotic_scaled_list) |
| 60 |
), |
|
| 61 | 26x |
msg = paste0( |
| 62 | 26x |
"data_abiotic_scaled_list must be a list with elements", |
| 63 | 26x |
" 'data_abiotic_scaled' and 'scale_attributes'" |
| 64 |
) |
|
| 65 |
) |
|
| 66 | ||
| 67 |
if ( |
|
| 68 | 24x |
!is.null(data_spatial_scaled_list) |
| 69 |
) {
|
|
| 70 | 10x |
assertthat::assert_that( |
| 71 | 10x |
is.list(data_spatial_scaled_list), |
| 72 | 10x |
all( |
| 73 | 10x |
c( |
| 74 | 10x |
"data_spatial_scaled", |
| 75 | 10x |
"spatial_scale_attributes" |
| 76 | 10x |
) %in% names(data_spatial_scaled_list) |
| 77 |
), |
|
| 78 | 10x |
msg = paste0( |
| 79 | 10x |
"data_spatial_scaled_list must be a list with", |
| 80 | 10x |
" elements 'data_spatial_scaled' and", |
| 81 | 10x |
" 'spatial_scale_attributes'" |
| 82 |
) |
|
| 83 |
) |
|
| 84 |
} |
|
| 85 | ||
| 86 | 22x |
data_abiotic_scaled <- |
| 87 | 22x |
data_abiotic_scaled_list |> |
| 88 | 22x |
purrr::chuck("data_abiotic_scaled")
|
| 89 | ||
| 90 | 22x |
scale_attributes <- |
| 91 | 22x |
data_abiotic_scaled_list |> |
| 92 | 22x |
purrr::chuck("scale_attributes")
|
| 93 | ||
| 94 |
# Validate row alignment: community vs abiotic ----- |
|
| 95 | ||
| 96 | 22x |
assertthat::assert_that( |
| 97 | 22x |
nrow(data_community_filtered) == nrow(data_abiotic_scaled), |
| 98 | 22x |
msg = paste0( |
| 99 | 22x |
"Row counts of community and abiotic data", |
| 100 | 22x |
" must be identical" |
| 101 |
) |
|
| 102 |
) |
|
| 103 | ||
| 104 | 20x |
assertthat::assert_that( |
| 105 | 20x |
all( |
| 106 | 20x |
rownames(data_community_filtered) == |
| 107 | 20x |
rownames(data_abiotic_scaled) |
| 108 |
), |
|
| 109 | 20x |
msg = paste0( |
| 110 | 20x |
"Row names of community and abiotic data must be", |
| 111 | 20x |
" identical and in the same order" |
| 112 |
) |
|
| 113 |
) |
|
| 114 | ||
| 115 | 18x |
res <- |
| 116 | 18x |
list( |
| 117 | 18x |
data_community_to_fit = data_community_filtered, |
| 118 | 18x |
data_abiotic_to_fit = data_abiotic_scaled, |
| 119 | 18x |
scale_attributes = scale_attributes |
| 120 |
) |
|
| 121 | ||
| 122 |
if ( |
|
| 123 | 18x |
!is.null(data_spatial_scaled_list) |
| 124 |
) {
|
|
| 125 | 8x |
data_spatial_scaled <- |
| 126 | 8x |
data_spatial_scaled_list |> |
| 127 | 8x |
purrr::chuck("data_spatial_scaled")
|
| 128 | ||
| 129 | 8x |
spatial_scale_attributes <- |
| 130 | 8x |
data_spatial_scaled_list |> |
| 131 | 8x |
purrr::chuck("spatial_scale_attributes")
|
| 132 | ||
| 133 | 8x |
assertthat::assert_that( |
| 134 | 8x |
is.data.frame(data_spatial_scaled), |
| 135 | 8x |
msg = "data_spatial_scaled must be a data frame" |
| 136 |
) |
|
| 137 | ||
| 138 | 8x |
assertthat::assert_that( |
| 139 | 8x |
nrow(data_spatial_scaled) == |
| 140 | 8x |
nrow(data_community_filtered), |
| 141 | 8x |
msg = paste0( |
| 142 | 8x |
"Row count of spatial data must match", |
| 143 | 8x |
" the community matrix" |
| 144 |
) |
|
| 145 |
) |
|
| 146 | ||
| 147 | 6x |
assertthat::assert_that( |
| 148 | 6x |
all( |
| 149 | 6x |
rownames(data_spatial_scaled) == |
| 150 | 6x |
rownames(data_community_filtered) |
| 151 |
), |
|
| 152 | 6x |
msg = paste0( |
| 153 | 6x |
"Row names of spatial data must match", |
| 154 | 6x |
" those of the community matrix" |
| 155 |
) |
|
| 156 |
) |
|
| 157 | ||
| 158 | 4x |
res[["data_spatial_to_fit"]] <- data_spatial_scaled |
| 159 | 4x |
res[["spatial_scale_attributes"]] <- |
| 160 | 4x |
spatial_scale_attributes |
| 161 |
} |
|
| 162 | ||
| 163 | 14x |
return(res) |
| 164 |
} |
| 1 |
#' @title Select non-collinear predictors from abiotic data |
|
| 2 |
#' @description |
|
| 3 |
#' Filters a data frame of abiotic variables, retaining only predictors |
|
| 4 |
#' identified as non-collinear by a collinearity analysis. The selection |
|
| 5 |
#' is taken from the `result$selection` element of a `collinear_output` |
|
| 6 |
#' object (as returned by `get_predictor_collinearity()`). |
|
| 7 |
#' @param data_source |
|
| 8 |
#' A data frame containing abiotic variables. Must include a column |
|
| 9 |
#' named `abiotic_variable_name` whose values are matched against the |
|
| 10 |
#' selected predictors. |
|
| 11 |
#' @param collinearity_res |
|
| 12 |
#' A `collinear_output` object (as returned by |
|
| 13 |
#' `get_predictor_collinearity()`). Must contain a `result$selection` |
|
| 14 |
#' element ā a non-empty character vector of selected predictor names. |
|
| 15 |
#' @return |
|
| 16 |
#' A filtered data frame (same structure as `data_source`) containing |
|
| 17 |
#' only rows whose `abiotic_variable_name` is in the set of selected |
|
| 18 |
#' non-collinear predictors. |
|
| 19 |
#' @details |
|
| 20 |
#' Input validation is performed with `assertthat`. The function |
|
| 21 |
#' requires that the filtering produces at least one row; if no |
|
| 22 |
#' predictor names match, an error is raised suggesting the user check |
|
| 23 |
#' the collinearity results. |
|
| 24 |
#' @seealso [get_predictor_collinearity()] |
|
| 25 |
#' @export |
|
| 26 |
select_non_collinear_predictors <- function(data_source = NULL, |
|
| 27 |
collinearity_res = NULL) {
|
|
| 28 | 19x |
assertthat::assert_that( |
| 29 | 19x |
is.data.frame(data_source), |
| 30 | 19x |
msg = "data_source must be a data frame" |
| 31 |
) |
|
| 32 | ||
| 33 | 16x |
assertthat::assert_that( |
| 34 | 16x |
inherits(collinearity_res, "collinear_output"), |
| 35 | 16x |
msg = "collinearity_res must be a collinear_output object" |
| 36 |
) |
|
| 37 | ||
| 38 | 13x |
assertthat::assert_that( |
| 39 | 13x |
"result" %in% names(collinearity_res), |
| 40 | 13x |
msg = "collinearity_res should contain a 'result' element" |
| 41 |
) |
|
| 42 | ||
| 43 | 12x |
assertthat::assert_that( |
| 44 | 12x |
"selection" %in% names(collinearity_res$result), |
| 45 | 12x |
msg = "collinearity_res$result should contain a 'selection' element" |
| 46 |
) |
|
| 47 | ||
| 48 | 11x |
assertthat::assert_that( |
| 49 | 11x |
is.character(collinearity_res$result$selection), |
| 50 | 11x |
length(collinearity_res$result$selection) > 0, |
| 51 | 11x |
msg = "Selection of predictors should be a non-empty character vector" |
| 52 |
) |
|
| 53 | ||
| 54 | 9x |
res <- |
| 55 | 9x |
data_source |> |
| 56 | 9x |
dplyr::filter( |
| 57 | 9x |
abiotic_variable_name %in% collinearity_res$result$selection |
| 58 |
) |
|
| 59 | ||
| 60 | 9x |
assertthat::assert_that( |
| 61 | 9x |
nrow(res) > 0, |
| 62 | 9x |
msg = "No predictors selected after filtering. Check collinearity results." |
| 63 |
) |
|
| 64 | ||
| 65 | 8x |
return(res) |
| 66 |
} |
| 1 |
#' @title Recalculate ANOVA Components via Shapley-Allocated |
|
| 2 |
#' Percentages |
|
| 3 |
#' @description |
|
| 4 |
#' Converts a long-format ANOVA results table into per-age |
|
| 5 |
#' percentages for the three unique variance-partitioning |
|
| 6 |
#' components (Abiotic, Associations, Spatial) using a |
|
| 7 |
#' Shapley equal-split allocation of intersection terms. |
|
| 8 |
#' Negative Nagelkerke R² values are clamped to zero before |
|
| 9 |
#' any computation. |
|
| 10 |
#' @param data_source |
|
| 11 |
#' A non-empty data frame with columns: |
|
| 12 |
#' \describe{
|
|
| 13 |
#' \item{age}{Numeric. Age (cal yr BP) of the time slice.}
|
|
| 14 |
#' \item{component}{Character. Component label. May include
|
|
| 15 |
#' any of the seven sjSDM variance-partitioning labels: |
|
| 16 |
#' \code{"Abiotic"}, \code{"Associations"},
|
|
| 17 |
#' \code{"Spatial"}, \code{"Abiotic&Associations"},
|
|
| 18 |
#' \code{"Abiotic&Spatial"},
|
|
| 19 |
#' \code{"Associations&Spatial"}, and
|
|
| 20 |
#' \code{"Abiotic&Associations&Spatial"}.
|
|
| 21 |
#' Missing intersection labels are treated as zero.} |
|
| 22 |
#' \item{R2_Nagelkerke}{Numeric. Nagelkerke R² value.
|
|
| 23 |
#' Values below zero are clamped to 0 internally.} |
|
| 24 |
#' } |
|
| 25 |
#' @return |
|
| 26 |
#' A tibble with exactly 3 rows per age (one per unique |
|
| 27 |
#' component) and columns \code{age}, \code{component},
|
|
| 28 |
#' \code{R2_Nagelkerke_adjusted}, and
|
|
| 29 |
#' \code{R2_Nagelkerke_percentage}:
|
|
| 30 |
#' \describe{
|
|
| 31 |
#' \item{R2_Nagelkerke_adjusted}{Numeric.
|
|
| 32 |
#' Shapley-adjusted R² for the component: the component's |
|
| 33 |
#' unique fraction plus an equal share of every |
|
| 34 |
#' intersection term that includes it.} |
|
| 35 |
#' \item{R2_Nagelkerke_percentage}{Numeric. Percentage of
|
|
| 36 |
#' total adjusted R² explained by this component within |
|
| 37 |
#' the age slice: |
|
| 38 |
#' \code{R2_adjusted / sum(R2_adjusted) * 100}.
|
|
| 39 |
#' \code{NA_real_} when the age-slice total is zero.}
|
|
| 40 |
#' } |
|
| 41 |
#' @details |
|
| 42 |
#' \strong{Negative clamping:} All \code{R2_Nagelkerke}
|
|
| 43 |
#' values are clamped to \eqn{[0, \infty)} before
|
|
| 44 |
#' allocation. Negative values arise from suppressor effects |
|
| 45 |
#' and carry no directional attribution. |
|
| 46 |
#' |
|
| 47 |
#' \strong{Shapley equal-split allocation:} Each intersection
|
|
| 48 |
#' term is divided equally among its constituent unique |
|
| 49 |
#' components, irrespective of unique-fraction magnitudes. |
|
| 50 |
#' The three adjusted values therefore sum to the total of |
|
| 51 |
#' all seven (clamped) fractions, preserving total explained |
|
| 52 |
#' variance. Using equal splits rather than proportional |
|
| 53 |
#' splits avoids a feedback loop in which a larger unique |
|
| 54 |
#' fraction accumulates more shared variance, further |
|
| 55 |
#' inflating its apparent importance. |
|
| 56 |
#' |
|
| 57 |
#' Concretely (where missing fractions are treated as 0): |
|
| 58 |
#' \itemize{
|
|
| 59 |
#' \item Abiotic_adj = F_A + F_AB/2 + F_AS/2 + F_ABS/3 |
|
| 60 |
#' \item Assoc_adj = F_B + F_AB/2 + F_BS/2 + F_ABS/3 |
|
| 61 |
#' \item Spatial_adj = F_S + F_AS/2 + F_BS/2 + F_ABS/3 |
|
| 62 |
#' } |
|
| 63 |
#' @seealso [extract_anova_fractions()], |
|
| 64 |
#' [aggregate_anova_components()] |
|
| 65 |
#' @export |
|
| 66 |
recalculate_anova_components <- function(data_source) {
|
|
| 67 | 24x |
assertthat::assert_that( |
| 68 | 24x |
base::is.data.frame(data_source), |
| 69 | 24x |
base::nrow(data_source) > 0, |
| 70 | 24x |
msg = "'data_source' must be a non-empty data frame." |
| 71 |
) |
|
| 72 | ||
| 73 | 19x |
assertthat::assert_that( |
| 74 | 19x |
base::all( |
| 75 | 19x |
c("age", "component", "R2_Nagelkerke") %in%
|
| 76 | 19x |
base::colnames(data_source) |
| 77 |
), |
|
| 78 | 19x |
msg = paste0( |
| 79 | 19x |
"'data_source' must contain columns", |
| 80 | 19x |
" 'age', 'component', and 'R2_Nagelkerke'." |
| 81 |
) |
|
| 82 |
) |
|
| 83 | ||
| 84 |
# Return the clamped R2 for one component from a per-age |
|
| 85 |
# slice. Returns 0 when the component row is absent. |
|
| 86 | 16x |
get_val <- function(data_slice, comp_name) {
|
| 87 | 168x |
val <- |
| 88 | 168x |
data_slice |> |
| 89 | 168x |
dplyr::filter(.data$component == comp_name) |> |
| 90 | 168x |
dplyr::pull(.data$R2_clamped) |
| 91 | 80x |
if (base::length(val) == 0L) 0 else val[[1L]] |
| 92 |
} |
|
| 93 | ||
| 94 |
# Shapley equal-split allocation for one age group. |
|
| 95 |
# Each intersection term is divided equally among its |
|
| 96 |
# constituent unique components, regardless of their |
|
| 97 |
# relative unique-fraction magnitudes. This avoids the |
|
| 98 |
# feedback loop where a larger unique fraction accumulates |
|
| 99 |
# even more shared variance. |
|
| 100 | 16x |
compute_shapley <- function(.x, .y) {
|
| 101 | 24x |
f_a <- get_val(.x, "Abiotic") |
| 102 | 24x |
f_b <- get_val(.x, "Associations") |
| 103 | 24x |
f_s <- get_val(.x, "Spatial") |
| 104 | 24x |
f_ab <- get_val(.x, "Abiotic&Associations") |
| 105 | 24x |
f_as <- get_val(.x, "Abiotic&Spatial") |
| 106 | 24x |
f_bs <- get_val(.x, "Associations&Spatial") |
| 107 | 24x |
f_abs <- get_val(.x, "Abiotic&Associations&Spatial") |
| 108 | ||
| 109 | 24x |
tibble::tibble( |
| 110 | 24x |
component = c("Abiotic", "Associations", "Spatial"),
|
| 111 | 24x |
R2_Nagelkerke_adjusted = c( |
| 112 | 24x |
f_a + f_ab / 2 + f_as / 2 + f_abs / 3, |
| 113 | 24x |
f_b + f_ab / 2 + f_bs / 2 + f_abs / 3, |
| 114 | 24x |
f_s + f_as / 2 + f_bs / 2 + f_abs / 3 |
| 115 |
) |
|
| 116 |
) |
|
| 117 |
} |
|
| 118 | ||
| 119 | 16x |
res <- |
| 120 | 16x |
data_source |> |
| 121 |
# Clamp negatives before allocation: negative values arise |
|
| 122 |
# from suppressor effects and carry no directional |
|
| 123 |
# attribution, so they should not reduce any component's |
|
| 124 |
# share. |
|
| 125 | 16x |
dplyr::mutate( |
| 126 | 16x |
R2_clamped = base::pmax(.data$R2_Nagelkerke, 0) |
| 127 |
) |> |
|
| 128 | 16x |
dplyr::group_by(.data$age) |> |
| 129 | 16x |
dplyr::group_modify(compute_shapley) |> |
| 130 | 16x |
dplyr::ungroup() |> |
| 131 | 16x |
dplyr::group_by(.data$age) |> |
| 132 | 16x |
dplyr::mutate( |
| 133 | 16x |
R2_Nagelkerke_percentage = {
|
| 134 | 24x |
vec_sum <- |
| 135 | 24x |
base::sum(.data$R2_Nagelkerke_adjusted) |
| 136 | 24x |
if (vec_sum > 0) {
|
| 137 | 23x |
.data$R2_Nagelkerke_adjusted / vec_sum * 100 |
| 138 |
} else {
|
|
| 139 | 1x |
base::rep( |
| 140 | 1x |
NA_real_, |
| 141 | 1x |
base::length(.data$R2_Nagelkerke_adjusted) |
| 142 |
) |
|
| 143 |
} |
|
| 144 |
} |
|
| 145 |
) |> |
|
| 146 | 16x |
dplyr::ungroup() |
| 147 | ||
| 148 | 16x |
return(res) |
| 149 |
} |
| 1 |
#' @title Interpolate 3-D Spatiotemporal MEVs to Prediction Grid |
|
| 2 |
#' @description |
|
| 3 |
#' Uses Inverse Distance Weighting (IDW, power = 2) in a |
|
| 4 |
#' z-scored 3-D space `(x_km, y_km, age_kyr)` to approximate |
|
| 5 |
#' spatiotemporal Moran Eigenvector Map (MEM) values from |
|
| 6 |
#' training samples to prediction locations at a given age. |
|
| 7 |
#' The interpolated values are then scaled using the training |
|
| 8 |
#' spatial scale attributes. |
|
| 9 |
#' @param data_st_mev_samples |
|
| 10 |
#' A data frame with row names in the format |
|
| 11 |
#' `"<dataset_name>__<age>"` and unscaled spatiotemporal MEV |
|
| 12 |
#' columns (`mev_1`, ā¦), as returned by |
|
| 13 |
#' `compute_spatiotemporal_mev()`. |
|
| 14 |
#' @param data_coords_projected_train |
|
| 15 |
#' A data frame with `coord_x_km` and `coord_y_km` columns |
|
| 16 |
#' and `dataset_name` as row names (site level), as returned |
|
| 17 |
#' by `project_coords_to_metric()`. |
|
| 18 |
#' @param data_coords_projected_pred |
|
| 19 |
#' A data frame with `coord_x_km` and `coord_y_km` columns |
|
| 20 |
#' and arbitrary row names identifying prediction locations, |
|
| 21 |
#' as returned by `project_coords_to_metric()`. |
|
| 22 |
#' @param pred_age |
|
| 23 |
#' Numeric or integer scalar. Age in years BP at which |
|
| 24 |
#' predictions are made. Used as the temporal dimension of |
|
| 25 |
#' the prediction 3-D coordinate matrix (`age_kyr = |
|
| 26 |
#' pred_age / 1000`). |
|
| 27 |
#' @param spatial_scale_attributes |
|
| 28 |
#' A named list of `"scaled:center"` and `"scaled:scale"` |
|
| 29 |
#' attributes per ST-MEV column, as returned by |
|
| 30 |
#' `scale_spatial_for_fit()` in the `spatial_scale_attributes` |
|
| 31 |
#' element. |
|
| 32 |
#' @return |
|
| 33 |
#' A data frame with the same row names as |
|
| 34 |
#' `data_coords_projected_pred` and one column per ST-MEV |
|
| 35 |
#' (names matching `data_st_mev_samples`). All columns are |
|
| 36 |
#' scaled to match the training ST-MEV distribution. |
|
| 37 |
#' @details |
|
| 38 |
#' 3-D coordinates `(x_km, y_km, age_kyr)` of the training |
|
| 39 |
#' samples are z-scored using `colMeans` and column-wise |
|
| 40 |
#' standard deviations before computing Euclidean distances. |
|
| 41 |
#' This ensures the spatial and temporal extents contribute |
|
| 42 |
#' equally. The same z-score parameters are applied to the |
|
| 43 |
#' prediction grid coordinates, fixing the temporal dimension |
|
| 44 |
#' at `pred_age / 1000`. |
|
| 45 |
#' |
|
| 46 |
#' This function handles the **3-D spatiotemporal case**. |
|
| 47 |
#' For models fitted with `spatial_mode = "spatial"` use |
|
| 48 |
#' `interpolate_mev_to_grid()` instead. |
|
| 49 |
#' @seealso |
|
| 50 |
#' [compute_spatiotemporal_mev()], |
|
| 51 |
#' [interpolate_mev_to_grid()], |
|
| 52 |
#' [project_coords_to_metric()], |
|
| 53 |
#' [scale_spatial_for_fit()] |
|
| 54 |
#' @export |
|
| 55 |
interpolate_st_mev_to_grid <- function( |
|
| 56 |
data_st_mev_samples = NULL, |
|
| 57 |
data_coords_projected_train = NULL, |
|
| 58 |
data_coords_projected_pred = NULL, |
|
| 59 |
pred_age = NULL, |
|
| 60 |
spatial_scale_attributes = NULL) {
|
|
| 61 | 19x |
assertthat::assert_that( |
| 62 | 19x |
is.data.frame(data_st_mev_samples), |
| 63 | 19x |
nrow(data_st_mev_samples) > 0, |
| 64 | 19x |
msg = "data_st_mev_samples must be a non-empty data frame" |
| 65 |
) |
|
| 66 | ||
| 67 | 16x |
assertthat::assert_that( |
| 68 | 16x |
is.data.frame(data_coords_projected_train), |
| 69 | 16x |
all( |
| 70 | 16x |
c("coord_x_km", "coord_y_km") %in%
|
| 71 | 16x |
base::names(data_coords_projected_train) |
| 72 |
), |
|
| 73 | 16x |
msg = paste0( |
| 74 | 16x |
"data_coords_projected_train must be a data frame", |
| 75 | 16x |
" with columns 'coord_x_km' and 'coord_y_km'" |
| 76 |
) |
|
| 77 |
) |
|
| 78 | ||
| 79 | 13x |
assertthat::assert_that( |
| 80 | 13x |
is.data.frame(data_coords_projected_pred), |
| 81 | 13x |
all( |
| 82 | 13x |
c("coord_x_km", "coord_y_km") %in%
|
| 83 | 13x |
base::names(data_coords_projected_pred) |
| 84 |
), |
|
| 85 | 13x |
msg = paste0( |
| 86 | 13x |
"data_coords_projected_pred must be a data frame", |
| 87 | 13x |
" with columns 'coord_x_km' and 'coord_y_km'" |
| 88 |
) |
|
| 89 |
) |
|
| 90 | ||
| 91 | 10x |
assertthat::assert_that( |
| 92 | 10x |
(is.numeric(pred_age) || is.integer(pred_age)) && |
| 93 | 10x |
length(pred_age) == 1L, |
| 94 | 10x |
msg = "pred_age must be a single numeric or integer value" |
| 95 |
) |
|
| 96 | ||
| 97 | 7x |
assertthat::assert_that( |
| 98 | 7x |
is.list(spatial_scale_attributes), |
| 99 | 7x |
length(spatial_scale_attributes) > 0, |
| 100 | 7x |
msg = "spatial_scale_attributes must be a non-empty list" |
| 101 |
) |
|
| 102 | ||
| 103 |
# 1. ST-MEV column names ----- |
|
| 104 | 5x |
vec_st_mev_cols <- |
| 105 | 5x |
base::names(data_st_mev_samples) |
| 106 | ||
| 107 |
# 2. Reconstruct training 3-D raw matrix ----- |
|
| 108 |
# Row names follow "dataset_name__age" format. |
|
| 109 | 5x |
data_train_3d_raw <- |
| 110 | 5x |
tibble::tibble( |
| 111 | 5x |
sample_id = base::rownames(data_st_mev_samples) |
| 112 |
) |> |
|
| 113 | 5x |
tidyr::separate( |
| 114 | 5x |
col = sample_id, |
| 115 | 5x |
into = c("dataset_name", "age_chr"),
|
| 116 | 5x |
sep = "__", |
| 117 | 5x |
extra = "merge" |
| 118 |
) |> |
|
| 119 | 5x |
dplyr::mutate(age = base::as.integer(age_chr)) |> |
| 120 | 5x |
dplyr::select(-age_chr) |> |
| 121 | 5x |
dplyr::inner_join( |
| 122 | 5x |
data_coords_projected_train |> |
| 123 | 5x |
tibble::rownames_to_column("dataset_name"),
|
| 124 | 5x |
by = dplyr::join_by(dataset_name) |
| 125 |
) |> |
|
| 126 | 5x |
dplyr::mutate(age_kyr = age / 1000) |
| 127 | ||
| 128 | 5x |
mat_3d_train_raw <- |
| 129 | 5x |
data_train_3d_raw |> |
| 130 | 5x |
dplyr::select(coord_x_km, coord_y_km, age_kyr) |> |
| 131 | 5x |
base::as.matrix() |
| 132 | ||
| 133 |
# 3. Z-score parameters from training samples ----- |
|
| 134 |
# Z-scoring ensures spatial and temporal extents |
|
| 135 |
# contribute equally to the Euclidean distance. |
|
| 136 | 5x |
vec_3d_center <- |
| 137 | 5x |
base::colMeans(mat_3d_train_raw) |
| 138 | ||
| 139 | 5x |
vec_3d_scale <- |
| 140 | 5x |
base::apply(mat_3d_train_raw, 2, stats::sd) |
| 141 | ||
| 142 | 5x |
mat_3d_train_z <- |
| 143 | 5x |
base::scale( |
| 144 | 5x |
mat_3d_train_raw, |
| 145 | 5x |
center = vec_3d_center, |
| 146 | 5x |
scale = vec_3d_scale |
| 147 |
) |
|
| 148 | ||
| 149 |
# 4. Prediction 3-D matrix: grid coords + pred_age ----- |
|
| 150 | 5x |
mat_3d_pred_raw <- |
| 151 | 5x |
data_coords_projected_pred |> |
| 152 | 5x |
dplyr::select(coord_x_km, coord_y_km) |> |
| 153 | 5x |
dplyr::mutate(age_kyr = pred_age / 1000) |> |
| 154 | 5x |
base::as.matrix() |
| 155 | ||
| 156 |
# Apply same z-score params as training data ----- |
|
| 157 | 5x |
mat_3d_pred_z <- |
| 158 | 5x |
base::scale( |
| 159 | 5x |
mat_3d_pred_raw, |
| 160 | 5x |
center = vec_3d_center, |
| 161 | 5x |
scale = vec_3d_scale |
| 162 |
) |
|
| 163 | ||
| 164 |
# 5. 3-D Euclidean distances (rows = pred, cols = train) ----- |
|
| 165 | 5x |
mat_dist_3d <- |
| 166 | 5x |
base::sqrt( |
| 167 | 5x |
base::outer( |
| 168 | 5x |
mat_3d_pred_z[, 1], mat_3d_train_z[, 1], `-` |
| 169 | 5x |
)^2 + |
| 170 | 5x |
base::outer( |
| 171 | 5x |
mat_3d_pred_z[, 2], mat_3d_train_z[, 2], `-` |
| 172 | 5x |
)^2 + |
| 173 | 5x |
base::outer( |
| 174 | 5x |
mat_3d_pred_z[, 3], mat_3d_train_z[, 3], `-` |
| 175 | 5x |
)^2 |
| 176 |
) |
|
| 177 | ||
| 178 |
# 6. IDW weights (power = 2, epsilon avoids div-by-zero) ----- |
|
| 179 | 5x |
mat_idw_weights_3d <- |
| 180 | 5x |
1 / (mat_dist_3d^2 + 1e-10) |
| 181 | ||
| 182 | 5x |
mat_idw_weights_3d <- |
| 183 | 5x |
mat_idw_weights_3d / base::rowSums(mat_idw_weights_3d) |
| 184 | ||
| 185 |
# 7. Interpolate unscaled ST-MEV values ----- |
|
| 186 | 5x |
mat_train_st_mev <- |
| 187 | 5x |
data_st_mev_samples |> |
| 188 | 5x |
dplyr::select(dplyr::all_of(vec_st_mev_cols)) |> |
| 189 | 5x |
base::as.matrix() |
| 190 | ||
| 191 | 5x |
data_pred_st_mev_raw <- |
| 192 | 5x |
base::as.data.frame( |
| 193 | 5x |
mat_idw_weights_3d %*% mat_train_st_mev |
| 194 |
) |
|
| 195 | ||
| 196 | 5x |
base::colnames(data_pred_st_mev_raw) <- vec_st_mev_cols |
| 197 | 5x |
base::rownames(data_pred_st_mev_raw) <- |
| 198 | 5x |
base::rownames(data_coords_projected_pred) |
| 199 | ||
| 200 |
# 8. Scale using training spatial scale attributes ----- |
|
| 201 | 5x |
data_pred_st_mev_scaled <- |
| 202 | 5x |
data_pred_st_mev_raw |> |
| 203 | 5x |
dplyr::mutate( |
| 204 | 5x |
dplyr::across( |
| 205 | 5x |
.cols = dplyr::everything(), |
| 206 | 5x |
.fns = ~ {
|
| 207 | 10x |
col_nm <- dplyr::cur_column() |
| 208 | 10x |
center <- base::as.numeric( |
| 209 | 10x |
spatial_scale_attributes[[col_nm]][["scaled:center"]] |
| 210 |
) |
|
| 211 | 10x |
sc <- base::as.numeric( |
| 212 | 10x |
spatial_scale_attributes[[col_nm]][["scaled:scale"]] |
| 213 |
) |
|
| 214 | 10x |
(.x - center) / sc |
| 215 |
} |
|
| 216 |
) |
|
| 217 |
) |
|
| 218 | ||
| 219 | 5x |
return(data_pred_st_mev_scaled) |
| 220 |
} |
| 1 |
#' @title Filter Constant Taxa from Community Matrix |
|
| 2 |
#' @description |
|
| 3 |
#' Removes taxa that show no variation across all samples. A |
|
| 4 |
#' taxon is considered constant when its standard deviation is |
|
| 5 |
#' zero, meaning every sample has the same value. Constant |
|
| 6 |
#' taxa cannot contribute to any model likelihood and must be |
|
| 7 |
#' excluded before fitting. Filtering is applied regardless of |
|
| 8 |
#' the error family (binomial, Gaussian, Poisson, beta, etc.). |
|
| 9 |
#' @param data_community_matrix |
|
| 10 |
#' A numeric matrix with samples as rows and taxa as columns, |
|
| 11 |
#' as returned by `prepare_community_for_fit()`. |
|
| 12 |
#' @return |
|
| 13 |
#' A numeric matrix of the same structure as the input, with |
|
| 14 |
#' all constant taxa (standard deviation equal to zero) removed. |
|
| 15 |
#' If no taxa are constant the input matrix is returned |
|
| 16 |
#' unchanged. |
|
| 17 |
#' @details |
|
| 18 |
#' Variation is assessed by computing `stats::sd()` for each |
|
| 19 |
#' column via `purrr::map_dbl()`. A column is retained only |
|
| 20 |
#' when its standard deviation is strictly greater than zero. |
|
| 21 |
#' This family-agnostic approach replaces the previous |
|
| 22 |
#' binomial-only binarisation check and is now a tracked |
|
| 23 |
#' pipeline target. |
|
| 24 |
#' @seealso [prepare_community_for_fit()], |
|
| 25 |
#' [assemble_data_to_fit()] |
|
| 26 |
#' @export |
|
| 27 |
filter_constant_taxa <- function( |
|
| 28 |
data_community_matrix = NULL) {
|
|
| 29 | 10x |
assertthat::assert_that( |
| 30 | 10x |
is.matrix(data_community_matrix), |
| 31 | 10x |
msg = "data_community_matrix must be a matrix" |
| 32 |
) |
|
| 33 | ||
| 34 | 9x |
vec_col_sd <- |
| 35 | 9x |
purrr::map_dbl( |
| 36 | 9x |
.x = colnames(data_community_matrix), |
| 37 | 9x |
.f = ~ stats::sd(data_community_matrix[, .x]) |
| 38 |
) |
|
| 39 | ||
| 40 | 9x |
vec_is_variable <- |
| 41 | 9x |
vec_col_sd > 0 |
| 42 | ||
| 43 | 9x |
data_community_matrix <- |
| 44 | 9x |
data_community_matrix[ |
| 45 |
, |
|
| 46 | 9x |
vec_is_variable, |
| 47 | 9x |
drop = FALSE |
| 48 |
] |
|
| 49 | ||
| 50 | 9x |
return(data_community_matrix) |
| 51 |
} |
| 1 |
#' @title Get Community Taxa |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts a vector of unique taxa from a community data frame. |
|
| 4 |
#' @param data A data frame containing a 'taxon' column. |
|
| 5 |
#' @return A character vector of unique taxon names present in the data. |
|
| 6 |
#' @details |
|
| 7 |
#' Uses dplyr to select distinct values from the 'taxon' column and returns |
|
| 8 |
#' them as a vector. |
|
| 9 |
#' @export |
|
| 10 |
get_community_taxa <- function(data) {
|
|
| 11 | ||
| 12 | 4x |
assertthat::assert_that( |
| 13 | 4x |
is.data.frame(data), |
| 14 | 4x |
msg = "data must be a data frame" |
| 15 |
) |
|
| 16 | ||
| 17 | 3x |
assertthat::assert_that( |
| 18 | 3x |
"taxon" %in% colnames(data), |
| 19 | 3x |
msg = "data must contain a 'taxon' column" |
| 20 |
) |
|
| 21 | ||
| 22 | 2x |
data_res <- |
| 23 | 2x |
data %>% |
| 24 | 2x |
dplyr::distinct(taxon) %>% |
| 25 | 2x |
dplyr::pull(taxon) |
| 26 | ||
| 27 | 2x |
return(data_res) |
| 28 |
} |
| 1 |
#' @title Get Scale ID from Current Targets Store Path |
|
| 2 |
#' @description |
|
| 3 |
#' Determines whether the currently active targets store corresponds |
|
| 4 |
#' to a spatial unit by inspecting the store path and checking it |
|
| 5 |
#' against the project's spatial grid CSV catalogue. Returns the |
|
| 6 |
#' `scale_id` string for spatial pipelines, or `NULL` for |
|
| 7 |
#' non-spatial (named-project) pipelines. |
|
| 8 |
#' @param store |
|
| 9 |
#' A single character string giving the path to the targets data |
|
| 10 |
#' store. Default: `targets::tar_path_store()`, which resolves |
|
| 11 |
#' correctly both inside target commands (including callr worker |
|
| 12 |
#' processes) and when called interactively. |
|
| 13 |
#' @param file |
|
| 14 |
#' Path to the spatial grid CSV catalogue file. |
|
| 15 |
#' Default: `here::here("Data/Input/spatial_grid.csv")`.
|
|
| 16 |
#' @return |
|
| 17 |
#' A single character string with the `scale_id` when the store |
|
| 18 |
#' path corresponds to a spatial unit in the CSV catalogue, or |
|
| 19 |
#' `NULL` otherwise. |
|
| 20 |
#' @details |
|
| 21 |
#' The store path convention for spatial pipelines is: |
|
| 22 |
#' `{target_store}/{scale_id}/{pipeline_name}/`. The function
|
|
| 23 |
#' extracts the second-to-last path component via |
|
| 24 |
#' `basename(dirname(store))` and checks whether it appears in the |
|
| 25 |
#' `scale_id` column of the spatial grid CSV. For non-spatial |
|
| 26 |
#' pipelines (e.g. `Data/targets/project_cz/pipeline_basic/`) the |
|
| 27 |
#' candidate (`"project_cz"`) is not in the CSV, so `NULL` is |
|
| 28 |
#' returned. When the CSV file does not exist the function returns |
|
| 29 |
#' `NULL` gracefully. |
|
| 30 |
#' @seealso get_spatial_window, get_spatial_model_params |
|
| 31 |
#' @export |
|
| 32 |
get_scale_id_from_store <- function( |
|
| 33 |
store = targets::tar_path_store(), |
|
| 34 |
file = here::here("Data/Input/spatial_grid.csv")) {
|
|
| 35 | 10x |
assertthat::assert_that( |
| 36 | 10x |
is.character(store) && length(store) == 1, |
| 37 | 10x |
msg = paste0( |
| 38 | 10x |
"`store` must be a single character string.", |
| 39 | 10x |
" Got class: ", class(store), |
| 40 | 10x |
", length: ", length(store) |
| 41 |
) |
|
| 42 |
) |
|
| 43 | ||
| 44 |
# Graceful NULL when the catalogue is absent (e.g. isolated test env) |
|
| 45 | 8x |
if (!file.exists(file)) {
|
| 46 | 1x |
return(NULL) |
| 47 |
} |
|
| 48 | ||
| 49 |
# Spatial store convention: {target_store}/{scale_id}/{pipeline_name}
|
|
| 50 |
# so basename(dirname(store)) is the scale_id for spatial pipelines |
|
| 51 |
# and the project key (e.g. "project_cz") for non-spatial ones. |
|
| 52 | 7x |
potential_id <- |
| 53 | 7x |
basename(dirname(store)) |
| 54 | ||
| 55 | 7x |
data_grid <- |
| 56 | 7x |
readr::read_csv( |
| 57 | 7x |
file = file, |
| 58 | 7x |
show_col_types = FALSE |
| 59 |
) |
|
| 60 | ||
| 61 | 7x |
if (potential_id %in% data_grid$scale_id) {
|
| 62 | 4x |
return(potential_id) |
| 63 |
} |
|
| 64 | ||
| 65 | 3x |
res <- NULL |
| 66 | ||
| 67 | 3x |
return(res) |
| 68 |
} |
| 1 |
#' @title Compute Moran Eigenvector Maps for Spatial Filtering |
|
| 2 |
#' @description |
|
| 3 |
#' Computes Moran Eigenvector Maps (MEMs) from projected |
|
| 4 |
#' core locations (km) using |
|
| 5 |
#' `sjSDM::generateSpatialEV()` and returns the first |
|
| 6 |
#' `n_mev` eigenvectors as a data frame suitable for |
|
| 7 |
#' `prepare_spatial_predictors_for_fit()`. |
|
| 8 |
#' @param data_coords_projected |
|
| 9 |
#' A data frame with `dataset_name` as row names and |
|
| 10 |
#' columns `coord_x_km` and `coord_y_km`, as returned by |
|
| 11 |
#' `project_coords_to_metric()`. Must have more than 3 rows |
|
| 12 |
#' (required by `sjSDM::generateSpatialEV()`). Each row |
|
| 13 |
#' represents one unique core/site location. |
|
| 14 |
#' @param n_mev |
|
| 15 |
#' A positive integer giving the number of eigenvectors to |
|
| 16 |
#' return. The actual count of positive Moran eigenvectors |
|
| 17 |
#' produced by `sjSDM::generateSpatialEV()` for the |
|
| 18 |
#' supplied coordinates depends on the spatial structure |
|
| 19 |
#' of the sites and is typically small (often 2). If |
|
| 20 |
#' `n_mev` exceeds the available count, it is clamped |
|
| 21 |
#' down automatically and a `cli::cli_warn()` message is |
|
| 22 |
#' emitted. Default is `20L`. |
|
| 23 |
#' @return |
|
| 24 |
#' A data frame with the same row names as |
|
| 25 |
#' `data_coords_projected`, and `n_mev` columns named |
|
| 26 |
#' `mev_1`, `mev_2`, ā¦, `mev_{n_mev}`, containing the
|
|
| 27 |
#' first `n_mev` Moran eigenvectors. This data frame is a |
|
| 28 |
#' drop-in replacement for |
|
| 29 |
#' `dplyr::select(data_coords_projected, |
|
| 30 |
#' coord_x_km, coord_y_km)` as input to |
|
| 31 |
#' `prepare_spatial_predictors_for_fit()`. |
|
| 32 |
#' @details |
|
| 33 |
#' MEMs capture the spatial autocorrelation structure of |
|
| 34 |
#' the sampling locations and are used as spatial |
|
| 35 |
#' predictors in the sjSDM model. Eigenvectors are |
|
| 36 |
#' computed on the unique core locations; the caller is |
|
| 37 |
#' responsible for expanding to sample level via |
|
| 38 |
#' `prepare_spatial_predictors_for_fit()`. |
|
| 39 |
#' |
|
| 40 |
#' `sjSDM::generateSpatialEV()` returns only eigenvectors |
|
| 41 |
#' with positive eigenvalues; the count often equals 2 for |
|
| 42 |
#' 2-D coordinate sets. If `n_mev` exceeds the number of |
|
| 43 |
#' positive eigenvectors actually produced, the function |
|
| 44 |
#' automatically lowers `n_mev` to that count, emits a |
|
| 45 |
#' `cli::cli_warn()` message, and continues normally. |
|
| 46 |
#' @seealso |
|
| 47 |
#' [project_coords_to_metric()], |
|
| 48 |
#' [prepare_spatial_predictors_for_fit()], |
|
| 49 |
#' [scale_spatial_for_fit()] |
|
| 50 |
#' @export |
|
| 51 |
compute_spatial_mev <- function( |
|
| 52 |
data_coords_projected = NULL, |
|
| 53 |
n_mev = 20L) {
|
|
| 54 | 18x |
assertthat::assert_that( |
| 55 | 18x |
is.data.frame(data_coords_projected), |
| 56 | 18x |
msg = "data_coords_projected must be a data frame" |
| 57 |
) |
|
| 58 | ||
| 59 | 17x |
assertthat::assert_that( |
| 60 | 17x |
all( |
| 61 | 17x |
c("coord_x_km", "coord_y_km") %in%
|
| 62 | 17x |
base::names(data_coords_projected) |
| 63 |
), |
|
| 64 | 17x |
msg = paste0( |
| 65 | 17x |
"data_coords_projected must contain columns", |
| 66 | 17x |
" 'coord_x_km' and 'coord_y_km'" |
| 67 |
) |
|
| 68 |
) |
|
| 69 | ||
| 70 | 15x |
assertthat::assert_that( |
| 71 | 15x |
nrow(data_coords_projected) > 3, |
| 72 | 15x |
msg = paste0( |
| 73 | 15x |
"data_coords_projected must have more than 3 rows", |
| 74 | 15x |
" (required by sjSDM::generateSpatialEV())" |
| 75 |
) |
|
| 76 |
) |
|
| 77 | ||
| 78 | 14x |
assertthat::assert_that( |
| 79 | 14x |
is.numeric(n_mev) || is.integer(n_mev), |
| 80 | 14x |
length(n_mev) == 1, |
| 81 | 14x |
n_mev >= 1, |
| 82 | 14x |
msg = "n_mev must be a single positive integer" |
| 83 |
) |
|
| 84 | ||
| 85 | 11x |
n_mev <- base::as.integer(n_mev) |
| 86 | ||
| 87 |
# 1. Build km-coordinate matrix ----- |
|
| 88 | ||
| 89 | 11x |
mat_coords_km <- |
| 90 | 11x |
data_coords_projected |> |
| 91 | 11x |
dplyr::select(coord_x_km, coord_y_km) |> |
| 92 | 11x |
base::as.matrix() |
| 93 | ||
| 94 |
# 2. Compute Moran eigenvectors ----- |
|
| 95 | ||
| 96 | 11x |
mat_mev_raw <- |
| 97 | 11x |
sjSDM::generateSpatialEV( |
| 98 | 11x |
coords = mat_coords_km |
| 99 |
) |
|
| 100 | ||
| 101 |
# Force to matrix: sjSDM returns a vector when exactly |
|
| 102 |
# one positive eigenvalue is found (drops the dimension) |
|
| 103 | 11x |
mat_mev_all <- |
| 104 | 11x |
base::as.matrix(mat_mev_raw) |
| 105 | ||
| 106 |
# 3. Post-call validation: clamp n_mev if needed ----- |
|
| 107 | ||
| 108 | 11x |
n_produced <- |
| 109 | 11x |
base::ncol(mat_mev_all) |
| 110 | ||
| 111 |
if ( |
|
| 112 | 11x |
n_mev > n_produced |
| 113 |
) {
|
|
| 114 | 1x |
cli::cli_warn( |
| 115 | 1x |
c( |
| 116 | 1x |
"{n_mev} MEV(s) requested; only {n_produced} positive.",
|
| 117 | 1x |
"i" = "Lowering n_mev from {n_mev} to {n_produced}."
|
| 118 |
) |
|
| 119 |
) |
|
| 120 | 1x |
n_mev <- n_produced |
| 121 |
} |
|
| 122 | ||
| 123 |
# 4. Select first n_mev columns ----- |
|
| 124 | ||
| 125 | 11x |
mat_mev <- |
| 126 | 11x |
mat_mev_all[, base::seq_len(n_mev), drop = FALSE] |
| 127 | ||
| 128 |
# 5. Coerce to data frame with named columns ----- |
|
| 129 | ||
| 130 | 11x |
vec_col_names <- |
| 131 | 11x |
base::paste0("mev_", base::seq_len(n_mev))
|
| 132 | ||
| 133 | 11x |
res <- |
| 134 | 11x |
base::as.data.frame(mat_mev) |
| 135 | ||
| 136 | 11x |
base::colnames(res) <- vec_col_names |
| 137 | 11x |
base::rownames(res) <- base::rownames(data_coords_projected) |
| 138 | ||
| 139 | 11x |
return(res) |
| 140 |
} |
| 1 |
#' @title Check for missing taxa and stop pipeline if any are found |
|
| 2 |
#' @description |
|
| 3 |
#' Validates that all community taxa have been classified. When |
|
| 4 |
#' missing taxa are found, stops with an informative error message |
|
| 5 |
#' listing the count of unclassified taxa. Returns `TRUE` invisibly |
|
| 6 |
#' when no taxa are missing. |
|
| 7 |
#' @param vec_taxa_without_classification |
|
| 8 |
#' A character vector of taxon names that could not be classified |
|
| 9 |
#' automatically or via the auxiliary table. An empty vector |
|
| 10 |
#' (`character(0)`) signals full coverage and causes the function |
|
| 11 |
#' to return silently. |
|
| 12 |
#' @return |
|
| 13 |
#' `TRUE` invisibly when every taxon is classified. |
|
| 14 |
#' Stops with an error when any taxa are missing. |
|
| 15 |
#' @details |
|
| 16 |
#' The missing taxa are stored as a targets object |
|
| 17 |
#' `data_missing_taxa_template` in the pipeline store. Inspect them |
|
| 18 |
#' with `targets::tar_read("data_missing_taxa_template")`. The
|
|
| 19 |
#' object is a `tibble` with columns `sel_name`, `kingdom`, |
|
| 20 |
#' `phylum`, `class`, `order`, `family`, `genus`, and `species`; |
|
| 21 |
#' rank columns are left as `NA` for manual completion. Fill in the |
|
| 22 |
#' missing classifications and copy or append rows to |
|
| 23 |
#' `Data/Input/aux_classification_table.csv`, then re-run the |
|
| 24 |
#' pipeline. Use |
|
| 25 |
#' `R/03_Supplementary_analyses/Make_auxiliary_classification_table.R` |
|
| 26 |
#' to coalesce templates across all pipeline stores into one CSV. |
|
| 27 |
#' @seealso |
|
| 28 |
#' [get_aux_classification_table()], |
|
| 29 |
#' [combine_classification_tables()], |
|
| 30 |
#' [get_taxa_without_classification()] |
|
| 31 |
#' @export |
|
| 32 |
check_and_report_missing_taxa <- function( |
|
| 33 |
vec_taxa_without_classification) {
|
|
| 34 | 6x |
assertthat::assert_that( |
| 35 | 6x |
is.character(vec_taxa_without_classification), |
| 36 | 6x |
msg = paste( |
| 37 | 6x |
"vec_taxa_without_classification must be", |
| 38 | 6x |
"a character vector" |
| 39 |
) |
|
| 40 |
) |
|
| 41 | ||
| 42 |
if ( |
|
| 43 | 4x |
length(vec_taxa_without_classification) == 0 |
| 44 |
) {
|
|
| 45 | 1x |
return(invisible(TRUE)) |
| 46 |
} |
|
| 47 | ||
| 48 | 3x |
stop( |
| 49 | 3x |
length(vec_taxa_without_classification), |
| 50 | 3x |
" taxon/taxa could not be classified.\n", |
| 51 | 3x |
"Inspect missing taxa with:\n", |
| 52 | 3x |
" targets::tar_read('data_missing_taxa_template')\n",
|
| 53 | 3x |
"Fill in the missing classifications and copy/append to\n", |
| 54 | 3x |
" Data/Input/aux_classification_table.csv\n", |
| 55 | 3x |
"then re-run the pipeline.", |
| 56 | 3x |
call. = FALSE |
| 57 |
) |
|
| 58 |
} |
| 1 |
#' @title Extract Dataset Name from String |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts the dataset name from a vector of strings, taking all characters |
|
| 4 |
#' before the double underscore ("__").
|
|
| 5 |
#' @param vec_names |
|
| 6 |
#' A character vector containing names with the format "dataset__something". |
|
| 7 |
#' @return |
|
| 8 |
#' A character vector of dataset names. |
|
| 9 |
#' @export |
|
| 10 |
get_dataset_name_from_string <- function(vec_names) {
|
|
| 11 | 26x |
assertthat::assert_that( |
| 12 | 26x |
is.character(vec_names), |
| 13 | 26x |
msg = "Input must be a character vector." |
| 14 |
) |
|
| 15 | ||
| 16 | 21x |
assertthat::assert_that( |
| 17 | 21x |
length(vec_names) > 0, |
| 18 | 21x |
msg = "Input vector must not be empty." |
| 19 |
) |
|
| 20 | ||
| 21 | 20x |
assertthat::assert_that( |
| 22 | 20x |
all(stringr::str_detect(vec_names, "__")), |
| 23 | 20x |
msg = "Input strings must contain '__' to extract dataset names." |
| 24 |
) |
|
| 25 | ||
| 26 | 17x |
vec_names %>% |
| 27 |
# get all values before double "__" |
|
| 28 | 17x |
stringr::str_extract(".*(?=__)") %>%
|
| 29 | 17x |
stringr::str_trim() %>% |
| 30 | 17x |
return() |
| 31 |
} |
| 1 |
#' @title Select N Taxa |
|
| 2 |
#' @description |
|
| 3 |
#' Selects the top N taxa based on their occurrence across datasets. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame containing the input data. Must include columns "taxon" and |
|
| 6 |
#' the column specified in the `per` parameter. |
|
| 7 |
#' @param n_taxa |
|
| 8 |
#' A numeric value specifying the number of taxa to select. Default is Inf. |
|
| 9 |
#' @param per |
|
| 10 |
#' A character string specifying the column name to group by. Default is |
|
| 11 |
#' "dataset_name". |
|
| 12 |
#' @return |
|
| 13 |
#' A data frame containing the filtered data with the top N taxa. |
|
| 14 |
#' @details |
|
| 15 |
#' The function identifies the most common taxa across datasets by counting |
|
| 16 |
#' their occurrences. It then filters the input data to include only the |
|
| 17 |
#' selected taxa. If no taxa are found, an error is raised. |
|
| 18 |
#' @export |
|
| 19 |
select_n_taxa <- function( |
|
| 20 |
data = NULL, |
|
| 21 |
n_taxa = Inf, |
|
| 22 |
per = "dataset_name") {
|
|
| 23 | 11x |
assertthat::assert_that( |
| 24 | 11x |
is.data.frame(data), |
| 25 | 11x |
msg = "data must be a data frame" |
| 26 |
) |
|
| 27 | ||
| 28 | 9x |
assertthat::assert_that( |
| 29 | 9x |
is.character(per) && length(per) == 1, |
| 30 | 9x |
msg = "per must be a single character string" |
| 31 |
) |
|
| 32 | ||
| 33 | 8x |
assertthat::assert_that( |
| 34 | 8x |
all( |
| 35 | 8x |
c("taxon", per) %in% names(data)
|
| 36 |
), |
|
| 37 | 8x |
msg = paste( |
| 38 | 8x |
"data must contain the following columns:", |
| 39 | 8x |
paste(c("taxon", per), collapse = ", ")
|
| 40 |
) |
|
| 41 |
) |
|
| 42 | ||
| 43 | 6x |
assertthat::assert_that( |
| 44 | 6x |
is.numeric(n_taxa), |
| 45 | 6x |
msg = "n_taxa must be a number" |
| 46 |
) |
|
| 47 | ||
| 48 | 4x |
assertthat::assert_that( |
| 49 | 4x |
n_taxa > 0, |
| 50 | 4x |
msg = "n_taxa must be greater than 0" |
| 51 |
) |
|
| 52 | ||
| 53 | ||
| 54 | 3x |
vec_common_taxa <- |
| 55 | 3x |
data %>% |
| 56 | 3x |
dplyr::distinct(taxon, !!rlang::sym(per)) %>% |
| 57 | 3x |
dplyr::group_by(taxon) %>% |
| 58 | 3x |
dplyr::summarise( |
| 59 | 3x |
.groups = "drop", |
| 60 | 3x |
n_datasets = dplyr::n() |
| 61 |
) %>% |
|
| 62 | 3x |
dplyr::slice_max(n = n_taxa, order_by = n_datasets) %>% |
| 63 | 3x |
dplyr::pull(taxon) |
| 64 | ||
| 65 | 3x |
res <- |
| 66 | 3x |
data %>% |
| 67 | 3x |
dplyr::filter(taxon %in% vec_common_taxa) |
| 68 | ||
| 69 | ||
| 70 | 3x |
assertthat::assert_that( |
| 71 | 3x |
nrow(res) > 0, |
| 72 | 3x |
msg = paste( |
| 73 | 3x |
"No taxa found in data. Please check the input data.", |
| 74 | 3x |
"The number of taxa selected is too high." |
| 75 |
) |
|
| 76 |
) |
|
| 77 | ||
| 78 | 3x |
return(res) |
| 79 |
} |
| 1 |
#' @title Prepare Spatial Predictors for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Expands dataset-level spatial predictor data to the sample |
|
| 4 |
#' level by joining against the canonical `(dataset_name, age)` |
|
| 5 |
#' sample index, producing a data frame whose rows align with the |
|
| 6 |
#' community matrix and abiotic data used for model fitting. |
|
| 7 |
#' @param data_spatial |
|
| 8 |
#' A data frame with `dataset_name` as row names and one or more |
|
| 9 |
#' spatial predictor columns (e.g. `coord_x_km`, `coord_y_km` |
|
| 10 |
#' from `project_coords_to_metric()`). |
|
| 11 |
#' @param data_sample_ids |
|
| 12 |
#' A data frame of valid `(dataset_name, age)` pairs as returned |
|
| 13 |
#' by `align_sample_ids()`. |
|
| 14 |
#' @return |
|
| 15 |
#' A data frame with row names in the format |
|
| 16 |
#' `"<dataset_name>__<age>"` and the same predictor columns as |
|
| 17 |
#' `data_spatial`. Rows are sorted by `dataset_name` then `age`, |
|
| 18 |
#' matching the ordering of the community matrix and abiotic data |
|
| 19 |
#' produced by the respective preparation functions. Rows with |
|
| 20 |
#' any `NA` in the spatial predictors are dropped. |
|
| 21 |
#' @details |
|
| 22 |
#' Spatial predictors are stored at the dataset level (one row |
|
| 23 |
#' per site) but models require one row per sample |
|
| 24 |
#' (site Ć time-slice). This function replicates each dataset's |
|
| 25 |
#' spatial values across all its valid sample ages. The row-name |
|
| 26 |
#' format `"<dataset_name>__<age>"` matches that of the community |
|
| 27 |
#' matrix and abiotic data frame. Unlike `prepare_coords_for_fit`, |
|
| 28 |
#' this function is generic and imposes no assumptions on which |
|
| 29 |
#' spatial predictor columns are present. |
|
| 30 |
#' @seealso |
|
| 31 |
#' [project_coords_to_metric()], [align_sample_ids()], |
|
| 32 |
#' [assemble_data_to_fit()], [prepare_coords_for_fit()] |
|
| 33 |
#' @export |
|
| 34 |
prepare_spatial_predictors_for_fit <- function( |
|
| 35 |
data_spatial = NULL, |
|
| 36 |
data_sample_ids = NULL) {
|
|
| 37 | 13x |
assertthat::assert_that( |
| 38 | 13x |
is.data.frame(data_spatial), |
| 39 | 13x |
msg = "data_spatial must be a data frame" |
| 40 |
) |
|
| 41 | ||
| 42 | 12x |
assertthat::assert_that( |
| 43 | 12x |
is.data.frame(data_sample_ids), |
| 44 | 12x |
msg = "data_sample_ids must be a data frame" |
| 45 |
) |
|
| 46 | ||
| 47 | 11x |
assertthat::assert_that( |
| 48 | 11x |
all(c("dataset_name", "age") %in% names(data_sample_ids)),
|
| 49 | 11x |
msg = paste0( |
| 50 | 11x |
"data_sample_ids must contain columns", |
| 51 | 11x |
" 'dataset_name' and 'age'" |
| 52 |
) |
|
| 53 |
) |
|
| 54 | ||
| 55 | 9x |
assertthat::assert_that( |
| 56 | 9x |
nrow(data_spatial) > 0, |
| 57 | 9x |
msg = "data_spatial must have at least one row" |
| 58 |
) |
|
| 59 | ||
| 60 | 8x |
assertthat::assert_that( |
| 61 | 8x |
ncol(data_spatial) > 0, |
| 62 | 8x |
msg = "data_spatial must have at least one column" |
| 63 |
) |
|
| 64 | ||
| 65 | 7x |
res <- |
| 66 | 7x |
data_sample_ids |> |
| 67 | 7x |
dplyr::inner_join( |
| 68 | 7x |
data_spatial |> |
| 69 | 7x |
tibble::rownames_to_column("dataset_name"),
|
| 70 | 7x |
by = dplyr::join_by(dataset_name) |
| 71 |
) |> |
|
| 72 | 7x |
tidyr::drop_na() |> |
| 73 | 7x |
dplyr::arrange(dataset_name, age) |> |
| 74 | 7x |
dplyr::mutate( |
| 75 | 7x |
.row_name = paste0(dataset_name, "__", age) |
| 76 |
) |> |
|
| 77 | 7x |
dplyr::select(-dataset_name, -age) |> |
| 78 | 7x |
tibble::column_to_rownames(".row_name")
|
| 79 | ||
| 80 | 7x |
return(res) |
| 81 |
} |
| 1 |
#' @title Get Community Data |
|
| 2 |
#' @description |
|
| 3 |
#' Processes a data frame containing community data and extracts the relevant |
|
| 4 |
#' columns, unnesting the `data_community` column in the process. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame. Must contain the columns `dataset_name` and |
|
| 7 |
#' `data_community`. |
|
| 8 |
#' @return |
|
| 9 |
#' A data frame with the `dataset_name` and unnested `data_community` columns. |
|
| 10 |
#' @details |
|
| 11 |
#' Validates that the input is a data frame, ensures the presence of the |
|
| 12 |
#' `dataset_name` and `data_community` columns, selects those columns, and |
|
| 13 |
#' unnests the `data_community` column. |
|
| 14 |
#' @export |
|
| 15 |
get_community_data <- function(data = NULL) {
|
|
| 16 | 4x |
assertthat::assert_that( |
| 17 | 4x |
is.data.frame(data), |
| 18 | 4x |
msg = "data must be a data frame" |
| 19 |
) |
|
| 20 | ||
| 21 | 2x |
assertthat::assert_that( |
| 22 | 2x |
all(c("dataset_name", "data_community") %in% colnames(data)),
|
| 23 | 2x |
msg = "data must contain columns 'dataset_name' and 'data_community'" |
| 24 |
) |
|
| 25 | ||
| 26 | 2x |
data %>% |
| 27 | 2x |
dplyr::select(dataset_name, data_community) %>% |
| 28 | 2x |
tidyr::unnest(data_community) %>% |
| 29 | 2x |
return() |
| 30 |
} |
| 1 |
#' @title Run Pipeline |
|
| 2 |
#' @description |
|
| 3 |
#' Executes a targets pipeline from a specified script and saves progress |
|
| 4 |
#' visualization. Prevents execution if default configuration is active. |
|
| 5 |
#' @param sel_script |
|
| 6 |
#' Path to the pipeline script to execute (relative to project root). |
|
| 7 |
#' @param store_suffix |
|
| 8 |
#' Optional character string appended as a sub-directory between the |
|
| 9 |
#' config-derived target store root and the pipeline name. |
|
| 10 |
#' When `NULL` (default) the path is |
|
| 11 |
#' `{target_store}/{pipeline_name}/` ā identical to the original
|
|
| 12 |
#' behaviour. When set, the path becomes |
|
| 13 |
#' `{target_store}/{store_suffix}/{pipeline_name}/`.
|
|
| 14 |
#' Useful when iterating over many spatial units that share one config |
|
| 15 |
#' but each need an isolated store (e.g. `store_suffix = "eu_r01"`). |
|
| 16 |
#' @param level_separation |
|
| 17 |
#' Numeric value controlling the vertical separation between levels in the |
|
| 18 |
#' progress visualization network graph. Default is 100. |
|
| 19 |
#' @param check_default_config |
|
| 20 |
#' Logical indicating whether to check if the default configuration is |
|
| 21 |
#' active and stop execution if TRUE. Default is TRUE. |
|
| 22 |
#' @param fresh_run |
|
| 23 |
#' Logical indicating whether to destroy the existing target store before |
|
| 24 |
#' running the pipeline, forcing all targets to be re-computed from |
|
| 25 |
#' scratch. When `TRUE`, calls |
|
| 26 |
#' `targets::tar_destroy(destroy = "all", store = ...)` prior to |
|
| 27 |
#' `targets::tar_make()`. Default is `FALSE`. |
|
| 28 |
#' @param plot_progress |
|
| 29 |
#' Logical indicating whether to save a progress visualisation after the |
|
| 30 |
#' pipeline completes. Default is TRUE. |
|
| 31 |
#' @return |
|
| 32 |
#' No return value. Function is called for side effects: executes the |
|
| 33 |
#' targets pipeline and saves progress visualization to the documentation |
|
| 34 |
#' folder. |
|
| 35 |
#' @details |
|
| 36 |
#' The function constructs pipeline-specific target store paths based on |
|
| 37 |
#' the script name and active configuration. When `fresh_run = TRUE`, |
|
| 38 |
#' the target store is destroyed with |
|
| 39 |
#' `targets::tar_destroy(destroy = "all")` before execution so that every |
|
| 40 |
#' target is rebuilt from scratch. It uses `targets::tar_make()` to |
|
| 41 |
#' execute the pipeline and then calls `save_progress_visualisation()` to |
|
| 42 |
#' generate a network visualization of the pipeline status. |
|
| 43 |
#' @seealso save_progress_visualisation, targets::tar_make |
|
| 44 |
#' @export |
|
| 45 |
run_pipeline <- function( |
|
| 46 |
sel_script, |
|
| 47 |
store_suffix = NULL, |
|
| 48 |
level_separation = 100, |
|
| 49 |
check_default_config = TRUE, |
|
| 50 |
plot_progress = TRUE, |
|
| 51 |
fresh_run = FALSE) {
|
|
| 52 | 19x |
assertthat::assert_that( |
| 53 | 19x |
is.character(sel_script), |
| 54 | 19x |
length(sel_script) == 1, |
| 55 | 19x |
msg = "sel_script must be a single string specifying the path to the pipeline script." |
| 56 |
) |
|
| 57 | 16x |
assertthat::assert_that( |
| 58 | 16x |
file.exists(sel_script), |
| 59 | 16x |
msg = paste( |
| 60 | 16x |
"The specified script does not exist:", sel_script, "\n", |
| 61 | 16x |
"Please provide a valid path relative to the project root." |
| 62 |
) |
|
| 63 |
) |
|
| 64 | ||
| 65 | 15x |
assertthat::assert_that( |
| 66 | 15x |
is.null(store_suffix) || |
| 67 |
( |
|
| 68 | 15x |
is.character(store_suffix) && length(store_suffix) == 1 |
| 69 |
), |
|
| 70 | 15x |
msg = "store_suffix must be NULL or a single string." |
| 71 |
) |
|
| 72 | ||
| 73 | 13x |
assertthat::assert_that( |
| 74 | 13x |
is.numeric(level_separation) && |
| 75 | 13x |
length(level_separation) == 1 && |
| 76 | 13x |
level_separation >= 0, |
| 77 | 13x |
msg = "level_separation must be a non-negative number." |
| 78 |
) |
|
| 79 | ||
| 80 | 11x |
assertthat::assert_that( |
| 81 | 11x |
assertthat::is.flag(check_default_config), |
| 82 | 11x |
msg = "check_default_config must be a single logical value (TRUE or FALSE)." |
| 83 |
) |
|
| 84 | ||
| 85 | 9x |
assertthat::assert_that( |
| 86 | 9x |
assertthat::is.flag(plot_progress), |
| 87 | 9x |
msg = "plot_progress must be a single logical value (TRUE or FALSE)." |
| 88 |
) |
|
| 89 | ||
| 90 | 7x |
assertthat::assert_that( |
| 91 | 7x |
assertthat::is.flag(fresh_run), |
| 92 | 7x |
msg = "fresh_run must be a single logical value (TRUE or FALSE)." |
| 93 |
) |
|
| 94 | ||
| 95 |
if ( |
|
| 96 | 5x |
isTRUE(check_default_config) && config::is_active("default")
|
| 97 |
) {
|
|
| 98 | 2x |
stop( |
| 99 | 2x |
paste( |
| 100 | 2x |
"The default config is active. Please set specific config.", "\n", |
| 101 | 2x |
"See `config.yaml` for available options.", "\n", |
| 102 | 2x |
"use Sys.setenv(R_CONFIG_ACTIVE = 'XXX') to set the config." |
| 103 |
) |
|
| 104 |
) |
|
| 105 |
} |
|
| 106 | ||
| 107 | 3x |
sel_script_path <- |
| 108 | 3x |
here::here(sel_script) |
| 109 | ||
| 110 | 3x |
sel_pipeline_name <- |
| 111 | 3x |
stringr::str_replace( |
| 112 | 3x |
string = basename(sel_script_path), |
| 113 | 3x |
pattern = ".R$", |
| 114 | 3x |
replacement = "" |
| 115 |
) |
|
| 116 | ||
| 117 | 3x |
sel_store_path <- |
| 118 | 3x |
if ( |
| 119 | 3x |
is.null(store_suffix) |
| 120 |
) {
|
|
| 121 | 3x |
paste0( |
| 122 | 3x |
get_active_config("target_store"), "/",
|
| 123 | 3x |
sel_pipeline_name, "/" |
| 124 |
) |
|
| 125 |
} else {
|
|
| 126 |
{
|
|
| 127 | ! |
paste0( |
| 128 | ! |
get_active_config("target_store"), "/",
|
| 129 | ! |
store_suffix, "/", |
| 130 | ! |
sel_pipeline_name, "/" |
| 131 |
) |
|
| 132 |
} |> |
|
| 133 | ! |
here::here() |
| 134 |
} |
|
| 135 | ||
| 136 |
# Optionally wipe the store so all targets are rebuilt from scratch. |
|
| 137 |
# TAR_ASK is set to "false" for the duration of tar_destroy() to |
|
| 138 |
# suppress the interactive confirmation prompt, allowing agents and |
|
| 139 |
# unattended scripts to run without user input. |
|
| 140 |
if ( |
|
| 141 | 3x |
isTRUE(fresh_run) |
| 142 |
) {
|
|
| 143 | 1x |
withr::with_envvar( |
| 144 | 1x |
new = c(TAR_ASK = "false"), |
| 145 | 1x |
code = targets::tar_destroy( |
| 146 | 1x |
destroy = "all", |
| 147 | 1x |
store = sel_store_path |
| 148 |
) |
|
| 149 |
) |
|
| 150 |
} |
|
| 151 | ||
| 152 |
# Run the pipeline. |
|
| 153 | 3x |
try( |
| 154 | 3x |
targets::tar_make( |
| 155 | 3x |
script = sel_script_path, |
| 156 | 3x |
store = sel_store_path, |
| 157 | 3x |
reporter = "verbose" |
| 158 |
), |
|
| 159 | 3x |
silent = FALSE |
| 160 |
) |
|
| 161 | ||
| 162 |
if ( |
|
| 163 | 3x |
isTRUE(plot_progress) |
| 164 |
) {
|
|
| 165 | ! |
save_progress_visualisation( |
| 166 | ! |
sel_script = sel_script_path, |
| 167 | ! |
sel_store = sel_store_path, |
| 168 | ! |
level_separation = level_separation |
| 169 |
) |
|
| 170 |
} |
|
| 171 |
} |
| 1 |
#' @title Make Environmental Formula |
|
| 2 |
#' @description |
|
| 3 |
#' Creates a formula for environmental variables from abiotic data. If an |
|
| 4 |
#' 'age' column is present and `use_age = TRUE`, creates interaction terms |
|
| 5 |
#' between age and other variables. When `use_age = FALSE`, age is excluded |
|
| 6 |
#' from the formula even if the column exists in `data`. |
|
| 7 |
#' @param data |
|
| 8 |
#' A data frame containing abiotic environmental variables. Must have at |
|
| 9 |
#' least one column and one row. |
|
| 10 |
#' @param use_age |
|
| 11 |
#' Logical scalar. If `TRUE` (default) and an 'age' column is present in |
|
| 12 |
#' `data`, produces an interaction formula `~ (var1 + var2 + ...) * age - age`. |
|
| 13 |
#' If `FALSE`, age is stripped from the formula regardless of its presence |
|
| 14 |
#' in `data`, producing a simple additive formula `~ var1 + var2 + ...`. |
|
| 15 |
#' @return |
|
| 16 |
#' A formula object suitable for modeling. If 'age' is present and |
|
| 17 |
#' `use_age = TRUE`, returns a formula with interaction terms (age * |
|
| 18 |
#' variables). Otherwise, returns a simple additive formula. All formulas |
|
| 19 |
#' exclude intercept terms. |
|
| 20 |
#' @details |
|
| 21 |
#' The function constructs different formulas based on the presence of an |
|
| 22 |
#' 'age' column and the `use_age` flag: |
|
| 23 |
#' - With age column and `use_age = TRUE`: ~ (var1 + var2 + ...) * age - age |
|
| 24 |
#' - Without age column, or `use_age = FALSE`: ~ var1 + var2 + ... |
|
| 25 |
#' The formula removes intercept terms and individual variable terms when |
|
| 26 |
#' interactions are present. |
|
| 27 |
#' @seealso [check_and_prepare_data_for_fit()] |
|
| 28 |
#' @export |
|
| 29 |
make_env_formula <- function(data, use_age = TRUE) {
|
|
| 30 | 29x |
assertthat::assert_that( |
| 31 | 29x |
is.data.frame(data), |
| 32 | 29x |
msg = "data must be a data frame" |
| 33 |
) |
|
| 34 | ||
| 35 | 24x |
assertthat::assert_that( |
| 36 | 24x |
ncol(data) > 0, |
| 37 | 24x |
msg = "data must have at least one column" |
| 38 |
) |
|
| 39 | ||
| 40 | 23x |
assertthat::assert_that( |
| 41 | 23x |
nrow(data) > 0, |
| 42 | 23x |
msg = "data must have at least one row" |
| 43 |
) |
|
| 44 | ||
| 45 | 22x |
assertthat::assert_that( |
| 46 | 22x |
assertthat::is.flag(use_age) && !is.na(use_age), |
| 47 | 22x |
msg = "use_age must be a single logical value (TRUE or FALSE)" |
| 48 |
) |
|
| 49 | ||
| 50 | 19x |
vec_names <- colnames(data) |
| 51 | ||
| 52 |
# Only treat age as active if the column is present AND use_age is TRUE |
|
| 53 | 19x |
is_age_present <- "age" %in% vec_names && isTRUE(use_age) |
| 54 | ||
| 55 |
if ( |
|
| 56 | 19x |
isTRUE(is_age_present) |
| 57 |
) {
|
|
| 58 | 5x |
vec_names <- vec_names[!vec_names %in% c("age")]
|
| 59 | ||
| 60 | 5x |
assertthat::assert_that( |
| 61 | 5x |
length(vec_names) > 0, |
| 62 | 5x |
msg = "data must have at least one column other than 'age' when 'age' is present" |
| 63 |
) |
|
| 64 | ||
| 65 | 4x |
formula_text <- |
| 66 | 4x |
paste0( |
| 67 |
" ~ ", |
|
| 68 |
# the expected formula for 2 bio variables is: ~ (bio1 + bio12) * age - age |
|
| 69 |
" (",
|
|
| 70 | 4x |
paste0(vec_names, collapse = " + "), |
| 71 | 4x |
") * age - age" |
| 72 |
) |
|
| 73 |
} else {
|
|
| 74 |
# Strip age from names even when use_age = FALSE but age col exists |
|
| 75 | 14x |
vec_names <- vec_names[!vec_names %in% c("age")]
|
| 76 | ||
| 77 | 14x |
assertthat::assert_that( |
| 78 | 14x |
length(vec_names) > 0, |
| 79 | 14x |
msg = "data must have at least one column other than 'age'" |
| 80 |
) |
|
| 81 | ||
| 82 | 14x |
formula_text <- |
| 83 | 14x |
paste0( |
| 84 |
" ~ ", |
|
| 85 | 14x |
paste0(vec_names, collapse = " + ") |
| 86 |
) |
|
| 87 |
} |
|
| 88 | ||
| 89 | 18x |
res <- as.formula(formula_text) |
| 90 | ||
| 91 | 18x |
return(res) |
| 92 |
} |
| 1 |
#' @title Check Presence of VegVault File |
|
| 2 |
#' @description |
|
| 3 |
#' Checks whether the `VegVault.sqlite` file exists in the specified directory. |
|
| 4 |
#' @param relative_path |
|
| 5 |
#' Relative path to the `VegVault.sqlite` file (default: "Data/Input/Vegvault.sqlite"). |
|
| 6 |
#' @return |
|
| 7 |
#' Logical value indicating whether the file exists. Stops with an error if |
|
| 8 |
#' the file is not found. |
|
| 9 |
#' @details |
|
| 10 |
#' Verifies the presence of the `VegVault.sqlite` file. If not found, throws |
|
| 11 |
#' an error with instructions to consult the `Data/Input/README.md` file. |
|
| 12 |
#' @export |
|
| 13 |
check_presence_of_vegvault <- function(relative_path = "Data/Input/VegVault.sqlite") {
|
|
| 14 | 12x |
vegvault_present <- |
| 15 | 12x |
file.exists( |
| 16 | 12x |
here::here(relative_path) |
| 17 |
) |
|
| 18 | ||
| 19 |
if ( |
|
| 20 | 12x |
isFALSE(vegvault_present) |
| 21 |
) {
|
|
| 22 | 3x |
stop( |
| 23 | 3x |
paste( |
| 24 | 3x |
"The VegVault.sqlite file is not present in", |
| 25 | 3x |
"the `Data/Input/` directory.", |
| 26 | 3x |
"Please read the `Data/Input/README.md` file for more information." |
| 27 |
) |
|
| 28 |
) |
|
| 29 | ||
| 30 | ! |
return(vegvault_present) |
| 31 |
} |
|
| 32 | ||
| 33 | 9x |
return(vegvault_present) |
| 34 |
} |
| 1 |
#' @title Compute Bipartite Network Metrics from Community Data |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts the binary community matrix from a `data_to_fit` |
|
| 4 |
#' list, binarizes it (any value > 0 becomes 1), and computes |
|
| 5 |
#' whole-network metrics via `bipartite::networklevel()`. |
|
| 6 |
#' Returns a tidy tibble with one row per metric. |
|
| 7 |
#' @param data_to_fit |
|
| 8 |
#' A named list as returned by `assemble_data_to_fit()`. Must |
|
| 9 |
#' contain the element `data_community_to_fit`, a numeric |
|
| 10 |
#' matrix with samples as rows and taxa as columns. |
|
| 11 |
#' @param vec_indices |
|
| 12 |
#' Character vector of network-level indices to compute, passed |
|
| 13 |
#' directly to the `index` argument of |
|
| 14 |
#' `bipartite::networklevel()`. Default: |
|
| 15 |
#' `c("connectance", "nestedness", "modularity")`.
|
|
| 16 |
#' @return |
|
| 17 |
#' A `tibble` with two columns: |
|
| 18 |
#' \describe{
|
|
| 19 |
#' \item{`metric`}{Character. Name of the network metric.}
|
|
| 20 |
#' \item{`value`}{Numeric. Computed value of the metric.}
|
|
| 21 |
#' } |
|
| 22 |
#' @details |
|
| 23 |
#' The function treats the community matrix as a |
|
| 24 |
#' lower-level (samples) Ć upper-level (taxa) bipartite |
|
| 25 |
#' network. Only the binary incidence (presence/absence) is |
|
| 26 |
#' used: all values > 0 are set to 1 before passing to |
|
| 27 |
#' `bipartite::networklevel()`. |
|
| 28 |
#' |
|
| 29 |
#' If the binarized matrix has no positive entries (i.e. no |
|
| 30 |
#' species observed in any sample), an error is raised. |
|
| 31 |
#' @seealso [assemble_data_to_fit()], |
|
| 32 |
#' [binarize_community_data()] |
|
| 33 |
#' @export |
|
| 34 |
compute_network_metrics <- function( |
|
| 35 |
data_to_fit = NULL, |
|
| 36 |
vec_indices = c("connectance", "nestedness", "modularity")) {
|
|
| 37 | 20x |
assertthat::assert_that( |
| 38 | 20x |
is.list(data_to_fit), |
| 39 | 20x |
msg = "data_to_fit must be a list" |
| 40 |
) |
|
| 41 | ||
| 42 | 17x |
assertthat::assert_that( |
| 43 | 17x |
"data_community_to_fit" %in% names(data_to_fit), |
| 44 | 17x |
msg = paste( |
| 45 | 17x |
"data_to_fit must contain an element named", |
| 46 | 17x |
"'data_community_to_fit'" |
| 47 |
) |
|
| 48 |
) |
|
| 49 | ||
| 50 | 16x |
data_community_matrix <- |
| 51 | 16x |
purrr::chuck(data_to_fit, "data_community_to_fit") |
| 52 | ||
| 53 | 16x |
assertthat::assert_that( |
| 54 | 16x |
is.matrix(data_community_matrix), |
| 55 | 16x |
msg = "data_to_fit$data_community_to_fit must be a matrix" |
| 56 |
) |
|
| 57 | ||
| 58 | 15x |
assertthat::assert_that( |
| 59 | 15x |
is.numeric(data_community_matrix), |
| 60 | 15x |
msg = paste( |
| 61 | 15x |
"data_to_fit$data_community_to_fit must be a", |
| 62 | 15x |
"numeric matrix" |
| 63 |
) |
|
| 64 |
) |
|
| 65 | ||
| 66 | 14x |
assertthat::assert_that( |
| 67 | 14x |
is.character(vec_indices), |
| 68 | 14x |
msg = "vec_indices must be a character vector" |
| 69 |
) |
|
| 70 | ||
| 71 | 13x |
assertthat::assert_that( |
| 72 | 13x |
length(vec_indices) >= 1L, |
| 73 | 13x |
msg = "vec_indices must contain at least one index name" |
| 74 |
) |
|
| 75 | ||
| 76 | 12x |
mat_binary <- |
| 77 | 12x |
(data_community_matrix > 0) * 1L |
| 78 | ||
| 79 | 12x |
assertthat::assert_that( |
| 80 | 12x |
base::sum(mat_binary) > 0, |
| 81 | 12x |
msg = paste( |
| 82 | 12x |
"The binarized community matrix contains no positive", |
| 83 | 12x |
"entries; cannot compute network metrics." |
| 84 |
) |
|
| 85 |
) |
|
| 86 | ||
| 87 | 11x |
vec_raw <- |
| 88 | 11x |
bipartite::networklevel( |
| 89 | 11x |
web = mat_binary, |
| 90 | 11x |
index = vec_indices |
| 91 |
) |
|
| 92 | ||
| 93 | 11x |
res <- |
| 94 | 11x |
tibble::tibble( |
| 95 | 11x |
metric = base::names(vec_raw), |
| 96 | 11x |
value = base::as.numeric(vec_raw) |
| 97 |
) |
|
| 98 | ||
| 99 | 11x |
return(res) |
| 100 |
} |
| 1 |
#' @title Filter Community Matrix by Minimum Number of Taxa |
|
| 2 |
#' @description |
|
| 3 |
#' Guards against running a joint species distribution model on |
|
| 4 |
#' data with too few taxa. Returns the matrix unchanged when |
|
| 5 |
#' the number of taxa (columns) is at least `min_n_taxa`. |
|
| 6 |
#' Stops with an informative error when the column count is |
|
| 7 |
#' below the threshold, preventing wasteful model fitting and |
|
| 8 |
#' meaningless speciesāspecies associations from near-empty |
|
| 9 |
#' communities. |
|
| 10 |
#' @param data_community_matrix |
|
| 11 |
#' A numeric matrix with samples as rows and taxa as columns, |
|
| 12 |
#' as returned by `filter_constant_taxa()`. |
|
| 13 |
#' @param min_n_taxa |
|
| 14 |
#' A single positive integer giving the minimum number of taxa |
|
| 15 |
#' (columns) required to proceed with model fitting. Default |
|
| 16 |
#' is 5. |
|
| 17 |
#' @return |
|
| 18 |
#' The input matrix `data_community_matrix` unchanged, when |
|
| 19 |
#' `ncol(data_community_matrix) >= min_n_taxa`. |
|
| 20 |
#' @details |
|
| 21 |
#' The check counts `ncol(data_community_matrix)` after all |
|
| 22 |
#' upstream taxon-level filtering (`filter_rare_taxa()`, |
|
| 23 |
#' `filter_community_by_n_cores()`, `filter_by_n_samples()`, |
|
| 24 |
#' `filter_constant_taxa()`) has been applied. If the count |
|
| 25 |
#' falls below `min_n_taxa`, `cli::cli_abort()` is called with |
|
| 26 |
#' a message that reports the actual count and the threshold, |
|
| 27 |
#' allowing the user to adjust the configuration or the data. |
|
| 28 |
#' @seealso [filter_constant_taxa()], [assemble_data_to_fit()] |
|
| 29 |
#' @export |
|
| 30 |
filter_community_by_n_taxa <- function( |
|
| 31 |
data_community_matrix = NULL, |
|
| 32 |
min_n_taxa = 5) {
|
|
| 33 | 14x |
assertthat::assert_that( |
| 34 | 14x |
base::is.matrix(data_community_matrix), |
| 35 | 14x |
msg = paste0( |
| 36 | 14x |
"data_community_matrix must be a matrix" |
| 37 |
) |
|
| 38 |
) |
|
| 39 | ||
| 40 | 11x |
assertthat::assert_that( |
| 41 | 11x |
base::is.numeric(min_n_taxa) && |
| 42 | 11x |
base::length(min_n_taxa) == 1, |
| 43 | 11x |
msg = "min_n_taxa must be a single numeric value" |
| 44 |
) |
|
| 45 | ||
| 46 | 9x |
assertthat::assert_that( |
| 47 | 9x |
min_n_taxa >= 1, |
| 48 | 9x |
msg = "min_n_taxa must be greater than or equal to 1" |
| 49 |
) |
|
| 50 | ||
| 51 | 7x |
n_taxa <- |
| 52 | 7x |
base::ncol(data_community_matrix) |
| 53 | ||
| 54 |
if ( |
|
| 55 | 7x |
n_taxa < min_n_taxa |
| 56 |
) {
|
|
| 57 | 3x |
cli::cli_abort( |
| 58 | 3x |
c( |
| 59 | 3x |
paste0( |
| 60 | 3x |
"Too few taxa remain after filtering to run", |
| 61 | 3x |
" the model." |
| 62 |
), |
|
| 63 | 3x |
"i" = paste0( |
| 64 | 3x |
"Found {n_taxa} taxa but at least",
|
| 65 | 3x |
" {min_n_taxa} are required."
|
| 66 |
), |
|
| 67 | 3x |
"i" = paste0( |
| 68 | 3x |
"Adjust `min_n_taxa` in the configuration", |
| 69 | 3x |
" or review upstream filtering steps." |
| 70 |
) |
|
| 71 |
) |
|
| 72 |
) |
|
| 73 |
} |
|
| 74 | ||
| 75 | 4x |
return(data_community_matrix) |
| 76 |
} |
| 1 |
#' @title Filter Non-Plantae Taxa |
|
| 2 |
#' @description |
|
| 3 |
#' Removes taxa that do not belong to the kingdom Plantae from a |
|
| 4 |
#' community data frame, using a classification table to determine |
|
| 5 |
#' the kingdom assignment for each taxon. |
|
| 6 |
#' @param data |
|
| 7 |
#' A data frame containing community data with at minimum a column |
|
| 8 |
#' named 'taxon'. |
|
| 9 |
#' @param data_classification_table |
|
| 10 |
#' A data frame with columns 'sel_name' and 'kingdom', mapping |
|
| 11 |
#' taxon names to their kingdom classification. |
|
| 12 |
#' @return |
|
| 13 |
#' A data frame identical in structure to 'data' but with all rows |
|
| 14 |
#' belonging to non-Plantae taxa removed. Taxa with 'kingdom = NA' |
|
| 15 |
#' (unclassifiable) are also removed. |
|
| 16 |
#' @details |
|
| 17 |
#' Performs a left join between 'data' and 'data_classification_table' |
|
| 18 |
#' on 'taxon == sel_name' to retrieve the kingdom for each taxon. |
|
| 19 |
#' Any taxon where 'kingdom' is not exactly '"Plantae"', including |
|
| 20 |
#' taxa with 'kingdom = NA', is treated as non-plant and removed. |
|
| 21 |
#' When any taxa are dropped, 'cli::cli_warn()' is issued reporting |
|
| 22 |
#' the count and the full vector of removed taxon names. |
|
| 23 |
#' Note: the upstream 'get_taxa_classification()' already filters |
|
| 24 |
#' to Plantae during the taxospace lookup, so in practice this |
|
| 25 |
#' function mainly catches taxa that are genuinely unclassifiable |
|
| 26 |
#' (i.e. not found in any classification source). |
|
| 27 |
#' @seealso [classify_taxonomic_resolution()], [filter_rare_taxa()] |
|
| 28 |
#' @export |
|
| 29 |
filter_non_plantae_taxa <- function(data, data_classification_table) {
|
|
| 30 | 15x |
assertthat::assert_that( |
| 31 | 15x |
is.data.frame(data), |
| 32 | 15x |
msg = "data must be a data frame" |
| 33 |
) |
|
| 34 | ||
| 35 | 12x |
assertthat::assert_that( |
| 36 | 12x |
"taxon" %in% colnames(data), |
| 37 | 12x |
msg = "data must contain a 'taxon' column" |
| 38 |
) |
|
| 39 | ||
| 40 | 11x |
assertthat::assert_that( |
| 41 | 11x |
is.data.frame(data_classification_table), |
| 42 | 11x |
msg = "data_classification_table must be a data frame" |
| 43 |
) |
|
| 44 | ||
| 45 | 8x |
assertthat::assert_that( |
| 46 | 8x |
all( |
| 47 | 8x |
c("sel_name", "kingdom") %in%
|
| 48 | 8x |
colnames(data_classification_table) |
| 49 |
), |
|
| 50 | 8x |
msg = paste( |
| 51 | 8x |
"data_classification_table must contain", |
| 52 | 8x |
"columns: 'sel_name' and 'kingdom'" |
| 53 |
) |
|
| 54 |
) |
|
| 55 | ||
| 56 | 7x |
data_with_kingdom <- |
| 57 | 7x |
data |> |
| 58 | 7x |
dplyr::left_join( |
| 59 | 7x |
data_classification_table |> |
| 60 | 7x |
dplyr::select(sel_name, kingdom), |
| 61 | 7x |
by = dplyr::join_by("taxon" == "sel_name")
|
| 62 |
) |
|
| 63 | ||
| 64 | 7x |
vec_dropped_taxa <- |
| 65 | 7x |
data_with_kingdom |> |
| 66 | 7x |
dplyr::filter( |
| 67 | 7x |
base::is.na(kingdom) | kingdom != "Plantae" |
| 68 |
) |> |
|
| 69 | 7x |
dplyr::distinct(taxon) |> |
| 70 | 7x |
dplyr::pull(taxon) |
| 71 | ||
| 72 | 7x |
if (base::length(vec_dropped_taxa) > 0) {
|
| 73 | 4x |
cli::cli_warn( |
| 74 | 4x |
c( |
| 75 | 4x |
"!" = paste0( |
| 76 | 4x |
"{base::length(vec_dropped_taxa)} taxon/taxa ",
|
| 77 | 4x |
"{?was/were} removed as non-Plantae or unclassified."
|
| 78 |
), |
|
| 79 | 4x |
"i" = "Removed: {.val {vec_dropped_taxa}}"
|
| 80 |
) |
|
| 81 |
) |
|
| 82 |
} |
|
| 83 | ||
| 84 | 7x |
res <- |
| 85 | 7x |
data_with_kingdom |> |
| 86 | 7x |
dplyr::filter( |
| 87 | 7x |
!base::is.na(kingdom) & kingdom == "Plantae" |
| 88 |
) |> |
|
| 89 | 7x |
dplyr::select(-kingdom) |
| 90 | ||
| 91 | 7x |
return(res) |
| 92 |
} |
| 1 |
#' @title Get and Cache a CHELSA-TraCE21k Raster |
|
| 2 |
#' @description |
|
| 3 |
#' Downloads a cropped CHELSA-TraCE21k raster for a given |
|
| 4 |
#' bioclim variable and age slice. If a cached `.tif` already |
|
| 5 |
#' exists in `cache_dir` it is returned immediately without |
|
| 6 |
#' re-downloading. Absolute-temperature variables (`bio1`, |
|
| 7 |
#' `bio6`) are corrected from Kelvin to degrees Celsius |
|
| 8 |
#' (subtract 273.15) before the raster is written to cache. |
|
| 9 |
#' @param chelsa_var |
|
| 10 |
#' Character scalar. Project-level bioclim variable name, |
|
| 11 |
#' e.g. `"bio1"`, `"bio4"`, `"bio12"`. Single-digit numbers |
|
| 12 |
#' are zero-padded internally to match CHELSA file names |
|
| 13 |
#' (`"bio1"` becomes `"bio01"` in the URL). |
|
| 14 |
#' @param age |
|
| 15 |
#' Numeric or integer scalar. Age in years BP used to encode |
|
| 16 |
#' the CHELSA-TraCE21k time step (e.g. `1000` encodes as |
|
| 17 |
#' time step `"-010"`). |
|
| 18 |
#' @param x_lim |
|
| 19 |
#' Numeric vector of length 2. Longitude extent |
|
| 20 |
#' `c(min, max)` for cropping the downloaded raster. |
|
| 21 |
#' @param y_lim |
|
| 22 |
#' Numeric vector of length 2. Latitude extent |
|
| 23 |
#' `c(min, max)` for cropping the downloaded raster. |
|
| 24 |
#' @param cache_dir |
|
| 25 |
#' Character scalar. Path to the directory where the cropped |
|
| 26 |
#' raster is cached as `{chelsa_var}_{age}_{xmin}_{xmax}_{ymin}_{ymax}.tif`.
|
|
| 27 |
#' The extent values are rounded to 2 decimal places so the cache |
|
| 28 |
#' is shared across calls with identical extents. The directory must |
|
| 29 |
#' already exist before calling this function. |
|
| 30 |
#' @return |
|
| 31 |
#' A `terra::SpatRaster` cropped to `x_lim` / `y_lim`, with |
|
| 32 |
#' corrected units: degrees Celsius for `bio1` and `bio6`; |
|
| 33 |
#' original CHELSA units for all other variables. |
|
| 34 |
#' @details |
|
| 35 |
#' CHELSA-TraCE21k time steps: `age = 0` (present) maps to |
|
| 36 |
#' the special step `"0000"`; all other ages use |
|
| 37 |
#' `sprintf("-%03d", age %/% 100)`, so `age = 100` gives
|
|
| 38 |
#' `"-001"` and `age = 1000` gives `"-010"`. |
|
| 39 |
#' |
|
| 40 |
#' The remote raster is accessed via GDAL `/vsicurl/`, so an |
|
| 41 |
#' internet connection is required the first time each |
|
| 42 |
#' `(chelsa_var, age, x_lim, y_lim)` combination is requested. |
|
| 43 |
#' Subsequent calls load from the cached `.tif` and need no |
|
| 44 |
#' connection. The geographic extent (rounded to 2 decimal |
|
| 45 |
#' places) is embedded in the cache filename, so rasters with |
|
| 46 |
#' different extents are always stored separately. |
|
| 47 |
#' |
|
| 48 |
#' Kelvin correction: `bio1` (Mean Annual Temperature) and |
|
| 49 |
#' `bio6` (Min Temperature of Coldest Month) are absolute |
|
| 50 |
#' temperatures stored in Kelvin; 273.15 is subtracted before |
|
| 51 |
#' caching. Range and seasonality variables (`bio4`, `bio7`) |
|
| 52 |
#' are differences or standard deviations ā no offset needed. |
|
| 53 |
#' @seealso |
|
| 54 |
#' [interpolate_mev_to_grid()], |
|
| 55 |
#' [interpolate_st_mev_to_grid()], |
|
| 56 |
#' [project_coords_to_metric()] |
|
| 57 |
#' @export |
|
| 58 |
get_chelsa_raster <- function( |
|
| 59 |
chelsa_var = NULL, |
|
| 60 |
age = NULL, |
|
| 61 |
x_lim = NULL, |
|
| 62 |
y_lim = NULL, |
|
| 63 |
cache_dir = NULL) {
|
|
| 64 | 18x |
assertthat::assert_that( |
| 65 | 18x |
assertthat::is.string(chelsa_var), |
| 66 | 18x |
msg = "chelsa_var must be a single character string" |
| 67 |
) |
|
| 68 | ||
| 69 | 15x |
assertthat::assert_that( |
| 70 | 15x |
(is.numeric(age) || is.integer(age)) && length(age) == 1L, |
| 71 | 15x |
msg = "age must be a single numeric or integer value" |
| 72 |
) |
|
| 73 | ||
| 74 | 12x |
assertthat::assert_that( |
| 75 | 12x |
is.numeric(x_lim) && length(x_lim) == 2L, |
| 76 | 12x |
msg = "x_lim must be a numeric vector of length 2" |
| 77 |
) |
|
| 78 | ||
| 79 | 8x |
assertthat::assert_that( |
| 80 | 8x |
is.numeric(y_lim) && length(y_lim) == 2L, |
| 81 | 8x |
msg = "y_lim must be a numeric vector of length 2" |
| 82 |
) |
|
| 83 | ||
| 84 | 4x |
assertthat::assert_that( |
| 85 | 4x |
assertthat::is.string(cache_dir), |
| 86 | 4x |
base::dir.exists(cache_dir), |
| 87 | 4x |
msg = "cache_dir must be a string path to an existing directory" |
| 88 |
) |
|
| 89 | ||
| 90 |
# 1. Map project variable name to CHELSA file name ----- |
|
| 91 |
# Pad single-digit bio numbers: "bio1" -> "bio01" |
|
| 92 | 1x |
chelsa_var_name <- |
| 93 | 1x |
stringr::str_replace(chelsa_var, "^bio(\\d)$", "bio0\\1") |
| 94 | ||
| 95 |
# 2. Build cache file path ----- |
|
| 96 |
# Include rounded extent in filename so rasters cropped to different |
|
| 97 |
# geographic areas never share the same cache entry. |
|
| 98 | 1x |
cache_extent_tag <- |
| 99 | 1x |
base::paste( |
| 100 | 1x |
base::round(base::min(x_lim), 2L), |
| 101 | 1x |
base::round(base::max(x_lim), 2L), |
| 102 | 1x |
base::round(base::min(y_lim), 2L), |
| 103 | 1x |
base::round(base::max(y_lim), 2L), |
| 104 | 1x |
sep = "_" |
| 105 |
) |
|
| 106 | ||
| 107 | 1x |
cache_file <- |
| 108 | 1x |
base::file.path( |
| 109 | 1x |
cache_dir, |
| 110 | 1x |
base::paste0( |
| 111 | 1x |
chelsa_var, "_", age, "_", cache_extent_tag, ".tif" |
| 112 |
) |
|
| 113 |
) |
|
| 114 | ||
| 115 |
# 3. Return from cache if available ----- |
|
| 116 | 1x |
if (base::file.exists(cache_file)) {
|
| 117 | 1x |
return(terra::rast(cache_file)) |
| 118 |
} |
|
| 119 | ||
| 120 |
# 4. Download and crop from CHELSA-TraCE21k ----- |
|
| 121 | ! |
chelsa_base_url <- |
| 122 | ! |
base::paste0( |
| 123 | ! |
"/vsicurl/https://os.zhdk.cloud.switch.ch/", |
| 124 | ! |
"chelsa01/chelsa_trace21k/global/bioclim/" |
| 125 |
) |
|
| 126 | ||
| 127 |
# The present slice uses the special step "0000"; |
|
| 128 |
# all other ages use sprintf("-%03d", age %/% 100).
|
|
| 129 | ! |
chelsa_time_step <- |
| 130 | ! |
if (base::as.integer(age) == 0L) {
|
| 131 | ! |
"0000" |
| 132 |
} else {
|
|
| 133 | ! |
base::sprintf( |
| 134 | ! |
"-%03d", |
| 135 | ! |
base::as.integer(age) %/% 100L |
| 136 |
) |
|
| 137 |
} |
|
| 138 | ||
| 139 | ! |
url_rast <- |
| 140 | ! |
base::paste0( |
| 141 | ! |
chelsa_base_url, |
| 142 | ! |
chelsa_var_name, "/", |
| 143 | ! |
"CHELSA_TraCE21k_", |
| 144 | ! |
chelsa_var_name, "_", |
| 145 | ! |
chelsa_time_step, |
| 146 | ! |
"_V.1.0.tif" |
| 147 |
) |
|
| 148 | ||
| 149 | ! |
ext_rast <- |
| 150 | ! |
terra::ext( |
| 151 | ! |
base::min(x_lim), base::max(x_lim), |
| 152 | ! |
base::min(y_lim), base::max(y_lim) |
| 153 |
) |
|
| 154 | ||
| 155 | ! |
rast_raw <- |
| 156 | ! |
terra::rast(url_rast) |> |
| 157 | ! |
terra::crop(y = ext_rast) |
| 158 | ||
| 159 |
# 5. Apply Kelvin -> Celsius correction where needed ----- |
|
| 160 |
# bio1 (mean annual temp) and bio6 (min temp coldest month) |
|
| 161 |
# are absolute temperatures stored in Kelvin. |
|
| 162 |
# bio4 (temp seasonality) is a std dev ā no offset needed. |
|
| 163 | ! |
vec_kelvin_vars <- c("bio1", "bio6")
|
| 164 | ||
| 165 | ! |
rast_out <- |
| 166 | ! |
if (chelsa_var %in% vec_kelvin_vars) {
|
| 167 | ! |
rast_raw - 273.15 |
| 168 |
} else {
|
|
| 169 | ! |
rast_raw |
| 170 |
} |
|
| 171 | ||
| 172 |
# 6. Write to cache and return ----- |
|
| 173 | ! |
terra::writeRaster(rast_out, cache_file, overwrite = TRUE) |
| 174 | ||
| 175 | ! |
return(rast_out) |
| 176 |
} |
| 1 |
#' @title Get Spatial Model Fitting Parameters from Grid Catalogue |
|
| 2 |
#' @description |
|
| 3 |
#' Retrieves the model fitting parameters for a given spatial unit ID |
|
| 4 |
#' from the project's spatial grid CSV catalogue. |
|
| 5 |
#' @param scale_id |
|
| 6 |
#' A single character string identifying the spatial unit. |
|
| 7 |
#' Must match exactly one row in the catalogue file. |
|
| 8 |
#' @param file |
|
| 9 |
#' Path to the spatial grid CSV file. |
|
| 10 |
#' Default: `here::here("Data/Input/spatial_grid.csv")`.
|
|
| 11 |
#' @return |
|
| 12 |
#' A named list with five elements: |
|
| 13 |
#' \describe{
|
|
| 14 |
#' \item{`n_iter`}{Integer. Number of training iterations.}
|
|
| 15 |
#' \item{`n_step_size`}{
|
|
| 16 |
#' Integer or `NULL`. SGD mini-batch size. |
|
| 17 |
#' `NULL` means auto (10 \% of sites), corresponding to |
|
| 18 |
#' an `NA` value in the CSV. |
|
| 19 |
#' } |
|
| 20 |
#' \item{`n_sampling`}{
|
|
| 21 |
#' Integer. Monte Carlo samples per epoch. |
|
| 22 |
#' } |
|
| 23 |
#' \item{`n_samples_anova`}{
|
|
| 24 |
#' Integer. Monte Carlo samples for ANOVA |
|
| 25 |
#' variation partitioning. |
|
| 26 |
#' } |
|
| 27 |
#' \item{`n_early_stopping`}{
|
|
| 28 |
#' Integer or `NULL`. Early stopping patience ā number of epochs |
|
| 29 |
#' without loss improvement before training halts. |
|
| 30 |
#' `NULL` means auto (20 \% of `iter`), corresponding to |
|
| 31 |
#' an `NA` value in the CSV. |
|
| 32 |
#' Passed as the `n_early_stopping` argument of |
|
| 33 |
#' `fit_jsdm_model()`. |
|
| 34 |
#' } |
|
| 35 |
#' } |
|
| 36 |
#' @details |
|
| 37 |
#' Reads the CSV using `readr::read_csv`, filters to the row whose |
|
| 38 |
#' `scale_id` column matches the supplied `scale_id` argument, and |
|
| 39 |
#' constructs the parameter list. Validation ensures the file is |
|
| 40 |
#' readable, has a `.csv` extension, contains the required columns, |
|
| 41 |
#' and that exactly one row matches the requested `scale_id`. |
|
| 42 |
#' `NA` values in the `n_step_size` and `n_early_stopping` columns are |
|
| 43 |
#' converted to `NULL`. |
|
| 44 |
#' @seealso get_spatial_window, get_active_config |
|
| 45 |
#' @export |
|
| 46 |
get_spatial_model_params <- function( |
|
| 47 |
scale_id, |
|
| 48 |
file = here::here("Data/Input/spatial_grid.csv")) {
|
|
| 49 | 13x |
assertthat::assert_that( |
| 50 | 13x |
is.character(scale_id) && length(scale_id) == 1, |
| 51 | 13x |
msg = paste0( |
| 52 | 13x |
"`scale_id` must be a single character string.", |
| 53 | 13x |
" Got length: ", length(scale_id) |
| 54 |
) |
|
| 55 |
) |
|
| 56 | ||
| 57 | 11x |
assertthat::assert_that( |
| 58 | 11x |
assertthat::is.readable(file) && |
| 59 | 11x |
assertthat::has_extension(file, "csv"), |
| 60 | 11x |
msg = "`file` must be a readable CSV file." |
| 61 |
) |
|
| 62 | ||
| 63 | 10x |
data_grid <- |
| 64 | 10x |
readr::read_csv( |
| 65 | 10x |
file = file, |
| 66 | 10x |
show_col_types = FALSE |
| 67 |
) |
|
| 68 | ||
| 69 | 10x |
vec_required_cols <- |
| 70 | 10x |
c( |
| 71 | 10x |
"scale_id", |
| 72 | 10x |
"n_iter", |
| 73 | 10x |
"n_step_size", |
| 74 | 10x |
"n_sampling", |
| 75 | 10x |
"n_samples_anova", |
| 76 | 10x |
"n_early_stopping" |
| 77 |
) |
|
| 78 | ||
| 79 | 10x |
assertthat::assert_that( |
| 80 | 10x |
base::all(vec_required_cols %in% base::names(data_grid)), |
| 81 | 10x |
msg = paste0( |
| 82 | 10x |
"`file` must contain columns: ", |
| 83 | 10x |
base::paste(vec_required_cols, collapse = ", "), "." |
| 84 |
) |
|
| 85 |
) |
|
| 86 | ||
| 87 | 8x |
data_row <- |
| 88 | 8x |
data_grid |> |
| 89 | 8x |
dplyr::filter( |
| 90 | 8x |
.data$scale_id == .env$scale_id |
| 91 |
) |
|
| 92 | ||
| 93 | 8x |
assertthat::assert_that( |
| 94 | 8x |
base::nrow(data_row) == 1, |
| 95 | 8x |
msg = paste0( |
| 96 | 8x |
"Expected exactly 1 row for scale_id '", scale_id, "'.", |
| 97 | 8x |
" Found: ", base::nrow(data_row) |
| 98 |
) |
|
| 99 |
) |
|
| 100 | ||
| 101 | 7x |
n_step_size_raw <- |
| 102 | 7x |
dplyr::pull(data_row, n_step_size) |
| 103 | ||
| 104 | 7x |
n_early_stopping_raw <- |
| 105 | 7x |
dplyr::pull(data_row, n_early_stopping) |
| 106 | ||
| 107 | 7x |
res <- |
| 108 | 7x |
list( |
| 109 | 7x |
n_iter = dplyr::pull(data_row, n_iter), |
| 110 | 7x |
n_step_size = if ( |
| 111 | 7x |
base::is.na(n_step_size_raw) |
| 112 |
) {
|
|
| 113 | 1x |
NULL |
| 114 |
} else {
|
|
| 115 | 6x |
n_step_size_raw |
| 116 |
}, |
|
| 117 | 7x |
n_sampling = dplyr::pull(data_row, n_sampling), |
| 118 | 7x |
n_samples_anova = dplyr::pull(data_row, n_samples_anova), |
| 119 | 7x |
n_early_stopping = if ( |
| 120 | 7x |
base::is.na(n_early_stopping_raw) |
| 121 |
) {
|
|
| 122 | 6x |
NULL |
| 123 |
} else {
|
|
| 124 | 1x |
n_early_stopping_raw |
| 125 |
} |
|
| 126 |
) |
|
| 127 | ||
| 128 | 7x |
return(res) |
| 129 |
} |
| 1 |
#' @title Get Taxa Classification |
|
| 2 |
#' @description |
|
| 3 |
#' Retrieves taxonomic classification for a vector of taxa using the |
|
| 4 |
#' taxospace package. Filters results to include only plant taxa. |
|
| 5 |
#' @param data A character vector of taxon names to classify. |
|
| 6 |
#' @return A data frame with columns for selected name, taxonomic name, |
|
| 7 |
#' rank, and id. |
|
| 8 |
#' @details |
|
| 9 |
#' Uses taxospace::get_classification to retrieve classification. Flags and |
|
| 10 |
#' filters for plant taxa (kingdom Plantae). Returns an empty tibble if no |
|
| 11 |
#' plant taxa found. |
|
| 12 |
#' @export |
|
| 13 |
get_taxa_classification <- function(data) {
|
|
| 14 | 12x |
require(taxospace) |
| 15 | ||
| 16 | 12x |
assertthat::assert_that( |
| 17 | 12x |
is.character(data) && length(data) > 0, |
| 18 | 12x |
msg = "data must be a non-empty character vector" |
| 19 |
) |
|
| 20 | ||
| 21 | 10x |
res_classification <- |
| 22 | 10x |
taxospace::get_classification( |
| 23 | 10x |
taxa_vec = data, |
| 24 |
# this is done so that the best match is returned |
|
| 25 |
# even if the result is not flagged as "exact" |
|
| 26 | 10x |
use_only_exact_match = FALSE |
| 27 |
) |
|
| 28 | ||
| 29 |
if ( |
|
| 30 | 10x |
!"classification" %in% names(res_classification) |
| 31 |
) {
|
|
| 32 | 2x |
return( |
| 33 | 2x |
tibble::tibble( |
| 34 | 2x |
sel_name = data, |
| 35 | 2x |
name = character(), |
| 36 | 2x |
rank = character(), |
| 37 | 2x |
id = integer(), |
| 38 |
) |
|
| 39 |
) |
|
| 40 |
} |
|
| 41 | ||
| 42 | 8x |
res_plant <- |
| 43 | 8x |
res_classification %>% |
| 44 |
# flag taxa that are plants |
|
| 45 | 8x |
dplyr::mutate( |
| 46 | 8x |
is_plant = purrr::map_lgl( |
| 47 | 8x |
.x = classification, |
| 48 | 8x |
.f = ~ .x %>% |
| 49 | 8x |
dplyr::filter(rank == "kingdom") %>% |
| 50 | 8x |
dplyr::pull(name) %>% |
| 51 | 8x |
stringr::str_detect("Plantae") %>%
|
| 52 | 8x |
any() |
| 53 |
) |
|
| 54 |
) %>% |
|
| 55 |
# filter only plant taxa |
|
| 56 | 8x |
dplyr::filter(is_plant) |
| 57 | ||
| 58 |
if ( |
|
| 59 | 8x |
isTRUE(nrow(res_plant) == 0) |
| 60 |
) {
|
|
| 61 | ! |
return( |
| 62 | ! |
tibble::tibble( |
| 63 | ! |
sel_name = data, |
| 64 | ! |
name = character(), |
| 65 | ! |
rank = character(), |
| 66 | ! |
id = integer(), |
| 67 |
) |
|
| 68 |
) |
|
| 69 |
} |
|
| 70 | ||
| 71 | 8x |
res_plant %>% |
| 72 | 8x |
dplyr::select(sel_name, classification) %>% |
| 73 | 8x |
tidyr::unnest(classification) %>% |
| 74 | 8x |
return() |
| 75 |
} |
| 1 |
#' @title Project Geographic Coordinates to Metric (km) |
|
| 2 |
#' @description |
|
| 3 |
#' Converts WGS84 lon/lat coordinates to planar metric |
|
| 4 |
#' coordinates in kilometres using a user-specified equal-area |
|
| 5 |
#' projection. The original lon/lat columns are retained |
|
| 6 |
#' alongside the projected `coord_x_km` and `coord_y_km` |
|
| 7 |
#' columns. |
|
| 8 |
#' @param data_coords |
|
| 9 |
#' A data frame with `dataset_name` as row names and columns |
|
| 10 |
#' `coord_long` and `coord_lat` (decimal degrees, WGS84), as |
|
| 11 |
#' returned by `get_coords()`. |
|
| 12 |
#' @param target_crs |
|
| 13 |
#' An integer EPSG code for the target projected coordinate |
|
| 14 |
#' reference system. Must be a metric (planar) equal-area |
|
| 15 |
#' CRS suitable for the geographic extent of the data. |
|
| 16 |
#' Defaults to `3035L` (ETRS89 Lambert Azimuthal Equal-Area, |
|
| 17 |
#' the standard for European analyses). For other regions, |
|
| 18 |
#' choose an appropriate equal-area projection, e.g.: |
|
| 19 |
#' * `6933L` ā EASE-Grid 2.0 (global) |
|
| 20 |
#' * `5070L` ā NAD83 Conus Albers (North America) |
|
| 21 |
#' * `102022L` ā Africa Albers Equal Area Conic (Africa) |
|
| 22 |
#' @return |
|
| 23 |
#' A data frame with the same row names and columns as |
|
| 24 |
#' `data_coords`, plus two additional columns: |
|
| 25 |
#' \describe{
|
|
| 26 |
#' \item{`coord_x_km`}{Easting in kilometres
|
|
| 27 |
#' (target CRS).} |
|
| 28 |
#' \item{`coord_y_km`}{Northing in kilometres
|
|
| 29 |
#' (target CRS).} |
|
| 30 |
#' } |
|
| 31 |
#' @details |
|
| 32 |
#' Using metric coordinates instead of degrees is necessary |
|
| 33 |
#' for distance-based spatial modelling (e.g., Moran |
|
| 34 |
#' eigenvectors) because degree distances are not uniform |
|
| 35 |
#' across latitudes. The `target_crs` default of EPSG:3035 |
|
| 36 |
#' is the standard equal-area projection for European |
|
| 37 |
#' spatial analyses; change it for non-European study |
|
| 38 |
#' regions. |
|
| 39 |
#' |
|
| 40 |
#' The function requires the \pkg{sf} package to perform
|
|
| 41 |
#' coordinate transformation. Dataset names (row names) are |
|
| 42 |
#' preserved unchanged in the output. |
|
| 43 |
#' @seealso [get_coords()], [prepare_spatial_predictors_for_fit()] |
|
| 44 |
#' @export |
|
| 45 |
project_coords_to_metric <- function( |
|
| 46 |
data_coords = NULL, |
|
| 47 |
target_crs = 3035L) {
|
|
| 48 | 29x |
assertthat::assert_that( |
| 49 | 29x |
is.data.frame(data_coords), |
| 50 | 29x |
msg = "data_coords must be a data frame" |
| 51 |
) |
|
| 52 | ||
| 53 | 27x |
assertthat::assert_that( |
| 54 | 27x |
all( |
| 55 | 27x |
c("coord_long", "coord_lat") %in% names(data_coords)
|
| 56 |
), |
|
| 57 | 27x |
msg = paste0( |
| 58 | 27x |
"data_coords must contain columns", |
| 59 | 27x |
" 'coord_long' and 'coord_lat'" |
| 60 |
) |
|
| 61 |
) |
|
| 62 | ||
| 63 | 23x |
assertthat::assert_that( |
| 64 | 23x |
nrow(data_coords) > 0, |
| 65 | 23x |
msg = "data_coords must have at least one row" |
| 66 |
) |
|
| 67 | ||
| 68 | 21x |
assertthat::assert_that( |
| 69 | 21x |
is.numeric(target_crs) || is.integer(target_crs), |
| 70 | 21x |
length(target_crs) == 1, |
| 71 | 21x |
!is.na(target_crs), |
| 72 | 21x |
target_crs > 0, |
| 73 | 21x |
msg = "target_crs must be a single positive integer EPSG code" |
| 74 |
) |
|
| 75 | ||
| 76 | 18x |
target_crs <- base::as.integer(target_crs) |
| 77 | ||
| 78 |
# Convert to sf, transform to target CRS, extract XY in metres |
|
| 79 | ||
| 80 | 18x |
vec_dataset_names <- |
| 81 | 18x |
rownames(data_coords) |
| 82 | ||
| 83 | 18x |
xy_m <- |
| 84 | 18x |
data_coords |> |
| 85 | 18x |
tibble::rownames_to_column("dataset_name") |>
|
| 86 | 18x |
sf::st_as_sf( |
| 87 | 18x |
coords = c("coord_long", "coord_lat"),
|
| 88 | 18x |
crs = 4326 |
| 89 |
) |> |
|
| 90 | 18x |
sf::st_transform(crs = target_crs) |> |
| 91 | 18x |
sf::st_coordinates() |
| 92 | ||
| 93 | 18x |
res <- |
| 94 | 18x |
data_coords |> |
| 95 | 18x |
dplyr::mutate( |
| 96 | 18x |
coord_x_km = xy_m[, "X"] / 1000, |
| 97 | 18x |
coord_y_km = xy_m[, "Y"] / 1000 |
| 98 |
) |
|
| 99 | ||
| 100 | 18x |
rownames(res) <- vec_dataset_names |
| 101 | ||
| 102 | 18x |
return(res) |
| 103 |
} |
| 1 |
#' @title Calculate the Total Pollen Count for Each Sample |
|
| 2 |
#' @description |
|
| 3 |
#' This function computes the total pollen count for each sample |
|
| 4 |
#' in the provided dataset. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame containing at least two columns: |
|
| 7 |
#' `sample_name` (the name or identifier of the sample) |
|
| 8 |
#' and `pollen_count` (the count of pollen for each observation). |
|
| 9 |
#' @return |
|
| 10 |
#' A data frame with two columns: `sample_name` and `pollen_sum`, |
|
| 11 |
#' where `pollen_sum` is the total pollen count |
|
| 12 |
#' for each sample. |
|
| 13 |
#' @details |
|
| 14 |
#' The function groups the data by `sample_name`, |
|
| 15 |
#' calculates the sum of `pollen_count` for each group, |
|
| 16 |
#' and removes any grouping structure before returning the result. |
|
| 17 |
#' Missing values (`NA`) in `pollen_count` are ignored. |
|
| 18 |
#' @export |
|
| 19 |
get_pollen_sum <- function(data) {
|
|
| 20 | 12x |
data %>% |
| 21 | 12x |
dplyr::group_by(sample_name) %>% |
| 22 | 12x |
dplyr::summarize(pollen_sum = sum(pollen_count, na.rm = TRUE)) %>% |
| 23 | 12x |
dplyr::ungroup() %>% |
| 24 | 12x |
return() |
| 25 |
} |
| 1 |
#' @title Align Sample IDs Across Data Streams |
|
| 2 |
#' @description |
|
| 3 |
#' Computes the intersection of valid `(dataset_name, age)` pairs |
|
| 4 |
#' present in all three data sources: community (long format), |
|
| 5 |
#' abiotic (long format), and coordinate data. The returned table |
|
| 6 |
#' serves as the canonical sample index used by all subsequent |
|
| 7 |
#' data-preparation functions. |
|
| 8 |
#' @param data_community_long |
|
| 9 |
#' A data frame in long format containing at least the columns |
|
| 10 |
#' `dataset_name` and `age`. |
|
| 11 |
#' @param data_abiotic_long |
|
| 12 |
#' A data frame in long format containing at least the columns |
|
| 13 |
#' `dataset_name` and `age`. |
|
| 14 |
#' @param data_coords |
|
| 15 |
#' A data frame of spatial coordinates with `dataset_name` stored |
|
| 16 |
#' as row names and columns `coord_long` and `coord_lat`. |
|
| 17 |
#' @param subset_age |
|
| 18 |
#' Optional numeric vector specifying age(s) to retain. If `NULL` |
|
| 19 |
#' (default) all intersecting ages are kept. |
|
| 20 |
#' @return |
|
| 21 |
#' A data frame with columns `dataset_name` and `age`, containing |
|
| 22 |
#' only the `(dataset_name, age)` pairs present in all three |
|
| 23 |
#' inputs, arranged by `dataset_name` then `age`. When |
|
| 24 |
#' `subset_age` is supplied the result is further filtered to |
|
| 25 |
#' those ages. |
|
| 26 |
#' @details |
|
| 27 |
#' All three-way intersection logic is performed while the data |
|
| 28 |
#' remain in long (tidy) format, so no rowname parsing is needed. |
|
| 29 |
#' Passing a `subset_age` value here ā rather than inside a |
|
| 30 |
#' downstream modelling function ā makes the subsetting explicit |
|
| 31 |
#' and cacheable as a pipeline target. |
|
| 32 |
#' @seealso [prepare_community_for_fit()], |
|
| 33 |
#' [prepare_abiotic_for_fit()], [prepare_coords_for_fit()] |
|
| 34 |
#' @export |
|
| 35 |
align_sample_ids <- function( |
|
| 36 |
data_community_long = NULL, |
|
| 37 |
data_abiotic_long = NULL, |
|
| 38 |
data_coords = NULL, |
|
| 39 |
subset_age = NULL) {
|
|
| 40 | 12x |
assertthat::assert_that( |
| 41 | 12x |
is.data.frame(data_community_long), |
| 42 | 12x |
msg = "data_community_long must be a data frame" |
| 43 |
) |
|
| 44 | ||
| 45 | 11x |
assertthat::assert_that( |
| 46 | 11x |
is.data.frame(data_abiotic_long), |
| 47 | 11x |
msg = "data_abiotic_long must be a data frame" |
| 48 |
) |
|
| 49 | ||
| 50 | 10x |
assertthat::assert_that( |
| 51 | 10x |
is.data.frame(data_coords), |
| 52 | 10x |
msg = "data_coords must be a data frame" |
| 53 |
) |
|
| 54 | ||
| 55 | 9x |
assertthat::assert_that( |
| 56 | 9x |
all(c("dataset_name", "age") %in% names(data_community_long)),
|
| 57 | 9x |
msg = paste0( |
| 58 | 9x |
"data_community_long must contain columns", |
| 59 | 9x |
" 'dataset_name' and 'age'" |
| 60 |
) |
|
| 61 |
) |
|
| 62 | ||
| 63 | 8x |
assertthat::assert_that( |
| 64 | 8x |
all(c("dataset_name", "age") %in% names(data_abiotic_long)),
|
| 65 | 8x |
msg = paste0( |
| 66 | 8x |
"data_abiotic_long must contain columns", |
| 67 | 8x |
" 'dataset_name' and 'age'" |
| 68 |
) |
|
| 69 |
) |
|
| 70 | ||
| 71 |
# 1. Extract distinct (dataset_name, age) pairs ----- |
|
| 72 | ||
| 73 | 7x |
data_community_ids <- |
| 74 | 7x |
data_community_long |> |
| 75 | 7x |
dplyr::distinct(dataset_name, age) |
| 76 | ||
| 77 | 7x |
data_abiotic_ids <- |
| 78 | 7x |
data_abiotic_long |> |
| 79 | 7x |
dplyr::distinct(dataset_name, age) |
| 80 | ||
| 81 | 7x |
data_coords_ids <- |
| 82 | 7x |
data_coords |> |
| 83 | 7x |
tibble::rownames_to_column("dataset_name") |>
|
| 84 | 7x |
dplyr::distinct(dataset_name) |
| 85 | ||
| 86 |
# 2. Three-way intersection ----- |
|
| 87 | ||
| 88 | 7x |
res <- |
| 89 | 7x |
dplyr::inner_join( |
| 90 | 7x |
data_community_ids, |
| 91 | 7x |
data_abiotic_ids, |
| 92 | 7x |
by = dplyr::join_by(dataset_name, age) |
| 93 |
) |> |
|
| 94 | 7x |
dplyr::inner_join( |
| 95 | 7x |
data_coords_ids, |
| 96 | 7x |
by = dplyr::join_by(dataset_name) |
| 97 |
) |> |
|
| 98 | 7x |
dplyr::distinct() |> |
| 99 | 7x |
dplyr::arrange(dataset_name, age) |
| 100 | ||
| 101 |
# 3. Optionally subset by age ----- |
|
| 102 | ||
| 103 |
if ( |
|
| 104 | 7x |
!is.null(subset_age) |
| 105 |
) {
|
|
| 106 | 2x |
assertthat::assert_that( |
| 107 | 2x |
is.numeric(subset_age), |
| 108 | 2x |
msg = "subset_age must be numeric" |
| 109 |
) |
|
| 110 | ||
| 111 | 1x |
res <- |
| 112 | 1x |
res |> |
| 113 | 1x |
dplyr::filter(age %in% subset_age) |
| 114 |
} |
|
| 115 | ||
| 116 | 6x |
return(res) |
| 117 |
} |
| 1 |
#' @title Extract ANOVA Fraction Components from a Single Object |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts specified variance partitioning fractions from a |
|
| 4 |
#' single sjSDManova object, returning a two-column tibble with |
|
| 5 |
#' fraction codes and their Nagelkerke R² values. |
|
| 6 |
#' @param anova_object |
|
| 7 |
#' A single sjSDManova-like object containing a |
|
| 8 |
#' \code{$results} element with columns \code{models} and
|
|
| 9 |
#' \code{"R2 Nagelkerke"}. Must be a list.
|
|
| 10 |
#' @param vec_anova_fractions |
|
| 11 |
#' A non-empty character vector of fraction codes to retain |
|
| 12 |
#' (e.g. \code{c("F_A", "F_B", "F_S")}).
|
|
| 13 |
#' @param clamp_negative |
|
| 14 |
#' A single logical (default \code{TRUE}). If \code{TRUE},
|
|
| 15 |
#' negative Nagelkerke R² values are clamped to 0. |
|
| 16 |
#' @return |
|
| 17 |
#' A tibble with columns: |
|
| 18 |
#' \describe{
|
|
| 19 |
#' \item{component}{Character. Human-readable component label
|
|
| 20 |
#' (e.g. "Abiotic", "Associations", "Spatial").} |
|
| 21 |
#' \item{R2_Nagelkerke}{Numeric. Nagelkerke R² value for
|
|
| 22 |
#' each fraction, optionally clamped to [0, Inf).} |
|
| 23 |
#' } |
|
| 24 |
#' @details |
|
| 25 |
#' Accesses \code{anova_object$results} via
|
|
| 26 |
#' \code{purrr::chuck()}, filters rows whose \code{models}
|
|
| 27 |
#' value is in \code{vec_anova_fractions}, renames the
|
|
| 28 |
#' columns to \code{component} and \code{R2_Nagelkerke},
|
|
| 29 |
#' translates internal fraction codes to human-readable |
|
| 30 |
#' labels, and (when \code{clamp_negative = TRUE}) clamps
|
|
| 31 |
#' negative R² values to 0: |
|
| 32 |
#' \itemize{
|
|
| 33 |
#' \item F_A -> "Abiotic" |
|
| 34 |
#' \item F_B -> "Associations" |
|
| 35 |
#' \item F_S -> "Spatial" |
|
| 36 |
#' \item F_AB -> "Abiotic&Associations" |
|
| 37 |
#' \item F_AS -> "Abiotic&Spatial" |
|
| 38 |
#' \item F_BS -> "Associations&Spatial" |
|
| 39 |
#' \item F_ABS -> "Abiotic&Associations&Spatial" |
|
| 40 |
#' } |
|
| 41 |
#' @seealso [aggregate_anova_components()] |
|
| 42 |
#' @export |
|
| 43 |
extract_anova_fractions <- function( |
|
| 44 |
anova_object, |
|
| 45 |
vec_anova_fractions = c("F_A", "F_B", "F_S", "F_AB", "F_AS", "F_BS", "F_ABS"),
|
|
| 46 |
clamp_negative = TRUE) {
|
|
| 47 | 24x |
assertthat::assert_that( |
| 48 | 24x |
base::is.list(anova_object), |
| 49 | 24x |
msg = "'anova_object' must be a list." |
| 50 |
) |
|
| 51 | ||
| 52 | 22x |
assertthat::assert_that( |
| 53 | 22x |
base::is.character(vec_anova_fractions), |
| 54 | 22x |
msg = "'vec_anova_fractions' must be a character vector." |
| 55 |
) |
|
| 56 | ||
| 57 | 21x |
assertthat::assert_that( |
| 58 | 21x |
base::length(vec_anova_fractions) > 0, |
| 59 | 21x |
msg = "'vec_anova_fractions' must not be empty." |
| 60 |
) |
|
| 61 | ||
| 62 | 20x |
assertthat::assert_that( |
| 63 | 20x |
assertthat::is.flag(clamp_negative), |
| 64 | 20x |
msg = "'clamp_negative' must be a single logical value." |
| 65 |
) |
|
| 66 | ||
| 67 |
# Map internal fraction codes to human-readable labels |
|
| 68 |
# following the convention used in variance partitioning |
|
| 69 |
# plots |
|
| 70 | 17x |
vec_component_labels <- |
| 71 | 17x |
c( |
| 72 | 17x |
"F_A" = "Abiotic", |
| 73 | 17x |
"F_B" = "Associations", |
| 74 | 17x |
"F_S" = "Spatial", |
| 75 | 17x |
"F_AB" = "Abiotic&Associations", |
| 76 | 17x |
"F_AS" = "Abiotic&Spatial", |
| 77 | 17x |
"F_BS" = "Associations&Spatial", |
| 78 | 17x |
"F_ABS" = "Abiotic&Associations&Spatial" |
| 79 |
) |
|
| 80 | ||
| 81 | 17x |
res <- |
| 82 | 17x |
anova_object |> |
| 83 | 17x |
purrr::chuck("results") |>
|
| 84 | 17x |
dplyr::filter( |
| 85 | 17x |
models %in% vec_anova_fractions |
| 86 |
) |> |
|
| 87 | 17x |
dplyr::select( |
| 88 | 17x |
component = "models", |
| 89 | 17x |
R2_Nagelkerke = "R2 Nagelkerke" |
| 90 |
) |> |
|
| 91 | 17x |
dplyr::mutate( |
| 92 |
# Replace internal codes with human-readable labels |
|
| 93 | 17x |
component = vec_component_labels[component], |
| 94 |
# Clamp negative R² to 0 when requested |
|
| 95 | 17x |
R2_Nagelkerke_coorrected = pmax(R2_Nagelkerke, 0), |
| 96 | 17x |
R2_Nagelkerke = purrr::map2_dbl( |
| 97 | 17x |
.x = R2_Nagelkerke, |
| 98 | 17x |
.y = R2_Nagelkerke_coorrected, |
| 99 | 17x |
.f = ~ if (clamp_negative) .y else .x |
| 100 |
) |
|
| 101 |
) |> |
|
| 102 | 17x |
dplyr::select( |
| 103 | 17x |
component, |
| 104 | 17x |
R2_Nagelkerke |
| 105 |
) |
|
| 106 | ||
| 107 | ||
| 108 | 16x |
return(res) |
| 109 |
} |
| 1 |
#' @title Filter Taxa by Minimum Number of Spatio-Temporal Samples |
|
| 2 |
#' @description |
|
| 3 |
#' Filters out taxa that are not present in a sufficient number of |
|
| 4 |
#' spatio-temporal samples (distinct dataset-age combinations). Only |
|
| 5 |
#' taxa occurring in at least `min_n_samples` distinct |
|
| 6 |
#' `(dataset_name, age)` combinations are retained. This removes |
|
| 7 |
#' taxa that are present in too few interpolated time steps to |
|
| 8 |
#' provide reliable co-occurrence signal. |
|
| 9 |
#' @param data |
|
| 10 |
#' A data frame containing community data in long format. Must include |
|
| 11 |
#' columns `taxon`, `dataset_name`, and `age`. |
|
| 12 |
#' @param min_n_samples |
|
| 13 |
#' A single positive integer specifying the minimum number of distinct |
|
| 14 |
#' spatio-temporal samples (dataset-age combinations) a taxon must |
|
| 15 |
#' appear in to be retained. Default is 1 (no filtering). |
|
| 16 |
#' @return |
|
| 17 |
#' A filtered data frame containing only taxa that appear in at least |
|
| 18 |
#' `min_n_samples` distinct spatio-temporal samples. All original |
|
| 19 |
#' columns are preserved. |
|
| 20 |
#' @details |
|
| 21 |
#' The function counts distinct `(dataset_name, age)` combinations per |
|
| 22 |
#' `taxon`. Taxa with fewer combinations than `min_n_samples` are |
|
| 23 |
#' removed. An error is raised if no taxa remain after filtering, which |
|
| 24 |
#' may indicate that `min_n_samples` is set too high. |
|
| 25 |
#' @seealso [filter_community_by_n_cores()], [filter_rare_taxa()], |
|
| 26 |
#' [select_n_taxa()] |
|
| 27 |
#' @export |
|
| 28 |
filter_by_n_samples <- function( |
|
| 29 |
data = NULL, |
|
| 30 |
min_n_samples = 1) {
|
|
| 31 | 18x |
assertthat::assert_that( |
| 32 | 18x |
is.data.frame(data), |
| 33 | 18x |
msg = "data must be a data frame" |
| 34 |
) |
|
| 35 | ||
| 36 | 14x |
assertthat::assert_that( |
| 37 | 14x |
all(c("taxon", "dataset_name", "age") %in% names(data)),
|
| 38 | 14x |
msg = paste( |
| 39 | 14x |
"data must contain columns:", |
| 40 | 14x |
paste(c("taxon", "dataset_name", "age"), collapse = ", ")
|
| 41 |
) |
|
| 42 |
) |
|
| 43 | ||
| 44 | 11x |
assertthat::assert_that( |
| 45 | 11x |
is.numeric(min_n_samples) && length(min_n_samples) == 1, |
| 46 | 11x |
msg = "min_n_samples must be a single numeric value" |
| 47 |
) |
|
| 48 | ||
| 49 | 7x |
assertthat::assert_that( |
| 50 | 7x |
min_n_samples >= 1, |
| 51 | 7x |
msg = "min_n_samples must be greater than or equal to 1" |
| 52 |
) |
|
| 53 | ||
| 54 | 5x |
vec_taxa_to_keep <- |
| 55 | 5x |
data |> |
| 56 | 5x |
dplyr::distinct(taxon, dataset_name, age) |> |
| 57 | 5x |
dplyr::group_by(taxon) |> |
| 58 | 5x |
dplyr::summarise( |
| 59 | 5x |
.groups = "drop", |
| 60 | 5x |
n_samples = dplyr::n() |
| 61 |
) |> |
|
| 62 | 5x |
dplyr::filter(n_samples >= min_n_samples) |> |
| 63 | 5x |
dplyr::pull(taxon) |
| 64 | ||
| 65 | 5x |
res <- |
| 66 | 5x |
data |> |
| 67 | 5x |
dplyr::filter(taxon %in% vec_taxa_to_keep) |
| 68 | ||
| 69 | 5x |
assertthat::assert_that( |
| 70 | 5x |
nrow(res) > 0, |
| 71 | 5x |
msg = paste( |
| 72 | 5x |
"No taxa remain after filtering.", |
| 73 | 5x |
"The min_n_samples is too high." |
| 74 |
) |
|
| 75 |
) |
|
| 76 | ||
| 77 | 4x |
return(res) |
| 78 |
} |
| 1 |
#' @title Check sjSDM Model Convergence |
|
| 2 |
#' @description |
|
| 3 |
#' Assesses whether a fitted sjSDM model has converged by analysing |
|
| 4 |
#' the training loss history stored in the model object. |
|
| 5 |
#' @param mod_jsdm |
|
| 6 |
#' A fitted sjSDM model object. Must be of class 'sjSDM'. |
|
| 7 |
#' @return |
|
| 8 |
#' A list with six elements: |
|
| 9 |
#' - `linear_trend_slope`: Absolute slope of a linear trend fitted |
|
| 10 |
#' to the final 10% of loss values. Values < 0.01 indicate |
|
| 11 |
#' convergence. |
|
| 12 |
#' - `median_diff`: Absolute difference between the median of the |
|
| 13 |
#' first and last 25% of the tail loss. Values < 1 indicate |
|
| 14 |
#' convergence. |
|
| 15 |
#' - `convergence_plot`: A ggplot2 object showing the full loss |
|
| 16 |
#' history with a 20-epoch rolling median smoother. A dashed |
|
| 17 |
#' orange vertical line marks the last epoch run when early |
|
| 18 |
#' stopping was triggered. |
|
| 19 |
#' - `note`: A character string summarising the thresholds. |
|
| 20 |
#' - `epochs_run`: Integer. The number of epochs actually run |
|
| 21 |
#' before early stopping halted training (or the full budget |
|
| 22 |
#' when early stopping was not triggered). |
|
| 23 |
#' - `early_stopping_triggered`: Logical. `TRUE` when the model |
|
| 24 |
#' stopped before exhausting its epoch budget (i.e. trailing |
|
| 25 |
#' zeros were detected in `mod_jsdm$history`). |
|
| 26 |
#' @details |
|
| 27 |
#' The function uses the `history` component of the fitted sjSDM |
|
| 28 |
#' model, which stores the per-epoch negative log-likelihood values |
|
| 29 |
#' produced during gradient descent. |
|
| 30 |
#' |
|
| 31 |
#' sjSDM pre-allocates `history` to the full epoch budget. When |
|
| 32 |
#' early stopping fires, the remaining trailing entries are zero. |
|
| 33 |
#' This function detects those trailing zeros, truncates the vector |
|
| 34 |
#' to only the epochs that were actually run, and sets |
|
| 35 |
#' `early_stopping_triggered = TRUE`. |
|
| 36 |
#' |
|
| 37 |
#' Two diagnostic metrics are computed on the tail (final 10% of |
|
| 38 |
#' epochs actually run): |
|
| 39 |
#' |
|
| 40 |
#' 1. **Linear trend slope** ā A near-zero slope means the loss is |
|
| 41 |
#' no longer decreasing, indicating convergence. The recommended |
|
| 42 |
#' threshold is < 0.01. |
|
| 43 |
#' 2. **Median difference** ā A robust comparison of the first |
|
| 44 |
#' versus last quarter of the tail using medians (insensitive to |
|
| 45 |
#' spikes). The recommended threshold is < 1. |
|
| 46 |
#' @seealso [fit_jsdm_model()], [evaluate_jsdm()] |
|
| 47 |
#' @export |
|
| 48 |
check_convergence_jsdm <- function(mod_jsdm = NULL) {
|
|
| 49 | 13x |
assertthat::assert_that( |
| 50 | 13x |
inherits(mod_jsdm, "sjSDM"), |
| 51 | 13x |
msg = "mod_jsdm must be of class 'sjSDM'" |
| 52 |
) |
|
| 53 | ||
| 54 | 10x |
loss_history <- mod_jsdm$history |
| 55 | ||
| 56 | 10x |
assertthat::assert_that( |
| 57 | 10x |
is.numeric(loss_history), |
| 58 | 10x |
msg = paste( |
| 59 | 10x |
"mod_jsdm$history must be a numeric vector of length >= 10.", |
| 60 | 10x |
"Did you fit the model with enough iterations?" |
| 61 |
) |
|
| 62 |
) |
|
| 63 | ||
| 64 |
# Strip trailing zeros left by early stopping. |
|
| 65 |
# sjSDM pre-allocates history to the full epoch budget; epochs not |
|
| 66 |
# reached remain 0. Metrics must be computed on real epochs only. |
|
| 67 | 10x |
n_budget <- base::length(loss_history) |
| 68 | 10x |
vec_nonzero_idx <- base::which(loss_history != 0) |
| 69 | 10x |
last_epoch <- |
| 70 | 10x |
if (base::length(vec_nonzero_idx) == 0L) {
|
| 71 | ! |
0L |
| 72 |
} else {
|
|
| 73 | 10x |
base::max(vec_nonzero_idx) |
| 74 |
} |
|
| 75 | 10x |
early_stopping_triggered <- last_epoch < n_budget |
| 76 | 10x |
loss_history <- loss_history[base::seq_len(last_epoch)] |
| 77 | ||
| 78 | 10x |
assertthat::assert_that( |
| 79 | 10x |
length(loss_history) >= 10L, |
| 80 | 10x |
msg = paste( |
| 81 | 10x |
"mod_jsdm$history must be a numeric vector of length >= 10.", |
| 82 | 10x |
"Did you fit the model with enough iterations?" |
| 83 |
) |
|
| 84 |
) |
|
| 85 | ||
| 86 | 9x |
n <- length(loss_history) |
| 87 | ||
| 88 | 9x |
tail_loss <- loss_history[round(n * 0.9):n] |
| 89 | ||
| 90 |
# 1. Linear trend slope on the tail (should be ā 0 if converged) |
|
| 91 | 9x |
linear_trend_slope <- |
| 92 | 9x |
stats::coef( |
| 93 | 9x |
stats::lm(tail_loss ~ seq_along(tail_loss)) |
| 94 | 9x |
)[2] |> |
| 95 | 9x |
abs() |> |
| 96 | 9x |
as.numeric() |> |
| 97 | 9x |
round(2) |
| 98 | ||
| 99 |
# 2. Compare median of first vs last quarter of the tail |
|
| 100 |
# (robust to spikes, unlike range) |
|
| 101 | 9x |
q <- length(tail_loss) |
| 102 | ||
| 103 | 9x |
median_diff <- |
| 104 |
( |
|
| 105 | 9x |
median(tail_loss[1:round(q * 0.25)]) - |
| 106 | 9x |
median(tail_loss[round(q * 0.75):q]) |
| 107 |
) |> |
|
| 108 | 9x |
abs() |> |
| 109 | 9x |
as.numeric() |> |
| 110 | 9x |
round(2) |
| 111 | ||
| 112 |
# 3. Rolling smoother to visualise the trend without noise |
|
| 113 | 9x |
loss_smooth <- |
| 114 | 9x |
stats::filter(loss_history, rep(1 / 20, 20), sides = 2) |
| 115 | ||
| 116 | 9x |
p_convergence <- |
| 117 | 9x |
dplyr::tibble( |
| 118 | 9x |
epoch = seq_along(loss_history), |
| 119 | 9x |
loss = loss_history, |
| 120 | 9x |
loss_smooth = as.numeric(loss_smooth) |
| 121 |
) |> |
|
| 122 | 9x |
ggplot2::ggplot( |
| 123 | 9x |
ggplot2::aes(x = epoch) |
| 124 |
) + |
|
| 125 | 9x |
ggplot2::geom_line( |
| 126 | 9x |
ggplot2::aes(y = loss), |
| 127 | 9x |
color = "grey80", |
| 128 | 9x |
linewidth = 0.5 |
| 129 |
) + |
|
| 130 | 9x |
ggplot2::geom_line( |
| 131 | 9x |
ggplot2::aes(y = loss_smooth), |
| 132 | 9x |
color = "red", |
| 133 | 9x |
linewidth = 1 |
| 134 |
) + |
|
| 135 | 9x |
ggplot2::labs( |
| 136 | 9x |
x = "Epoch", |
| 137 | 9x |
y = "Loss", |
| 138 | 9x |
title = "Model Convergence: Loss History" |
| 139 |
) + |
|
| 140 | 9x |
ggplot2::theme_minimal() + |
| 141 | 9x |
ggplot2::theme( |
| 142 | 9x |
plot.title = ggplot2::element_text(hjust = 0.5) |
| 143 |
) |
|
| 144 | ||
| 145 |
# Mark the early-stopping point with a dashed orange line |
|
| 146 | 9x |
if (early_stopping_triggered) {
|
| 147 | 5x |
p_convergence <- |
| 148 | 5x |
p_convergence + |
| 149 | 5x |
ggplot2::geom_vline( |
| 150 | 5x |
xintercept = last_epoch, |
| 151 | 5x |
linetype = "dashed", |
| 152 | 5x |
color = "orange" |
| 153 |
) |
|
| 154 |
} |
|
| 155 | ||
| 156 | 9x |
return( |
| 157 | 9x |
list( |
| 158 | 9x |
linear_trend_slope = linear_trend_slope, |
| 159 | 9x |
median_diff = median_diff, |
| 160 | 9x |
convergence_plot = p_convergence, |
| 161 | 9x |
note = paste( |
| 162 | 9x |
"Linear trend slope on tail should be < 0.01.", |
| 163 | 9x |
"Median difference in tail should be < 1." |
| 164 |
), |
|
| 165 | 9x |
epochs_run = base::as.integer(last_epoch), |
| 166 | 9x |
early_stopping_triggered = early_stopping_triggered |
| 167 |
) |
|
| 168 |
) |
|
| 169 |
} |
| 1 |
#' @title Get auxiliary classification table |
|
| 2 |
#' @description |
|
| 3 |
#' Reads the manually curated auxiliary classification table from a |
|
| 4 |
#' CSV file. If the file does not yet exist, returns an empty tibble |
|
| 5 |
#' with the required columns so that the rest of the pipeline can |
|
| 6 |
#' continue and detect any missing taxa. |
|
| 7 |
#' @param file_path |
|
| 8 |
#' A length-1 character string giving the path to the CSV file. |
|
| 9 |
#' Defaults to |
|
| 10 |
#' `here::here("Data/Input/aux_classification_table.csv")`.
|
|
| 11 |
#' The file, if present, must contain a `sel_name` column. All |
|
| 12 |
#' seven taxonomic rank columns (`kingdom`, `phylum`, `class`, |
|
| 13 |
#' `order`, `family`, `genus`, `species`) are expected but |
|
| 14 |
#' optional ā any that are absent are filled with `NA_character_`. |
|
| 15 |
#' @return |
|
| 16 |
#' A tibble with columns `sel_name`, `kingdom`, `phylum`, `class`, |
|
| 17 |
#' `order`, `family`, `genus`, and `species` (all character). |
|
| 18 |
#' Returns an empty tibble when the file does not exist. |
|
| 19 |
#' @details |
|
| 20 |
#' Manual classifications in this file override automatic |
|
| 21 |
#' classifications produced by `get_taxa_classification()`. When |
|
| 22 |
#' the file exists it is validated to confirm `sel_name` is |
|
| 23 |
#' present. Any of the seven rank columns that are absent are |
|
| 24 |
#' filled with `NA_character_` rather than raising an error, so |
|
| 25 |
#' that partial tables (e.g., those that only specify `family`, |
|
| 26 |
#' `genus`, and `species`) continue to be accepted. |
|
| 27 |
#' @seealso |
|
| 28 |
#' [combine_classification_tables()], |
|
| 29 |
#' [check_and_report_missing_taxa()] |
|
| 30 |
#' @export |
|
| 31 |
get_aux_classification_table <- function( |
|
| 32 |
file_path = here::here("Data/Input/aux_classification_table.csv")) {
|
|
| 33 | 12x |
assertthat::assert_that( |
| 34 | 12x |
is.character(file_path) && length(file_path) == 1, |
| 35 | 12x |
msg = "file_path must be a single character string" |
| 36 |
) |
|
| 37 | ||
| 38 | 8x |
res_empty <- |
| 39 | 8x |
tibble::tibble( |
| 40 | 8x |
sel_name = character(0), |
| 41 | 8x |
kingdom = character(0), |
| 42 | 8x |
phylum = character(0), |
| 43 | 8x |
class = character(0), |
| 44 | 8x |
order = character(0), |
| 45 | 8x |
family = character(0), |
| 46 | 8x |
genus = character(0), |
| 47 | 8x |
species = character(0) |
| 48 |
) |
|
| 49 | ||
| 50 | 8x |
if (!file.exists(file_path)) {
|
| 51 | 1x |
return(res_empty) |
| 52 |
} |
|
| 53 | ||
| 54 | 7x |
res_raw <- |
| 55 | 7x |
readr::read_csv( |
| 56 | 7x |
file_path, |
| 57 | 7x |
show_col_types = FALSE |
| 58 |
) |
|
| 59 | ||
| 60 | 7x |
assertthat::assert_that( |
| 61 | 7x |
"sel_name" %in% colnames(res_raw), |
| 62 | 7x |
msg = paste( |
| 63 | 7x |
"aux_classification_table.csv must contain", |
| 64 | 7x |
"a 'sel_name' column" |
| 65 |
) |
|
| 66 |
) |
|
| 67 | ||
| 68 |
# Fill any missing expected columns with NA so combining works |
|
| 69 | 6x |
vec_expected_cols <- c( |
| 70 | 6x |
"kingdom", "phylum", "class", "order", |
| 71 | 6x |
"family", "genus", "species" |
| 72 |
) |
|
| 73 | ||
| 74 | 6x |
for ( |
| 75 | 6x |
col_name in vec_expected_cols |
| 76 |
) {
|
|
| 77 | 42x |
if (!col_name %in% colnames(res_raw)) {
|
| 78 | 21x |
res_raw <- |
| 79 | 21x |
res_raw %>% |
| 80 | 21x |
dplyr::mutate(!!col_name := NA_character_) |
| 81 |
} |
|
| 82 |
} |
|
| 83 | ||
| 84 | 6x |
res <- |
| 85 | 6x |
res_raw %>% |
| 86 | 6x |
dplyr::select( |
| 87 | 6x |
dplyr::all_of(c("sel_name", vec_expected_cols))
|
| 88 |
) %>% |
|
| 89 | 6x |
dplyr::mutate( |
| 90 | 6x |
dplyr::across( |
| 91 | 6x |
dplyr::everything(), |
| 92 | 6x |
as.character |
| 93 |
) |
|
| 94 |
) |
|
| 95 | ||
| 96 | 6x |
return(res) |
| 97 |
} |
| 1 |
#' @title Get Abiotic Data |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts abiotic data from a data frame containing nested abiotic |
|
| 4 |
#' information. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame. Must contain the columns `dataset_name` and `data_abiotic`. |
|
| 7 |
#' @return |
|
| 8 |
#' A data frame with columns `dataset_name`, `sample_name`, |
|
| 9 |
#' `abiotic_variable_name`, and `abiotic_value`. |
|
| 10 |
#' @details |
|
| 11 |
#' Validates the input data frame, ensures required columns are present, and |
|
| 12 |
#' unnests the `data_abiotic` column. |
|
| 13 |
#' @export |
|
| 14 |
get_abiotic_data <- function(data = NULL) {
|
|
| 15 | 6x |
assertthat::assert_that( |
| 16 | 6x |
is.data.frame(data), |
| 17 | 6x |
msg = "data must be a data frame" |
| 18 |
) |
|
| 19 | ||
| 20 | 4x |
assertthat::assert_that( |
| 21 | 4x |
all(c("dataset_name", "data_abiotic") %in% colnames(data)),
|
| 22 | 4x |
msg = "data must contain columns 'dataset_name' and 'data_abiotic'" |
| 23 |
) |
|
| 24 | 2x |
data %>% |
| 25 | 2x |
dplyr::select(dataset_name, data_abiotic) %>% |
| 26 | 2x |
tidyr::unnest( |
| 27 | 2x |
cols = c(data_abiotic) |
| 28 |
) %>% |
|
| 29 | 2x |
dplyr::select(dataset_name, sample_name, abiotic_variable_name, abiotic_value) %>% |
| 30 | 2x |
return() |
| 31 |
} |
| 1 |
#' @title Add Age to Community Data |
|
| 2 |
#' @description |
|
| 3 |
#' Merges community data with age data based on dataset and sample names. |
|
| 4 |
#' @param data_community |
|
| 5 |
#' A data frame containing community data. Must include `dataset_name` and |
|
| 6 |
#' `sample_name` columns. |
|
| 7 |
#' @param data_ages |
|
| 8 |
#' A data frame containing age data. Must include `dataset_name` and |
|
| 9 |
#' `sample_name` columns. |
|
| 10 |
#' @return |
|
| 11 |
#' A data frame with community data merged with the corresponding age data. |
|
| 12 |
#' @details |
|
| 13 |
#' Performs a left join between community data and age data using |
|
| 14 |
#' `dataset_name` and `sample_name` as keys. |
|
| 15 |
#' @export |
|
| 16 |
add_age_to_samples <- function(data_community = NULL, data_ages = NULL) {
|
|
| 17 | 4x |
assertthat::assert_that( |
| 18 | 4x |
is.data.frame(data_community), |
| 19 | 4x |
msg = "data_community must be a data frame" |
| 20 |
) |
|
| 21 | 2x |
assertthat::assert_that( |
| 22 | 2x |
is.data.frame(data_ages), |
| 23 | 2x |
msg = "data_ages must be a data frame" |
| 24 |
) |
|
| 25 | ||
| 26 | 2x |
assertthat::assert_that( |
| 27 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data_community)),
|
| 28 | 2x |
msg = "data_community must contain columns 'dataset_name' and 'sample_name'" |
| 29 |
) |
|
| 30 | 2x |
assertthat::assert_that( |
| 31 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data_ages)),
|
| 32 | 2x |
msg = "data_ages must contain columns 'dataset_name' and 'sample_name'" |
| 33 |
) |
|
| 34 | ||
| 35 | 2x |
dplyr::left_join( |
| 36 | 2x |
x = data_community, |
| 37 | 2x |
y = data_ages, |
| 38 | 2x |
by = c("dataset_name", "sample_name")
|
| 39 |
) %>% |
|
| 40 | 2x |
return() |
| 41 |
} |
| 1 |
#' @title Get ANOVA for sjSDM Model |
|
| 2 |
#' @description |
|
| 3 |
#' Computes the ANOVA decomposition for a fitted sjSDM model, |
|
| 4 |
#' partitioning variance explained by environmental and spatial |
|
| 5 |
#' components. |
|
| 6 |
#' @param mod |
|
| 7 |
#' A fitted model object of class `sjSDM`. |
|
| 8 |
#' @param n_samples |
|
| 9 |
#' Integer; number of Monte Carlo samples used for the variation |
|
| 10 |
#' partitioning calculation. Larger values yield more stable |
|
| 11 |
#' estimates but are slower. Defaults to `5000L`. |
|
| 12 |
#' @param verbose |
|
| 13 |
#' Logical; if `TRUE`, prints detailed output from the ANOVA |
|
| 14 |
#' computation. |
|
| 15 |
#' @return |
|
| 16 |
#' An ANOVA result object as returned by `sjSDM:::anova.sjSDM()`, |
|
| 17 |
#' containing variance partitioning across model components. |
|
| 18 |
#' @details |
|
| 19 |
#' The function wraps the internal `sjSDM:::anova.sjSDM()` method. |
|
| 20 |
#' The `n_samples` argument controls how many Monte Carlo draws are |
|
| 21 |
#' used to approximate the likelihoods in each variation partition. |
|
| 22 |
#' For exploratory runs, a value of 1000 is sufficient; for |
|
| 23 |
#' publication-quality results, 3000ā5000 is recommended. |
|
| 24 |
#' @seealso [fit_jsdm_model()] |
|
| 25 |
#' @export |
|
| 26 |
get_anova <- function( |
|
| 27 |
mod, |
|
| 28 |
n_samples = 5000L, |
|
| 29 |
verbose = FALSE) {
|
|
| 30 | 19x |
assertthat::assert_that( |
| 31 | 19x |
inherits(mod, "sjSDM"), |
| 32 | 19x |
msg = "The model must be of class 'sjSDM'." |
| 33 |
) |
|
| 34 | ||
| 35 | 12x |
assertthat::assert_that( |
| 36 | 12x |
assertthat::is.count(n_samples), |
| 37 | 12x |
msg = "n_samples must be a single positive integer." |
| 38 |
) |
|
| 39 | ||
| 40 | 8x |
assertthat::assert_that( |
| 41 | 8x |
assertthat::is.flag(verbose), |
| 42 | 8x |
!is.na(verbose) && length(verbose) == 1L, |
| 43 | 8x |
msg = "verbose must be a single logical value (TRUE or FALSE)." |
| 44 |
) |
|
| 45 | ||
| 46 | 4x |
res <- |
| 47 | 4x |
sjSDM:::anova.sjSDM(mod, samples = n_samples, verbose = verbose) |
| 48 | ||
| 49 | 4x |
return(res) |
| 50 |
} |
| 1 |
#' @title Make Community Proportions |
|
| 2 |
#' @description |
|
| 3 |
#' Transforms community pollen count data into proportions, normalising |
|
| 4 |
#' each sample by its total pollen count. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame with columns `dataset_name`, `sample_name`, `taxon`, |
|
| 7 |
#' `age`, and `pollen_count`. Must contain a `pollen_count` column with |
|
| 8 |
#' raw pollen counts. |
|
| 9 |
#' @return |
|
| 10 |
#' A data frame with the same structure as the input, but with |
|
| 11 |
#' `pollen_count` replaced by `pollen_prop` (pollen counts divided by |
|
| 12 |
#' sample-level total). The `pollen_count` and `pollen_sum` columns are |
|
| 13 |
#' dropped. |
|
| 14 |
#' @details |
|
| 15 |
#' Computes per-sample totals using `get_pollen_sum()` and normalises |
|
| 16 |
#' counts via `transform_to_proportions()`. The result is suitable for |
|
| 17 |
#' passing to `interpolate_community_data()`. |
|
| 18 |
#' @seealso [interpolate_community_data()], [transform_to_proportions()] |
|
| 19 |
#' @export |
|
| 20 |
make_community_proportions <- function(data = NULL) {
|
|
| 21 | 8x |
assertthat::assert_that( |
| 22 | 8x |
is.data.frame(data), |
| 23 | 8x |
msg = "data must be a data frame" |
| 24 |
) |
|
| 25 | ||
| 26 | 7x |
assertthat::assert_that( |
| 27 | 7x |
"pollen_count" %in% colnames(data), |
| 28 | 7x |
msg = "data must contain a 'pollen_count' column" |
| 29 |
) |
|
| 30 | ||
| 31 | 6x |
res <- |
| 32 | 6x |
data %>% |
| 33 | 6x |
transform_to_proportions( |
| 34 | 6x |
pollen_sum = get_pollen_sum(data) |
| 35 |
) |
|
| 36 | ||
| 37 | 6x |
return(res) |
| 38 |
} |
| 1 |
#' @title Prepare Abiotic Data for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Reshapes long-format abiotic data into a wide data frame |
|
| 4 |
#' suitable for downstream scaling, filtering to the canonical |
|
| 5 |
#' sample index supplied by `align_sample_ids()`. |
|
| 6 |
#' @param data_abiotic_long |
|
| 7 |
#' A data frame in long format with columns `dataset_name`, `age`, |
|
| 8 |
#' `abiotic_variable_name`, and `abiotic_value`. |
|
| 9 |
#' @param data_sample_ids |
|
| 10 |
#' A data frame of valid `(dataset_name, age)` pairs as returned by |
|
| 11 |
#' `align_sample_ids()`. |
|
| 12 |
#' @return |
|
| 13 |
#' A data frame in wide format with real columns `dataset_name`, |
|
| 14 |
#' `age`, and one column per abiotic variable. Rows are ordered by |
|
| 15 |
#' `dataset_name` then `age`, matching the order in |
|
| 16 |
#' `data_sample_ids`. Missing variableāsample combinations are |
|
| 17 |
#' left as `NA`. The `age` column is preserved as a plain numeric |
|
| 18 |
#' column (not encoded in row names) to simplify downstream |
|
| 19 |
#' scaling. |
|
| 20 |
#' @details |
|
| 21 |
#' Unlike the former `prepare_data_for_fit(type = "abiotic")`, this |
|
| 22 |
#' function retains `dataset_name` and `age` as real columns so |
|
| 23 |
#' that `scale_abiotic_for_fit()` can operate on them without |
|
| 24 |
#' parsing row names. Row names are added by |
|
| 25 |
#' `scale_abiotic_for_fit()` as part of the scaling step. |
|
| 26 |
#' @seealso [align_sample_ids()], [scale_abiotic_for_fit()], |
|
| 27 |
#' [assemble_data_to_fit()] |
|
| 28 |
#' @export |
|
| 29 |
prepare_abiotic_for_fit <- function( |
|
| 30 |
data_abiotic_long = NULL, |
|
| 31 |
data_sample_ids = NULL) {
|
|
| 32 | 10x |
assertthat::assert_that( |
| 33 | 10x |
is.data.frame(data_abiotic_long), |
| 34 | 10x |
msg = "data_abiotic_long must be a data frame" |
| 35 |
) |
|
| 36 | ||
| 37 | 9x |
assertthat::assert_that( |
| 38 | 9x |
is.data.frame(data_sample_ids), |
| 39 | 9x |
msg = "data_sample_ids must be a data frame" |
| 40 |
) |
|
| 41 | ||
| 42 | 8x |
assertthat::assert_that( |
| 43 | 8x |
all( |
| 44 | 8x |
c( |
| 45 | 8x |
"dataset_name", "age", |
| 46 | 8x |
"abiotic_variable_name", "abiotic_value" |
| 47 | 8x |
) %in% names(data_abiotic_long) |
| 48 |
), |
|
| 49 | 8x |
msg = paste0( |
| 50 | 8x |
"data_abiotic_long must contain columns 'dataset_name',", |
| 51 | 8x |
" 'age', 'abiotic_variable_name', 'abiotic_value'" |
| 52 |
) |
|
| 53 |
) |
|
| 54 | ||
| 55 | 7x |
assertthat::assert_that( |
| 56 | 7x |
all(c("dataset_name", "age") %in% names(data_sample_ids)),
|
| 57 | 7x |
msg = paste0( |
| 58 | 7x |
"data_sample_ids must contain columns", |
| 59 | 7x |
" 'dataset_name' and 'age'" |
| 60 |
) |
|
| 61 |
) |
|
| 62 | ||
| 63 | 6x |
res <- |
| 64 | 6x |
data_abiotic_long |> |
| 65 | 6x |
dplyr::inner_join( |
| 66 | 6x |
data_sample_ids, |
| 67 | 6x |
by = dplyr::join_by(dataset_name, age) |
| 68 |
) |> |
|
| 69 | 6x |
dplyr::arrange(dataset_name, age) |> |
| 70 | 6x |
dplyr::select( |
| 71 | 6x |
dataset_name, |
| 72 | 6x |
age, |
| 73 | 6x |
abiotic_variable_name, |
| 74 | 6x |
abiotic_value |
| 75 |
) |> |
|
| 76 | 6x |
tidyr::pivot_wider( |
| 77 | 6x |
names_from = "abiotic_variable_name", |
| 78 | 6x |
values_from = "abiotic_value", |
| 79 | 6x |
values_fill = NULL |
| 80 |
) |
|
| 81 | ||
| 82 | 6x |
return(res) |
| 83 |
} |
| 1 |
#' @title Scale Spatial Predictors for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Centres and scales all spatial predictor columns, records |
|
| 4 |
#' the scaling attributes for later back-transformation, and |
|
| 5 |
#' returns both the scaled data frame and the attributes as a |
|
| 6 |
#' named list. |
|
| 7 |
#' @param data_spatial |
|
| 8 |
#' A data frame with row names in the format |
|
| 9 |
#' `"<dataset_name>__<age>"` and numeric columns containing |
|
| 10 |
#' spatial predictor variables (e.g. `coord_x_km`, |
|
| 11 |
#' `coord_y_km`), as returned by |
|
| 12 |
#' `prepare_spatial_predictors_for_fit()`. |
|
| 13 |
#' @return |
|
| 14 |
#' A named list with two elements: |
|
| 15 |
#' \describe{
|
|
| 16 |
#' \item{`data_spatial_scaled`}{A data frame with the same
|
|
| 17 |
#' row names as the input, with all predictor columns |
|
| 18 |
#' centred and scaled. When only one sample is present, |
|
| 19 |
#' columns are centred only (scale = FALSE).} |
|
| 20 |
#' \item{`spatial_scale_attributes`}{A named list of
|
|
| 21 |
#' `center` and `scale` attributes for each predictor |
|
| 22 |
#' column, which can be used to back-transform |
|
| 23 |
#' predictions.} |
|
| 24 |
#' } |
|
| 25 |
#' @details |
|
| 26 |
#' All columns are centred (mean subtracted) and divided by |
|
| 27 |
#' their standard deviation using `base::scale()`, provided |
|
| 28 |
#' more than one sample is present. When only one row exists, |
|
| 29 |
#' only centring is applied. |
|
| 30 |
#' @seealso [prepare_spatial_predictors_for_fit()], |
|
| 31 |
#' [assemble_data_to_fit()] |
|
| 32 |
#' @export |
|
| 33 |
scale_spatial_for_fit <- function(data_spatial = NULL) {
|
|
| 34 | 12x |
assertthat::assert_that( |
| 35 | 12x |
is.data.frame(data_spatial), |
| 36 | 12x |
msg = "data_spatial must be a data frame" |
| 37 |
) |
|
| 38 | ||
| 39 | 10x |
assertthat::assert_that( |
| 40 | 10x |
nrow(data_spatial) > 0, |
| 41 | 10x |
msg = "data_spatial must have at least one row" |
| 42 |
) |
|
| 43 | ||
| 44 | 9x |
assertthat::assert_that( |
| 45 | 9x |
ncol(data_spatial) > 0, |
| 46 | 9x |
msg = "data_spatial must have at least one column" |
| 47 |
) |
|
| 48 | ||
| 49 | 8x |
is_scalable <- |
| 50 | 8x |
nrow(data_spatial) > 1 |
| 51 | ||
| 52 |
# 1. Capture scale attributes ----- |
|
| 53 | ||
| 54 | 8x |
spatial_scale_attributes <- |
| 55 | 8x |
data_spatial |> |
| 56 | 8x |
purrr::map( |
| 57 | 8x |
.f = ~ scale( |
| 58 | 8x |
.x, |
| 59 | 8x |
center = TRUE, |
| 60 | 8x |
scale = is_scalable |
| 61 |
) |> |
|
| 62 | 8x |
attributes() %>% # use magrittr pipe for environment handling |
| 63 |
{
|
|
| 64 | 16x |
.[-1] |
| 65 |
} |
|
| 66 |
) |
|
| 67 | ||
| 68 |
# 2. Apply scaling ----- |
|
| 69 | ||
| 70 | 8x |
data_spatial_scaled <- |
| 71 | 8x |
data_spatial |> |
| 72 | 8x |
dplyr::mutate( |
| 73 | 8x |
dplyr::across( |
| 74 | 8x |
.cols = dplyr::everything(), |
| 75 | 8x |
.fns = ~ scale( |
| 76 | 8x |
.x, |
| 77 | 8x |
center = TRUE, |
| 78 | 8x |
scale = is_scalable |
| 79 |
) |> |
|
| 80 | 8x |
as.numeric() |
| 81 |
) |
|
| 82 |
) |
|
| 83 | ||
| 84 |
# 3. Return list ----- |
|
| 85 | ||
| 86 | 8x |
res <- |
| 87 | 8x |
list( |
| 88 | 8x |
data_spatial_scaled = data_spatial_scaled, |
| 89 | 8x |
spatial_scale_attributes = spatial_scale_attributes |
| 90 |
) |
|
| 91 | ||
| 92 | 8x |
return(res) |
| 93 |
} |
| 1 |
#' @title Aggregate ANOVA Variance Components Across Time Slices |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts variance partitioning fractions from a named list |
|
| 4 |
#' of sjSDM ANOVA objects and assembles a long-format tibble |
|
| 5 |
#' with one row per age Ć component combination. |
|
| 6 |
#' @param list_model_anova |
|
| 7 |
#' A named list of sjSDManova objects, one per time slice. |
|
| 8 |
#' Names must end with a numeric age value |
|
| 9 |
#' (e.g. "timeslice_500"). NULL entries and objects without a |
|
| 10 |
#' \code{$results} element are silently discarded.
|
|
| 11 |
#' @return |
|
| 12 |
#' A tibble with columns: |
|
| 13 |
#' \describe{
|
|
| 14 |
#' \item{age}{Numeric age (cal yr BP) extracted from the
|
|
| 15 |
#' list-element name.} |
|
| 16 |
#' \item{component}{Character. Human-readable component label
|
|
| 17 |
#' (e.g. "Abiotic", "Associations", "Spatial").} |
|
| 18 |
#' \item{R2_Nagelkerke}{Numeric. Nagelkerke R² for the
|
|
| 19 |
#' component, clamped to [0, Inf).} |
|
| 20 |
#' } |
|
| 21 |
#' @details |
|
| 22 |
#' The age is parsed from the list element name by extracting |
|
| 23 |
#' the trailing digit sequence (e.g. "timeslice_500" -> 500). |
|
| 24 |
#' Fraction extraction, code-to-label translation, and |
|
| 25 |
#' negative R² clamping are delegated to |
|
| 26 |
#' [extract_anova_fractions()] (called with |
|
| 27 |
#' \code{clamp_negative = TRUE}).
|
|
| 28 |
#' @seealso [get_anova()], [extract_anova_fractions()] |
|
| 29 |
#' @export |
|
| 30 |
aggregate_anova_components <- function(list_model_anova) {
|
|
| 31 | 12x |
assertthat::assert_that( |
| 32 | 12x |
base::is.list(list_model_anova), |
| 33 | 12x |
msg = "'list_model_anova' must be a list." |
| 34 |
) |
|
| 35 | ||
| 36 | 9x |
vec_anova_fractions <- |
| 37 | 9x |
c("F_A", "F_B", "F_S", "F_AB", "F_AS", "F_BS", "F_ABS")
|
| 38 | ||
| 39 | 9x |
res <- |
| 40 | 9x |
list_model_anova |> |
| 41 | 9x |
purrr::discard( |
| 42 | 9x |
~ base::is.null(.x) || !("results" %in% base::names(.x))
|
| 43 |
) |> |
|
| 44 | 9x |
purrr::imap( |
| 45 | 9x |
.f = ~ {
|
| 46 | 9x |
age_val <- |
| 47 | 9x |
.y |> |
| 48 | 9x |
stringr::str_extract("\\d+$") |>
|
| 49 | 9x |
as.numeric() |
| 50 | ||
| 51 | 9x |
extract_anova_fractions( |
| 52 | 9x |
anova_object = .x, |
| 53 | 9x |
vec_anova_fractions = vec_anova_fractions, |
| 54 | 9x |
clamp_negative = TRUE |
| 55 |
) |> |
|
| 56 | 9x |
dplyr::mutate( |
| 57 | 9x |
age = age_val |
| 58 |
) |
|
| 59 |
} |
|
| 60 |
) |> |
|
| 61 | 9x |
purrr::list_rbind() |
| 62 | ||
| 63 | 9x |
return(res) |
| 64 |
} |
| 1 |
#' @title Interpolate Data |
|
| 2 |
#' @description |
|
| 3 |
#' Interpolates data over a specified age range and timestep using a method. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame containing the data to be interpolated. |
|
| 6 |
#' @param by |
|
| 7 |
#' A character vector of column name(s) to group by when nesting data for |
|
| 8 |
#' interpolation (default: "dataset_name"). |
|
| 9 |
#' @param age_var |
|
| 10 |
#' Name of the age variable column (default: "age"). |
|
| 11 |
#' @param value_var |
|
| 12 |
#' Name of the value variable column (default: "pollen_prop"). |
|
| 13 |
#' @param method |
|
| 14 |
#' Interpolation method to use (default: "linear"). |
|
| 15 |
#' @param rule |
|
| 16 |
#' Integer specifying the extrapolation rule (default: 1). |
|
| 17 |
#' @param ties |
|
| 18 |
#' Function to handle tied values (default: `mean`). |
|
| 19 |
#' @param age_min |
|
| 20 |
#' Minimum age for interpolation (default: 0). |
|
| 21 |
#' @param age_max |
|
| 22 |
#' Maximum age for interpolation (default: 12000). |
|
| 23 |
#' @param timestep |
|
| 24 |
#' Timestep for interpolation (default: 500). |
|
| 25 |
#' @return |
|
| 26 |
#' A data frame with interpolated values, including dataset name, taxon, age, |
|
| 27 |
#' and value columns. |
|
| 28 |
#' @details |
|
| 29 |
#' Nests data by dataset and taxon, performs interpolation using `stats::approx`, |
|
| 30 |
#' and returns the interpolated data in a flat format. |
|
| 31 |
#' @seealso [stats::approx()] |
|
| 32 |
#' @export |
|
| 33 |
interpolate_data <- function(data = NULL, |
|
| 34 |
by = "dataset_name", |
|
| 35 |
age_var = "age", |
|
| 36 |
value_var = "pollen_prop", |
|
| 37 |
method = "linear", |
|
| 38 |
rule = 1, |
|
| 39 |
ties = mean, |
|
| 40 |
age_min = 0, |
|
| 41 |
age_max = 12e03, |
|
| 42 |
timestep = 500) {
|
|
| 43 | 12x |
assertthat::assert_that( |
| 44 | 12x |
is.data.frame(data), |
| 45 | 12x |
msg = "data must be a data frame" |
| 46 |
) |
|
| 47 | ||
| 48 | 10x |
assertthat::assert_that( |
| 49 | 10x |
is.character(by) && length(by) > 0, |
| 50 | 10x |
msg = "by must be a character vector with at least one element" |
| 51 |
) |
|
| 52 | ||
| 53 | 10x |
assertthat::assert_that( |
| 54 | 10x |
all(by %in% colnames(data)), |
| 55 | 10x |
msg = paste0( |
| 56 | 10x |
"data must contain the following columns: ", |
| 57 | 10x |
paste(by, collapse = ", ") |
| 58 |
) |
|
| 59 |
) |
|
| 60 | ||
| 61 | 10x |
assertthat::assert_that( |
| 62 | 10x |
is.character(age_var) && length(age_var) == 1, |
| 63 | 10x |
msg = "age_var must be a single character string" |
| 64 |
) |
|
| 65 | ||
| 66 | 10x |
assertthat::assert_that( |
| 67 | 10x |
is.character(value_var) && length(value_var) == 1, |
| 68 | 10x |
msg = "value_var must be a single character string" |
| 69 |
) |
|
| 70 | ||
| 71 | 10x |
assertthat::assert_that( |
| 72 | 10x |
is.character(method) && length(method) == 1, |
| 73 | 10x |
msg = "method must be a single character string" |
| 74 |
) |
|
| 75 | ||
| 76 | 10x |
assertthat::assert_that( |
| 77 | 10x |
is.numeric(rule) && length(rule) == 1, |
| 78 | 10x |
msg = "rule must be a single numeric value" |
| 79 |
) |
|
| 80 | ||
| 81 | 10x |
assertthat::assert_that( |
| 82 | 10x |
is.function(ties), |
| 83 | 10x |
msg = "ties must be a function" |
| 84 |
) |
|
| 85 | ||
| 86 | 10x |
assertthat::assert_that( |
| 87 | 10x |
is.numeric(age_min) && length(age_min) == 1, |
| 88 | 10x |
msg = "age_min must be a single numeric value" |
| 89 |
) |
|
| 90 | ||
| 91 | 10x |
assertthat::assert_that( |
| 92 | 10x |
is.numeric(age_max) && length(age_max) == 1, |
| 93 | 10x |
msg = "age_max must be a single numeric value" |
| 94 |
) |
|
| 95 | ||
| 96 | 10x |
assertthat::assert_that( |
| 97 | 10x |
age_min < age_max, |
| 98 | 10x |
msg = "age_min must be less than age_max" |
| 99 |
) |
|
| 100 | ||
| 101 | 8x |
assertthat::assert_that( |
| 102 | 8x |
is.numeric(timestep) && length(timestep) == 1, |
| 103 | 8x |
msg = "timestep must be a single numeric value" |
| 104 |
) |
|
| 105 | ||
| 106 | 8x |
assertthat::assert_that( |
| 107 | 8x |
timestep > 0, |
| 108 | 8x |
msg = "timestep must be greater than 0" |
| 109 |
) |
|
| 110 | ||
| 111 | 6x |
data %>% |
| 112 | 6x |
tidyr::nest( |
| 113 | 6x |
data_nested = !dplyr::any_of(by) |
| 114 |
) %>% |
|
| 115 | 6x |
dplyr::mutate( |
| 116 | 6x |
data_interpolated = purrr::map( |
| 117 | 6x |
.x = data_nested, |
| 118 | 6x |
.f = purrr::possibly( |
| 119 | 6x |
.f = ~ .x %>% |
| 120 | 6x |
dplyr::select( |
| 121 | 6x |
!!rlang::sym(age_var), |
| 122 | 6x |
!!rlang::sym(value_var) |
| 123 |
) %>% |
|
| 124 | 6x |
grDevices::xy.coords() %>% |
| 125 | 6x |
stats::approx( |
| 126 | 6x |
xout = seq( |
| 127 | 6x |
age_min, |
| 128 | 6x |
age_max, |
| 129 | 6x |
by = timestep |
| 130 |
), |
|
| 131 | 6x |
ties = ties, |
| 132 | 6x |
method = method, |
| 133 | 6x |
rule = rule |
| 134 |
) %>% |
|
| 135 | 6x |
tibble::as_tibble() %>% |
| 136 | 6x |
dplyr::rename( |
| 137 | 6x |
!!rlang::sym(age_var) := x, |
| 138 | 6x |
!!rlang::sym(value_var) := y |
| 139 |
), |
|
| 140 | 6x |
otherwise = NULL |
| 141 |
) |
|
| 142 |
) |
|
| 143 |
) %>% |
|
| 144 | 6x |
tidyr::unnest(data_interpolated) %>% |
| 145 | 6x |
dplyr::select( |
| 146 | 6x |
dplyr::any_of(by), |
| 147 | 6x |
!!rlang::sym(age_var), |
| 148 | 6x |
!!rlang::sym(value_var) |
| 149 |
) %>% |
|
| 150 | 6x |
return() |
| 151 |
} |
| 1 |
#' @title Check Minimum Number of Cores in Spatial Window |
|
| 2 |
#' @description |
|
| 3 |
#' Verifies that the number of cores (distinct pollen sites) in the |
|
| 4 |
#' current spatial window meets the minimum required for analysis. |
|
| 5 |
#' The check is intended as an early guard in the pipeline, applied |
|
| 6 |
#' directly to `data_coords` (output of `get_coords()`) before any |
|
| 7 |
#' expensive community-data processing begins. |
|
| 8 |
#' @param data_coords |
|
| 9 |
#' A data frame of site coordinates, one row per core, as returned by |
|
| 10 |
#' `get_coords()`. Gridpoints are already excluded by that function, so |
|
| 11 |
#' every row represents a real pollen core. |
|
| 12 |
#' @param min_n_cores |
|
| 13 |
#' A single positive numeric value specifying the minimum number of |
|
| 14 |
#' distinct cores required. Typically sourced from |
|
| 15 |
#' `config.data_processing$min_n_cores`. |
|
| 16 |
#' @return |
|
| 17 |
#' `TRUE` invisibly when the check passes. |
|
| 18 |
#' @details |
|
| 19 |
#' Raises a `cli::cli_abort()` error naming the actual core count and |
|
| 20 |
#' the required threshold when `nrow(data_coords) < min_n_cores`. This |
|
| 21 |
#' causes the targets pipeline target to fail immediately, preventing |
|
| 22 |
#' all downstream community-data targets from running for spatial windows |
|
| 23 |
#' that contain too few sites. |
|
| 24 |
#' @seealso [get_coords()], [filter_community_by_n_cores()] |
|
| 25 |
#' @export |
|
| 26 |
check_min_n_cores <- function( |
|
| 27 |
data_coords = NULL, |
|
| 28 |
min_n_cores = 2) {
|
|
| 29 | 21x |
assertthat::assert_that( |
| 30 | 21x |
base::is.data.frame(data_coords), |
| 31 | 21x |
msg = "'data_coords' must be a data frame" |
| 32 |
) |
|
| 33 | ||
| 34 | 17x |
assertthat::assert_that( |
| 35 | 17x |
base::is.numeric(min_n_cores) && |
| 36 | 17x |
base::length(min_n_cores) == 1, |
| 37 | 17x |
msg = "'min_n_cores' must be a single numeric value" |
| 38 |
) |
|
| 39 | ||
| 40 | 13x |
assertthat::assert_that( |
| 41 | 13x |
min_n_cores >= 1, |
| 42 | 13x |
msg = "'min_n_cores' must be greater than or equal to 1" |
| 43 |
) |
|
| 44 | ||
| 45 | 11x |
n_cores_available <- base::nrow(data_coords) |
| 46 | ||
| 47 | 11x |
if (n_cores_available < min_n_cores) {
|
| 48 | 6x |
cli::cli_abort( |
| 49 | 6x |
c( |
| 50 | 6x |
"Not enough cores in this spatial window.", |
| 51 | 6x |
"i" = base::paste0( |
| 52 | 6x |
"Found ", n_cores_available, |
| 53 | 6x |
" core(s); at least ", min_n_cores, " required." |
| 54 |
), |
|
| 55 | 6x |
"i" = paste0( |
| 56 | 6x |
"Adjust 'min_n_cores' in config or choose a", |
| 57 | 6x |
" larger spatial window." |
| 58 |
) |
|
| 59 |
) |
|
| 60 |
) |
|
| 61 |
} |
|
| 62 | ||
| 63 | 5x |
return(invisible(TRUE)) |
| 64 |
} |
| 1 |
#' @title Evaluate JSDM Model Performance |
|
| 2 |
#' @description |
|
| 3 |
#' Evaluates a fitted sjSDM model and returns comprehensive performance |
|
| 4 |
#' metrics at both model and species level. |
|
| 5 |
#' @param mod_jsdm |
|
| 6 |
#' A fitted sjSDM model object. Must be of class 'sjSDM'. |
|
| 7 |
#' @return |
|
| 8 |
#' A list with three elements: |
|
| 9 |
#' - `model`: Named numeric vector of R-squared values |
|
| 10 |
#' (McFadden, Nagelkerke) |
|
| 11 |
#' - `species`: A tibble with one row per species and columns: |
|
| 12 |
#' species, AUC, Accuracy, LogLoss (binomial) or RMSE (other |
|
| 13 |
#' families) |
|
| 14 |
#' - `convergence`: A list from [check_convergence_jsdm()] with |
|
| 15 |
#' `linear_trend_slope`, `median_diff`, `convergence_plot`, and |
|
| 16 |
#' `note` |
|
| 17 |
#' @details |
|
| 18 |
#' For binomial models, species-level classification metrics (AUC, |
|
| 19 |
#' Accuracy, LogLoss) are computed using a 0.5 probability threshold |
|
| 20 |
#' for binary predictions. For other model families, RMSE is computed |
|
| 21 |
#' per species. |
|
| 22 |
#' |
|
| 23 |
#' Convergence is assessed via [check_convergence_jsdm()], which |
|
| 24 |
#' analyses the training loss history. A `linear_trend_slope` < 0.01 |
|
| 25 |
#' and `median_diff` < 1 in the returned `convergence` element |
|
| 26 |
#' indicate that the model has converged. |
|
| 27 |
#' @seealso sjSDM::Rsquared, Metrics::auc, check_convergence_jsdm |
|
| 28 |
#' @export |
|
| 29 |
evaluate_jsdm <- function(mod_jsdm = NULL) {
|
|
| 30 | 8x |
assertthat::assert_that( |
| 31 | 8x |
inherits(mod_jsdm, "sjSDM"), |
| 32 | 8x |
msg = "mod_jsdm must be of class sjSDM" |
| 33 |
) |
|
| 34 | ||
| 35 |
# Extract observed and predicted values |
|
| 36 | 5x |
obs_data <- |
| 37 | 5x |
mod_jsdm$data$Y |
| 38 | 5x |
pred_prob <- |
| 39 | 5x |
predict(mod_jsdm, newdata = NULL) |
| 40 | ||
| 41 | 5x |
vec_species <- |
| 42 | 5x |
seq_len(ncol(obs_data)) |> |
| 43 | 5x |
rlang::set_names(colnames(obs_data)) |
| 44 | ||
| 45 |
# Initialize evaluation list |
|
| 46 | 5x |
list_eval <- |
| 47 | 5x |
list( |
| 48 | 5x |
model = NULL, |
| 49 | 5x |
species = NULL, |
| 50 | 5x |
convergence = NULL |
| 51 |
) |
|
| 52 | ||
| 53 |
# 1. R-squared metrics |
|
| 54 |
# Note: sjSDM::Rsquared() prints to console regardless of verbose = FALSE; |
|
| 55 |
# capture.output() suppresses this unwanted output. |
|
| 56 | 5x |
invisible( |
| 57 | 5x |
utils::capture.output( |
| 58 | 5x |
vec_r2 <- c( |
| 59 | 5x |
sjSDM::Rsquared(mod_jsdm, method = "McFadden", verbose = FALSE), |
| 60 | 5x |
sjSDM::Rsquared(mod_jsdm, method = "Nagelkerke", verbose = FALSE) |
| 61 |
) |
|
| 62 |
) |
|
| 63 |
) |
|
| 64 | ||
| 65 | 5x |
list_eval$model <- |
| 66 | 5x |
rlang::set_names(vec_r2, c("R2-McFadden", "R2-Nagelkerke"))
|
| 67 | ||
| 68 |
# 2. Species-level metrics |
|
| 69 |
if ( |
|
| 70 | 5x |
mod_jsdm$family$family$family == "binomial" |
| 71 |
) {
|
|
| 72 |
# AUC per species |
|
| 73 | 4x |
vec_auc <- |
| 74 | 4x |
vec_species |> |
| 75 | 4x |
purrr::map_dbl( |
| 76 | 4x |
~ Metrics::auc( |
| 77 | 4x |
actual = as.data.frame(obs_data)[, .x], |
| 78 | 4x |
predicted = as.data.frame(pred_prob)[, .x] |
| 79 |
) |
|
| 80 |
) |
|
| 81 | ||
| 82 |
# Accuracy per species (binary predictions at 0.5 threshold) |
|
| 83 | 4x |
pred_binary <- pred_prob > 0.5 |
| 84 | ||
| 85 | 4x |
vec_accuracy <- |
| 86 | 4x |
vec_species |> |
| 87 | 4x |
purrr::map_dbl( |
| 88 | 4x |
~ Metrics::accuracy( |
| 89 | 4x |
actual = as.data.frame(obs_data)[, .x], |
| 90 | 4x |
predicted = as.data.frame(pred_binary)[, .x] |
| 91 |
) |
|
| 92 |
) |
|
| 93 | ||
| 94 |
# Log Loss per species |
|
| 95 | 4x |
vec_logloss <- |
| 96 | 4x |
vec_species |> |
| 97 | 4x |
purrr::map_dbl( |
| 98 | 4x |
~ Metrics::logLoss( |
| 99 | 4x |
actual = as.data.frame(obs_data)[, .x], |
| 100 | 4x |
predicted = as.data.frame(pred_prob)[, .x] |
| 101 |
) |
|
| 102 |
) |
|
| 103 | ||
| 104 | 4x |
list_eval$species <- |
| 105 | 4x |
tibble::tibble( |
| 106 | 4x |
species = colnames(obs_data), |
| 107 | 4x |
AUC = vec_auc, |
| 108 | 4x |
Accuracy = vec_accuracy, |
| 109 | 4x |
LogLoss = vec_logloss |
| 110 |
) |
|
| 111 |
} else {
|
|
| 112 |
# For non-binomial models, use RMSE |
|
| 113 | 1x |
vec_rmse <- |
| 114 | 1x |
vec_species |> |
| 115 | 1x |
purrr::map_dbl( |
| 116 | 1x |
~ sqrt(mean((obs_data[, .x] - pred_prob[, .x])^2)) |
| 117 |
) |
|
| 118 | ||
| 119 | 1x |
list_eval$species <- |
| 120 | 1x |
tibble::tibble( |
| 121 | 1x |
species = colnames(obs_data), |
| 122 | 1x |
RMSE = vec_rmse |
| 123 |
) |
|
| 124 |
} |
|
| 125 | ||
| 126 |
# 3. Convergence diagnostics |
|
| 127 | 5x |
list_eval$convergence <- |
| 128 | 5x |
check_convergence_jsdm(mod_jsdm) |
| 129 | ||
| 130 | 5x |
return(list_eval) |
| 131 |
} |
| 1 |
#' @title Get Active Configuration Value |
|
| 2 |
#' @description |
|
| 3 |
#' Retrieves a specific configuration value from a YAML configuration file. |
|
| 4 |
#' @param value |
|
| 5 |
#' A character vector specifying the configuration key(s) to retrieve. |
|
| 6 |
#' @param file |
|
| 7 |
#' Path to the YAML configuration file (default: "config.yml"). |
|
| 8 |
#' @return |
|
| 9 |
#' Value(s) associated with the specified key(s) in the configuration file. |
|
| 10 |
#' @details |
|
| 11 |
#' Validates input parameters, ensures the file is readable, and retrieves |
|
| 12 |
#' configuration value(s) using `config::get`. Active configuration is set by |
|
| 13 |
#' the `R_CONFIG_ACTIVE` environment variable. |
|
| 14 |
#' @export |
|
| 15 |
get_active_config <- function( |
|
| 16 |
value = NULL, |
|
| 17 |
file = here::here("config.yml")) {
|
|
| 18 | 11x |
assertthat::assert_that( |
| 19 | 11x |
is.character(value) && length(value) > 0, |
| 20 | 11x |
msg = "value must be a character vector with at least one element" |
| 21 |
) |
|
| 22 | ||
| 23 | 8x |
assertthat::assert_that( |
| 24 | 8x |
assertthat::is.readable(file) && assertthat::has_extension(file, "yml"), |
| 25 | 8x |
msg = "file must be a readable YAML file" |
| 26 |
) |
|
| 27 | ||
| 28 | 6x |
config::get( |
| 29 | 6x |
value = value, |
| 30 | 6x |
config = Sys.getenv("R_CONFIG_ACTIVE"),
|
| 31 | 6x |
use_parent = FALSE, |
| 32 | 6x |
file = file |
| 33 |
) %>% |
|
| 34 | 6x |
return() |
| 35 |
} |
| 1 |
#' @title Replace NA in Community Data with Zeros |
|
| 2 |
#' @description |
|
| 3 |
#' Replaces NA values in community data with zeros. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame. Must contain `dataset_name` and `sample_name` columns, and |
|
| 6 |
#' at least one taxon column. |
|
| 7 |
#' @return |
|
| 8 |
#' A data frame with NA values replaced by zeros. |
|
| 9 |
#' @details |
|
| 10 |
#' Converts the data to long format, replaces NA values in `pollen_count` |
|
| 11 |
#' with zeros, and reshapes it back to wide format. |
|
| 12 |
#' @export |
|
| 13 |
replace_na_community_data_with_zeros <- function(data = NULL) {
|
|
| 14 | 4x |
assertthat::assert_that( |
| 15 | 4x |
is.data.frame(data), |
| 16 | 4x |
msg = "data must be a data frame" |
| 17 |
) |
|
| 18 | 2x |
assertthat::assert_that( |
| 19 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data)),
|
| 20 | 2x |
msg = "data must contain columns 'dataset_name' and 'sample_name'" |
| 21 |
) |
|
| 22 | ||
| 23 | 2x |
assertthat::assert_that( |
| 24 | 2x |
colnames(data) %>% |
| 25 | 2x |
length() > 2, |
| 26 | 2x |
msg = "data must contain at least one taxon column" |
| 27 |
) |
|
| 28 | ||
| 29 | 2x |
data %>% |
| 30 | 2x |
tidyr::pivot_longer( |
| 31 | 2x |
cols = !c("dataset_name", "sample_name"),
|
| 32 | 2x |
names_to = "taxon", |
| 33 | 2x |
values_to = "pollen_count" |
| 34 |
) %>% |
|
| 35 | 2x |
dplyr::mutate( |
| 36 | 2x |
pollen_count = dplyr::if_else(is.na(pollen_count), 0, pollen_count) |
| 37 |
) %>% |
|
| 38 | 2x |
tidyr::pivot_wider( |
| 39 | 2x |
names_from = "taxon", |
| 40 | 2x |
values_from = "pollen_count" |
| 41 |
) %>% |
|
| 42 | 2x |
return() |
| 43 |
} |
| 1 |
#' @title Check Sample IDs Have Minimum Number of Samples |
|
| 2 |
#' @description |
|
| 3 |
#' Guards against running downstream data preparation and model |
|
| 4 |
#' fitting on a time slice with too few |
|
| 5 |
#' `(dataset_name, age)` combinations. Returns |
|
| 6 |
#' `data_sample_ids` unchanged when the row count is at least |
|
| 7 |
#' `min_n_samples`. Stops with an informative error when the |
|
| 8 |
#' count falls below the threshold, preventing expensive model |
|
| 9 |
#' fitting on near-empty slices. |
|
| 10 |
#' @param data_sample_ids |
|
| 11 |
#' A data frame with at least the columns `dataset_name` and |
|
| 12 |
#' `age`, as returned by `align_sample_ids()`. Each row |
|
| 13 |
#' represents one valid `(dataset_name, age)` pair. |
|
| 14 |
#' @param min_n_samples |
|
| 15 |
#' A single positive integer giving the minimum number of |
|
| 16 |
#' samples (rows) required to proceed with data preparation |
|
| 17 |
#' and model fitting. Default is 1. |
|
| 18 |
#' @return |
|
| 19 |
#' The input `data_sample_ids` unchanged, when |
|
| 20 |
#' `nrow(data_sample_ids) >= min_n_samples`. |
|
| 21 |
#' @details |
|
| 22 |
#' The check counts `nrow(data_sample_ids)` after the |
|
| 23 |
#' time-slice filter has been applied by |
|
| 24 |
#' `align_sample_ids(subset_age = ...)`. If the count falls |
|
| 25 |
#' below `min_n_samples`, `cli::cli_abort()` is called with a |
|
| 26 |
#' message that reports the actual sample count and the |
|
| 27 |
#' threshold, allowing the user to adjust the configuration or |
|
| 28 |
#' the input data. This check is intended to be placed in the |
|
| 29 |
#' per-slice pipeline (e.g. `pipe_segment_age_filter`) so |
|
| 30 |
#' that slices without sufficient data fail immediately, |
|
| 31 |
#' before any expensive preparation or model fitting. |
|
| 32 |
#' @seealso [align_sample_ids()], |
|
| 33 |
#' [filter_community_by_n_taxa()] |
|
| 34 |
#' @export |
|
| 35 |
check_data_sample_ids_n <- function( |
|
| 36 |
data_sample_ids = NULL, |
|
| 37 |
min_n_samples = 1) {
|
|
| 38 | 23x |
assertthat::assert_that( |
| 39 | 23x |
is.data.frame(data_sample_ids), |
| 40 | 23x |
msg = "data_sample_ids must be a data frame" |
| 41 |
) |
|
| 42 | ||
| 43 | 19x |
assertthat::assert_that( |
| 44 | 19x |
all(c("dataset_name", "age") %in% names(data_sample_ids)),
|
| 45 | 19x |
msg = paste0( |
| 46 | 19x |
"data_sample_ids must contain columns", |
| 47 | 19x |
" 'dataset_name' and 'age'" |
| 48 |
) |
|
| 49 |
) |
|
| 50 | ||
| 51 | 17x |
assertthat::assert_that( |
| 52 | 17x |
is.numeric(min_n_samples) && |
| 53 | 17x |
length(min_n_samples) == 1, |
| 54 | 17x |
msg = "min_n_samples must be a single numeric value" |
| 55 |
) |
|
| 56 | ||
| 57 | 13x |
assertthat::assert_that( |
| 58 | 13x |
min_n_samples >= 1, |
| 59 | 13x |
msg = "min_n_samples must be greater than or equal to 1" |
| 60 |
) |
|
| 61 | ||
| 62 | 11x |
n_samples <- |
| 63 | 11x |
nrow(data_sample_ids) |
| 64 | ||
| 65 | 11x |
if (n_samples < min_n_samples) {
|
| 66 | 6x |
cli::cli_abort( |
| 67 | 6x |
c( |
| 68 | 6x |
paste0( |
| 69 | 6x |
"Too few samples in this time slice to proceed", |
| 70 | 6x |
" with data preparation and model fitting." |
| 71 |
), |
|
| 72 | 6x |
"i" = paste0( |
| 73 | 6x |
"Found {n_samples} sample(s) but at least",
|
| 74 | 6x |
" {min_n_samples} are required."
|
| 75 |
), |
|
| 76 | 6x |
"i" = paste0( |
| 77 | 6x |
"Adjust `min_n_samples` in the configuration", |
| 78 | 6x |
" or review the input data for this age slice." |
| 79 |
) |
|
| 80 |
) |
|
| 81 |
) |
|
| 82 |
} |
|
| 83 | ||
| 84 | 5x |
return(data_sample_ids) |
| 85 |
} |
| 1 |
#' @title Prepare Coordinate Data for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Expands dataset-level coordinate data to the sample level by |
|
| 4 |
#' joining against the canonical `(dataset_name, age)` sample |
|
| 5 |
#' index, producing a data frame whose rows align with the |
|
| 6 |
#' community matrix and abiotic data used for model fitting. |
|
| 7 |
#' @param data_coords |
|
| 8 |
#' A data frame with `dataset_name` as row names and columns |
|
| 9 |
#' `coord_long` and `coord_lat`. |
|
| 10 |
#' @param data_sample_ids |
|
| 11 |
#' A data frame of valid `(dataset_name, age)` pairs as returned by |
|
| 12 |
#' `align_sample_ids()`. |
|
| 13 |
#' @return |
|
| 14 |
#' A data frame with row names in the format |
|
| 15 |
#' `"<dataset_name>__<age>"` and columns `coord_long` and |
|
| 16 |
#' `coord_lat`. Rows are sorted by `dataset_name` then `age`, |
|
| 17 |
#' matching the ordering of `data_sample_ids`. |
|
| 18 |
#' @details |
|
| 19 |
#' Coordinates are stored at dataset level but models require one |
|
| 20 |
#' row per sample. This function replicates each dataset's |
|
| 21 |
#' coordinates across all its valid sample ages. The row-name |
|
| 22 |
#' format matches that of the community matrix and abiotic data |
|
| 23 |
#' frame produced by the respective preparation functions. |
|
| 24 |
#' @seealso [align_sample_ids()], [assemble_data_to_fit()] |
|
| 25 |
#' @export |
|
| 26 |
prepare_coords_for_fit <- function( |
|
| 27 |
data_coords = NULL, |
|
| 28 |
data_sample_ids = NULL) {
|
|
| 29 | 10x |
assertthat::assert_that( |
| 30 | 10x |
is.data.frame(data_coords), |
| 31 | 10x |
msg = "data_coords must be a data frame" |
| 32 |
) |
|
| 33 | ||
| 34 | 9x |
assertthat::assert_that( |
| 35 | 9x |
is.data.frame(data_sample_ids), |
| 36 | 9x |
msg = "data_sample_ids must be a data frame" |
| 37 |
) |
|
| 38 | ||
| 39 | 8x |
assertthat::assert_that( |
| 40 | 8x |
all( |
| 41 | 8x |
c("coord_long", "coord_lat") %in% names(data_coords)
|
| 42 |
), |
|
| 43 | 8x |
msg = paste0( |
| 44 | 8x |
"data_coords must contain columns", |
| 45 | 8x |
" 'coord_long' and 'coord_lat'" |
| 46 |
) |
|
| 47 |
) |
|
| 48 | ||
| 49 | 7x |
assertthat::assert_that( |
| 50 | 7x |
all(c("dataset_name", "age") %in% names(data_sample_ids)),
|
| 51 | 7x |
msg = paste0( |
| 52 | 7x |
"data_sample_ids must contain columns", |
| 53 | 7x |
" 'dataset_name' and 'age'" |
| 54 |
) |
|
| 55 |
) |
|
| 56 | ||
| 57 | 6x |
res <- |
| 58 | 6x |
data_sample_ids |> |
| 59 | 6x |
dplyr::inner_join( |
| 60 | 6x |
data_coords |> |
| 61 | 6x |
tibble::rownames_to_column("dataset_name"),
|
| 62 | 6x |
by = dplyr::join_by(dataset_name) |
| 63 |
) |> |
|
| 64 | 6x |
tidyr::drop_na(coord_long, coord_lat) |> |
| 65 | 6x |
dplyr::arrange(dataset_name, age) |> |
| 66 | 6x |
dplyr::mutate( |
| 67 | 6x |
.row_name = paste0(dataset_name, "__", age) |
| 68 |
) |> |
|
| 69 | 6x |
dplyr::select(-dataset_name, -age) |> |
| 70 | 6x |
tibble::column_to_rownames(".row_name")
|
| 71 | ||
| 72 | 6x |
return(res) |
| 73 |
} |
| 1 |
#' @title Binarize Community Data Matrix |
|
| 2 |
#' @description |
|
| 3 |
#' Converts a numeric community matrix of proportions (or |
|
| 4 |
#' counts) to a binary presence-absence matrix. Any value |
|
| 5 |
#' strictly greater than zero is recoded as `1L`; zeros |
|
| 6 |
#' remain `0L`. The resulting integer matrix is suitable |
|
| 7 |
#' for `filter_constant_taxa()` when the model uses a |
|
| 8 |
#' binomial error family. |
|
| 9 |
#' @param data_community_matrix |
|
| 10 |
#' A numeric matrix with samples as rows and taxa as columns, |
|
| 11 |
#' as returned by `prepare_community_for_fit()`. Values must |
|
| 12 |
#' be non-negative. |
|
| 13 |
#' @return |
|
| 14 |
#' An integer matrix of the same dimensions and dimnames as |
|
| 15 |
#' the input, with all non-zero values replaced by `1L`. |
|
| 16 |
#' @details |
|
| 17 |
#' Pre-binarization before `filter_constant_taxa()` is |
|
| 18 |
#' essential when using a binomial error family: a taxon |
|
| 19 |
#' recorded at non-zero varying proportions in every sample |
|
| 20 |
#' has positive SD on the proportion scale but becomes a |
|
| 21 |
#' constant-1 column after binarization inside the model, |
|
| 22 |
#' causing implicit intercept saturation. Applying this |
|
| 23 |
#' function first ensures that `filter_constant_taxa()` |
|
| 24 |
#' removes such taxa before they reach the model. |
|
| 25 |
#' |
|
| 26 |
#' For other error families (e.g., Gaussian, future hurdle |
|
| 27 |
#' models) the raw proportional matrix should be passed to |
|
| 28 |
#' `filter_constant_taxa()` directly; use the |
|
| 29 |
#' `error_family` configuration key to control this choice |
|
| 30 |
#' in the pipeline. |
|
| 31 |
#' @seealso [prepare_community_for_fit()], |
|
| 32 |
#' [filter_constant_taxa()], [assemble_data_to_fit()] |
|
| 33 |
#' @export |
|
| 34 |
binarize_community_data <- function( |
|
| 35 |
data_community_matrix = NULL) {
|
|
| 36 | 18x |
assertthat::assert_that( |
| 37 | 18x |
is.matrix(data_community_matrix), |
| 38 | 18x |
msg = "data_community_matrix must be a matrix" |
| 39 |
) |
|
| 40 | ||
| 41 | 15x |
assertthat::assert_that( |
| 42 | 15x |
is.numeric(data_community_matrix), |
| 43 | 15x |
msg = "data_community_matrix must be a numeric matrix" |
| 44 |
) |
|
| 45 | ||
| 46 | 14x |
assertthat::assert_that( |
| 47 | 14x |
nrow(data_community_matrix) > 0, |
| 48 | 14x |
msg = "data_community_matrix must have at least one row" |
| 49 |
) |
|
| 50 | ||
| 51 | 13x |
assertthat::assert_that( |
| 52 | 13x |
ncol(data_community_matrix) > 0, |
| 53 | 13x |
msg = "data_community_matrix must have at least one column" |
| 54 |
) |
|
| 55 | ||
| 56 | 12x |
assertthat::assert_that( |
| 57 | 12x |
base::all(data_community_matrix >= 0, na.rm = TRUE), |
| 58 | 12x |
msg = "data_community_matrix values must all be >= 0" |
| 59 |
) |
|
| 60 | ||
| 61 | 11x |
res <- |
| 62 | 11x |
(data_community_matrix > 0) * 1L |
| 63 | ||
| 64 | 11x |
base::rownames(res) <- base::rownames(data_community_matrix) |
| 65 | 11x |
base::colnames(res) <- base::colnames(data_community_matrix) |
| 66 | ||
| 67 | 11x |
return(res) |
| 68 |
} |
| 1 |
#' @title Get taxa without classification |
|
| 2 |
#' @description |
|
| 3 |
#' Identifies taxa from a community vector that are absent from a |
|
| 4 |
#' classification table. Returns the names of taxa that do not have |
|
| 5 |
#' a matching entry in the `sel_name` column of the classification |
|
| 6 |
#' table. |
|
| 7 |
#' @param vec_community_taxa |
|
| 8 |
#' A non-empty character vector of taxon names present in the |
|
| 9 |
#' community data. |
|
| 10 |
#' @param data_classification_table |
|
| 11 |
#' A data frame containing a `sel_name` column with unique taxon |
|
| 12 |
#' names used for classification lookup. Must not contain duplicate |
|
| 13 |
#' values in `sel_name`. |
|
| 14 |
#' @return |
|
| 15 |
#' A character vector of taxon names from `vec_community_taxa` that |
|
| 16 |
#' are not found in `data_classification_table$sel_name`. Returns an |
|
| 17 |
#' empty character vector if all taxa are classified. |
|
| 18 |
#' @details |
|
| 19 |
#' Uses `dplyr::anti_join()` to find taxa present in |
|
| 20 |
#' `vec_community_taxa` but absent from the `sel_name` column of |
|
| 21 |
#' `data_classification_table`. Duplicate entries in |
|
| 22 |
#' `vec_community_taxa` are collapsed before the comparison, so |
|
| 23 |
#' each unclassified taxon appears only once in the output. |
|
| 24 |
#' @seealso |
|
| 25 |
#' [classify_taxonomic_resolution()] |
|
| 26 |
#' @export |
|
| 27 |
get_taxa_without_classification <- function(vec_community_taxa, data_classification_table) {
|
|
| 28 | 17x |
assertthat::assert_that( |
| 29 | 17x |
is.character(vec_community_taxa), |
| 30 | 17x |
length(vec_community_taxa) > 0, |
| 31 | 17x |
msg = "vec_community_taxa should be a non-empty character vector" |
| 32 |
) |
|
| 33 | ||
| 34 | 13x |
assertthat::assert_that( |
| 35 | 13x |
is.data.frame(data_classification_table), |
| 36 | 13x |
msg = "data_classification_table should be a data frame" |
| 37 |
) |
|
| 38 | ||
| 39 | 11x |
assertthat::assert_that( |
| 40 | 11x |
"sel_name" %in% names(data_classification_table), |
| 41 | 11x |
msg = "data_classification_table should contain a 'sel_name' column" |
| 42 |
) |
|
| 43 | ||
| 44 | 10x |
assertthat::assert_that( |
| 45 | 10x |
!any(duplicated(data_classification_table$sel_name)), |
| 46 | 10x |
msg = "data_classification_table should not contain duplicate 'sel_name' values" |
| 47 |
) |
|
| 48 | ||
| 49 | 9x |
missing_taxa <- |
| 50 | 9x |
dplyr::anti_join( |
| 51 | 9x |
data.frame(taxon = vec_community_taxa) |> |
| 52 | 9x |
dplyr::distinct(taxon), |
| 53 | 9x |
data_classification_table, |
| 54 | 9x |
by = dplyr::join_by("taxon" == "sel_name")
|
| 55 |
) |> |
|
| 56 | 9x |
dplyr::pull(taxon) |
| 57 | ||
| 58 | ||
| 59 | 9x |
return(missing_taxa) |
| 60 |
} |
| 1 |
#' @title Plot ANOVA Variance Components by Age |
|
| 2 |
#' @description |
|
| 3 |
#' Creates a line-and-point plot of sjSDM ANOVA variance |
|
| 4 |
#' components (Nagelkerke R²) across time slices (ages). |
|
| 5 |
#' @param data_anova_components |
|
| 6 |
#' A data frame or tibble with columns \code{age} (numeric),
|
|
| 7 |
#' \code{component} (character), and \code{R2_Nagelkerke}
|
|
| 8 |
#' (numeric). Typically the output of |
|
| 9 |
#' \code{\link{aggregate_anova_components}()}.
|
|
| 10 |
#' @param title |
|
| 11 |
#' Optional character string for the plot title. |
|
| 12 |
#' Defaults to \code{NULL} (no title).
|
|
| 13 |
#' @param subtitle |
|
| 14 |
#' Optional character string for the plot subtitle. |
|
| 15 |
#' Defaults to \code{NULL} (no subtitle).
|
|
| 16 |
#' @return |
|
| 17 |
#' A \code{ggplot} object.
|
|
| 18 |
#' @details |
|
| 19 |
#' The x-axis is reversed so that older ages appear on the left |
|
| 20 |
#' and the y-axis is clamped to [0, NA]. Each variance component |
|
| 21 |
#' is drawn as a coloured line with points. |
|
| 22 |
#' @seealso [aggregate_anova_components()] |
|
| 23 |
#' @export |
|
| 24 |
plot_anova_components_by_age <- function( |
|
| 25 |
data_anova_components, |
|
| 26 |
title = NULL, |
|
| 27 |
subtitle = NULL) {
|
|
| 28 | 10x |
assertthat::assert_that( |
| 29 | 10x |
base::is.data.frame(data_anova_components), |
| 30 | 10x |
msg = "'data_anova_components' must be a data frame." |
| 31 |
) |
|
| 32 | ||
| 33 | 7x |
assertthat::assert_that( |
| 34 | 7x |
base::all( |
| 35 | 7x |
c("age", "component", "R2_Nagelkerke_percentage") %in%
|
| 36 | 7x |
base::names(data_anova_components) |
| 37 |
), |
|
| 38 | 7x |
msg = paste0( |
| 39 | 7x |
"'data_anova_components' must have columns", |
| 40 | 7x |
" 'age', 'component', and 'R2_Nagelkerke_percentage'." |
| 41 |
) |
|
| 42 |
) |
|
| 43 | ||
| 44 | 4x |
res <- |
| 45 | 4x |
data_anova_components |> |
| 46 | 4x |
ggplot2::ggplot( |
| 47 | 4x |
mapping = ggplot2::aes( |
| 48 | 4x |
x = age, |
| 49 | 4x |
y = R2_Nagelkerke_percentage, |
| 50 | 4x |
colour = component, |
| 51 | 4x |
group = component |
| 52 |
) |
|
| 53 |
) + |
|
| 54 | 4x |
ggplot2::geom_line() + |
| 55 | 4x |
ggplot2::geom_point() + |
| 56 | 4x |
ggplot2::scale_x_continuous( |
| 57 | 4x |
trans = "reverse" |
| 58 |
) + |
|
| 59 | 4x |
ggplot2::coord_cartesian( |
| 60 | 4x |
ylim = c(0, NA) |
| 61 |
) + |
|
| 62 | 4x |
ggplot2::labs( |
| 63 | 4x |
title = title, |
| 64 | 4x |
subtitle = subtitle, |
| 65 | 4x |
x = "Age (cal yr BP)", |
| 66 |
# this should read as "Percentage of variance explained (Nagelkerke R²)" |
|
| 67 | 4x |
y = expression("Percentage of variance explained" ~ "(" ~ R^2 ~ "Nagelkerke)"),
|
| 68 | 4x |
colour = "Component" |
| 69 |
) |
|
| 70 | ||
| 71 | 4x |
return(res) |
| 72 |
} |
| 1 |
#' @title Get Spatial Window from Grid Catalogue |
|
| 2 |
#' @description |
|
| 3 |
#' Retrieves the spatial bounding box for a given spatial unit ID from the |
|
| 4 |
#' project's spatial grid CSV catalogue. |
|
| 5 |
#' @param scale_id |
|
| 6 |
#' A single character string identifying the spatial unit. |
|
| 7 |
#' Must match exactly one row in the catalogue file. |
|
| 8 |
#' @param file |
|
| 9 |
#' Path to the spatial grid CSV file. |
|
| 10 |
#' Default: `here::here("Data/Input/spatial_grid.csv")`.
|
|
| 11 |
#' @return |
|
| 12 |
#' A named list with two elements: |
|
| 13 |
#' \describe{
|
|
| 14 |
#' \item{`x_lim`}{Numeric vector of length 2: `c(x_min, x_max)`.}
|
|
| 15 |
#' \item{`y_lim`}{Numeric vector of length 2: `c(y_min, y_max)`.}
|
|
| 16 |
#' } |
|
| 17 |
#' @details |
|
| 18 |
#' Reads the CSV using `readr::read_csv`, filters to the row whose |
|
| 19 |
#' `scale_id` column matches the supplied `scale_id` argument, and |
|
| 20 |
#' constructs the bounding box vectors. Validation ensures the file |
|
| 21 |
#' is readable, has a `.csv` extension, and that exactly one row |
|
| 22 |
#' matches the requested `scale_id`. |
|
| 23 |
#' @seealso get_active_config |
|
| 24 |
#' @export |
|
| 25 |
get_spatial_window <- function( |
|
| 26 |
scale_id, |
|
| 27 |
file = here::here("Data/Input/spatial_grid.csv")) {
|
|
| 28 | 7x |
assertthat::assert_that( |
| 29 | 7x |
is.character(scale_id) && length(scale_id) == 1, |
| 30 | 7x |
msg = paste0( |
| 31 | 7x |
"`scale_id` must be a single character string.", |
| 32 | 7x |
" Got length: ", length(scale_id) |
| 33 |
) |
|
| 34 |
) |
|
| 35 | ||
| 36 | 5x |
assertthat::assert_that( |
| 37 | 5x |
assertthat::is.readable(file) && |
| 38 | 5x |
assertthat::has_extension(file, "csv"), |
| 39 | 5x |
msg = "`file` must be a readable CSV file." |
| 40 |
) |
|
| 41 | ||
| 42 | 4x |
data_grid <- |
| 43 | 4x |
readr::read_csv( |
| 44 | 4x |
file = file, |
| 45 | 4x |
show_col_types = FALSE |
| 46 |
) |
|
| 47 | ||
| 48 | 4x |
data_row <- |
| 49 | 4x |
data_grid |> |
| 50 | 4x |
dplyr::filter( |
| 51 | 4x |
.data$scale_id == .env$scale_id |
| 52 |
) |
|
| 53 | ||
| 54 | 4x |
assertthat::assert_that( |
| 55 | 4x |
base::nrow(data_row) == 1, |
| 56 | 4x |
msg = paste0( |
| 57 | 4x |
"Expected exactly 1 row for scale_id '", scale_id, "'.", |
| 58 | 4x |
" Found: ", base::nrow(data_row) |
| 59 |
) |
|
| 60 |
) |
|
| 61 | ||
| 62 | 3x |
res <- |
| 63 | 3x |
list( |
| 64 | 3x |
x_lim = c( |
| 65 | 3x |
dplyr::pull(data_row, x_min), |
| 66 | 3x |
dplyr::pull(data_row, x_max) |
| 67 |
), |
|
| 68 | 3x |
y_lim = c( |
| 69 | 3x |
dplyr::pull(data_row, y_min), |
| 70 | 3x |
dplyr::pull(data_row, y_max) |
| 71 |
) |
|
| 72 |
) |
|
| 73 | ||
| 74 | 3x |
return(res) |
| 75 |
} |
| 1 |
#' @title Extract Age from String |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts the age value from a vector of strings, taking all characters |
|
| 4 |
#' after the double underscore ("__").
|
|
| 5 |
#' @param vec_names |
|
| 6 |
#' A character vector containing names with the format "dataset__age". |
|
| 7 |
#' @return |
|
| 8 |
#' A character vector of age values. |
|
| 9 |
#' @export |
|
| 10 |
get_age_from_string <- function(vec_names) {
|
|
| 11 | 27x |
assertthat::assert_that( |
| 12 | 27x |
is.character(vec_names), |
| 13 | 27x |
msg = "Input must be a character vector." |
| 14 |
) |
|
| 15 | ||
| 16 | 22x |
assertthat::assert_that( |
| 17 | 22x |
length(vec_names) > 0, |
| 18 | 22x |
msg = "Input vector must not be empty." |
| 19 |
) |
|
| 20 | ||
| 21 | 21x |
assertthat::assert_that( |
| 22 | 21x |
all(stringr::str_detect(vec_names, "__")), |
| 23 | 21x |
msg = "Input strings must contain '__' to extract age." |
| 24 |
) |
|
| 25 | ||
| 26 | 18x |
vec_names %>% |
| 27 |
# get all values after "__" |
|
| 28 | 18x |
stringr::str_extract("__(.*)") %>%
|
| 29 |
# remove "__" |
|
| 30 | 18x |
stringr::str_remove("__") %>%
|
| 31 | 18x |
stringr::str_trim() %>% |
| 32 | 18x |
return() |
| 33 |
} |
| 1 |
#' @title Add Dataset Name Column from Row Names |
|
| 2 |
#' @description |
|
| 3 |
#' Adds a 'dataset_name' column to a data frame by extracting dataset names |
|
| 4 |
#' from row names. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame with row names in the format "dataset__something". |
|
| 7 |
#' @return |
|
| 8 |
#' The input data frame with an added 'dataset_name' column. |
|
| 9 |
#' @export |
|
| 10 |
add_dataset_name_column_from_rownames <- function(data) {
|
|
| 11 | 14x |
row_names <- |
| 12 | 14x |
row.names(data) %>% |
| 13 | 14x |
get_dataset_name_from_string() |
| 14 | ||
| 15 | 10x |
data %>% |
| 16 | 10x |
dplyr::mutate( |
| 17 | 10x |
dataset_name = row_names |
| 18 |
) %>% |
|
| 19 | 10x |
return() |
| 20 |
} |
| 1 |
#' @title Combine automatic and auxiliary classification tables |
|
| 2 |
#' @description |
|
| 3 |
#' Merges the automatically generated classification table with a |
|
| 4 |
#' manually curated auxiliary table. When the same `sel_name` entry |
|
| 5 |
#' exists in both tables the auxiliary (manual) row takes priority, |
|
| 6 |
#' overriding the automatic classification entirely. |
|
| 7 |
#' @param data_classification_table |
|
| 8 |
#' A data frame produced by `make_classification_table()` with |
|
| 9 |
#' columns `sel_name`, `kingdom`, `phylum`, `class`, `order`, |
|
| 10 |
#' `family`, `genus`, and `species`. |
|
| 11 |
#' @param data_aux_classification_table |
|
| 12 |
#' A data frame produced by `get_aux_classification_table()` with |
|
| 13 |
#' columns `sel_name`, `kingdom`, `phylum`, `class`, `order`, |
|
| 14 |
#' `family`, `genus`, and `species`. May be an empty tibble |
|
| 15 |
#' (zero rows) when no manual overrides exist. |
|
| 16 |
#' @return |
|
| 17 |
#' A tibble with columns `sel_name`, `kingdom`, `phylum`, `class`, |
|
| 18 |
#' `order`, `family`, `genus`, and `species` containing all unique |
|
| 19 |
#' taxa from both inputs. Manual entries override automatic ones |
|
| 20 |
#' on `sel_name` collision. Only columns present in both inputs |
|
| 21 |
#' are retained (intersection). |
|
| 22 |
#' @details |
|
| 23 |
#' Binding is performed by placing auxiliary rows before automatic |
|
| 24 |
#' rows and then retaining the first occurrence of each `sel_name` |
|
| 25 |
#' via `dplyr::distinct()`. This guarantees that manual |
|
| 26 |
#' classifications always win regardless of their relative |
|
| 27 |
#' completeness. |
|
| 28 |
#' @seealso |
|
| 29 |
#' [get_aux_classification_table()], |
|
| 30 |
#' [make_classification_table()], |
|
| 31 |
#' [classify_taxonomic_resolution()] |
|
| 32 |
#' @export |
|
| 33 |
combine_classification_tables <- function( |
|
| 34 |
data_classification_table, |
|
| 35 |
data_aux_classification_table) {
|
|
| 36 | 13x |
assertthat::assert_that( |
| 37 | 13x |
is.data.frame(data_classification_table), |
| 38 | 13x |
msg = "data_classification_table must be a data frame" |
| 39 |
) |
|
| 40 | ||
| 41 | 12x |
assertthat::assert_that( |
| 42 | 12x |
"sel_name" %in% colnames(data_classification_table), |
| 43 | 12x |
msg = paste( |
| 44 | 12x |
"data_classification_table must contain", |
| 45 | 12x |
"a 'sel_name' column" |
| 46 |
) |
|
| 47 |
) |
|
| 48 | ||
| 49 | 11x |
assertthat::assert_that( |
| 50 | 11x |
is.data.frame(data_aux_classification_table), |
| 51 | 11x |
msg = "data_aux_classification_table must be a data frame" |
| 52 |
) |
|
| 53 | ||
| 54 | 10x |
assertthat::assert_that( |
| 55 | 10x |
"sel_name" %in% colnames(data_aux_classification_table), |
| 56 | 10x |
msg = paste( |
| 57 | 10x |
"data_aux_classification_table must contain", |
| 58 | 10x |
"a 'sel_name' column" |
| 59 |
) |
|
| 60 |
) |
|
| 61 | ||
| 62 | 9x |
vec_shared_cols <- |
| 63 | 9x |
intersect( |
| 64 | 9x |
colnames(data_classification_table), |
| 65 | 9x |
colnames(data_aux_classification_table) |
| 66 |
) |
|
| 67 | ||
| 68 |
# Auxiliary rows first so dplyr::distinct keeps them on collision |
|
| 69 | 9x |
res <- |
| 70 | 9x |
dplyr::bind_rows( |
| 71 | 9x |
data_aux_classification_table %>% |
| 72 | 9x |
dplyr::select(dplyr::all_of(vec_shared_cols)), |
| 73 | 9x |
data_classification_table %>% |
| 74 | 9x |
dplyr::select(dplyr::all_of(vec_shared_cols)) |
| 75 |
) %>% |
|
| 76 | 9x |
dplyr::distinct(sel_name, .keep_all = TRUE) |
| 77 | ||
| 78 | 9x |
return(res) |
| 79 |
} |
| 1 |
#' @title Get Sample Ages |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts sample ages from a data frame containing dataset and sample info. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame. Must contain the columns `dataset_name` and `data_samples`. |
|
| 6 |
#' @return |
|
| 7 |
#' A data frame with columns `dataset_name`, `sample_name`, and `age`. |
|
| 8 |
#' @details |
|
| 9 |
#' Validates the input data frame, ensures required columns are present, and |
|
| 10 |
#' extracts sample ages by unnesting the `data_samples` column. |
|
| 11 |
#' @export |
|
| 12 |
get_sample_ages <- function(data = NULL) {
|
|
| 13 | 6x |
assertthat::assert_that( |
| 14 | 6x |
is.data.frame(data), |
| 15 | 6x |
msg = "data must be a data frame" |
| 16 |
) |
|
| 17 | ||
| 18 | 4x |
assertthat::assert_that( |
| 19 | 4x |
all(c("dataset_name", "data_samples") %in% colnames(data)),
|
| 20 | 4x |
msg = "data must contain columns 'dataset_name' and 'data_samples'" |
| 21 |
) |
|
| 22 | ||
| 23 | 2x |
data %>% |
| 24 | 2x |
dplyr::select(dataset_name, data_samples) %>% |
| 25 | 2x |
tidyr::unnest(data_samples) %>% |
| 26 | 2x |
dplyr::select( |
| 27 | 2x |
"dataset_name", |
| 28 | 2x |
"sample_name", |
| 29 | 2x |
"age" |
| 30 |
) %>% |
|
| 31 | 2x |
return() |
| 32 |
} |
| 1 |
#' @title Filter Taxa by Minimum Number of Cores |
|
| 2 |
#' @description |
|
| 3 |
#' Filters out taxa that are not present in a sufficient number of |
|
| 4 |
#' cores (distinct datasets). Only taxa occurring in at least |
|
| 5 |
#' `min_n_cores` distinct `dataset_name` values are retained. This |
|
| 6 |
#' removes taxa that appear in only a single core, which can |
|
| 7 |
#' disproportionately influence the species-species co-occurrence matrix. |
|
| 8 |
#' @param data |
|
| 9 |
#' A data frame containing community data in long format. Must include |
|
| 10 |
#' columns `taxon` and `dataset_name`. |
|
| 11 |
#' @param min_n_cores |
|
| 12 |
#' A single positive integer specifying the minimum number of distinct |
|
| 13 |
#' cores (datasets) a taxon must appear in to be retained. Default is 2. |
|
| 14 |
#' @return |
|
| 15 |
#' A filtered data frame containing only taxa that appear in at least |
|
| 16 |
#' `min_n_cores` distinct datasets. All original columns are preserved. |
|
| 17 |
#' @details |
|
| 18 |
#' The function counts distinct `dataset_name` values per `taxon` across |
|
| 19 |
#' the entire dataset. Taxa with fewer distinct cores than `min_n_cores` |
|
| 20 |
#' are removed. An error is raised if no taxa remain after filtering, |
|
| 21 |
#' which may indicate that `min_n_cores` is set too high. |
|
| 22 |
#' @seealso [filter_community_by_n_samples()], [filter_rare_taxa()], |
|
| 23 |
#' [select_n_taxa()] |
|
| 24 |
#' @export |
|
| 25 |
filter_community_by_n_cores <- function( |
|
| 26 |
data = NULL, |
|
| 27 |
min_n_cores = 2) {
|
|
| 28 | 18x |
assertthat::assert_that( |
| 29 | 18x |
is.data.frame(data), |
| 30 | 18x |
msg = "data must be a data frame" |
| 31 |
) |
|
| 32 | ||
| 33 | 14x |
assertthat::assert_that( |
| 34 | 14x |
all(c("taxon", "dataset_name") %in% names(data)),
|
| 35 | 14x |
msg = paste( |
| 36 | 14x |
"data must contain columns:", |
| 37 | 14x |
paste(c("taxon", "dataset_name"), collapse = ", ")
|
| 38 |
) |
|
| 39 |
) |
|
| 40 | ||
| 41 | 12x |
assertthat::assert_that( |
| 42 | 12x |
is.numeric(min_n_cores) && length(min_n_cores) == 1, |
| 43 | 12x |
msg = "min_n_cores must be a single numeric value" |
| 44 |
) |
|
| 45 | ||
| 46 | 8x |
assertthat::assert_that( |
| 47 | 8x |
min_n_cores >= 1, |
| 48 | 8x |
msg = "min_n_cores must be greater than or equal to 1" |
| 49 |
) |
|
| 50 | ||
| 51 | 6x |
vec_taxa_to_keep <- |
| 52 | 6x |
data |> |
| 53 | 6x |
dplyr::distinct(taxon, dataset_name) |> |
| 54 | 6x |
dplyr::group_by(taxon) |> |
| 55 | 6x |
dplyr::summarise( |
| 56 | 6x |
.groups = "drop", |
| 57 | 6x |
n_cores = dplyr::n() |
| 58 |
) |> |
|
| 59 | 6x |
dplyr::filter(n_cores >= min_n_cores) |> |
| 60 | 6x |
dplyr::pull(taxon) |
| 61 | ||
| 62 | 6x |
res <- |
| 63 | 6x |
data |> |
| 64 | 6x |
dplyr::filter(taxon %in% vec_taxa_to_keep) |
| 65 | ||
| 66 | 6x |
assertthat::assert_that( |
| 67 | 6x |
nrow(res) > 0, |
| 68 | 6x |
msg = paste( |
| 69 | 6x |
"No taxa remain after filtering.", |
| 70 | 6x |
"The min_n_cores is too high." |
| 71 |
) |
|
| 72 |
) |
|
| 73 | ||
| 74 | 5x |
return(res) |
| 75 |
} |
| 1 |
#' @title Interpolate Community Data |
|
| 2 |
#' @description |
|
| 3 |
#' Interpolates community proportion data to a regular time grid. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame with columns `dataset_name`, `taxon`, `age`, and |
|
| 6 |
#' `pollen_prop`. Must already be in proportion form ā see |
|
| 7 |
#' [make_community_proportions()]. |
|
| 8 |
#' @param ... |
|
| 9 |
#' Additional arguments passed to [interpolate_data()], such as |
|
| 10 |
#' `timestep`, `age_min`, and `age_max`. |
|
| 11 |
#' @return |
|
| 12 |
#' A data frame with interpolated community data at regular time |
|
| 13 |
#' intervals. |
|
| 14 |
#' @details |
|
| 15 |
#' Calls [interpolate_data()] grouped by `dataset_name` and `taxon`. |
|
| 16 |
#' Data must be converted to proportions before calling this function |
|
| 17 |
#' using [make_community_proportions()]. |
|
| 18 |
#' @seealso [make_community_proportions()], [interpolate_data()] |
|
| 19 |
#' @export |
|
| 20 |
interpolate_community_data <- function(data, ...) {
|
|
| 21 | 7x |
assertthat::assert_that( |
| 22 | 7x |
is.data.frame(data), |
| 23 | 7x |
msg = "data must be a data frame" |
| 24 |
) |
|
| 25 | ||
| 26 | 5x |
assertthat::assert_that( |
| 27 | 5x |
"pollen_prop" %in% colnames(data), |
| 28 | 5x |
msg = paste( |
| 29 | 5x |
"data must contain a 'pollen_prop' column.", |
| 30 | 5x |
"Use make_community_proportions() first." |
| 31 |
) |
|
| 32 |
) |
|
| 33 | ||
| 34 | 4x |
res <- |
| 35 | 4x |
data %>% |
| 36 | 4x |
interpolate_data( |
| 37 | 4x |
by = c("dataset_name", "taxon"),
|
| 38 |
... |
|
| 39 |
) |
|
| 40 | ||
| 41 | 4x |
return(res) |
| 42 |
} |
| 1 |
#' @title Prepare Community Data for Model Fitting |
|
| 2 |
#' @description |
|
| 3 |
#' Reshapes long-format community data into a wide matrix suitable |
|
| 4 |
#' for model fitting, filtering to the canonical sample index |
|
| 5 |
#' supplied by `align_sample_ids()`. |
|
| 6 |
#' @param data_community_long |
|
| 7 |
#' A data frame in long format with columns `dataset_name`, `age`, |
|
| 8 |
#' `taxon`, and `pollen_prop`. |
|
| 9 |
#' @param data_sample_ids |
|
| 10 |
#' A data frame of valid `(dataset_name, age)` pairs as returned by |
|
| 11 |
#' `align_sample_ids()`. |
|
| 12 |
#' @return |
|
| 13 |
#' A numeric matrix with row names in the format |
|
| 14 |
#' `"<dataset_name>__<age>"` and taxon names as column names. |
|
| 15 |
#' Missing taxonāsample combinations are filled with `0`. Rows are |
|
| 16 |
#' ordered by `dataset_name` then `age`, matching the order in |
|
| 17 |
#' `data_sample_ids`. |
|
| 18 |
#' @details |
|
| 19 |
#' Only samples present in `data_sample_ids` are retained. Rows |
|
| 20 |
#' with `NA` or zero `pollen_prop` are dropped before pivoting. |
|
| 21 |
#' The function widens the data and converts the result to a matrix |
|
| 22 |
#' directly, so the output is ready for `filter_constant_taxa()` |
|
| 23 |
#' and `assemble_data_to_fit()`. |
|
| 24 |
#' @seealso [align_sample_ids()], [filter_constant_taxa()], |
|
| 25 |
#' [assemble_data_to_fit()] |
|
| 26 |
#' @export |
|
| 27 |
prepare_community_for_fit <- function( |
|
| 28 |
data_community_long = NULL, |
|
| 29 |
data_sample_ids = NULL) {
|
|
| 30 | 11x |
assertthat::assert_that( |
| 31 | 11x |
is.data.frame(data_community_long), |
| 32 | 11x |
msg = "data_community_long must be a data frame" |
| 33 |
) |
|
| 34 | ||
| 35 | 10x |
assertthat::assert_that( |
| 36 | 10x |
is.data.frame(data_sample_ids), |
| 37 | 10x |
msg = "data_sample_ids must be a data frame" |
| 38 |
) |
|
| 39 | ||
| 40 | 9x |
assertthat::assert_that( |
| 41 | 9x |
all( |
| 42 | 9x |
c("dataset_name", "age", "taxon", "pollen_prop") %in%
|
| 43 | 9x |
names(data_community_long) |
| 44 |
), |
|
| 45 | 9x |
msg = paste0( |
| 46 | 9x |
"data_community_long must contain columns", |
| 47 | 9x |
" 'dataset_name', 'age', 'taxon', 'pollen_prop'" |
| 48 |
) |
|
| 49 |
) |
|
| 50 | ||
| 51 | 8x |
assertthat::assert_that( |
| 52 | 8x |
all(c("dataset_name", "age") %in% names(data_sample_ids)),
|
| 53 | 8x |
msg = paste0( |
| 54 | 8x |
"data_sample_ids must contain columns", |
| 55 | 8x |
" 'dataset_name' and 'age'" |
| 56 |
) |
|
| 57 |
) |
|
| 58 | ||
| 59 | 7x |
res <- |
| 60 | 7x |
data_community_long |> |
| 61 | 7x |
dplyr::inner_join( |
| 62 | 7x |
data_sample_ids, |
| 63 | 7x |
by = dplyr::join_by(dataset_name, age) |
| 64 |
) |> |
|
| 65 | 7x |
tidyr::drop_na(pollen_prop) |> |
| 66 | 7x |
dplyr::filter(pollen_prop > 0) |> |
| 67 | 7x |
dplyr::arrange(dataset_name, age) |> |
| 68 | 7x |
dplyr::mutate( |
| 69 | 7x |
sample_name = paste0(dataset_name, "__", age) |
| 70 |
) |> |
|
| 71 | 7x |
dplyr::select(sample_name, taxon, pollen_prop) |> |
| 72 | 7x |
tidyr::pivot_wider( |
| 73 | 7x |
names_from = "taxon", |
| 74 | 7x |
values_from = "pollen_prop", |
| 75 | 7x |
values_fill = 0 |
| 76 |
) |> |
|
| 77 | 7x |
tibble::column_to_rownames("sample_name") |>
|
| 78 | 7x |
as.matrix() |
| 79 | ||
| 80 | 7x |
return(res) |
| 81 |
} |
| 1 |
#' @title Add Age Column from Row Names |
|
| 2 |
#' @description |
|
| 3 |
#' Adds an 'age' column to a data frame by extracting age values from row |
|
| 4 |
#' names. |
|
| 5 |
#' @param data |
|
| 6 |
#' A data frame with row names in the format "dataset__age". |
|
| 7 |
#' @return |
|
| 8 |
#' The input data frame with an added 'age' column. |
|
| 9 |
#' @export |
|
| 10 |
add_age_column_from_rownames <- function(data) {
|
|
| 11 | 14x |
row_names <- |
| 12 | 14x |
row.names(data) %>% |
| 13 | 14x |
get_age_from_string() |
| 14 | ||
| 15 | 10x |
data %>% |
| 16 | 10x |
dplyr::mutate( |
| 17 | 10x |
age = as.numeric(row_names) |
| 18 |
) %>% |
|
| 19 | 10x |
return() |
| 20 |
} |
| 1 |
#' @title Make Community Data Long |
|
| 2 |
#' @description |
|
| 3 |
#' Converts community data from wide format to long format. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame. Must contain `dataset_name` and `sample_name` columns. |
|
| 6 |
#' @return |
|
| 7 |
#' A data frame in long format with columns `dataset_name`, `sample_name`, |
|
| 8 |
#' `taxon`, and `pollen_count`. |
|
| 9 |
#' @details |
|
| 10 |
#' Uses `tidyr::pivot_longer` to reshape the data, dropping NA values in the |
|
| 11 |
#' process. |
|
| 12 |
#' @export |
|
| 13 |
make_community_data_long <- function(data = NULL) {
|
|
| 14 | 4x |
assertthat::assert_that( |
| 15 | 4x |
is.data.frame(data), |
| 16 | 4x |
msg = "data must be a data frame" |
| 17 |
) |
|
| 18 | ||
| 19 | 2x |
assertthat::assert_that( |
| 20 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data)),
|
| 21 | 2x |
msg = "data must contain columns 'dataset_name' and 'sample_name'" |
| 22 |
) |
|
| 23 | ||
| 24 | 2x |
data %>% |
| 25 | 2x |
tidyr::pivot_longer( |
| 26 | 2x |
cols = !c("dataset_name", "sample_name"),
|
| 27 | 2x |
names_to = "taxon", |
| 28 | 2x |
values_to = "pollen_count", |
| 29 | 2x |
values_drop_na = TRUE |
| 30 |
) %>% |
|
| 31 | 2x |
return() |
| 32 |
} |
| 1 |
#' @title Get Dataset Coordinates |
|
| 2 |
#' @description |
|
| 3 |
#' Extracts unique coordinates for each dataset, excluding gridpoints. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame containing columns 'dataset_name', 'dataset_type', |
|
| 6 |
#' 'coord_long', and 'coord_lat'. Rows with `dataset_type == "gridpoints"` |
|
| 7 |
#' are excluded. |
|
| 8 |
#' @return |
|
| 9 |
#' A data frame with unique coordinates for each dataset, with |
|
| 10 |
#' `dataset_name` as row names. |
|
| 11 |
#' @export |
|
| 12 |
get_coords <- function(data) {
|
|
| 13 | 11x |
data %>% |
| 14 | 11x |
dplyr::filter( |
| 15 | 11x |
dataset_type != "gridpoints" |
| 16 |
) %>% |
|
| 17 | 11x |
dplyr::select( |
| 18 | 11x |
"dataset_name", |
| 19 | 11x |
"coord_long", |
| 20 | 11x |
"coord_lat" |
| 21 |
) %>% |
|
| 22 | 11x |
dplyr::distinct() %>% |
| 23 | 11x |
tibble::column_to_rownames("dataset_name") %>%
|
| 24 | 11x |
return() |
| 25 |
} |
| 1 |
#' @title Make Classification Table |
|
| 2 |
#' @description |
|
| 3 |
#' Creates a wide-format taxonomic classification table from a list of data |
|
| 4 |
#' frames containing taxon names and ranks. All seven standard taxonomic ranks |
|
| 5 |
#' are retained: `kingdom`, `phylum`, `class`, `order`, `family`, `genus`, |
|
| 6 |
#' and `species`. |
|
| 7 |
#' @param data A list of data frames, each containing columns `sel_name`, |
|
| 8 |
#' `rank`, and `name`. |
|
| 9 |
#' @return A data frame in wide format with up to eight columns (`sel_name` |
|
| 10 |
#' plus one column per taxonomic rank) and one row per `sel_name`. Ranks |
|
| 11 |
#' not available for a given taxon are `NA`. |
|
| 12 |
#' @details |
|
| 13 |
#' Filters for all seven standard taxonomic ranks (`kingdom`, `phylum`, |
|
| 14 |
#' `class`, `order`, `family`, `genus`, `species`), removes duplicate |
|
| 15 |
#' rankāname pairs, and pivots to wide format. `values_fn = dplyr::first` |
|
| 16 |
#' is used as a defensive guard to prevent list columns in the rare case |
|
| 17 |
#' where a taxon has more than one name recorded for the same rank. |
|
| 18 |
#' @export |
|
| 19 |
make_classification_table <- function(data) {
|
|
| 20 | 6x |
dplyr::bind_rows(data) %>% |
| 21 | 6x |
dplyr::filter( |
| 22 | 6x |
rank %in% c( |
| 23 | 6x |
"kingdom", "phylum", "class", "order", |
| 24 | 6x |
"family", "genus", "species" |
| 25 |
) |
|
| 26 |
) %>% |
|
| 27 | 6x |
dplyr::distinct( |
| 28 | 6x |
sel_name, rank, name |
| 29 |
) %>% |
|
| 30 | 6x |
tidyr::pivot_wider( |
| 31 | 6x |
names_from = rank, |
| 32 | 6x |
values_from = name, |
| 33 | 6x |
values_fn = dplyr::first |
| 34 |
) %>% |
|
| 35 | 6x |
return() |
| 36 |
} |
| 1 |
#' @title Transform to Proportions |
|
| 2 |
#' @description |
|
| 3 |
#' Transforms pollen count data into proportions based on total pollen count. |
|
| 4 |
#' @param data |
|
| 5 |
#' A data frame containing pollen count data. |
|
| 6 |
#' @param pollen_sum |
|
| 7 |
#' A data frame with total pollen counts for each sample. |
|
| 8 |
#' @return |
|
| 9 |
#' A data frame with pollen proportions, excluding `pollen_sum` and |
|
| 10 |
#' `pollen_count` columns. |
|
| 11 |
#' @details |
|
| 12 |
#' Joins the input data with total pollen counts and calculates proportions |
|
| 13 |
#' using `dplyr::mutate`. |
|
| 14 |
#' @export |
|
| 15 |
transform_to_proportions <- function(data = NULL, pollen_sum = NULL) {
|
|
| 16 | 10x |
data %>% |
| 17 | 10x |
dplyr::left_join( |
| 18 | 10x |
pollen_sum, |
| 19 | 10x |
by = "sample_name" |
| 20 |
) %>% |
|
| 21 | 10x |
dplyr::mutate( |
| 22 | 10x |
pollen_prop = pollen_count / pollen_sum, |
| 23 | 10x |
.after = pollen_count |
| 24 |
) %>% |
|
| 25 | 10x |
dplyr::select( |
| 26 | 10x |
!c("pollen_sum", "pollen_count")
|
| 27 |
) %>% |
|
| 28 | 10x |
return() |
| 29 |
} |