#' @title Extract Dataset Name from String
#' @description
#' Extracts the dataset name from a vector of strings, taking all characters
#' before the double underscore ("__").
#' @param vec_names
#' A character vector containing names with the format "dataset__something".
#' @return
#' A character vector of dataset names.
#' @export
get_dataset_name_from_string <- function(vec_names) {
  assertthat::assert_that(
    is.character(vec_names),
    msg = "Input must be a character vector."
  )

  assertthat::assert_that(
    length(vec_names) > 0,
    msg = "Input vector must not be empty."
  )

  assertthat::assert_that(
    all(stringr::str_detect(vec_names, "__")),
    msg = "Input strings must contain '__' to extract dataset names."
  )

  vec_names %>%
    # get all values before double "__"
    stringr::str_extract(".*(?=__)") %>%
    stringr::str_trim() %>%
    return()
}

#' @title Get Abiotic Data
#' @description
#' Extracts abiotic data from a data frame containing nested abiotic
#' information.
#' @param data
#' A data frame. Must contain the columns `dataset_name` and `data_abiotic`.
#' @return
#' A data frame with columns `dataset_name`, `sample_name`,
#' `abiotic_variable_name`, and `abiotic_value`.
#' @details
#' Validates the input data frame, ensures required columns are present, and
#' unnests the `data_abiotic` column.
#' @export
get_abiotic_data <- function(data = NULL) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    all(c("dataset_name", "data_abiotic") %in% colnames(data)),
    msg = "data must contain columns 'dataset_name' and 'data_abiotic'"
  )
  data %>%
    dplyr::select(dataset_name, data_abiotic) %>%
    tidyr::unnest(
      cols = c(data_abiotic)
    ) %>%
    dplyr::select(dataset_name, sample_name, abiotic_variable_name, abiotic_value) %>%
    return()
}

#' @title Add Model Evaluation
#' @description
#' Evaluates a fitted Hmsc model using predicted data and returns a list
#' containing the model and its evaluation.
#' @param mod_fitted A fitted Hmsc model object. Must be of class 'Hmsc'.
#' @param data_pred An array of predicted values. Must be of class 'array'.
#' @return
#' A list with two elements: the fitted model ('mod') and the evaluation
#' results ('eval').
#' @seealso Hmsc::evaluateModelFit
#' @export
add_model_evaluation <- function(mod_fitted = NULL,
                                 data_pred = NULL) {
  assertthat::assert_that(
    class(mod_fitted) == "Hmsc",
    msg = "mod_fitted must be of class Hmsc"
  )

  assertthat::assert_that(
    class(data_pred) == "array",
    msg = "data_pred must be an array"
  )

  list_eval <-
    Hmsc::evaluateModelFit(
      hM = mod_fitted,
      predY = data_pred
    )

  res <-
    list(
      mod = mod_fitted,
      eval = list_eval
    )

  return(res)
}

#' @title Get Community Taxa
#' @description
#' Extracts a vector of unique taxa from a community data frame.
#' @param data A data frame containing a 'taxon' column.
#' @return A character vector of unique taxon names present in the data.
#' @details
#' Uses dplyr to select distinct values from the 'taxon' column and returns
#' them as a vector.
#' @export
get_community_taxa <- function(data) {

  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    "taxon" %in% colnames(data),
    msg = "data must contain a 'taxon' column"
  )

  data_res <-
    data %>%
    dplyr::distinct(taxon) %>%
    dplyr::pull(taxon)

  return(data_res)
}

#' @title Interpolate Data
#' @description
#' Interpolates data over a specified age range and timestep using a method.
#' @param data
#' A data frame containing the data to be interpolated.
#' @param age_var
#' Name of the age variable column (default: "age").
#' @param value_var
#' Name of the value variable column (default: "pollen_prop").
#' @param method
#' Interpolation method to use (default: "linear").
#' @param rule
#' Integer specifying the extrapolation rule (default: 1).
#' @param ties
#' Function to handle tied values (default: `mean`).
#' @param age_min
#' Minimum age for interpolation (default: 0).
#' @param age_max
#' Maximum age for interpolation (default: 12000).
#' @param timestep
#' Timestep for interpolation (default: 500).
#' @return
#' A data frame with interpolated values, including dataset name, taxon, age,
#' and value columns.
#' @details
#' Nests data by dataset and taxon, performs interpolation using `stats::approx`,
#' and returns the interpolated data in a flat format.
#' @seealso [stats::approx()]
#' @export
interpolate_data <- function(data = NULL,
                             by = "dataset_name",
                             age_var = "age",
                             value_var = "pollen_prop",
                             method = "linear",
                             rule = 1,
                             ties = mean,
                             age_min = 0,
                             age_max = 12e03,
                             timestep = 500) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    is.character(by) && length(by) > 0,
    msg = "by must be a character vector with at least one element"
  )

  assertthat::assert_that(
    all(by %in% colnames(data)),
    msg = paste0(
      "data must contain the following columns: ",
      paste(by, collapse = ", ")
    )
  )

  assertthat::assert_that(
    is.character(age_var) && length(age_var) == 1,
    msg = "age_var must be a single character string"
  )

  assertthat::assert_that(
    is.character(value_var) && length(value_var) == 1,
    msg = "value_var must be a single character string"
  )

  assertthat::assert_that(
    is.character(method) && length(method) == 1,
    msg = "method must be a single character string"
  )

  assertthat::assert_that(
    is.numeric(rule) && length(rule) == 1,
    msg = "rule must be a single numeric value"
  )

  assertthat::assert_that(
    is.function(ties),
    msg = "ties must be a function"
  )

  assertthat::assert_that(
    is.numeric(age_min) && length(age_min) == 1,
    msg = "age_min must be a single numeric value"
  )

  assertthat::assert_that(
    is.numeric(age_max) && length(age_max) == 1,
    msg = "age_max must be a single numeric value"
  )

  assertthat::assert_that(
    age_min < age_max,
    msg = "age_min must be less than age_max"
  )

  assertthat::assert_that(
    is.numeric(timestep) && length(timestep) == 1,
    msg = "timestep must be a single numeric value"
  )

  assertthat::assert_that(
    timestep > 0,
    msg = "timestep must be greater than 0"
  )

  data %>%
    tidyr::nest(
      data_nested = !dplyr::any_of(by)
    ) %>%
    dplyr::mutate(
      data_interpolated = purrr::map(
        .x = data_nested,
        .f = purrr::possibly(
          .f = ~ .x %>%
            dplyr::select(
              !!rlang::sym(age_var),
              !!rlang::sym(value_var)
            ) %>%
            grDevices::xy.coords() %>%
            stats::approx(
              xout = seq(
                age_min,
                age_max,
                by = timestep
              ),
              ties = ties,
              method = method,
              rule = rule
            ) %>%
            tibble::as_tibble() %>%
            dplyr::rename(
              !!rlang::sym(age_var) := x,
              !!rlang::sym(value_var) := y
            ),
          otherwise = NULL
        )
      )
    ) %>%
    tidyr::unnest(data_interpolated) %>%
    dplyr::select(
      dplyr::any_of(by),
      !!rlang::sym(age_var),
      !!rlang::sym(value_var)
    ) %>%
    return()
}

#' @title Add Age to Community Data
#' @description
#' Merges community data with age data based on dataset and sample names.
#' @param data_community
#' A data frame containing community data. Must include `dataset_name` and
#' `sample_name` columns.
#' @param data_ages
#' A data frame containing age data. Must include `dataset_name` and
#' `sample_name` columns.
#' @return
#' A data frame with community data merged with the corresponding age data.
#' @details
#' Performs a left join between community data and age data using
#' `dataset_name` and `sample_name` as keys.
#' @export
add_age_to_samples <- function(data_community = NULL, data_ages = NULL) {
  assertthat::assert_that(
    is.data.frame(data_community),
    msg = "data_community must be a data frame"
  )
  assertthat::assert_that(
    is.data.frame(data_ages),
    msg = "data_ages must be a data frame"
  )

  assertthat::assert_that(
    all(c("dataset_name", "sample_name") %in% colnames(data_community)),
    msg = "data_community must contain columns 'dataset_name' and 'sample_name'"
  )
  assertthat::assert_that(
    all(c("dataset_name", "sample_name") %in% colnames(data_ages)),
    msg = "data_ages must contain columns 'dataset_name' and 'sample_name'"
  )

  dplyr::left_join(
    x = data_community,
    y = data_ages,
    by = c("dataset_name", "sample_name")
  ) %>%
    return()
}

#' @title Check and Prepare Data for Model Fitting
#' @description
#' Cleans and prepares community, abiotic, and coordinate data for HMSC model
#' fitting, with optional age subsetting.
#' @param data_community
#' Data frame of community data.
#' @param data_abiotic
#' Data frame of abiotic data.
#' @param data_coords
#' Data frame of coordinates.
#' @param subset_age
#' Optional age value to subset data.
#' @seealso [make_hmsc_model()]
#' @return
#' A list of cleaned and prepared data frames for model fitting.
#' @export
check_and_prepare_data_for_fit <- function(
    data_community = NULL,
    data_abiotic = NULL,
    data_coords = NULL,
    subset_age = NULL) {
  `%>%` <- magrittr::`%>%`

  data_community_no_na <-
    tidyr::drop_na(data_community)

  data_abiotic_no_na <-
    tidyr::drop_na(data_abiotic)

  data_coords_no_na <-
    tidyr::drop_na(data_coords) %>%
    dplyr::distinct()

  if (
    !is.null(subset_age)
  ) {
    data_community_no_na <-
      data_community_no_na %>%
      add_age_column_from_rownames() %>%
      dplyr::filter(age == subset_age) %>%
      dplyr::select(-age)

    data_abiotic_no_na <-
      data_abiotic_no_na %>%
      add_age_column_from_rownames() %>%
      dplyr::filter(age == subset_age) %>%
      dplyr::select(-age)
  }

  data_community_rownames <-
    data_community_no_na %>%
    add_age_column_from_rownames() %>%
    add_dataset_name_column_from_rownames() %>%
    dplyr::distinct(dataset_name, age)

  data_abiotic_rownames <-
    data_abiotic_no_na %>%
    add_age_column_from_rownames() %>%
    add_dataset_name_column_from_rownames() %>%
    dplyr::distinct(dataset_name, age)

  data_coords_rownames <-
    data_coords_no_na %>%
    tibble::rownames_to_column("dataset_name") %>%
    dplyr::distinct(dataset_name)

  data_intersect <-
    dplyr::inner_join(
      data_community_rownames,
      data_abiotic_rownames,
      by = dplyr::join_by(dataset_name, age)
    ) %>%
    dplyr::inner_join(
      data_coords_rownames,
      by = dplyr::join_by(dataset_name)
    ) %>%
    dplyr::distinct()

  data_community_to_fit <-
    data_community_no_na %>%
    add_age_column_from_rownames() %>%
    add_dataset_name_column_from_rownames() %>%
    tibble::rownames_to_column("row_names") %>%
    dplyr::inner_join(
      data_intersect,
      by = dplyr::join_by(dataset_name, age)
    ) %>%
    dplyr::select(-dataset_name, -age) %>%
    tibble::column_to_rownames("row_names")


  data_abiotic_to_fit <-
    data_abiotic_no_na %>%
    add_age_column_from_rownames() %>%
    add_dataset_name_column_from_rownames() %>%
    tibble::rownames_to_column("row_names") %>%
    dplyr::inner_join(
      data_intersect,
      by = dplyr::join_by(dataset_name, age)
    ) %>%
    dplyr::select(-dataset_name, -age) %>%
    tibble::column_to_rownames("row_names")

  vec_shared_dataset_names <-
    data_intersect %>%
    dplyr::distinct(dataset_name) %>%
    purrr::chuck("dataset_name") %>%
    as.character()

  data_coords_to_fit <-
    data_coords_no_na %>%
    as.data.frame() %>%
    tibble::rownames_to_column("row_names") %>%
    dplyr::filter(row_names %in% vec_shared_dataset_names) %>%
    tibble::column_to_rownames("row_names")

  vec_age <-
    data_intersect %>%
    dplyr::distinct(age) %>%
    purrr::chuck("age") %>%
    as.numeric() %>%
    sort() %>%
    as.character()

  data_ages_to_fit <-
    tibble::tibble(
      age = as.numeric(vec_age),
      row_names = age
    ) %>%
    tibble::column_to_rownames("row_names")

  res <-
    list(
      data_community_to_fit = data_community_to_fit,
      data_abiotic_to_fit = data_abiotic_to_fit,
      data_ages_to_fit = data_ages_to_fit,
      data_coords_to_fit = data_coords_to_fit
    )

  return(res)
}

#' @title Fit HMSC Model
#' @description
#' Sample the MCMC process for the HMSC model.
#' @param mod_hmsc
#' An unfitted HMSC model object.
#' @param n_chains
#' Number of MCMC chains (default: 20).
#' @param n_samples
#' Number of MCMC samples (default: 10,000).
#' @param n_thin
#' Thinning interval for MCMC samples (default: 5).
#' @param n_transient
#' Number of transient iterations (default: 2,500).
#' @param n_parallel
#' Number of parallel chains (default: 20).
#' @param n_samples_verbose
#' Verbosity interval for MCMC sampling (default: 500).
#' @return
#' Returns a fitted HMSC model object.
#' @export
fit_hmsc_model <- function(
    mod_hmsc,
    n_chains = 20,
    n_samples = 10e3,
    n_thin = 1,
    n_transient = 2500,
    n_parallel = 20,
    n_samples_verbose = 500) {
  assertthat::assert_that(
    inherits(mod_hmsc, "Hmsc"),
    msg = "mod_hmsc must be an HMSC model object"
  )

  assertthat::assert_that(
    is.numeric(n_chains) && n_chains > 0,
    msg = "n_chains must be a positive number"
  )

  assertthat::assert_that(
    is.numeric(n_samples) && n_samples > 0,
    msg = "n_samples must be a positive number"
  )

  assertthat::assert_that(
    is.numeric(n_thin) && n_thin > 0,
    msg = "n_thin must be a positive number"
  )

  assertthat::assert_that(
    is.numeric(n_transient) && n_transient > 0,
    msg = "n_transient must be a positive number"
  )

  assertthat::assert_that(
    is.numeric(n_parallel) && n_parallel > 0,
    msg = "n_parallel must be a positive number"
  )

  assertthat::assert_that(
    is.numeric(n_samples_verbose) && n_samples_verbose > 0,
    msg = "n_samples_verbose must be a positive number"
  )

  mod_hmsc_fitted <-
    Hmsc::sampleMcmc(
      mod_hmsc,
      nChains = n_chains,
      samples = n_samples,
      thin = n_thin,
      transient = n_transient,
      verbose = n_samples_verbose,
      nParallel = n_parallel
    )

  return(mod_hmsc_fitted)
}

#' @title Prepare Data for Model Fitting
#' @description
#' Prepares community or abiotic data for model fitting by reshaping it into
#' a wide format with appropriate column names.
#' @param data
#' A data frame containing the input data. For `type = "community"`, it must
#' include columns `dataset_name`, `age`, `taxon`, and `pollen_prop`. For
#' `type = "abiotic"`, it must include columns `dataset_name`, `age`,
#' `abiotic_variable_name`, and `abiotic_value`.
#' @param type
#' A character string specifying the type of data to prepare. Must be either
#' "community" or "abiotic" (default: "community").
#' @return
#' A data frame in wide format, with `sample_name` as row names and either
#' taxa or abiotic variable names as columns. For community data, missing
#' values are filled with 0. For abiotic data, missing values are left as NA.
#' @details
#' The function validates the input data and reshapes it based on the
#' specified `type`. For community data, it combines `dataset_name` and `age`
#' into a `sample_name` column, selects relevant columns, and pivots the data
#' to a wide format. For abiotic data, it performs similar steps but uses
#' abiotic variable names and values.
#' @export
prepare_data_for_fit <- function(data = NULL, type = c("community", "abiotic")) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  type <- match.arg(type)

  assertthat::assert_that(
    type %in% c("community", "abiotic"),
    msg = "type must be either 'community' or 'abiotic'"
  )

  res <-
    switch(type,
      "community" = data %>%
        dplyr::mutate(
          sample_name = paste0(dataset_name, "__", age),
        ) %>%
        dplyr::select("sample_name", "taxon", "pollen_prop") %>%
        tidyr::pivot_wider(
          names_from = "taxon",
          values_from = "pollen_prop",
          values_fill = 0
        ) %>%
        tibble::column_to_rownames("sample_name"),
      "abiotic" = data %>%
        dplyr::mutate(
          sample_name = paste0(dataset_name, "__", age),
        ) %>%
        dplyr::select(
          "sample_name", "abiotic_variable_name", "abiotic_value"
        ) %>%
        tidyr::pivot_wider(
          names_from = "abiotic_variable_name",
          values_from = "abiotic_value",
          values_fill = NULL
        ) %>%
        tibble::column_to_rownames("sample_name")
    )

    return(res)
}

#' @title Save Progress Visualisation
#' @description
#' Generates a visualisation of project progress and saves it as HTML and PNG.
#' @param sel_script
#' The script file to be visualised.
#' @param output_file
#' The name of the output file (default: "project_status").
#' @param output_dir
#' Directory where the output files will be saved (default: "Outputs/Figures").
#' @param level_separation
#' Level separation for the visualisation graph (default: 250).
#' @details
#' Uses `targets::tar_visnetwork` to create a network graph and saves it as
#' HTML using `visNetwork::visSave`. Also generates a static PNG image using
#' `webshot2::webshot`.
#' @export
save_progress_visualisation <- function(
    sel_script,
    output_file = "project_status",
    output_dir = here::here(
      "Documentation/Progress"
    ),
    background_color = "white",
    level_separation = 250) {
  network_graph <-
    targets::tar_visnetwork(
      script = sel_script,
      outdated = FALSE,
      store = get_active_config("target_store"),
      targets_only = FALSE,
      level_separation = level_separation
    )

  network_graph_static <-
    targets::tar_visnetwork(
      script = sel_script,
      store = get_active_config("target_store"),
      targets_only = TRUE,
      outdated = FALSE,
      level_separation = level_separation
    )

  visNetwork::visSave(
    graph = network_graph,
    file = here::here("Documentation/Progress/project_status.html"),
    selfcontained = TRUE,
    background = background_color
  )

  visNetwork::visSave(
    graph = network_graph_static,
    file = paste0(output_dir, "/", output_file, "_small.html"),
    selfcontained = TRUE,
    background = background_color
  )

  webshot2::webshot(
    url = paste0(output_dir, "/", output_file, "_small.html"),
    file = paste0(output_dir, "/", output_file, "_static.png"),
    vwidth = 950,
    vheight = 750
  )
}

#' @title Select N Taxa
#' @description
#' Selects the top N taxa based on their occurrence across datasets.
#' @param data
#' A data frame containing the input data. Must include columns "taxon" and
#' the column specified in the `per` parameter.
#' @param n_taxa
#' A numeric value specifying the number of taxa to select. Default is Inf.
#' @param per
#' A character string specifying the column name to group by. Default is
#' "dataset_name".
#' @return
#' A data frame containing the filtered data with the top N taxa.
#' @details
#' The function identifies the most common taxa across datasets by counting
#' their occurrences. It then filters the input data to include only the
#' selected taxa. If no taxa are found, an error is raised.
#' @export
select_n_taxa <- function(
    data = NULL,
    n_taxa = Inf,
    per = "dataset_name") {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    is.character(per) && length(per) == 1,
    msg = "per must be a single character string"
  )

  assertthat::assert_that(
    all(
      c("taxon", per) %in% names(data)
    ),
    msg = paste(
      "data must contain the following columns:",
      paste(c("taxon", per), collapse = ", ")
    )
  )

  assertthat::assert_that(
    is.numeric(n_taxa),
    msg = "n_taxa must be a number"
  )

  assertthat::assert_that(
    n_taxa > 0,
    msg = "n_taxa must be greater than 0"
  )


  vec_common_taxa <-
    data %>%
    dplyr::distinct(taxon, !!rlang::sym(per)) %>%
    dplyr::group_by(taxon) %>%
    dplyr::summarise(
      .groups = "drop",
      n_datasets = dplyr::n()
    ) %>%
    dplyr::slice_max(n = n_taxa, order_by = n_datasets) %>%
    dplyr::pull(taxon)

  res <-
    data %>%
    dplyr::filter(taxon %in% vec_common_taxa)


  assertthat::assert_that(
    nrow(res) > 0,
    msg = paste(
      "No taxa found in data. Please check the input data.",
      "The number of taxa selected is too high."
    )
  )

  return(res)
}

#' @title Get Random Structure for Model
#' @description
#' Constructs the random structure for HMSC models based on age and/or space.
#' @param data
#' A list containing data for model fitting from the function
#' [check_and_prepare_data_for_fit()].
#' @param type
#' Character vector specifying random effect types ("age", "space").
#' @param min_knots_distance
#' Minimum distance between knots for spatial random effect (optional).
#' Only used if "space" is in `type`.
#' @return
#' A list describing the random structure for the model.
#' @export
get_random_structure_for_model <- function(
    data = NULL,
    type = c("age", "space"),
    min_knots_distance = NULL) {
  `%>%` <- magrittr::`%>%`

  assertthat::assert_that(
    class(data) == "list",
    msg = "data must be a list"
  )

  assertthat::assert_that(
    length(data) == 4,
    msg = "data must contain four elements"
  )

  assertthat::assert_that(
    all(
      names(data) %in% c(
        "data_community_to_fit",
        "data_abiotic_to_fit",
        "data_ages_to_fit",
        "data_coords_to_fit"
      )
    ),
    msg = "data must contain the elements: data_community_to_fit, data_abiotic_to_fit, data_ages_to_fit, data_coords_to_fit"
  )


  assertthat::assert_that(
    is.character(type),
    msg = "type must be a character vector"
  )

  assertthat::assert_that(
    length(type) > 0,
    msg = "type must be a non-empty character vector"
  )

  assertthat::assert_that(
    all(type %in% c("age", "space")),
    msg = "type must be one of 'age' or 'space'"
  )

  assertthat::assert_that(
    is.null(min_knots_distance) || is.numeric(min_knots_distance),
    msg = "min_knots_distance must be NULL or a numeric value"
  )

  assertthat::assert_that(
    is.null(min_knots_distance) || min_knots_distance > 0,
    msg = "min_knots_distance must be NULL or a positive numeric value"
  )

  list_random <- list()

  data_study_design <-
    data %>%
    purrr::chuck("data_community_to_fit") %>%
    add_dataset_name_column_from_rownames() %>%
    dplyr::mutate(
      dataset_name_factor = as.factor(dataset_name)
    ) %>%
    dplyr::select(
      "dataset_name" = "dataset_name_factor"
    )

  if (
    "age" %in% type
  ) {
    data_study_design <-
      data %>%
      purrr::chuck("data_community_to_fit") %>%
      add_age_column_from_rownames() %>%
      add_dataset_name_column_from_rownames() %>%
      dplyr::mutate(
        age_factor = factor(
          x = age,
          levels = sort(unique(as.numeric(age))),
          ordered = TRUE
        ),
        dataset_name_factor = as.factor(dataset_name)
      ) %>%
      dplyr::select(
        "dataset_name" = "dataset_name_factor",
        "age" = "age_factor"
      )

    data_age <-
      data %>%
      purrr::chuck("data_ages_to_fit") %>%
      dplyr::mutate(
        age = as.numeric(age),
      )

    vec_age <-
      data_age %>%
      dplyr::distinct(age) %>%
      dplyr::pull(age) %>%
      as.character()

    random_level_age <-
      Hmsc::HmscRandomLevel(
        sData = data_age
      )

    list_random <-
      c(
        list_random,
        list("age" = random_level_age)
      )
  }


  if (
    "space" %in% type
  ) {
    data_coords <-
      data %>%
      purrr::chuck("data_coords_to_fit")

    random_coors_knots <-
      Hmsc::constructKnots(
        sData = data_coords,
        minKnotDist = min_knots_distance
      )

    random_level_coords <-
      Hmsc::HmscRandomLevel(
        sData = data_coords,
        sMethod = "GPP",
        sKnot = random_coors_knots
      )

    list_random <-
      c(
        list_random,
        list("dataset_name" = random_level_coords)
      )
  }

  list(
    random_levels = list_random,
    study_design = data_study_design
  ) %>%
    return()
}

#' @title Get Community Data
#' @description
#' This function processes a data frame containing community data
#' and extracts the relevant columns, unnesting the `data_community`
#' column in the process.
#' @param
#' data A data frame. Must contain the columns `dataset_name` and
#'  `data_community`.
#' @return
#' A data frame with the `dataset_name` and unnested `data_community` columns.
#' @details The function performs the following steps:
#'   \itemize{
#'     \item Validates that the input is a data frame.
#'     \item Ensures the presence of the `dataset_name` and `data_community` columns.
#'     \item Selects the `dataset_name` and `data_community` columns.
#'     \item Unnests the `data_community` column.
#'   }
#' @export
get_community_data <- function(data = NULL) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    all(c("dataset_name", "data_community") %in% colnames(data)),
    msg = "data must contain columns 'dataset_name' and 'data_community'"
  )

  data %>%
    dplyr::select(dataset_name, data_community) %>%
    tidyr::unnest(data_community) %>%
    return()
}

#' @title Get Sample Ages
#' @description
#' Extracts sample ages from a data frame containing dataset and sample info.
#' @param data
#' A data frame. Must contain the columns `dataset_name` and `data_samples`.
#' @return
#' A data frame with columns `dataset_name`, `sample_name`, and `age`.
#' @details
#' Validates the input data frame, ensures required columns are present, and
#' extracts sample ages by unnesting the `data_samples` column.
#' @export
get_sample_ages <- function(data = NULL) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    all(c("dataset_name", "data_samples") %in% colnames(data)),
    msg = "data must contain columns 'dataset_name' and 'data_samples'"
  )

  data %>%
    dplyr::select(dataset_name, data_samples) %>%
    tidyr::unnest(data_samples) %>%
    dplyr::select(
      "dataset_name",
      "sample_name",
      "age"
    ) %>%
    return()
}

#' @title Select Better Model Based on Fit
#' @description
#' Compares two models and selects the one with better fit based on Tjur's R2.
#' @param list_models
#' A list of two fitted model objects.
#' @return
#' The model object with better fit.
#' @export
get_better_model_based_on_fit <- function(list_models) {
  assertthat::assert_that(
    is.list(list_models),
    msg = "list_models must be a list of models"
  )

  assertthat::assert_that(
    length(list_models) == 2,
    msg = "list_models must be a list of two models"
  )

  assertthat::assert_that(
    !is.null(list_models[[1]]),
    msg = "The first model in list_models must not be NULL"
  )

  assertthat::assert_that(
    !is.null(list_models[[2]]),
    msg = "The second model in list_models must not be NULL"
  )

  mod_null <-
    list_models %>%
    purrr::chuck(1)

  mod_full <-
    list_models %>%
    purrr::chuck(2)

  # by default select the null model
  res <- mod_null

  null_r2 <-
    mod_null %>%
    purrr::chuck("eval", "TjurR2")

  full_r2 <-
    mod_full %>%
    purrr::chuck("eval", "TjurR2")

  n_r2 <- length(null_r2)

  n_better_r2 <-
    sum(
      null_r2 < full_r2,
      na.rm = TRUE
    )

  if (
    n_r2 / n_better_r2 >= 0.5
  ) {
    res <-
      mod_full
  }

  return(res)
}

#' @title Compute Species Associations
#' @description
#' Computes species associations from a fitted Hmsc model object.
#' @param data_source
#' A list containing a fitted Hmsc model under the 'mod' element.
#' Generally, this is the output of the function add_model_evaluation() or
#' # get_better_model_based_on_fit()
#' @return
#' A matrix of species associations.
#' @seealso [add_model_evaluation(), get_better_model_based_on_fit()]
#' @export
get_species_association <- function(data_source) {
  assertthat::assert_that(
    is.list(data_source),
    msg = "data_source must be a list"
  )

  mod <-
    data_source %>%
    purrr::chuck("mod")

  assertthat::assert_that(
    assertthat::are_equal(
      class(mod),
      "Hmsc",
      msg = "data_source must be of class Hmsc"
    )
  )

  res <-
    Hmsc::computeAssociations(mod) %>%
    purrr::set_names(
      nm = mod %>%
        purrr::chuck("ranLevelsUsed")
    )

  return(res)
}

#' @title Get Significant Species Associations
#' @description
#' Identifies significant species associations based on support and mean
#' values for each error level.
#' @param data_source
#' A list containing association matrices from a fitted Hmsc model.
#' Generally, this is the output of the function get_species_association().
#' @param alpha
#' Significance level for support threshold (default: 0.05).
#' @return
#' A vector of significant association values.
#' @seealso [get_species_association()]
#' @export
get_significant_associations <- function(data_source, alpha = 0.05) {
  assertthat::assert_that(
    is.list(data_source),
    msg = "The data source must be a list."
  )

  support_threshold <- 1 - alpha

  assertthat::assert_that(
    is.numeric(support_threshold) && support_threshold > 0 && support_threshold < 1,
    msg = "The support threshold must be a numeric value between 0 and 1."
  )

  assertthat::assert_that(
    is.list(data_source) && length(data_source) > 0,
    msg = "The data source is list and not empty."
  )

  assertthat::assert_that(
    purrr::map_lgl(
      .x = data_source,
      .f = ~ is.list(.x) && all(c("mean", "support") %in% names(.x))
    ) %>%
      all(),
    msg = "The data source must contain lists with 'mean' and 'support' matrices."
  )

  result <-
    data_source %>%
    purrr::map(
      .f = ~ {
        vec_data_support <-
          purrr::pluck(.x, "support") %>%
          {
            .[lower.tri(., diag = FALSE)]
          }

        assertthat::assert_that(
          is.numeric(vec_data_support),
          msg = "The support values must be numeric vectors."
        )

        n_values <- length(vec_data_support)

        vec_significant <- (
          (vec_data_support > support_threshold) +
            ((vec_data_support < (1 - support_threshold)) > 0)
        )

        n_significant <- sum(vec_significant, na.rm = TRUE)

        res <-
          list(
            n_associations = n_values,
            n_significant = n_significant,
            proportion_significant = n_significant / n_values
          )
      }
    )

  return(result)
}

#' @title Get Active Configuration Value
#' @description
#' Retrieves a specific configuration value from a YAML configuration file.
#' @param value
#' A character vector specifying the configuration key(s) to retrieve.
#' @param file
#' Path to the YAML configuration file (default: "config.yml").
#' @return
#' Value(s) associated with the specified key(s) in the configuration file.
#' @details
#' Validates input parameters, ensures the file is readable, and retrieves
#' configuration value(s) using `config::get`. Active configuration is set by
#' the `R_CONFIG_ACTIVE` environment variable.
#' @export
get_active_config <- function(
    value = NULL,
    file = here::here("config.yml")) {
  assertthat::assert_that(
    is.character(value) && length(value) > 0,
    msg = "value must be a character vector with at least one element"
  )

  assertthat::assert_that(
    assertthat::is.readable(file) && assertthat::has_extension(file, "yml"),
    msg = "file must be a readable YAML file"
  )

  config::get(
    value = value,
    config = Sys.getenv("R_CONFIG_ACTIVE"),
    use_parent = FALSE,
    file = file
  ) %>%
    return()
}

#' @title Check Presence of VegVault File
#' @description
#' Checks whether the `VegVault.sqlite` file exists in the specified directory.
#' @param relative_path
#' Relative path to the `VegVault.sqlite` file (default: "Data/Input/Vegvault.sqlite").
#' @return
#' Logical value indicating whether the file exists. Stops with an error if
#' the file is not found.
#' @details
#' Verifies the presence of the `VegVault.sqlite` file. If not found, throws
#' an error with instructions to consult the `Data/Input/README.md` file.
#' @export
check_presence_of_vegvault <- function(relative_path = "Data/Input/Vegvault.sqlite") {
  vegvault_present <-
    file.exists(
      here::here(relative_path)
    )

  if (
    isFALSE(vegvault_present)
  ) {
    stop(
      paste(
        "The VegVault.sqlite file is not present in",
        "the `Data/Input/` directory.",
        "Please read the `Data/Input/README.md` file for more information."
      )
    )

    return(vegvault_present)
  }

  return(vegvault_present)
}

#' @title Get Taxa Classification
#' @description
#' Retrieves taxonomic classification for a vector of taxa using the
#' taxospace package. Filters results to include only plant taxa.
#' @param data A character vector of taxon names to classify.
#' @return A data frame with columns for selected name, taxonomic name,
#'   rank, and id.
#' @details
#' Uses taxospace::get_classification to retrieve classification. Flags and
#' filters for plant taxa (kingdom Plantae). Returns an empty tibble if no
#' plant taxa found.
#' @export
get_taxa_classification <- function(data) {
  require(taxospace)

  assertthat::assert_that(
    is.character(data) && length(data) > 0,
    msg = "data must be a non-empty character vector"
  )

  res_classification <-
    taxospace::get_classification(
      taxa_vec = data,
      # this is done so that the best match is returned
      #   even if the result is not flagged as "exact"
      use_only_exact_match = FALSE
    )

  res_plant <-
    res_classification %>%
    # flag taxa that are plants
    dplyr::mutate(
      is_plant = purrr::map_lgl(
        .x = classification,
        .f = ~ .x %>%
          dplyr::filter(rank == "kingdom") %>%
          dplyr::pull(name) %>%
          stringr::str_detect("Plantae") %>%
          any()
      )
    ) %>%
    # filter only plant taxa
    dplyr::filter(is_plant)

  if (
    isTRUE(nrow(res_plant) == 0)
  ) {
    return(
      tibble::tibble(
        sel_name = data,
        name = character(),
        rank = character(),
        id = integer(),
      )
    )
  }

  res_plant %>%
    dplyr::select(sel_name, classification) %>%
    tidyr::unnest(classification) %>%
    return()
}

#' @title Calculate the Total Pollen Count for Each Sample
#' @description
#' This function computes the total pollen count for each sample
#' in the provided dataset.
#' @param data
#' A data frame containing at least two columns:
#' `sample_name` (the name or identifier of the sample)
#' and `pollen_count` (the count of pollen for each observation).
#' @return
#' A data frame with two columns: `sample_name` and `pollen_sum`,
#' where `pollen_sum` is the total pollen count
#' for each sample.
#' @details
#' The function groups the data by `sample_name`,
#' calculates the sum of `pollen_count` for each group,
#' and removes any grouping structure before returning the result.
#' Missing values (`NA`) in `pollen_count` are ignored.
#' @export
get_pollen_sum <- function(data) {
  data %>%
    dplyr::group_by(sample_name) %>%
    dplyr::summarize(pollen_sum = sum(pollen_count, na.rm = TRUE)) %>%
    dplyr::ungroup() %>%
    return()
}

#' @title Make HMSC Model
#' @description
#' Create a Hierarchical Modelling of Species Communities (HMSC) model to
#' fit community and abiotic data.
#' @param data_to_fit
#' A list containing the community and abiotic data to fit the model.
#' @param random_structure
#' A list containing the random structure for the model, including the
#' study design and random levels.
#' @param error_family
#' A character string specifying the error family. Options are "normal" or
#' "binomial" (default: "normal").
#' @return
#' returns an unfitted HMSC model object.
#' @details
#' If `error_family` is "binomial", the community data is converted to binary
#' presence/absence data, and the error family is set to "probit".
#' @export
make_hmsc_model <- function(
    data_to_fit = NULL,
    sel_formula = NULL,
    random_structure = NULL,
    error_family = c("normal", "binomial")) {
  assertthat::assert_that(
    is.list(data_to_fit),
    msg = "data_to_fit must be a list"
  )

  data_community <-
    data_to_fit %>%
    purrr::chuck("data_community_to_fit")


  assertthat::assert_that(
    is.data.frame(data_community),
    msg = "data_community must be a data frame"
  )

  data_abiotic <-
    data_to_fit %>%
    purrr::chuck("data_abiotic_to_fit")


  assertthat::assert_that(
    is.data.frame(data_abiotic),
    msg = "data_abiotic must be a data frame"
  )


  assertthat::assert_that(
    is.character(sel_formula),
    msg = "sel_formula must be a character string "
  )

  assertthat::assert_that(
    any(
      error_family %in% c("normal", "binomial")
    ),
    msg = "error_family must be either 'normal' or 'binomial'"
  )

  error_family <- match.arg(error_family)

  assertthat::assert_that(
    length(sel_formula) == 1,
    msg = "sel_formula must be a character string of length 1"
  )

  if (
    error_family == "binomial"
  ) {
    data_community <-
      data_community > 0

    error_family <- "probit"
  }

  assertthat::assert_that(
    is.list(random_structure),
    msg = "random_structure must be a list"
  )

  assertthat::assert_that(
    all(c("study_design", "random_levels") %in% names(random_structure)),
    msg = "random_structure must contain study_design and random_levels"
  )

  study_design <-
    random_structure %>%
    purrr::chuck("study_design")

  mod_hmsc <-
    Hmsc::Hmsc(
      Y = data_community,
      XData = data_abiotic,
      XFormula = as.formula(sel_formula),
      distr = error_family,
      studyDesign = study_design,
      ranLevels = random_structure$random_levels
    )

  return(mod_hmsc)
}

#' @title Extract Data from VegVault
#' @description
#' Extracts data from the VegVault SQLite database based on specified
#' geographic, temporal, and dataset type constraints.
#' @param path_to_vegvault
#' A character string specifying the path to the VegVault SQLite database
#' (default: "Data/Input/VegVault.sqlite").
#' @param x_lim
#' A numeric vector of length 2 specifying the longitude range.
#' @param y_lim
#' A numeric vector of length 2 specifying the latitude range.
#' @param age_lim
#' A numeric vector of length 2 specifying the age range.
#' @param sel_dataset_type
#' A character vector specifying the dataset types to select.
#' @param sel_abiotic_var_name
#' A character vector specifying the abiotic variable names to select.
#' @return
#' A data frame containing the extracted data.
#' @details
#' The function performs the following steps:
#'   
#'   1. Validates input parameters.
#'   2. Checks the presence of the VegVault SQLite database.
#'   3. Accesses the database and filters data based on geographic, temporal,
#'      and dataset type constraints.
#'   4. Retrieves abiotic data and taxa information.
#'   5. Returns the extracted data as a data frame.
#' @export
extract_data_from_vegvault <- function(
    path_to_vegvault = here::here("Data/Input/VegVault.sqlite"),
    x_lim = NULL,
    y_lim = NULL,
    age_lim = NULL,
    sel_dataset_type = NULL,
    sel_abiotic_var_name = NULL) {
  `%>%` <- magrittr::`%>%`

  assertthat::assert_that(
    is.character(path_to_vegvault),
    length(path_to_vegvault) == 1,
    msg = "path_to_vegvault must be a single character string"
  )

  # Check if the VegVault file exists
  check_presence_of_vegvault(path_to_vegvault)

  assertthat::assert_that(
    is.numeric(x_lim) && length(x_lim) == 2,
    msg = "x_lim must be a numeric vector of length 2"
  )

  assertthat::assert_that(
    is.numeric(y_lim) && length(y_lim) == 2,
    msg = "y_lim must be a numeric vector of length 2"
  )

  assertthat::assert_that(
    is.numeric(age_lim) && length(age_lim) == 2,
    msg = "age_lim must be a numeric vector of length 2"
  )

  assertthat::assert_that(
    is.character(sel_dataset_type) && length(sel_dataset_type) > 0,
    msg = "sel_dataset_type must be a character vector of length > 0"
  )

  assertthat::assert_that(
    is.character(sel_abiotic_var_name) && length(sel_abiotic_var_name) > 0,
    msg = "sel_abiotic_var_name must be a character vector of length > 0"
  )

  vaultkeepr_plan <-
    # Access the VegVault file
    vaultkeepr::open_vault(
      path = path_to_vegvault
    ) %>%
    # Add the dataset information
    vaultkeepr::get_datasets() %>%
    # Select modern plot data and climate
    vaultkeepr::select_dataset_by_type(
      sel_dataset_type = sel_dataset_type
    ) %>%
    # Limit data to Czech Republic
    vaultkeepr::select_dataset_by_geo(
      lat_lim = y_lim,
      long_lim = x_lim,
      verbose = FALSE
    ) %>%
    # Add samples
    vaultkeepr::get_samples() %>%
    # select only modern data
    vaultkeepr::select_samples_by_age(
      age_lim = age_lim,
      verbose = FALSE
    ) %>%
    # Add abiotic data
    vaultkeepr::get_abiotic_data(verbose = FALSE) %>%
    # Select only Mean Anual Temperature (bio1)
    vaultkeepr::select_abiotic_var_by_name(
      sel_var_name = sel_abiotic_var_name) %>%
    # add taxa
    vaultkeepr::get_taxa()

  data_extracted <-
    vaultkeepr_plan %>%
    vaultkeepr::extract_data(
      return_raw_data = FALSE,
      verbose = FALSE
    )

  return(data_extracted)
}

#' @title Get Dataset Coordinates
#' @description
#' Extracts unique coordinates for each dataset, excluding gridpoints.
#' @param data
#' A data frame containing columns 'dataset_name', 'coord_long', and
#' 'coord_lat'.
#' @return
#' A data frame with unique coordinates for each dataset.
#' @export
get_coords <- function(data) {
  data %>%
    dplyr::filter(
      dataset_type != "gridpoints"
    ) %>%
    dplyr::select(
      "dataset_name",
      "coord_long",
      "coord_lat"
    ) %>%
    dplyr::distinct() %>%
    tibble::column_to_rownames("dataset_name") %>%
    return()
}

#' @title Classify Taxonomic Resolution
#' @description
#' Classifies taxa in a data frame to a specified taxonomic resolution (family,
#' genus, or species) using a classification table, and aggregates pollen
#' proportions accordingly.
#' @param data A data frame containing taxon data with columns including
#'   'taxon', 'dataset_name', 'age', and 'pollen_prop'.
#' @param data_classification_table A data frame mapping 'sel_name' to
#'   taxonomic levels (e.g., family, genus, species).
#' @param taxonomic_resolution A character string specifying the taxonomic
#'   resolution to classify to. Must be one of 'family', 'genus', or 'species'.
#' @return A data frame with taxa classified to the specified resolution and
#'   pollen proportions aggregated accordingly. The output preserves all
#'   dataset_name and age combinations for true negatives.
#' @details
#' Performs a left join to map taxa to the desired resolution, aggregates
#' pollen proportions, and ensures all dataset_name-age-taxon combinations are
#' present in the output.
#' @export
classify_taxonomic_resolution <- function(data, data_classification_table, taxonomic_resolution) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    all(c("taxon", "dataset_name", "age", "pollen_prop") %in% colnames(data)),
    msg = "data must contain columns: taxon, dataset_name, age, and pollen_prop"
  )

  assertthat::assert_that(
    is.data.frame(data_classification_table),
    msg = "data_classification_table must be a data frame"
  )

  assertthat::assert_that(
    is.character(taxonomic_resolution) && length(taxonomic_resolution) == 1,
    msg = "taxonomic_resolution must be a single character string"
  )

  assertthat::assert_that(
    taxonomic_resolution %in% c("family", "genus", "species"),
    msg = "taxonomic_resolution must be one of 'family', 'genus', or 'species'"
  )

  assertthat::assert_that(
    any(taxonomic_resolution %in% colnames(data_classification_table)),
    msg = "taxonomic_resolution must be a column in data_classification_table"
  )

  data_classification_table_sub <-
    data_classification_table %>%
    dplyr::select(sel_name, !!taxonomic_resolution)

  data_classified <-
    data %>%
    dplyr::left_join(
      data_classification_table_sub,
      by = dplyr::join_by("taxon" == "sel_name")
    ) %>%
    dplyr::select(-taxon) %>%
    dplyr::rename(
      taxon = !!taxonomic_resolution
    )

  # make dummy table with all dataset_name and age combinations
  #   this is needed to ensure that all combinations are present in the
  #   final output to preserve true negative values
  data_dataset_age_cross_ref <-
    data_classified %>%
    dplyr::distinct(dataset_name, age, taxon)

  res <-
    data_classified %>%
    tidyr::drop_na(pollen_prop) %>%
    dplyr::group_by(
      dataset_name, age, taxon
    ) %>%
    dplyr::summarise(
      .groups = "drop",
      pollen_prop = sum(pollen_prop)
    ) %>%
    dplyr::full_join(
      data_dataset_age_cross_ref,
      by = c("dataset_name", "age", "taxon")
    ) %>%
    dplyr::arrange(age, dataset_name, taxon) %>%
    dplyr::select(
      names(data)
    )

  return(res)
}

#' @title Extract Age from String
#' @description
#' Extracts the age value from a vector of strings, taking all characters
#' after the double underscore ("__").
#' @param vec_names
#' A character vector containing names with the format "dataset__age".
#' @return
#' A character vector of age values.
#' @export
get_age_from_string <- function(vec_names) {
  assertthat::assert_that(
    is.character(vec_names),
    msg = "Input must be a character vector."
  )

  assertthat::assert_that(
    length(vec_names) > 0,
    msg = "Input vector must not be empty."
  )

  assertthat::assert_that(
    all(stringr::str_detect(vec_names, "__")),
    msg = "Input strings must contain '__' to extract age."
  )

  vec_names %>%
    # get all values after "__"
    stringr::str_extract("__(.*)") %>%
    # remove "__"
    stringr::str_remove("__") %>%
    stringr::str_trim() %>%
    return()
}

#' @title Replace NA in Community Data with Zeros
#' @description
#' Replaces NA values in community data with zeros.
#' @param data
#' A data frame. Must contain `dataset_name` and `sample_name` columns, and
#' at least one taxon column.
#' @return
#' A data frame with NA values replaced by zeros.
#' @details
#' Converts the data to long format, replaces NA values in `pollen_count`
#' with zeros, and reshapes it back to wide format.
#' @export
replace_na_community_data_with_zeros <- function(data = NULL) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )
  assertthat::assert_that(
    all(c("dataset_name", "sample_name") %in% colnames(data)),
    msg = "data must contain columns 'dataset_name' and 'sample_name'"
  )

  assertthat::assert_that(
    colnames(data) %>%
      length() > 2,
    msg = "data must contain at least one taxon column"
  )

  data %>%
    tidyr::pivot_longer(
      cols = !c("dataset_name", "sample_name"),
      names_to = "taxon",
      values_to = "pollen_count"
    ) %>%
    dplyr::mutate(
      pollen_count = dplyr::if_else(is.na(pollen_count), 0, pollen_count)
    ) %>%
    tidyr::pivot_wider(
      names_from = "taxon",
      values_from = "pollen_count"
    ) %>%
    return()
}

#' @title Make Community Data Long
#' @description
#' Converts community data from wide format to long format.
#' @param data
#' A data frame. Must contain `dataset_name` and `sample_name` columns.
#' @return
#' A data frame in long format with columns `dataset_name`, `sample_name`,
#' `taxon`, and `pollen_count`.
#' @details
#' Uses `tidyr::pivot_longer` to reshape the data, dropping NA values in the
#' process.
#' @export
make_community_data_long <- function(data = NULL) {
  assertthat::assert_that(
    is.data.frame(data),
    msg = "data must be a data frame"
  )

  assertthat::assert_that(
    all(c("dataset_name", "sample_name") %in% colnames(data)),
    msg = "data must contain columns 'dataset_name' and 'sample_name'"
  )

  data %>%
    tidyr::pivot_longer(
      cols = !c("dataset_name", "sample_name"),
      names_to = "taxon",
      values_to = "pollen_count",
      values_drop_na = TRUE
    ) %>%
    return()
}

#' @title Add Dataset Name Column from Row Names
#' @description
#' Adds a 'dataset_name' column to a data frame by extracting dataset names
#' from row names.
#' @param data
#' A data frame with row names in the format "dataset__something".
#' @return
#' The input data frame with an added 'dataset_name' column.
#' @export
add_dataset_name_column_from_rownames <- function(data) {
  row_names <-
    row.names(data) %>%
    get_dataset_name_from_string()

  data %>%
    dplyr::mutate(
      dataset_name = row_names
    ) %>%
    return()
}

#' @title Add Age Column from Row Names
#' @description
#' Adds an 'age' column to a data frame by extracting age values from row
#' names.
#' @param data
#' A data frame with row names in the format "dataset__age".
#' @return
#' The input data frame with an added 'age' column.
#' @export
add_age_column_from_rownames <- function(data) {
  row_names <-
    row.names(data) %>%
    get_age_from_string()

  data %>%
    dplyr::mutate(
      age = as.numeric(row_names)
    ) %>%
    return()
}

#' @title Transform to Proportions
#' @description
#' Transforms pollen count data into proportions based on total pollen count.
#' @param data
#' A data frame containing pollen count data.
#' @param pollen_sum
#' A data frame with total pollen counts for each sample.
#' @return
#' A data frame with pollen proportions, excluding `pollen_sum` and
#' `pollen_count` columns.
#' @details
#' Joins the input data with total pollen counts and calculates proportions
#' using `dplyr::mutate`.
#' @export
transform_to_proportions <- function(data = NULL, pollen_sum = NULL) {
  data %>%
    dplyr::left_join(
      pollen_sum,
      by = "sample_name"
    ) %>%
    dplyr::mutate(
      pollen_prop = pollen_count / pollen_sum,
      .after = pollen_count
    ) %>%
    dplyr::select(
      !c("pollen_sum", "pollen_count")
    ) %>%
    return()
}

#' @title Interpolate Community Data
#' @description
#' Transforms community data to proportions, interpolates it, and returns it.
#' @param data
#' A data frame containing community data to be transformed and interpolated.
#' @param ...
#' Additional arguments passed to the `interpolate_data` function.
#' @return
#' A data frame with interpolated community data.
#' @details
#' Transforms data to proportions using `transform_to_proportions` and total
#' pollen count from `get_pollen_sum`. Then interpolates using `interpolate_data`.
#' @export
interpolate_community_data <- function(data, ...) {
  data %>%
    transform_to_proportions(pollen_sum = get_pollen_sum(data)) %>%
    interpolate_data(by = c("dataset_name", "taxon"), ...) %>%
    return()
}

#' @title Make Classification Table
#' @description
#' Creates a wide-format taxonomic classification table from a list of data
#' frames containing taxon names and ranks (e.g., family, genus, species).
#' @param data A list of data frames, each containing columns 'sel_name',
#'   'rank', and 'name'.
#' @return A data frame in wide format with columns for each taxonomic rank
#'   and one row per 'sel_name'.
#' @details
#' Filters for relevant taxonomic ranks, removes duplicates, and pivots the
#' table to wide format with one column per rank.
#' @export
make_classification_table <- function(data) {
  dplyr::bind_rows(data) %>%
    dplyr::filter(
      rank %in% c(
        "family", "genus", "species"
      )
    ) %>%
    dplyr::distinct(
      sel_name, rank, name
    ) %>%
    tidyr::pivot_wider(
      names_from = rank,
      values_from = name
    ) %>%
    return()
}

1		#' @title Extract Dataset Name from String
2		#' @description
3		#' Extracts the dataset name from a vector of strings, taking all characters
4		#' before the double underscore ("__").
5		#' @param vec_names
6		#' A character vector containing names with the format "dataset__something".
7		#' @return
8		#' A character vector of dataset names.
9		#' @export
10		get_dataset_name_from_string <- function(vec_names) {
11	69x	assertthat::assert_that(
12	69x	is.character(vec_names),
13	69x	msg = "Input must be a character vector."
14		)
15
16	64x	assertthat::assert_that(
17	64x	length(vec_names) > 0,
18	64x	msg = "Input vector must not be empty."
19		)
20
21	63x	assertthat::assert_that(
22	63x	all(stringr::str_detect(vec_names, "__")),
23	63x	msg = "Input strings must contain '__' to extract dataset names."
24		)
25
26	60x	vec_names %>%
27		# get all values before double "__"
28	60x	stringr::str_extract(".*(?=__)") %>%
29	60x	stringr::str_trim() %>%
30	60x	return()
31		}

1		#' @title Add Model Evaluation
2		#' @description
3		#' Evaluates a fitted Hmsc model using predicted data and returns a list
4		#' containing the model and its evaluation.
5		#' @param mod_fitted A fitted Hmsc model object. Must be of class 'Hmsc'.
6		#' @param data_pred An array of predicted values. Must be of class 'array'.
7		#' @return
8		#' A list with two elements: the fitted model ('mod') and the evaluation
9		#' results ('eval').
10		#' @seealso Hmsc::evaluateModelFit
11		#' @export
12		add_model_evaluation <- function(mod_fitted = NULL,
13		data_pred = NULL) {
14	10x	assertthat::assert_that(
15	10x	class(mod_fitted) == "Hmsc",
16	10x	msg = "mod_fitted must be of class Hmsc"
17		)
18
19	8x	assertthat::assert_that(
20	8x	class(data_pred) == "array",
21	8x	msg = "data_pred must be an array"
22		)
23
24	8x	list_eval <-
25	8x	Hmsc::evaluateModelFit(
26	8x	hM = mod_fitted,
27	8x	predY = data_pred
28		)
29
30	8x	res <-
31	8x	list(
32	8x	mod = mod_fitted,
33	8x	eval = list_eval
34		)
35
36	8x	return(res)
37		}

1		#' @title Get Community Taxa
2		#' @description
3		#' Extracts a vector of unique taxa from a community data frame.
4		#' @param data A data frame containing a 'taxon' column.
5		#' @return A character vector of unique taxon names present in the data.
6		#' @details
7		#' Uses dplyr to select distinct values from the 'taxon' column and returns
8		#' them as a vector.
9		#' @export
10		get_community_taxa <- function(data) {
11
12	4x	assertthat::assert_that(
13	4x	is.data.frame(data),
14	4x	msg = "data must be a data frame"
15		)
16
17	3x	assertthat::assert_that(
18	3x	"taxon" %in% colnames(data),
19	3x	msg = "data must contain a 'taxon' column"
20		)
21
22	2x	data_res <-
23	2x	data %>%
24	2x	dplyr::distinct(taxon) %>%
25	2x	dplyr::pull(taxon)
26
27	2x	return(data_res)
28		}

1		#' @title Interpolate Data
2		#' @description
3		#' Interpolates data over a specified age range and timestep using a method.
4		#' @param data
5		#' A data frame containing the data to be interpolated.
6		#' @param age_var
7		#' Name of the age variable column (default: "age").
8		#' @param value_var
9		#' Name of the value variable column (default: "pollen_prop").
10		#' @param method
11		#' Interpolation method to use (default: "linear").
12		#' @param rule
13		#' Integer specifying the extrapolation rule (default: 1).
14		#' @param ties
15		#' Function to handle tied values (default: `mean`).
16		#' @param age_min
17		#' Minimum age for interpolation (default: 0).
18		#' @param age_max
19		#' Maximum age for interpolation (default: 12000).
20		#' @param timestep
21		#' Timestep for interpolation (default: 500).
22		#' @return
23		#' A data frame with interpolated values, including dataset name, taxon, age,
24		#' and value columns.
25		#' @details
26		#' Nests data by dataset and taxon, performs interpolation using `stats::approx`,
27		#' and returns the interpolated data in a flat format.
28		#' @seealso [stats::approx()]
29		#' @export
30		interpolate_data <- function(data = NULL,
31		by = "dataset_name",
32		age_var = "age",
33		value_var = "pollen_prop",
34		method = "linear",
35		rule = 1,
36		ties = mean,
37		age_min = 0,
38		age_max = 12e03,
39		timestep = 500) {
40	8x	assertthat::assert_that(
41	8x	is.data.frame(data),
42	8x	msg = "data must be a data frame"
43		)
44
45	4x	assertthat::assert_that(
46	4x	is.character(by) && length(by) > 0,
47	4x	msg = "by must be a character vector with at least one element"
48		)
49
50	4x	assertthat::assert_that(
51	4x	all(by %in% colnames(data)),
52	4x	msg = paste0(
53	4x	"data must contain the following columns: ",
54	4x	paste(by, collapse = ", ")
55		)
56		)
57
58	4x	assertthat::assert_that(
59	4x	is.character(age_var) && length(age_var) == 1,
60	4x	msg = "age_var must be a single character string"
61		)
62
63	4x	assertthat::assert_that(
64	4x	is.character(value_var) && length(value_var) == 1,
65	4x	msg = "value_var must be a single character string"
66		)
67
68	4x	assertthat::assert_that(
69	4x	is.character(method) && length(method) == 1,
70	4x	msg = "method must be a single character string"
71		)
72
73	4x	assertthat::assert_that(
74	4x	is.numeric(rule) && length(rule) == 1,
75	4x	msg = "rule must be a single numeric value"
76		)
77
78	4x	assertthat::assert_that(
79	4x	is.function(ties),
80	4x	msg = "ties must be a function"
81		)
82
83	4x	assertthat::assert_that(
84	4x	is.numeric(age_min) && length(age_min) == 1,
85	4x	msg = "age_min must be a single numeric value"
86		)
87
88	4x	assertthat::assert_that(
89	4x	is.numeric(age_max) && length(age_max) == 1,
90	4x	msg = "age_max must be a single numeric value"
91		)
92
93	4x	assertthat::assert_that(
94	4x	age_min < age_max,
95	4x	msg = "age_min must be less than age_max"
96		)
97
98	4x	assertthat::assert_that(
99	4x	is.numeric(timestep) && length(timestep) == 1,
100	4x	msg = "timestep must be a single numeric value"
101		)
102
103	4x	assertthat::assert_that(
104	4x	timestep > 0,
105	4x	msg = "timestep must be greater than 0"
106		)
107
108	4x	data %>%
109	4x	tidyr::nest(
110	4x	data_nested = !dplyr::any_of(by)
111		) %>%
112	4x	dplyr::mutate(
113	4x	data_interpolated = purrr::map(
114	4x	.x = data_nested,
115	4x	.f = purrr::possibly(
116	4x	.f = ~ .x %>%
117	4x	dplyr::select(
118	4x	!!rlang::sym(age_var),
119	4x	!!rlang::sym(value_var)
120		) %>%
121	4x	grDevices::xy.coords() %>%
122	4x	stats::approx(
123	4x	xout = seq(
124	4x	age_min,
125	4x	age_max,
126	4x	by = timestep
127		),
128	4x	ties = ties,
129	4x	method = method,
130	4x	rule = rule
131		) %>%
132	4x	tibble::as_tibble() %>%
133	4x	dplyr::rename(
134	4x	!!rlang::sym(age_var) := x,
135	4x	!!rlang::sym(value_var) := y
136		),
137	4x	otherwise = NULL
138		)
139		)
140		) %>%
141	4x	tidyr::unnest(data_interpolated) %>%
142	4x	dplyr::select(
143	4x	dplyr::any_of(by),
144	4x	!!rlang::sym(age_var),
145	4x	!!rlang::sym(value_var)
146		) %>%
147	4x	return()
148		}

1		#' @title Add Age to Community Data
2		#' @description
3		#' Merges community data with age data based on dataset and sample names.
4		#' @param data_community
5		#' A data frame containing community data. Must include `dataset_name` and
6		#' `sample_name` columns.
7		#' @param data_ages
8		#' A data frame containing age data. Must include `dataset_name` and
9		#' `sample_name` columns.
10		#' @return
11		#' A data frame with community data merged with the corresponding age data.
12		#' @details
13		#' Performs a left join between community data and age data using
14		#' `dataset_name` and `sample_name` as keys.
15		#' @export
16		add_age_to_samples <- function(data_community = NULL, data_ages = NULL) {
17	4x	assertthat::assert_that(
18	4x	is.data.frame(data_community),
19	4x	msg = "data_community must be a data frame"
20		)
21	2x	assertthat::assert_that(
22	2x	is.data.frame(data_ages),
23	2x	msg = "data_ages must be a data frame"
24		)
25
26	2x	assertthat::assert_that(
27	2x	all(c("dataset_name", "sample_name") %in% colnames(data_community)),
28	2x	msg = "data_community must contain columns 'dataset_name' and 'sample_name'"
29		)
30	2x	assertthat::assert_that(
31	2x	all(c("dataset_name", "sample_name") %in% colnames(data_ages)),
32	2x	msg = "data_ages must contain columns 'dataset_name' and 'sample_name'"
33		)
34
35	2x	dplyr::left_join(
36	2x	x = data_community,
37	2x	y = data_ages,
38	2x	by = c("dataset_name", "sample_name")
39		) %>%
40	2x	return()
41		}

1		#' @title Get Abiotic Data
2		#' @description
3		#' Extracts abiotic data from a data frame containing nested abiotic
4		#' information.
5		#' @param data
6		#' A data frame. Must contain the columns `dataset_name` and `data_abiotic`.
7		#' @return
8		#' A data frame with columns `dataset_name`, `sample_name`,
9		#' `abiotic_variable_name`, and `abiotic_value`.
10		#' @details
11		#' Validates the input data frame, ensures required columns are present, and
12		#' unnests the `data_abiotic` column.
13		#' @export
14		get_abiotic_data <- function(data = NULL) {
15	4x	assertthat::assert_that(
16	4x	is.data.frame(data),
17	4x	msg = "data must be a data frame"
18		)
19
20	2x	assertthat::assert_that(
21	2x	all(c("dataset_name", "data_abiotic") %in% colnames(data)),
22	2x	msg = "data must contain columns 'dataset_name' and 'data_abiotic'"
23		)
24	2x	data %>%
25	2x	dplyr::select(dataset_name, data_abiotic) %>%
26	2x	tidyr::unnest(
27	2x	cols = c(data_abiotic)
28		) %>%
29	2x	dplyr::select(dataset_name, sample_name, abiotic_variable_name, abiotic_value) %>%
30	2x	return()
31		}

1		#' @title Check and Prepare Data for Model Fitting
2		#' @description
3		#' Cleans and prepares community, abiotic, and coordinate data for HMSC model
4		#' fitting, with optional age subsetting.
5		#' @param data_community
6		#' Data frame of community data.
7		#' @param data_abiotic
8		#' Data frame of abiotic data.
9		#' @param data_coords
10		#' Data frame of coordinates.
11		#' @param subset_age
12		#' Optional age value to subset data.
13		#' @seealso [make_hmsc_model()]
14		#' @return
15		#' A list of cleaned and prepared data frames for model fitting.
16		#' @export
17		check_and_prepare_data_for_fit <- function(
18		data_community = NULL,
19		data_abiotic = NULL,
20		data_coords = NULL,
21		subset_age = NULL) {
22	21x	`%>%` <- magrittr::`%>%`
23
24	21x	data_community_no_na <-
25	21x	tidyr::drop_na(data_community)
26
27	18x	data_abiotic_no_na <-
28	18x	tidyr::drop_na(data_abiotic)
29
30	15x	data_coords_no_na <-
31	15x	tidyr::drop_na(data_coords) %>%
32	15x	dplyr::distinct()
33
34		if (
35	12x	!is.null(subset_age)
36		) {
37	2x	data_community_no_na <-
38	2x	data_community_no_na %>%
39	2x	add_age_column_from_rownames() %>%
40	2x	dplyr::filter(age == subset_age) %>%
41	2x	dplyr::select(-age)
42
43	2x	data_abiotic_no_na <-
44	2x	data_abiotic_no_na %>%
45	2x	add_age_column_from_rownames() %>%
46	2x	dplyr::filter(age == subset_age) %>%
47	2x	dplyr::select(-age)
48		}
49
50	12x	data_community_rownames <-
51	12x	data_community_no_na %>%
52	12x	add_age_column_from_rownames() %>%
53	12x	add_dataset_name_column_from_rownames() %>%
54	12x	dplyr::distinct(dataset_name, age)
55
56	11x	data_abiotic_rownames <-
57	11x	data_abiotic_no_na %>%
58	11x	add_age_column_from_rownames() %>%
59	11x	add_dataset_name_column_from_rownames() %>%
60	11x	dplyr::distinct(dataset_name, age)
61
62	11x	data_coords_rownames <-
63	11x	data_coords_no_na %>%
64	11x	tibble::rownames_to_column("dataset_name") %>%
65	11x	dplyr::distinct(dataset_name)
66
67	11x	data_intersect <-
68	11x	dplyr::inner_join(
69	11x	data_community_rownames,
70	11x	data_abiotic_rownames,
71	11x	by = dplyr::join_by(dataset_name, age)
72		) %>%
73	11x	dplyr::inner_join(
74	11x	data_coords_rownames,
75	11x	by = dplyr::join_by(dataset_name)
76		) %>%
77	11x	dplyr::distinct()
78
79	11x	data_community_to_fit <-
80	11x	data_community_no_na %>%
81	11x	add_age_column_from_rownames() %>%
82	11x	add_dataset_name_column_from_rownames() %>%
83	11x	tibble::rownames_to_column("row_names") %>%
84	11x	dplyr::inner_join(
85	11x	data_intersect,
86	11x	by = dplyr::join_by(dataset_name, age)
87		) %>%
88	11x	dplyr::select(-dataset_name, -age) %>%
89	11x	tibble::column_to_rownames("row_names")
90
91
92	11x	data_abiotic_to_fit <-
93	11x	data_abiotic_no_na %>%
94	11x	add_age_column_from_rownames() %>%
95	11x	add_dataset_name_column_from_rownames() %>%
96	11x	tibble::rownames_to_column("row_names") %>%
97	11x	dplyr::inner_join(
98	11x	data_intersect,
99	11x	by = dplyr::join_by(dataset_name, age)
100		) %>%
101	11x	dplyr::select(-dataset_name, -age) %>%
102	11x	tibble::column_to_rownames("row_names")
103
104	11x	vec_shared_dataset_names <-
105	11x	data_intersect %>%
106	11x	dplyr::distinct(dataset_name) %>%
107	11x	purrr::chuck("dataset_name") %>%
108	11x	as.character()
109
110	11x	data_coords_to_fit <-
111	11x	data_coords_no_na %>%
112	11x	as.data.frame() %>%
113	11x	tibble::rownames_to_column("row_names") %>%
114	11x	dplyr::filter(row_names %in% vec_shared_dataset_names) %>%
115	11x	tibble::column_to_rownames("row_names")
116
117	11x	vec_age <-
118	11x	data_intersect %>%
119	11x	dplyr::distinct(age) %>%
120	11x	purrr::chuck("age") %>%
121	11x	as.numeric() %>%
122	11x	sort() %>%
123	11x	as.character()
124
125	11x	data_ages_to_fit <-
126	11x	tibble::tibble(
127	11x	age = as.numeric(vec_age),
128	11x	row_names = age
129		) %>%
130	11x	tibble::column_to_rownames("row_names")
131
132	11x	res <-
133	11x	list(
134	11x	data_community_to_fit = data_community_to_fit,
135	11x	data_abiotic_to_fit = data_abiotic_to_fit,
136	11x	data_ages_to_fit = data_ages_to_fit,
137	11x	data_coords_to_fit = data_coords_to_fit
138		)
139
140	11x	return(res)
141		}

1		#' @title Fit HMSC Model
2		#' @description
3		#' Sample the MCMC process for the HMSC model.
4		#' @param mod_hmsc
5		#' An unfitted HMSC model object.
6		#' @param n_chains
7		#' Number of MCMC chains (default: 20).
8		#' @param n_samples
9		#' Number of MCMC samples (default: 10,000).
10		#' @param n_thin
11		#' Thinning interval for MCMC samples (default: 5).
12		#' @param n_transient
13		#' Number of transient iterations (default: 2,500).
14		#' @param n_parallel
15		#' Number of parallel chains (default: 20).
16		#' @param n_samples_verbose
17		#' Verbosity interval for MCMC sampling (default: 500).
18		#' @return
19		#' Returns a fitted HMSC model object.
20		#' @export
21		fit_hmsc_model <- function(
22		mod_hmsc,
23		n_chains = 20,
24		n_samples = 10e3,
25		n_thin = 1,
26		n_transient = 2500,
27		n_parallel = 20,
28		n_samples_verbose = 500) {
29	22x	assertthat::assert_that(
30	22x	inherits(mod_hmsc, "Hmsc"),
31	22x	msg = "mod_hmsc must be an HMSC model object"
32		)
33
34	20x	assertthat::assert_that(
35	20x	is.numeric(n_chains) && n_chains > 0,
36	20x	msg = "n_chains must be a positive number"
37		)
38
39	18x	assertthat::assert_that(
40	18x	is.numeric(n_samples) && n_samples > 0,
41	18x	msg = "n_samples must be a positive number"
42		)
43
44	16x	assertthat::assert_that(
45	16x	is.numeric(n_thin) && n_thin > 0,
46	16x	msg = "n_thin must be a positive number"
47		)
48
49	14x	assertthat::assert_that(
50	14x	is.numeric(n_transient) && n_transient > 0,
51	14x	msg = "n_transient must be a positive number"
52		)
53
54	12x	assertthat::assert_that(
55	12x	is.numeric(n_parallel) && n_parallel > 0,
56	12x	msg = "n_parallel must be a positive number"
57		)
58
59	10x	assertthat::assert_that(
60	10x	is.numeric(n_samples_verbose) && n_samples_verbose > 0,
61	10x	msg = "n_samples_verbose must be a positive number"
62		)
63
64	8x	mod_hmsc_fitted <-
65	8x	Hmsc::sampleMcmc(
66	8x	mod_hmsc,
67	8x	nChains = n_chains,
68	8x	samples = n_samples,
69	8x	thin = n_thin,
70	8x	transient = n_transient,
71	8x	verbose = n_samples_verbose,
72	8x	nParallel = n_parallel
73		)
74
75	8x	return(mod_hmsc_fitted)
76		}

1		#' @title Prepare Data for Model Fitting
2		#' @description
3		#' Prepares community or abiotic data for model fitting by reshaping it into
4		#' a wide format with appropriate column names.
5		#' @param data
6		#' A data frame containing the input data. For `type = "community"`, it must
7		#' include columns `dataset_name`, `age`, `taxon`, and `pollen_prop`. For
8		#' `type = "abiotic"`, it must include columns `dataset_name`, `age`,
9		#' `abiotic_variable_name`, and `abiotic_value`.
10		#' @param type
11		#' A character string specifying the type of data to prepare. Must be either
12		#' "community" or "abiotic" (default: "community").
13		#' @return
14		#' A data frame in wide format, with `sample_name` as row names and either
15		#' taxa or abiotic variable names as columns. For community data, missing
16		#' values are filled with 0. For abiotic data, missing values are left as NA.
17		#' @details
18		#' The function validates the input data and reshapes it based on the
19		#' specified `type`. For community data, it combines `dataset_name` and `age`
20		#' into a `sample_name` column, selects relevant columns, and pivots the data
21		#' to a wide format. For abiotic data, it performs similar steps but uses
22		#' abiotic variable names and values.
23		#' @export
24		prepare_data_for_fit <- function(data = NULL, type = c("community", "abiotic")) {
25	9x	assertthat::assert_that(
26	9x	is.data.frame(data),
27	9x	msg = "data must be a data frame"
28		)
29
30	7x	type <- match.arg(type)
31
32	6x	assertthat::assert_that(
33	6x	type %in% c("community", "abiotic"),
34	6x	msg = "type must be either 'community' or 'abiotic'"
35		)
36
37	6x	res <-
38	6x	switch(type,
39	6x	"community" = data %>%
40	6x	dplyr::mutate(
41	6x	sample_name = paste0(dataset_name, "__", age),
42		) %>%
43	6x	dplyr::select("sample_name", "taxon", "pollen_prop") %>%
44	6x	tidyr::pivot_wider(
45	6x	names_from = "taxon",
46	6x	values_from = "pollen_prop",
47	6x	values_fill = 0
48		) %>%
49	6x	tibble::column_to_rownames("sample_name"),
50	6x	"abiotic" = data %>%
51	6x	dplyr::mutate(
52	6x	sample_name = paste0(dataset_name, "__", age),
53		) %>%
54	6x	dplyr::select(
55	6x	"sample_name", "abiotic_variable_name", "abiotic_value"
56		) %>%
57	6x	tidyr::pivot_wider(
58	6x	names_from = "abiotic_variable_name",
59	6x	values_from = "abiotic_value",
60	6x	values_fill = NULL
61		) %>%
62	6x	tibble::column_to_rownames("sample_name")
63		)
64
65	4x	return(res)
66		}

1		#' @title Save Progress Visualisation
2		#' @description
3		#' Generates a visualisation of project progress and saves it as HTML and PNG.
4		#' @param sel_script
5		#' The script file to be visualised.
6		#' @param output_file
7		#' The name of the output file (default: "project_status").
8		#' @param output_dir
9		#' Directory where the output files will be saved (default: "Outputs/Figures").
10		#' @param level_separation
11		#' Level separation for the visualisation graph (default: 250).
12		#' @details
13		#' Uses `targets::tar_visnetwork` to create a network graph and saves it as
14		#' HTML using `visNetwork::visSave`. Also generates a static PNG image using
15		#' `webshot2::webshot`.
16		#' @export
17		save_progress_visualisation <- function(
18		sel_script,
19		output_file = "project_status",
20		output_dir = here::here(
21		"Documentation/Progress"
22		),
23		background_color = "white",
24		level_separation = 250) {
25	!	network_graph <-
26	!	targets::tar_visnetwork(
27	!	script = sel_script,
28	!	outdated = FALSE,
29	!	store = get_active_config("target_store"),
30	!	targets_only = FALSE,
31	!	level_separation = level_separation
32		)
33
34	!	network_graph_static <-
35	!	targets::tar_visnetwork(
36	!	script = sel_script,
37	!	store = get_active_config("target_store"),
38	!	targets_only = TRUE,
39	!	outdated = FALSE,
40	!	level_separation = level_separation
41		)
42
43	!	visNetwork::visSave(
44	!	graph = network_graph,
45	!	file = here::here("Documentation/Progress/project_status.html"),
46	!	selfcontained = TRUE,
47	!	background = background_color
48		)
49
50	!	visNetwork::visSave(
51	!	graph = network_graph_static,
52	!	file = paste0(output_dir, "/", output_file, "_small.html"),
53	!	selfcontained = TRUE,
54	!	background = background_color
55		)
56
57	!	webshot2::webshot(
58	!	url = paste0(output_dir, "/", output_file, "_small.html"),
59	!	file = paste0(output_dir, "/", output_file, "_static.png"),
60	!	vwidth = 950,
61	!	vheight = 750
62		)
63		}

1		#' @title Select N Taxa
2		#' @description
3		#' Selects the top N taxa based on their occurrence across datasets.
4		#' @param data
5		#' A data frame containing the input data. Must include columns "taxon" and
6		#' the column specified in the `per` parameter.
7		#' @param n_taxa
8		#' A numeric value specifying the number of taxa to select. Default is Inf.
9		#' @param per
10		#' A character string specifying the column name to group by. Default is
11		#' "dataset_name".
12		#' @return
13		#' A data frame containing the filtered data with the top N taxa.
14		#' @details
15		#' The function identifies the most common taxa across datasets by counting
16		#' their occurrences. It then filters the input data to include only the
17		#' selected taxa. If no taxa are found, an error is raised.
18		#' @export
19		select_n_taxa <- function(
20		data = NULL,
21		n_taxa = Inf,
22		per = "dataset_name") {
23	11x	assertthat::assert_that(
24	11x	is.data.frame(data),
25	11x	msg = "data must be a data frame"
26		)
27
28	9x	assertthat::assert_that(
29	9x	is.character(per) && length(per) == 1,
30	9x	msg = "per must be a single character string"
31		)
32
33	8x	assertthat::assert_that(
34	8x	all(
35	8x	c("taxon", per) %in% names(data)
36		),
37	8x	msg = paste(
38	8x	"data must contain the following columns:",
39	8x	paste(c("taxon", per), collapse = ", ")
40		)
41		)
42
43	6x	assertthat::assert_that(
44	6x	is.numeric(n_taxa),
45	6x	msg = "n_taxa must be a number"
46		)
47
48	4x	assertthat::assert_that(
49	4x	n_taxa > 0,
50	4x	msg = "n_taxa must be greater than 0"
51		)
52
53
54	3x	vec_common_taxa <-
55	3x	data %>%
56	3x	dplyr::distinct(taxon, !!rlang::sym(per)) %>%
57	3x	dplyr::group_by(taxon) %>%
58	3x	dplyr::summarise(
59	3x	.groups = "drop",
60	3x	n_datasets = dplyr::n()
61		) %>%
62	3x	dplyr::slice_max(n = n_taxa, order_by = n_datasets) %>%
63	3x	dplyr::pull(taxon)
64
65	3x	res <-
66	3x	data %>%
67	3x	dplyr::filter(taxon %in% vec_common_taxa)
68
69
70	3x	assertthat::assert_that(
71	3x	nrow(res) > 0,
72	3x	msg = paste(
73	3x	"No taxa found in data. Please check the input data.",
74	3x	"The number of taxa selected is too high."
75		)
76		)
77
78	3x	return(res)
79		}

1		#' @title Get Random Structure for Model
2		#' @description
3		#' Constructs the random structure for HMSC models based on age and/or space.
4		#' @param data
5		#' A list containing data for model fitting from the function
6		#' [check_and_prepare_data_for_fit()].
7		#' @param type
8		#' Character vector specifying random effect types ("age", "space").
9		#' @param min_knots_distance
10		#' Minimum distance between knots for spatial random effect (optional).
11		#' Only used if "space" is in `type`.
12		#' @return
13		#' A list describing the random structure for the model.
14		#' @export
15		get_random_structure_for_model <- function(
16		data = NULL,
17		type = c("age", "space"),
18		min_knots_distance = NULL) {
19	15x	`%>%` <- magrittr::`%>%`
20
21	15x	assertthat::assert_that(
22	15x	class(data) == "list",
23	15x	msg = "data must be a list"
24		)
25
26	13x	assertthat::assert_that(
27	13x	length(data) == 4,
28	13x	msg = "data must contain four elements"
29		)
30
31	12x	assertthat::assert_that(
32	12x	all(
33	12x	names(data) %in% c(
34	12x	"data_community_to_fit",
35	12x	"data_abiotic_to_fit",
36	12x	"data_ages_to_fit",
37	12x	"data_coords_to_fit"
38		)
39		),
40	12x	msg = "data must contain the elements: data_community_to_fit, data_abiotic_to_fit, data_ages_to_fit, data_coords_to_fit"
41		)
42
43
44	12x	assertthat::assert_that(
45	12x	is.character(type),
46	12x	msg = "type must be a character vector"
47		)
48
49	11x	assertthat::assert_that(
50	11x	length(type) > 0,
51	11x	msg = "type must be a non-empty character vector"
52		)
53
54	11x	assertthat::assert_that(
55	11x	all(type %in% c("age", "space")),
56	11x	msg = "type must be one of 'age' or 'space'"
57		)
58
59	9x	assertthat::assert_that(
60	9x	is.null(min_knots_distance) \|\| is.numeric(min_knots_distance),
61	9x	msg = "min_knots_distance must be NULL or a numeric value"
62		)
63
64	8x	assertthat::assert_that(
65	8x	is.null(min_knots_distance) \|\| min_knots_distance > 0,
66	8x	msg = "min_knots_distance must be NULL or a positive numeric value"
67		)
68
69	7x	list_random <- list()
70
71	7x	data_study_design <-
72	7x	data %>%
73	7x	purrr::chuck("data_community_to_fit") %>%
74	7x	add_dataset_name_column_from_rownames() %>%
75	7x	dplyr::mutate(
76	7x	dataset_name_factor = as.factor(dataset_name)
77		) %>%
78	7x	dplyr::select(
79	7x	"dataset_name" = "dataset_name_factor"
80		)
81
82		if (
83	7x	"age" %in% type
84		) {
85	5x	data_study_design <-
86	5x	data %>%
87	5x	purrr::chuck("data_community_to_fit") %>%
88	5x	add_age_column_from_rownames() %>%
89	5x	add_dataset_name_column_from_rownames() %>%
90	5x	dplyr::mutate(
91	5x	age_factor = factor(
92	5x	x = age,
93	5x	levels = sort(unique(as.numeric(age))),
94	5x	ordered = TRUE
95		),
96	5x	dataset_name_factor = as.factor(dataset_name)
97		) %>%
98	5x	dplyr::select(
99	5x	"dataset_name" = "dataset_name_factor",
100	5x	"age" = "age_factor"
101		)
102
103	5x	data_age <-
104	5x	data %>%
105	5x	purrr::chuck("data_ages_to_fit") %>%
106	5x	dplyr::mutate(
107	5x	age = as.numeric(age),
108		)
109
110	5x	vec_age <-
111	5x	data_age %>%
112	5x	dplyr::distinct(age) %>%
113	5x	dplyr::pull(age) %>%
114	5x	as.character()
115
116	5x	random_level_age <-
117	5x	Hmsc::HmscRandomLevel(
118	5x	sData = data_age
119		)
120
121	5x	list_random <-
122	5x	c(
123	5x	list_random,
124	5x	list("age" = random_level_age)
125		)
126		}
127
128
129		if (
130	7x	"space" %in% type
131		) {
132	7x	data_coords <-
133	7x	data %>%
134	7x	purrr::chuck("data_coords_to_fit")
135
136	7x	random_coors_knots <-
137	7x	Hmsc::constructKnots(
138	7x	sData = data_coords,
139	7x	minKnotDist = min_knots_distance
140		)
141
142	7x	random_level_coords <-
143	7x	Hmsc::HmscRandomLevel(
144	7x	sData = data_coords,
145	7x	sMethod = "GPP",
146	7x	sKnot = random_coors_knots
147		)
148
149	7x	list_random <-
150	7x	c(
151	7x	list_random,
152	7x	list("dataset_name" = random_level_coords)
153		)
154		}
155
156	7x	list(
157	7x	random_levels = list_random,
158	7x	study_design = data_study_design
159		) %>%
160	7x	return()
161		}

1		#' @title Get Community Data
2		#' @description
3		#' This function processes a data frame containing community data
4		#' and extracts the relevant columns, unnesting the `data_community`
5		#' column in the process.
6		#' @param
7		#' data A data frame. Must contain the columns `dataset_name` and
8		#' `data_community`.
9		#' @return
10		#' A data frame with the `dataset_name` and unnested `data_community` columns.
11		#' @details The function performs the following steps:
12		#' \itemize{
13		#' \item Validates that the input is a data frame.
14		#' \item Ensures the presence of the `dataset_name` and `data_community` columns.
15		#' \item Selects the `dataset_name` and `data_community` columns.
16		#' \item Unnests the `data_community` column.
17		#' }
18		#' @export
19		get_community_data <- function(data = NULL) {
20	4x	assertthat::assert_that(
21	4x	is.data.frame(data),
22	4x	msg = "data must be a data frame"
23		)
24
25	2x	assertthat::assert_that(
26	2x	all(c("dataset_name", "data_community") %in% colnames(data)),
27	2x	msg = "data must contain columns 'dataset_name' and 'data_community'"
28		)
29
30	2x	data %>%
31	2x	dplyr::select(dataset_name, data_community) %>%
32	2x	tidyr::unnest(data_community) %>%
33	2x	return()
34		}

1		#' @title Get Sample Ages
2		#' @description
3		#' Extracts sample ages from a data frame containing dataset and sample info.
4		#' @param data
5		#' A data frame. Must contain the columns `dataset_name` and `data_samples`.
6		#' @return
7		#' A data frame with columns `dataset_name`, `sample_name`, and `age`.
8		#' @details
9		#' Validates the input data frame, ensures required columns are present, and
10		#' extracts sample ages by unnesting the `data_samples` column.
11		#' @export
12		get_sample_ages <- function(data = NULL) {
13	4x	assertthat::assert_that(
14	4x	is.data.frame(data),
15	4x	msg = "data must be a data frame"
16		)
17
18	2x	assertthat::assert_that(
19	2x	all(c("dataset_name", "data_samples") %in% colnames(data)),
20	2x	msg = "data must contain columns 'dataset_name' and 'data_samples'"
21		)
22
23	2x	data %>%
24	2x	dplyr::select(dataset_name, data_samples) %>%
25	2x	tidyr::unnest(data_samples) %>%
26	2x	dplyr::select(
27	2x	"dataset_name",
28	2x	"sample_name",
29	2x	"age"
30		) %>%
31	2x	return()
32		}

1		#' @title Select Better Model Based on Fit
2		#' @description
3		#' Compares two models and selects the one with better fit based on Tjur's R2.
4		#' @param list_models
5		#' A list of two fitted model objects.
6		#' @return
7		#' The model object with better fit.
8		#' @export
9		get_better_model_based_on_fit <- function(list_models) {
10	7x	assertthat::assert_that(
11	7x	is.list(list_models),
12	7x	msg = "list_models must be a list of models"
13		)
14
15	5x	assertthat::assert_that(
16	5x	length(list_models) == 2,
17	5x	msg = "list_models must be a list of two models"
18		)
19
20	4x	assertthat::assert_that(
21	4x	!is.null(list_models[[1]]),
22	4x	msg = "The first model in list_models must not be NULL"
23		)
24
25	3x	assertthat::assert_that(
26	3x	!is.null(list_models[[2]]),
27	3x	msg = "The second model in list_models must not be NULL"
28		)
29
30	2x	mod_null <-
31	2x	list_models %>%
32	2x	purrr::chuck(1)
33
34	2x	mod_full <-
35	2x	list_models %>%
36	2x	purrr::chuck(2)
37
38		# by default select the null model
39	2x	res <- mod_null
40
41	2x	null_r2 <-
42	2x	mod_null %>%
43	2x	purrr::chuck("eval", "TjurR2")
44
45	2x	full_r2 <-
46	2x	mod_full %>%
47	2x	purrr::chuck("eval", "TjurR2")
48
49	2x	n_r2 <- length(null_r2)
50
51	2x	n_better_r2 <-
52	2x	sum(
53	2x	null_r2 < full_r2,
54	2x	na.rm = TRUE
55		)
56
57		if (
58	2x	n_r2 / n_better_r2 >= 0.5
59		) {
60	2x	res <-
61	2x	mod_full
62		}
63
64	2x	return(res)
65		}

1		#' @title Compute Species Associations
2		#' @description
3		#' Computes species associations from a fitted Hmsc model object.
4		#' @param data_source
5		#' A list containing a fitted Hmsc model under the 'mod' element.
6		#' Generally, this is the output of the function add_model_evaluation() or
7		#' # get_better_model_based_on_fit()
8		#' @return
9		#' A matrix of species associations.
10		#' @seealso [add_model_evaluation(), get_better_model_based_on_fit()]
11		#' @export
12		get_species_association <- function(data_source) {
13	5x	assertthat::assert_that(
14	5x	is.list(data_source),
15	5x	msg = "data_source must be a list"
16		)
17
18	5x	mod <-
19	5x	data_source %>%
20	5x	purrr::chuck("mod")
21
22	5x	assertthat::assert_that(
23	5x	assertthat::are_equal(
24	5x	class(mod),
25	5x	"Hmsc",
26	5x	msg = "data_source must be of class Hmsc"
27		)
28		)
29
30	5x	res <-
31	5x	Hmsc::computeAssociations(mod) %>%
32	5x	purrr::set_names(
33	5x	nm = mod %>%
34	5x	purrr::chuck("ranLevelsUsed")
35		)
36
37	5x	return(res)
38		}

1		#' @title Get Significant Species Associations
2		#' @description
3		#' Identifies significant species associations based on support and mean
4		#' values for each error level.
5		#' @param data_source
6		#' A list containing association matrices from a fitted Hmsc model.
7		#' Generally, this is the output of the function get_species_association().
8		#' @param alpha
9		#' Significance level for support threshold (default: 0.05).
10		#' @return
11		#' A vector of significant association values.
12		#' @seealso [get_species_association()]
13		#' @export
14		get_significant_associations <- function(data_source, alpha = 0.05) {
15	3x	assertthat::assert_that(
16	3x	is.list(data_source),
17	3x	msg = "The data source must be a list."
18		)
19
20	3x	support_threshold <- 1 - alpha
21
22	3x	assertthat::assert_that(
23	3x	is.numeric(support_threshold) && support_threshold > 0 && support_threshold < 1,
24	3x	msg = "The support threshold must be a numeric value between 0 and 1."
25		)
26
27	3x	assertthat::assert_that(
28	3x	is.list(data_source) && length(data_source) > 0,
29	3x	msg = "The data source is list and not empty."
30		)
31
32	3x	assertthat::assert_that(
33	3x	purrr::map_lgl(
34	3x	.x = data_source,
35	3x	.f = ~ is.list(.x) && all(c("mean", "support") %in% names(.x))
36		) %>%
37	3x	all(),
38	3x	msg = "The data source must contain lists with 'mean' and 'support' matrices."
39		)
40
41	3x	result <-
42	3x	data_source %>%
43	3x	purrr::map(
44	3x	.f = ~ {
45	6x	vec_data_support <-
46	6x	purrr::pluck(.x, "support") %>%
47		{
48	6x	.[lower.tri(., diag = FALSE)]
49		}
50
51	6x	assertthat::assert_that(
52	6x	is.numeric(vec_data_support),
53	6x	msg = "The support values must be numeric vectors."
54		)
55
56	6x	n_values <- length(vec_data_support)
57
58	6x	vec_significant <- (
59	6x	(vec_data_support > support_threshold) +
60	6x	((vec_data_support < (1 - support_threshold)) > 0)
61		)
62
63	6x	n_significant <- sum(vec_significant, na.rm = TRUE)
64
65	6x	res <-
66	6x	list(
67	6x	n_associations = n_values,
68	6x	n_significant = n_significant,
69	6x	proportion_significant = n_significant / n_values
70		)
71		}
72		)
73
74	3x	return(result)
75		}

1		#' @title Get Active Configuration Value
2		#' @description
3		#' Retrieves a specific configuration value from a YAML configuration file.
4		#' @param value
5		#' A character vector specifying the configuration key(s) to retrieve.
6		#' @param file
7		#' Path to the YAML configuration file (default: "config.yml").
8		#' @return
9		#' Value(s) associated with the specified key(s) in the configuration file.
10		#' @details
11		#' Validates input parameters, ensures the file is readable, and retrieves
12		#' configuration value(s) using `config::get`. Active configuration is set by
13		#' the `R_CONFIG_ACTIVE` environment variable.
14		#' @export
15		get_active_config <- function(
16		value = NULL,
17		file = here::here("config.yml")) {
18	3x	assertthat::assert_that(
19	3x	is.character(value) && length(value) > 0,
20	3x	msg = "value must be a character vector with at least one element"
21		)
22
23	2x	assertthat::assert_that(
24	2x	assertthat::is.readable(file) && assertthat::has_extension(file, "yml"),
25	2x	msg = "file must be a readable YAML file"
26		)
27
28	1x	config::get(
29	1x	value = value,
30	1x	config = Sys.getenv("R_CONFIG_ACTIVE"),
31	1x	use_parent = FALSE,
32	1x	file = file
33		) %>%
34	1x	return()
35		}

1		#' @title Check Presence of VegVault File
2		#' @description
3		#' Checks whether the `VegVault.sqlite` file exists in the specified directory.
4		#' @param relative_path
5		#' Relative path to the `VegVault.sqlite` file (default: "Data/Input/Vegvault.sqlite").
6		#' @return
7		#' Logical value indicating whether the file exists. Stops with an error if
8		#' the file is not found.
9		#' @details
10		#' Verifies the presence of the `VegVault.sqlite` file. If not found, throws
11		#' an error with instructions to consult the `Data/Input/README.md` file.
12		#' @export
13		check_presence_of_vegvault <- function(relative_path = "Data/Input/Vegvault.sqlite") {
14	9x	vegvault_present <-
15	9x	file.exists(
16	9x	here::here(relative_path)
17		)
18
19		if (
20	9x	isFALSE(vegvault_present)
21		) {
22	2x	stop(
23	2x	paste(
24	2x	"The VegVault.sqlite file is not present in",
25	2x	"the `Data/Input/` directory.",
26	2x	"Please read the `Data/Input/README.md` file for more information."
27		)
28		)
29
30	!	return(vegvault_present)
31		}
32
33	7x	return(vegvault_present)
34		}

1		#' @title Get Taxa Classification
2		#' @description
3		#' Retrieves taxonomic classification for a vector of taxa using the
4		#' taxospace package. Filters results to include only plant taxa.
5		#' @param data A character vector of taxon names to classify.
6		#' @return A data frame with columns for selected name, taxonomic name,
7		#' rank, and id.
8		#' @details
9		#' Uses taxospace::get_classification to retrieve classification. Flags and
10		#' filters for plant taxa (kingdom Plantae). Returns an empty tibble if no
11		#' plant taxa found.
12		#' @export
13		get_taxa_classification <- function(data) {
14	12x	require(taxospace)
15
16	12x	assertthat::assert_that(
17	12x	is.character(data) && length(data) > 0,
18	12x	msg = "data must be a non-empty character vector"
19		)
20
21	10x	res_classification <-
22	10x	taxospace::get_classification(
23	10x	taxa_vec = data,
24		# this is done so that the best match is returned
25		# even if the result is not flagged as "exact"
26	10x	use_only_exact_match = FALSE
27		)
28
29	10x	res_plant <-
30	10x	res_classification %>%
31		# flag taxa that are plants
32	10x	dplyr::mutate(
33	10x	is_plant = purrr::map_lgl(
34	10x	.x = classification,
35	10x	.f = ~ .x %>%
36	10x	dplyr::filter(rank == "kingdom") %>%
37	10x	dplyr::pull(name) %>%
38	10x	stringr::str_detect("Plantae") %>%
39	10x	any()
40		)
41		) %>%
42		# filter only plant taxa
43	10x	dplyr::filter(is_plant)
44
45		if (
46	8x	isTRUE(nrow(res_plant) == 0)
47		) {
48	!	return(
49	!	tibble::tibble(
50	!	sel_name = data,
51	!	name = character(),
52	!	rank = character(),
53	!	id = integer(),
54		)
55		)
56		}
57
58	8x	res_plant %>%
59	8x	dplyr::select(sel_name, classification) %>%
60	8x	tidyr::unnest(classification) %>%
61	8x	return()
62		}

1		#' @title Calculate the Total Pollen Count for Each Sample
2		#' @description
3		#' This function computes the total pollen count for each sample
4		#' in the provided dataset.
5		#' @param data
6		#' A data frame containing at least two columns:
7		#' `sample_name` (the name or identifier of the sample)
8		#' and `pollen_count` (the count of pollen for each observation).
9		#' @return
10		#' A data frame with two columns: `sample_name` and `pollen_sum`,
11		#' where `pollen_sum` is the total pollen count
12		#' for each sample.
13		#' @details
14		#' The function groups the data by `sample_name`,
15		#' calculates the sum of `pollen_count` for each group,
16		#' and removes any grouping structure before returning the result.
17		#' Missing values (`NA`) in `pollen_count` are ignored.
18		#' @export
19		get_pollen_sum <- function(data) {
20	8x	data %>%
21	8x	dplyr::group_by(sample_name) %>%
22	8x	dplyr::summarize(pollen_sum = sum(pollen_count, na.rm = TRUE)) %>%
23	8x	dplyr::ungroup() %>%
24	8x	return()
25		}

1		#' @title Make HMSC Model
2		#' @description
3		#' Create a Hierarchical Modelling of Species Communities (HMSC) model to
4		#' fit community and abiotic data.
5		#' @param data_to_fit
6		#' A list containing the community and abiotic data to fit the model.
7		#' @param random_structure
8		#' A list containing the random structure for the model, including the
9		#' study design and random levels.
10		#' @param error_family
11		#' A character string specifying the error family. Options are "normal" or
12		#' "binomial" (default: "normal").
13		#' @return
14		#' returns an unfitted HMSC model object.
15		#' @details
16		#' If `error_family` is "binomial", the community data is converted to binary
17		#' presence/absence data, and the error family is set to "probit".
18		#' @export
19		make_hmsc_model <- function(
20		data_to_fit = NULL,
21		sel_formula = NULL,
22		random_structure = NULL,
23		error_family = c("normal", "binomial")) {
24	13x	assertthat::assert_that(
25	13x	is.list(data_to_fit),
26	13x	msg = "data_to_fit must be a list"
27		)
28
29	11x	data_community <-
30	11x	data_to_fit %>%
31	11x	purrr::chuck("data_community_to_fit")
32
33
34	10x	assertthat::assert_that(
35	10x	is.data.frame(data_community),
36	10x	msg = "data_community must be a data frame"
37		)
38
39	10x	data_abiotic <-
40	10x	data_to_fit %>%
41	10x	purrr::chuck("data_abiotic_to_fit")
42
43
44	10x	assertthat::assert_that(
45	10x	is.data.frame(data_abiotic),
46	10x	msg = "data_abiotic must be a data frame"
47		)
48
49
50	10x	assertthat::assert_that(
51	10x	is.character(sel_formula),
52	10x	msg = "sel_formula must be a character string "
53		)
54
55	9x	assertthat::assert_that(
56	9x	any(
57	9x	error_family %in% c("normal", "binomial")
58		),
59	9x	msg = "error_family must be either 'normal' or 'binomial'"
60		)
61
62	6x	error_family <- match.arg(error_family)
63
64	6x	assertthat::assert_that(
65	6x	length(sel_formula) == 1,
66	6x	msg = "sel_formula must be a character string of length 1"
67		)
68
69		if (
70	6x	error_family == "binomial"
71		) {
72	!	data_community <-
73	!	data_community > 0
74
75	!	error_family <- "probit"
76		}
77
78	6x	assertthat::assert_that(
79	6x	is.list(random_structure),
80	6x	msg = "random_structure must be a list"
81		)
82
83	4x	assertthat::assert_that(
84	4x	all(c("study_design", "random_levels") %in% names(random_structure)),
85	4x	msg = "random_structure must contain study_design and random_levels"
86		)
87
88	3x	study_design <-
89	3x	random_structure %>%
90	3x	purrr::chuck("study_design")
91
92	3x	mod_hmsc <-
93	3x	Hmsc::Hmsc(
94	3x	Y = data_community,
95	3x	XData = data_abiotic,
96	3x	XFormula = as.formula(sel_formula),
97	3x	distr = error_family,
98	3x	studyDesign = study_design,
99	3x	ranLevels = random_structure$random_levels
100		)
101
102	2x	return(mod_hmsc)
103		}

1		#' @title Extract Data from VegVault
2		#' @description
3		#' Extracts data from the VegVault SQLite database based on specified
4		#' geographic, temporal, and dataset type constraints.
5		#' @param path_to_vegvault
6		#' A character string specifying the path to the VegVault SQLite database
7		#' (default: "Data/Input/VegVault.sqlite").
8		#' @param x_lim
9		#' A numeric vector of length 2 specifying the longitude range.
10		#' @param y_lim
11		#' A numeric vector of length 2 specifying the latitude range.
12		#' @param age_lim
13		#' A numeric vector of length 2 specifying the age range.
14		#' @param sel_dataset_type
15		#' A character vector specifying the dataset types to select.
16		#' @param sel_abiotic_var_name
17		#' A character vector specifying the abiotic variable names to select.
18		#' @return
19		#' A data frame containing the extracted data.
20		#' @details
21		#' The function performs the following steps:
22		#'
23		#' 1. Validates input parameters.
24		#' 2. Checks the presence of the VegVault SQLite database.
25		#' 3. Accesses the database and filters data based on geographic, temporal,
26		#' and dataset type constraints.
27		#' 4. Retrieves abiotic data and taxa information.
28		#' 5. Returns the extracted data as a data frame.
29		#' @export
30		extract_data_from_vegvault <- function(
31		path_to_vegvault = here::here("Data/Input/VegVault.sqlite"),
32		x_lim = NULL,
33		y_lim = NULL,
34		age_lim = NULL,
35		sel_dataset_type = NULL,
36		sel_abiotic_var_name = NULL) {
37	7x	`%>%` <- magrittr::`%>%`
38
39	7x	assertthat::assert_that(
40	7x	is.character(path_to_vegvault),
41	7x	length(path_to_vegvault) == 1,
42	7x	msg = "path_to_vegvault must be a single character string"
43		)
44
45		# Check if the VegVault file exists
46	7x	check_presence_of_vegvault(path_to_vegvault)
47
48	6x	assertthat::assert_that(
49	6x	is.numeric(x_lim) && length(x_lim) == 2,
50	6x	msg = "x_lim must be a numeric vector of length 2"
51		)
52
53	5x	assertthat::assert_that(
54	5x	is.numeric(y_lim) && length(y_lim) == 2,
55	5x	msg = "y_lim must be a numeric vector of length 2"
56		)
57
58	4x	assertthat::assert_that(
59	4x	is.numeric(age_lim) && length(age_lim) == 2,
60	4x	msg = "age_lim must be a numeric vector of length 2"
61		)
62
63	3x	assertthat::assert_that(
64	3x	is.character(sel_dataset_type) && length(sel_dataset_type) > 0,
65	3x	msg = "sel_dataset_type must be a character vector of length > 0"
66		)
67
68	2x	assertthat::assert_that(
69	2x	is.character(sel_abiotic_var_name) && length(sel_abiotic_var_name) > 0,
70	2x	msg = "sel_abiotic_var_name must be a character vector of length > 0"
71		)
72
73	1x	vaultkeepr_plan <-
74		# Access the VegVault file
75	1x	vaultkeepr::open_vault(
76	1x	path = path_to_vegvault
77		) %>%
78		# Add the dataset information
79	1x	vaultkeepr::get_datasets() %>%
80		# Select modern plot data and climate
81	1x	vaultkeepr::select_dataset_by_type(
82	1x	sel_dataset_type = sel_dataset_type
83		) %>%
84		# Limit data to Czech Republic
85	1x	vaultkeepr::select_dataset_by_geo(
86	1x	lat_lim = y_lim,
87	1x	long_lim = x_lim,
88	1x	verbose = FALSE
89		) %>%
90		# Add samples
91	1x	vaultkeepr::get_samples() %>%
92		# select only modern data
93	1x	vaultkeepr::select_samples_by_age(
94	1x	age_lim = age_lim,
95	1x	verbose = FALSE
96		) %>%
97		# Add abiotic data
98	1x	vaultkeepr::get_abiotic_data(verbose = FALSE) %>%
99		# Select only Mean Anual Temperature (bio1)
100	1x	vaultkeepr::select_abiotic_var_by_name(
101	1x	sel_var_name = sel_abiotic_var_name) %>%
102		# add taxa
103	1x	vaultkeepr::get_taxa()
104
105	1x	data_extracted <-
106	1x	vaultkeepr_plan %>%
107	1x	vaultkeepr::extract_data(
108	1x	return_raw_data = FALSE,
109	1x	verbose = FALSE
110		)
111
112	1x	return(data_extracted)
113		}

1		#' @title Get Dataset Coordinates
2		#' @description
3		#' Extracts unique coordinates for each dataset, excluding gridpoints.
4		#' @param data
5		#' A data frame containing columns 'dataset_name', 'coord_long', and
6		#' 'coord_lat'.
7		#' @return
8		#' A data frame with unique coordinates for each dataset.
9		#' @export
10		get_coords <- function(data) {
11	5x	data %>%
12	5x	dplyr::filter(
13	5x	dataset_type != "gridpoints"
14		) %>%
15	5x	dplyr::select(
16	5x	"dataset_name",
17	5x	"coord_long",
18	5x	"coord_lat"
19		) %>%
20	5x	dplyr::distinct() %>%
21	5x	tibble::column_to_rownames("dataset_name") %>%
22	5x	return()
23		}

1		#' @title Classify Taxonomic Resolution
2		#' @description
3		#' Classifies taxa in a data frame to a specified taxonomic resolution (family,
4		#' genus, or species) using a classification table, and aggregates pollen
5		#' proportions accordingly.
6		#' @param data A data frame containing taxon data with columns including
7		#' 'taxon', 'dataset_name', 'age', and 'pollen_prop'.
8		#' @param data_classification_table A data frame mapping 'sel_name' to
9		#' taxonomic levels (e.g., family, genus, species).
10		#' @param taxonomic_resolution A character string specifying the taxonomic
11		#' resolution to classify to. Must be one of 'family', 'genus', or 'species'.
12		#' @return A data frame with taxa classified to the specified resolution and
13		#' pollen proportions aggregated accordingly. The output preserves all
14		#' dataset_name and age combinations for true negatives.
15		#' @details
16		#' Performs a left join to map taxa to the desired resolution, aggregates
17		#' pollen proportions, and ensures all dataset_name-age-taxon combinations are
18		#' present in the output.
19		#' @export
20		classify_taxonomic_resolution <- function(data, data_classification_table, taxonomic_resolution) {
21	13x	assertthat::assert_that(
22	13x	is.data.frame(data),
23	13x	msg = "data must be a data frame"
24		)
25
26	10x	assertthat::assert_that(
27	10x	all(c("taxon", "dataset_name", "age", "pollen_prop") %in% colnames(data)),
28	10x	msg = "data must contain columns: taxon, dataset_name, age, and pollen_prop"
29		)
30
31	9x	assertthat::assert_that(
32	9x	is.data.frame(data_classification_table),
33	9x	msg = "data_classification_table must be a data frame"
34		)
35
36	6x	assertthat::assert_that(
37	6x	is.character(taxonomic_resolution) && length(taxonomic_resolution) == 1,
38	6x	msg = "taxonomic_resolution must be a single character string"
39		)
40
41	4x	assertthat::assert_that(
42	4x	taxonomic_resolution %in% c("family", "genus", "species"),
43	4x	msg = "taxonomic_resolution must be one of 'family', 'genus', or 'species'"
44		)
45
46	3x	assertthat::assert_that(
47	3x	any(taxonomic_resolution %in% colnames(data_classification_table)),
48	3x	msg = "taxonomic_resolution must be a column in data_classification_table"
49		)
50
51	2x	data_classification_table_sub <-
52	2x	data_classification_table %>%
53	2x	dplyr::select(sel_name, !!taxonomic_resolution)
54
55	2x	data_classified <-
56	2x	data %>%
57	2x	dplyr::left_join(
58	2x	data_classification_table_sub,
59	2x	by = dplyr::join_by("taxon" == "sel_name")
60		) %>%
61	2x	dplyr::select(-taxon) %>%
62	2x	dplyr::rename(
63	2x	taxon = !!taxonomic_resolution
64		)
65
66		# make dummy table with all dataset_name and age combinations
67		# this is needed to ensure that all combinations are present in the
68		# final output to preserve true negative values
69	2x	data_dataset_age_cross_ref <-
70	2x	data_classified %>%
71	2x	dplyr::distinct(dataset_name, age, taxon)
72
73	2x	res <-
74	2x	data_classified %>%
75	2x	tidyr::drop_na(pollen_prop) %>%
76	2x	dplyr::group_by(
77	2x	dataset_name, age, taxon
78		) %>%
79	2x	dplyr::summarise(
80	2x	.groups = "drop",
81	2x	pollen_prop = sum(pollen_prop)
82		) %>%
83	2x	dplyr::full_join(
84	2x	data_dataset_age_cross_ref,
85	2x	by = c("dataset_name", "age", "taxon")
86		) %>%
87	2x	dplyr::arrange(age, dataset_name, taxon) %>%
88	2x	dplyr::select(
89	2x	names(data)
90		)
91
92	2x	return(res)
93		}

coverage - 95.61%

1		#' @title Extract Age from String
2		#' @description
3		#' Extracts the age value from a vector of strings, taking all characters
4		#' after the double underscore ("__").
5		#' @param vec_names
6		#' A character vector containing names with the format "dataset__age".
7		#' @return
8		#' A character vector of age values.
9		#' @export
10		get_age_from_string <- function(vec_names) {
11	65x	assertthat::assert_that(
12	65x	is.character(vec_names),
13	65x	msg = "Input must be a character vector."
14		)
15
16	61x	assertthat::assert_that(
17	61x	length(vec_names) > 0,
18	61x	msg = "Input vector must not be empty."
19		)
20
21	59x	assertthat::assert_that(
22	59x	all(stringr::str_detect(vec_names, "__")),
23	59x	msg = "Input strings must contain '__' to extract age."
24		)
25
26	57x	vec_names %>%
27		# get all values after "__"
28	57x	stringr::str_extract("__(.*)") %>%
29		# remove "__"
30	57x	stringr::str_remove("__") %>%
31	57x	stringr::str_trim() %>%
32	57x	return()
33		}

1		#' @title Replace NA in Community Data with Zeros
2		#' @description
3		#' Replaces NA values in community data with zeros.
4		#' @param data
5		#' A data frame. Must contain `dataset_name` and `sample_name` columns, and
6		#' at least one taxon column.
7		#' @return
8		#' A data frame with NA values replaced by zeros.
9		#' @details
10		#' Converts the data to long format, replaces NA values in `pollen_count`
11		#' with zeros, and reshapes it back to wide format.
12		#' @export
13		replace_na_community_data_with_zeros <- function(data = NULL) {
14	4x	assertthat::assert_that(
15	4x	is.data.frame(data),
16	4x	msg = "data must be a data frame"
17		)
18	2x	assertthat::assert_that(
19	2x	all(c("dataset_name", "sample_name") %in% colnames(data)),
20	2x	msg = "data must contain columns 'dataset_name' and 'sample_name'"
21		)
22
23	2x	assertthat::assert_that(
24	2x	colnames(data) %>%
25	2x	length() > 2,
26	2x	msg = "data must contain at least one taxon column"
27		)
28
29	2x	data %>%
30	2x	tidyr::pivot_longer(
31	2x	cols = !c("dataset_name", "sample_name"),
32	2x	names_to = "taxon",
33	2x	values_to = "pollen_count"
34		) %>%
35	2x	dplyr::mutate(
36	2x	pollen_count = dplyr::if_else(is.na(pollen_count), 0, pollen_count)
37		) %>%
38	2x	tidyr::pivot_wider(
39	2x	names_from = "taxon",
40	2x	values_from = "pollen_count"
41		) %>%
42	2x	return()
43		}

1		#' @title Make Community Data Long
2		#' @description
3		#' Converts community data from wide format to long format.
4		#' @param data
5		#' A data frame. Must contain `dataset_name` and `sample_name` columns.
6		#' @return
7		#' A data frame in long format with columns `dataset_name`, `sample_name`,
8		#' `taxon`, and `pollen_count`.
9		#' @details
10		#' Uses `tidyr::pivot_longer` to reshape the data, dropping NA values in the
11		#' process.
12		#' @export
13		make_community_data_long <- function(data = NULL) {
14	4x	assertthat::assert_that(
15	4x	is.data.frame(data),
16	4x	msg = "data must be a data frame"
17		)
18
19	2x	assertthat::assert_that(
20	2x	all(c("dataset_name", "sample_name") %in% colnames(data)),
21	2x	msg = "data must contain columns 'dataset_name' and 'sample_name'"
22		)
23
24	2x	data %>%
25	2x	tidyr::pivot_longer(
26	2x	cols = !c("dataset_name", "sample_name"),
27	2x	names_to = "taxon",
28	2x	values_to = "pollen_count",
29	2x	values_drop_na = TRUE
30		) %>%
31	2x	return()
32		}

1		#' @title Add Dataset Name Column from Row Names
2		#' @description
3		#' Adds a 'dataset_name' column to a data frame by extracting dataset names
4		#' from row names.
5		#' @param data
6		#' A data frame with row names in the format "dataset__something".
7		#' @return
8		#' The input data frame with an added 'dataset_name' column.
9		#' @export
10		add_dataset_name_column_from_rownames <- function(data) {
11	62x	row_names <-
12	62x	row.names(data) %>%
13	62x	get_dataset_name_from_string()
14
15	58x	data %>%
16	58x	dplyr::mutate(
17	58x	dataset_name = row_names
18		) %>%
19	58x	return()
20		}

1		#' @title Add Age Column from Row Names
2		#' @description
3		#' Adds an 'age' column to a data frame by extracting age values from row
4		#' names.
5		#' @param data
6		#' A data frame with row names in the format "dataset__age".
7		#' @return
8		#' The input data frame with an added 'age' column.
9		#' @export
10		add_age_column_from_rownames <- function(data) {
11	59x	row_names <-
12	59x	row.names(data) %>%
13	59x	get_age_from_string()
14
15	55x	data %>%
16	55x	dplyr::mutate(
17	55x	age = as.numeric(row_names)
18		) %>%
19	55x	return()
20		}

1		#' @title Transform to Proportions
2		#' @description
3		#' Transforms pollen count data into proportions based on total pollen count.
4		#' @param data
5		#' A data frame containing pollen count data.
6		#' @param pollen_sum
7		#' A data frame with total pollen counts for each sample.
8		#' @return
9		#' A data frame with pollen proportions, excluding `pollen_sum` and
10		#' `pollen_count` columns.
11		#' @details
12		#' Joins the input data with total pollen counts and calculates proportions
13		#' using `dplyr::mutate`.
14		#' @export
15		transform_to_proportions <- function(data = NULL, pollen_sum = NULL) {
16	8x	data %>%
17	8x	dplyr::left_join(
18	8x	pollen_sum,
19	8x	by = "sample_name"
20		) %>%
21	8x	dplyr::mutate(
22	8x	pollen_prop = pollen_count / pollen_sum,
23	8x	.after = pollen_count
24		) %>%
25	8x	dplyr::select(
26	8x	!c("pollen_sum", "pollen_count")
27		) %>%
28	8x	return()
29		}

1		#' @title Interpolate Community Data
2		#' @description
3		#' Transforms community data to proportions, interpolates it, and returns it.
4		#' @param data
5		#' A data frame containing community data to be transformed and interpolated.
6		#' @param ...
7		#' Additional arguments passed to the `interpolate_data` function.
8		#' @return
9		#' A data frame with interpolated community data.
10		#' @details
11		#' Transforms data to proportions using `transform_to_proportions` and total
12		#' pollen count from `get_pollen_sum`. Then interpolates using `interpolate_data`.
13		#' @export
14		interpolate_community_data <- function(data, ...) {
15	4x	data %>%
16	4x	transform_to_proportions(pollen_sum = get_pollen_sum(data)) %>%
17	4x	interpolate_data(by = c("dataset_name", "taxon"), ...) %>%
18	4x	return()
19		}

1		#' @title Make Classification Table
2		#' @description
3		#' Creates a wide-format taxonomic classification table from a list of data
4		#' frames containing taxon names and ranks (e.g., family, genus, species).
5		#' @param data A list of data frames, each containing columns 'sel_name',
6		#' 'rank', and 'name'.
7		#' @return A data frame in wide format with columns for each taxonomic rank
8		#' and one row per 'sel_name'.
9		#' @details
10		#' Filters for relevant taxonomic ranks, removes duplicates, and pivots the
11		#' table to wide format with one column per rank.
12		#' @export
13		make_classification_table <- function(data) {
14	6x	dplyr::bind_rows(data) %>%
15	6x	dplyr::filter(
16	6x	rank %in% c(
17	6x	"family", "genus", "species"
18		)
19		) %>%
20	6x	dplyr::distinct(
21	6x	sel_name, rank, name
22		) %>%
23	6x	tidyr::pivot_wider(
24	6x	names_from = rank,
25	6x	values_from = name
26		) %>%
27	6x	return()
28		}