1 |
#' @title Extract Dataset Name from String |
|
2 |
#' @description |
|
3 |
#' Extracts the dataset name from a vector of strings, taking all characters |
|
4 |
#' before the double underscore ("__"). |
|
5 |
#' @param vec_names |
|
6 |
#' A character vector containing names with the format "dataset__something". |
|
7 |
#' @return |
|
8 |
#' A character vector of dataset names. |
|
9 |
#' @export |
|
10 |
get_dataset_name_from_string <- function(vec_names) { |
|
11 | 69x |
assertthat::assert_that( |
12 | 69x |
is.character(vec_names), |
13 | 69x |
msg = "Input must be a character vector." |
14 |
) |
|
15 | ||
16 | 64x |
assertthat::assert_that( |
17 | 64x |
length(vec_names) > 0, |
18 | 64x |
msg = "Input vector must not be empty." |
19 |
) |
|
20 | ||
21 | 63x |
assertthat::assert_that( |
22 | 63x |
all(stringr::str_detect(vec_names, "__")), |
23 | 63x |
msg = "Input strings must contain '__' to extract dataset names." |
24 |
) |
|
25 | ||
26 | 60x |
vec_names %>% |
27 |
# get all values before double "__" |
|
28 | 60x |
stringr::str_extract(".*(?=__)") %>% |
29 | 60x |
stringr::str_trim() %>% |
30 | 60x |
return() |
31 |
} |
1 |
#' @title Get Abiotic Data |
|
2 |
#' @description |
|
3 |
#' Extracts abiotic data from a data frame containing nested abiotic |
|
4 |
#' information. |
|
5 |
#' @param data |
|
6 |
#' A data frame. Must contain the columns `dataset_name` and `data_abiotic`. |
|
7 |
#' @return |
|
8 |
#' A data frame with columns `dataset_name`, `sample_name`, |
|
9 |
#' `abiotic_variable_name`, and `abiotic_value`. |
|
10 |
#' @details |
|
11 |
#' Validates the input data frame, ensures required columns are present, and |
|
12 |
#' unnests the `data_abiotic` column. |
|
13 |
#' @export |
|
14 |
get_abiotic_data <- function(data = NULL) { |
|
15 | 4x |
assertthat::assert_that( |
16 | 4x |
is.data.frame(data), |
17 | 4x |
msg = "data must be a data frame" |
18 |
) |
|
19 | ||
20 | 2x |
assertthat::assert_that( |
21 | 2x |
all(c("dataset_name", "data_abiotic") %in% colnames(data)), |
22 | 2x |
msg = "data must contain columns 'dataset_name' and 'data_abiotic'" |
23 |
) |
|
24 | 2x |
data %>% |
25 | 2x |
dplyr::select(dataset_name, data_abiotic) %>% |
26 | 2x |
tidyr::unnest( |
27 | 2x |
cols = c(data_abiotic) |
28 |
) %>% |
|
29 | 2x |
dplyr::select(dataset_name, sample_name, abiotic_variable_name, abiotic_value) %>% |
30 | 2x |
return() |
31 |
} |
1 |
#' @title Add Model Evaluation |
|
2 |
#' @description |
|
3 |
#' Evaluates a fitted Hmsc model using predicted data and returns a list |
|
4 |
#' containing the model and its evaluation. |
|
5 |
#' @param mod_fitted A fitted Hmsc model object. Must be of class 'Hmsc'. |
|
6 |
#' @param data_pred An array of predicted values. Must be of class 'array'. |
|
7 |
#' @return |
|
8 |
#' A list with two elements: the fitted model ('mod') and the evaluation |
|
9 |
#' results ('eval'). |
|
10 |
#' @seealso Hmsc::evaluateModelFit |
|
11 |
#' @export |
|
12 |
add_model_evaluation <- function(mod_fitted = NULL, |
|
13 |
data_pred = NULL) { |
|
14 | 10x |
assertthat::assert_that( |
15 | 10x |
class(mod_fitted) == "Hmsc", |
16 | 10x |
msg = "mod_fitted must be of class Hmsc" |
17 |
) |
|
18 | ||
19 | 8x |
assertthat::assert_that( |
20 | 8x |
class(data_pred) == "array", |
21 | 8x |
msg = "data_pred must be an array" |
22 |
) |
|
23 | ||
24 | 8x |
list_eval <- |
25 | 8x |
Hmsc::evaluateModelFit( |
26 | 8x |
hM = mod_fitted, |
27 | 8x |
predY = data_pred |
28 |
) |
|
29 | ||
30 | 8x |
res <- |
31 | 8x |
list( |
32 | 8x |
mod = mod_fitted, |
33 | 8x |
eval = list_eval |
34 |
) |
|
35 | ||
36 | 8x |
return(res) |
37 |
} |
1 |
#' @title Get Community Taxa |
|
2 |
#' @description |
|
3 |
#' Extracts a vector of unique taxa from a community data frame. |
|
4 |
#' @param data A data frame containing a 'taxon' column. |
|
5 |
#' @return A character vector of unique taxon names present in the data. |
|
6 |
#' @details |
|
7 |
#' Uses dplyr to select distinct values from the 'taxon' column and returns |
|
8 |
#' them as a vector. |
|
9 |
#' @export |
|
10 |
get_community_taxa <- function(data) { |
|
11 | ||
12 | 4x |
assertthat::assert_that( |
13 | 4x |
is.data.frame(data), |
14 | 4x |
msg = "data must be a data frame" |
15 |
) |
|
16 | ||
17 | 3x |
assertthat::assert_that( |
18 | 3x |
"taxon" %in% colnames(data), |
19 | 3x |
msg = "data must contain a 'taxon' column" |
20 |
) |
|
21 | ||
22 | 2x |
data_res <- |
23 | 2x |
data %>% |
24 | 2x |
dplyr::distinct(taxon) %>% |
25 | 2x |
dplyr::pull(taxon) |
26 | ||
27 | 2x |
return(data_res) |
28 |
} |
1 |
#' @title Interpolate Data |
|
2 |
#' @description |
|
3 |
#' Interpolates data over a specified age range and timestep using a method. |
|
4 |
#' @param data |
|
5 |
#' A data frame containing the data to be interpolated. |
|
6 |
#' @param age_var |
|
7 |
#' Name of the age variable column (default: "age"). |
|
8 |
#' @param value_var |
|
9 |
#' Name of the value variable column (default: "pollen_prop"). |
|
10 |
#' @param method |
|
11 |
#' Interpolation method to use (default: "linear"). |
|
12 |
#' @param rule |
|
13 |
#' Integer specifying the extrapolation rule (default: 1). |
|
14 |
#' @param ties |
|
15 |
#' Function to handle tied values (default: `mean`). |
|
16 |
#' @param age_min |
|
17 |
#' Minimum age for interpolation (default: 0). |
|
18 |
#' @param age_max |
|
19 |
#' Maximum age for interpolation (default: 12000). |
|
20 |
#' @param timestep |
|
21 |
#' Timestep for interpolation (default: 500). |
|
22 |
#' @return |
|
23 |
#' A data frame with interpolated values, including dataset name, taxon, age, |
|
24 |
#' and value columns. |
|
25 |
#' @details |
|
26 |
#' Nests data by dataset and taxon, performs interpolation using `stats::approx`, |
|
27 |
#' and returns the interpolated data in a flat format. |
|
28 |
#' @seealso [stats::approx()] |
|
29 |
#' @export |
|
30 |
interpolate_data <- function(data = NULL, |
|
31 |
by = "dataset_name", |
|
32 |
age_var = "age", |
|
33 |
value_var = "pollen_prop", |
|
34 |
method = "linear", |
|
35 |
rule = 1, |
|
36 |
ties = mean, |
|
37 |
age_min = 0, |
|
38 |
age_max = 12e03, |
|
39 |
timestep = 500) { |
|
40 | 8x |
assertthat::assert_that( |
41 | 8x |
is.data.frame(data), |
42 | 8x |
msg = "data must be a data frame" |
43 |
) |
|
44 | ||
45 | 4x |
assertthat::assert_that( |
46 | 4x |
is.character(by) && length(by) > 0, |
47 | 4x |
msg = "by must be a character vector with at least one element" |
48 |
) |
|
49 | ||
50 | 4x |
assertthat::assert_that( |
51 | 4x |
all(by %in% colnames(data)), |
52 | 4x |
msg = paste0( |
53 | 4x |
"data must contain the following columns: ", |
54 | 4x |
paste(by, collapse = ", ") |
55 |
) |
|
56 |
) |
|
57 | ||
58 | 4x |
assertthat::assert_that( |
59 | 4x |
is.character(age_var) && length(age_var) == 1, |
60 | 4x |
msg = "age_var must be a single character string" |
61 |
) |
|
62 | ||
63 | 4x |
assertthat::assert_that( |
64 | 4x |
is.character(value_var) && length(value_var) == 1, |
65 | 4x |
msg = "value_var must be a single character string" |
66 |
) |
|
67 | ||
68 | 4x |
assertthat::assert_that( |
69 | 4x |
is.character(method) && length(method) == 1, |
70 | 4x |
msg = "method must be a single character string" |
71 |
) |
|
72 | ||
73 | 4x |
assertthat::assert_that( |
74 | 4x |
is.numeric(rule) && length(rule) == 1, |
75 | 4x |
msg = "rule must be a single numeric value" |
76 |
) |
|
77 | ||
78 | 4x |
assertthat::assert_that( |
79 | 4x |
is.function(ties), |
80 | 4x |
msg = "ties must be a function" |
81 |
) |
|
82 | ||
83 | 4x |
assertthat::assert_that( |
84 | 4x |
is.numeric(age_min) && length(age_min) == 1, |
85 | 4x |
msg = "age_min must be a single numeric value" |
86 |
) |
|
87 | ||
88 | 4x |
assertthat::assert_that( |
89 | 4x |
is.numeric(age_max) && length(age_max) == 1, |
90 | 4x |
msg = "age_max must be a single numeric value" |
91 |
) |
|
92 | ||
93 | 4x |
assertthat::assert_that( |
94 | 4x |
age_min < age_max, |
95 | 4x |
msg = "age_min must be less than age_max" |
96 |
) |
|
97 | ||
98 | 4x |
assertthat::assert_that( |
99 | 4x |
is.numeric(timestep) && length(timestep) == 1, |
100 | 4x |
msg = "timestep must be a single numeric value" |
101 |
) |
|
102 | ||
103 | 4x |
assertthat::assert_that( |
104 | 4x |
timestep > 0, |
105 | 4x |
msg = "timestep must be greater than 0" |
106 |
) |
|
107 | ||
108 | 4x |
data %>% |
109 | 4x |
tidyr::nest( |
110 | 4x |
data_nested = !dplyr::any_of(by) |
111 |
) %>% |
|
112 | 4x |
dplyr::mutate( |
113 | 4x |
data_interpolated = purrr::map( |
114 | 4x |
.x = data_nested, |
115 | 4x |
.f = purrr::possibly( |
116 | 4x |
.f = ~ .x %>% |
117 | 4x |
dplyr::select( |
118 | 4x |
!!rlang::sym(age_var), |
119 | 4x |
!!rlang::sym(value_var) |
120 |
) %>% |
|
121 | 4x |
grDevices::xy.coords() %>% |
122 | 4x |
stats::approx( |
123 | 4x |
xout = seq( |
124 | 4x |
age_min, |
125 | 4x |
age_max, |
126 | 4x |
by = timestep |
127 |
), |
|
128 | 4x |
ties = ties, |
129 | 4x |
method = method, |
130 | 4x |
rule = rule |
131 |
) %>% |
|
132 | 4x |
tibble::as_tibble() %>% |
133 | 4x |
dplyr::rename( |
134 | 4x |
!!rlang::sym(age_var) := x, |
135 | 4x |
!!rlang::sym(value_var) := y |
136 |
), |
|
137 | 4x |
otherwise = NULL |
138 |
) |
|
139 |
) |
|
140 |
) %>% |
|
141 | 4x |
tidyr::unnest(data_interpolated) %>% |
142 | 4x |
dplyr::select( |
143 | 4x |
dplyr::any_of(by), |
144 | 4x |
!!rlang::sym(age_var), |
145 | 4x |
!!rlang::sym(value_var) |
146 |
) %>% |
|
147 | 4x |
return() |
148 |
} |
1 |
#' @title Add Age to Community Data |
|
2 |
#' @description |
|
3 |
#' Merges community data with age data based on dataset and sample names. |
|
4 |
#' @param data_community |
|
5 |
#' A data frame containing community data. Must include `dataset_name` and |
|
6 |
#' `sample_name` columns. |
|
7 |
#' @param data_ages |
|
8 |
#' A data frame containing age data. Must include `dataset_name` and |
|
9 |
#' `sample_name` columns. |
|
10 |
#' @return |
|
11 |
#' A data frame with community data merged with the corresponding age data. |
|
12 |
#' @details |
|
13 |
#' Performs a left join between community data and age data using |
|
14 |
#' `dataset_name` and `sample_name` as keys. |
|
15 |
#' @export |
|
16 |
add_age_to_samples <- function(data_community = NULL, data_ages = NULL) { |
|
17 | 4x |
assertthat::assert_that( |
18 | 4x |
is.data.frame(data_community), |
19 | 4x |
msg = "data_community must be a data frame" |
20 |
) |
|
21 | 2x |
assertthat::assert_that( |
22 | 2x |
is.data.frame(data_ages), |
23 | 2x |
msg = "data_ages must be a data frame" |
24 |
) |
|
25 | ||
26 | 2x |
assertthat::assert_that( |
27 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data_community)), |
28 | 2x |
msg = "data_community must contain columns 'dataset_name' and 'sample_name'" |
29 |
) |
|
30 | 2x |
assertthat::assert_that( |
31 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data_ages)), |
32 | 2x |
msg = "data_ages must contain columns 'dataset_name' and 'sample_name'" |
33 |
) |
|
34 | ||
35 | 2x |
dplyr::left_join( |
36 | 2x |
x = data_community, |
37 | 2x |
y = data_ages, |
38 | 2x |
by = c("dataset_name", "sample_name") |
39 |
) %>% |
|
40 | 2x |
return() |
41 |
} |
1 |
#' @title Check and Prepare Data for Model Fitting |
|
2 |
#' @description |
|
3 |
#' Cleans and prepares community, abiotic, and coordinate data for HMSC model |
|
4 |
#' fitting, with optional age subsetting. |
|
5 |
#' @param data_community |
|
6 |
#' Data frame of community data. |
|
7 |
#' @param data_abiotic |
|
8 |
#' Data frame of abiotic data. |
|
9 |
#' @param data_coords |
|
10 |
#' Data frame of coordinates. |
|
11 |
#' @param subset_age |
|
12 |
#' Optional age value to subset data. |
|
13 |
#' @seealso [make_hmsc_model()] |
|
14 |
#' @return |
|
15 |
#' A list of cleaned and prepared data frames for model fitting. |
|
16 |
#' @export |
|
17 |
check_and_prepare_data_for_fit <- function( |
|
18 |
data_community = NULL, |
|
19 |
data_abiotic = NULL, |
|
20 |
data_coords = NULL, |
|
21 |
subset_age = NULL) { |
|
22 | 21x |
`%>%` <- magrittr::`%>%` |
23 | ||
24 | 21x |
data_community_no_na <- |
25 | 21x |
tidyr::drop_na(data_community) |
26 | ||
27 | 18x |
data_abiotic_no_na <- |
28 | 18x |
tidyr::drop_na(data_abiotic) |
29 | ||
30 | 15x |
data_coords_no_na <- |
31 | 15x |
tidyr::drop_na(data_coords) %>% |
32 | 15x |
dplyr::distinct() |
33 | ||
34 |
if ( |
|
35 | 12x |
!is.null(subset_age) |
36 |
) { |
|
37 | 2x |
data_community_no_na <- |
38 | 2x |
data_community_no_na %>% |
39 | 2x |
add_age_column_from_rownames() %>% |
40 | 2x |
dplyr::filter(age == subset_age) %>% |
41 | 2x |
dplyr::select(-age) |
42 | ||
43 | 2x |
data_abiotic_no_na <- |
44 | 2x |
data_abiotic_no_na %>% |
45 | 2x |
add_age_column_from_rownames() %>% |
46 | 2x |
dplyr::filter(age == subset_age) %>% |
47 | 2x |
dplyr::select(-age) |
48 |
} |
|
49 | ||
50 | 12x |
data_community_rownames <- |
51 | 12x |
data_community_no_na %>% |
52 | 12x |
add_age_column_from_rownames() %>% |
53 | 12x |
add_dataset_name_column_from_rownames() %>% |
54 | 12x |
dplyr::distinct(dataset_name, age) |
55 | ||
56 | 11x |
data_abiotic_rownames <- |
57 | 11x |
data_abiotic_no_na %>% |
58 | 11x |
add_age_column_from_rownames() %>% |
59 | 11x |
add_dataset_name_column_from_rownames() %>% |
60 | 11x |
dplyr::distinct(dataset_name, age) |
61 | ||
62 | 11x |
data_coords_rownames <- |
63 | 11x |
data_coords_no_na %>% |
64 | 11x |
tibble::rownames_to_column("dataset_name") %>% |
65 | 11x |
dplyr::distinct(dataset_name) |
66 | ||
67 | 11x |
data_intersect <- |
68 | 11x |
dplyr::inner_join( |
69 | 11x |
data_community_rownames, |
70 | 11x |
data_abiotic_rownames, |
71 | 11x |
by = dplyr::join_by(dataset_name, age) |
72 |
) %>% |
|
73 | 11x |
dplyr::inner_join( |
74 | 11x |
data_coords_rownames, |
75 | 11x |
by = dplyr::join_by(dataset_name) |
76 |
) %>% |
|
77 | 11x |
dplyr::distinct() |
78 | ||
79 | 11x |
data_community_to_fit <- |
80 | 11x |
data_community_no_na %>% |
81 | 11x |
add_age_column_from_rownames() %>% |
82 | 11x |
add_dataset_name_column_from_rownames() %>% |
83 | 11x |
tibble::rownames_to_column("row_names") %>% |
84 | 11x |
dplyr::inner_join( |
85 | 11x |
data_intersect, |
86 | 11x |
by = dplyr::join_by(dataset_name, age) |
87 |
) %>% |
|
88 | 11x |
dplyr::select(-dataset_name, -age) %>% |
89 | 11x |
tibble::column_to_rownames("row_names") |
90 | ||
91 | ||
92 | 11x |
data_abiotic_to_fit <- |
93 | 11x |
data_abiotic_no_na %>% |
94 | 11x |
add_age_column_from_rownames() %>% |
95 | 11x |
add_dataset_name_column_from_rownames() %>% |
96 | 11x |
tibble::rownames_to_column("row_names") %>% |
97 | 11x |
dplyr::inner_join( |
98 | 11x |
data_intersect, |
99 | 11x |
by = dplyr::join_by(dataset_name, age) |
100 |
) %>% |
|
101 | 11x |
dplyr::select(-dataset_name, -age) %>% |
102 | 11x |
tibble::column_to_rownames("row_names") |
103 | ||
104 | 11x |
vec_shared_dataset_names <- |
105 | 11x |
data_intersect %>% |
106 | 11x |
dplyr::distinct(dataset_name) %>% |
107 | 11x |
purrr::chuck("dataset_name") %>% |
108 | 11x |
as.character() |
109 | ||
110 | 11x |
data_coords_to_fit <- |
111 | 11x |
data_coords_no_na %>% |
112 | 11x |
as.data.frame() %>% |
113 | 11x |
tibble::rownames_to_column("row_names") %>% |
114 | 11x |
dplyr::filter(row_names %in% vec_shared_dataset_names) %>% |
115 | 11x |
tibble::column_to_rownames("row_names") |
116 | ||
117 | 11x |
vec_age <- |
118 | 11x |
data_intersect %>% |
119 | 11x |
dplyr::distinct(age) %>% |
120 | 11x |
purrr::chuck("age") %>% |
121 | 11x |
as.numeric() %>% |
122 | 11x |
sort() %>% |
123 | 11x |
as.character() |
124 | ||
125 | 11x |
data_ages_to_fit <- |
126 | 11x |
tibble::tibble( |
127 | 11x |
age = as.numeric(vec_age), |
128 | 11x |
row_names = age |
129 |
) %>% |
|
130 | 11x |
tibble::column_to_rownames("row_names") |
131 | ||
132 | 11x |
res <- |
133 | 11x |
list( |
134 | 11x |
data_community_to_fit = data_community_to_fit, |
135 | 11x |
data_abiotic_to_fit = data_abiotic_to_fit, |
136 | 11x |
data_ages_to_fit = data_ages_to_fit, |
137 | 11x |
data_coords_to_fit = data_coords_to_fit |
138 |
) |
|
139 | ||
140 | 11x |
return(res) |
141 |
} |
1 |
#' @title Fit HMSC Model |
|
2 |
#' @description |
|
3 |
#' Sample the MCMC process for the HMSC model. |
|
4 |
#' @param mod_hmsc |
|
5 |
#' An unfitted HMSC model object. |
|
6 |
#' @param n_chains |
|
7 |
#' Number of MCMC chains (default: 20). |
|
8 |
#' @param n_samples |
|
9 |
#' Number of MCMC samples (default: 10,000). |
|
10 |
#' @param n_thin |
|
11 |
#' Thinning interval for MCMC samples (default: 5). |
|
12 |
#' @param n_transient |
|
13 |
#' Number of transient iterations (default: 2,500). |
|
14 |
#' @param n_parallel |
|
15 |
#' Number of parallel chains (default: 20). |
|
16 |
#' @param n_samples_verbose |
|
17 |
#' Verbosity interval for MCMC sampling (default: 500). |
|
18 |
#' @return |
|
19 |
#' Returns a fitted HMSC model object. |
|
20 |
#' @export |
|
21 |
fit_hmsc_model <- function( |
|
22 |
mod_hmsc, |
|
23 |
n_chains = 20, |
|
24 |
n_samples = 10e3, |
|
25 |
n_thin = 1, |
|
26 |
n_transient = 2500, |
|
27 |
n_parallel = 20, |
|
28 |
n_samples_verbose = 500) { |
|
29 | 22x |
assertthat::assert_that( |
30 | 22x |
inherits(mod_hmsc, "Hmsc"), |
31 | 22x |
msg = "mod_hmsc must be an HMSC model object" |
32 |
) |
|
33 | ||
34 | 20x |
assertthat::assert_that( |
35 | 20x |
is.numeric(n_chains) && n_chains > 0, |
36 | 20x |
msg = "n_chains must be a positive number" |
37 |
) |
|
38 | ||
39 | 18x |
assertthat::assert_that( |
40 | 18x |
is.numeric(n_samples) && n_samples > 0, |
41 | 18x |
msg = "n_samples must be a positive number" |
42 |
) |
|
43 | ||
44 | 16x |
assertthat::assert_that( |
45 | 16x |
is.numeric(n_thin) && n_thin > 0, |
46 | 16x |
msg = "n_thin must be a positive number" |
47 |
) |
|
48 | ||
49 | 14x |
assertthat::assert_that( |
50 | 14x |
is.numeric(n_transient) && n_transient > 0, |
51 | 14x |
msg = "n_transient must be a positive number" |
52 |
) |
|
53 | ||
54 | 12x |
assertthat::assert_that( |
55 | 12x |
is.numeric(n_parallel) && n_parallel > 0, |
56 | 12x |
msg = "n_parallel must be a positive number" |
57 |
) |
|
58 | ||
59 | 10x |
assertthat::assert_that( |
60 | 10x |
is.numeric(n_samples_verbose) && n_samples_verbose > 0, |
61 | 10x |
msg = "n_samples_verbose must be a positive number" |
62 |
) |
|
63 | ||
64 | 8x |
mod_hmsc_fitted <- |
65 | 8x |
Hmsc::sampleMcmc( |
66 | 8x |
mod_hmsc, |
67 | 8x |
nChains = n_chains, |
68 | 8x |
samples = n_samples, |
69 | 8x |
thin = n_thin, |
70 | 8x |
transient = n_transient, |
71 | 8x |
verbose = n_samples_verbose, |
72 | 8x |
nParallel = n_parallel |
73 |
) |
|
74 | ||
75 | 8x |
return(mod_hmsc_fitted) |
76 |
} |
1 |
#' @title Prepare Data for Model Fitting |
|
2 |
#' @description |
|
3 |
#' Prepares community or abiotic data for model fitting by reshaping it into |
|
4 |
#' a wide format with appropriate column names. |
|
5 |
#' @param data |
|
6 |
#' A data frame containing the input data. For `type = "community"`, it must |
|
7 |
#' include columns `dataset_name`, `age`, `taxon`, and `pollen_prop`. For |
|
8 |
#' `type = "abiotic"`, it must include columns `dataset_name`, `age`, |
|
9 |
#' `abiotic_variable_name`, and `abiotic_value`. |
|
10 |
#' @param type |
|
11 |
#' A character string specifying the type of data to prepare. Must be either |
|
12 |
#' "community" or "abiotic" (default: "community"). |
|
13 |
#' @return |
|
14 |
#' A data frame in wide format, with `sample_name` as row names and either |
|
15 |
#' taxa or abiotic variable names as columns. For community data, missing |
|
16 |
#' values are filled with 0. For abiotic data, missing values are left as NA. |
|
17 |
#' @details |
|
18 |
#' The function validates the input data and reshapes it based on the |
|
19 |
#' specified `type`. For community data, it combines `dataset_name` and `age` |
|
20 |
#' into a `sample_name` column, selects relevant columns, and pivots the data |
|
21 |
#' to a wide format. For abiotic data, it performs similar steps but uses |
|
22 |
#' abiotic variable names and values. |
|
23 |
#' @export |
|
24 |
prepare_data_for_fit <- function(data = NULL, type = c("community", "abiotic")) { |
|
25 | 9x |
assertthat::assert_that( |
26 | 9x |
is.data.frame(data), |
27 | 9x |
msg = "data must be a data frame" |
28 |
) |
|
29 | ||
30 | 7x |
type <- match.arg(type) |
31 | ||
32 | 6x |
assertthat::assert_that( |
33 | 6x |
type %in% c("community", "abiotic"), |
34 | 6x |
msg = "type must be either 'community' or 'abiotic'" |
35 |
) |
|
36 | ||
37 | 6x |
res <- |
38 | 6x |
switch(type, |
39 | 6x |
"community" = data %>% |
40 | 6x |
dplyr::mutate( |
41 | 6x |
sample_name = paste0(dataset_name, "__", age), |
42 |
) %>% |
|
43 | 6x |
dplyr::select("sample_name", "taxon", "pollen_prop") %>% |
44 | 6x |
tidyr::pivot_wider( |
45 | 6x |
names_from = "taxon", |
46 | 6x |
values_from = "pollen_prop", |
47 | 6x |
values_fill = 0 |
48 |
) %>% |
|
49 | 6x |
tibble::column_to_rownames("sample_name"), |
50 | 6x |
"abiotic" = data %>% |
51 | 6x |
dplyr::mutate( |
52 | 6x |
sample_name = paste0(dataset_name, "__", age), |
53 |
) %>% |
|
54 | 6x |
dplyr::select( |
55 | 6x |
"sample_name", "abiotic_variable_name", "abiotic_value" |
56 |
) %>% |
|
57 | 6x |
tidyr::pivot_wider( |
58 | 6x |
names_from = "abiotic_variable_name", |
59 | 6x |
values_from = "abiotic_value", |
60 | 6x |
values_fill = NULL |
61 |
) %>% |
|
62 | 6x |
tibble::column_to_rownames("sample_name") |
63 |
) |
|
64 | ||
65 | 4x |
return(res) |
66 |
} |
1 |
#' @title Save Progress Visualisation |
|
2 |
#' @description |
|
3 |
#' Generates a visualisation of project progress and saves it as HTML and PNG. |
|
4 |
#' @param sel_script |
|
5 |
#' The script file to be visualised. |
|
6 |
#' @param output_file |
|
7 |
#' The name of the output file (default: "project_status"). |
|
8 |
#' @param output_dir |
|
9 |
#' Directory where the output files will be saved (default: "Outputs/Figures"). |
|
10 |
#' @param level_separation |
|
11 |
#' Level separation for the visualisation graph (default: 250). |
|
12 |
#' @details |
|
13 |
#' Uses `targets::tar_visnetwork` to create a network graph and saves it as |
|
14 |
#' HTML using `visNetwork::visSave`. Also generates a static PNG image using |
|
15 |
#' `webshot2::webshot`. |
|
16 |
#' @export |
|
17 |
save_progress_visualisation <- function( |
|
18 |
sel_script, |
|
19 |
output_file = "project_status", |
|
20 |
output_dir = here::here( |
|
21 |
"Documentation/Progress" |
|
22 |
), |
|
23 |
background_color = "white", |
|
24 |
level_separation = 250) { |
|
25 | ! |
network_graph <- |
26 | ! |
targets::tar_visnetwork( |
27 | ! |
script = sel_script, |
28 | ! |
outdated = FALSE, |
29 | ! |
store = get_active_config("target_store"), |
30 | ! |
targets_only = FALSE, |
31 | ! |
level_separation = level_separation |
32 |
) |
|
33 | ||
34 | ! |
network_graph_static <- |
35 | ! |
targets::tar_visnetwork( |
36 | ! |
script = sel_script, |
37 | ! |
store = get_active_config("target_store"), |
38 | ! |
targets_only = TRUE, |
39 | ! |
outdated = FALSE, |
40 | ! |
level_separation = level_separation |
41 |
) |
|
42 | ||
43 | ! |
visNetwork::visSave( |
44 | ! |
graph = network_graph, |
45 | ! |
file = here::here("Documentation/Progress/project_status.html"), |
46 | ! |
selfcontained = TRUE, |
47 | ! |
background = background_color |
48 |
) |
|
49 | ||
50 | ! |
visNetwork::visSave( |
51 | ! |
graph = network_graph_static, |
52 | ! |
file = paste0(output_dir, "/", output_file, "_small.html"), |
53 | ! |
selfcontained = TRUE, |
54 | ! |
background = background_color |
55 |
) |
|
56 | ||
57 | ! |
webshot2::webshot( |
58 | ! |
url = paste0(output_dir, "/", output_file, "_small.html"), |
59 | ! |
file = paste0(output_dir, "/", output_file, "_static.png"), |
60 | ! |
vwidth = 950, |
61 | ! |
vheight = 750 |
62 |
) |
|
63 |
} |
1 |
#' @title Select N Taxa |
|
2 |
#' @description |
|
3 |
#' Selects the top N taxa based on their occurrence across datasets. |
|
4 |
#' @param data |
|
5 |
#' A data frame containing the input data. Must include columns "taxon" and |
|
6 |
#' the column specified in the `per` parameter. |
|
7 |
#' @param n_taxa |
|
8 |
#' A numeric value specifying the number of taxa to select. Default is Inf. |
|
9 |
#' @param per |
|
10 |
#' A character string specifying the column name to group by. Default is |
|
11 |
#' "dataset_name". |
|
12 |
#' @return |
|
13 |
#' A data frame containing the filtered data with the top N taxa. |
|
14 |
#' @details |
|
15 |
#' The function identifies the most common taxa across datasets by counting |
|
16 |
#' their occurrences. It then filters the input data to include only the |
|
17 |
#' selected taxa. If no taxa are found, an error is raised. |
|
18 |
#' @export |
|
19 |
select_n_taxa <- function( |
|
20 |
data = NULL, |
|
21 |
n_taxa = Inf, |
|
22 |
per = "dataset_name") { |
|
23 | 11x |
assertthat::assert_that( |
24 | 11x |
is.data.frame(data), |
25 | 11x |
msg = "data must be a data frame" |
26 |
) |
|
27 | ||
28 | 9x |
assertthat::assert_that( |
29 | 9x |
is.character(per) && length(per) == 1, |
30 | 9x |
msg = "per must be a single character string" |
31 |
) |
|
32 | ||
33 | 8x |
assertthat::assert_that( |
34 | 8x |
all( |
35 | 8x |
c("taxon", per) %in% names(data) |
36 |
), |
|
37 | 8x |
msg = paste( |
38 | 8x |
"data must contain the following columns:", |
39 | 8x |
paste(c("taxon", per), collapse = ", ") |
40 |
) |
|
41 |
) |
|
42 | ||
43 | 6x |
assertthat::assert_that( |
44 | 6x |
is.numeric(n_taxa), |
45 | 6x |
msg = "n_taxa must be a number" |
46 |
) |
|
47 | ||
48 | 4x |
assertthat::assert_that( |
49 | 4x |
n_taxa > 0, |
50 | 4x |
msg = "n_taxa must be greater than 0" |
51 |
) |
|
52 | ||
53 | ||
54 | 3x |
vec_common_taxa <- |
55 | 3x |
data %>% |
56 | 3x |
dplyr::distinct(taxon, !!rlang::sym(per)) %>% |
57 | 3x |
dplyr::group_by(taxon) %>% |
58 | 3x |
dplyr::summarise( |
59 | 3x |
.groups = "drop", |
60 | 3x |
n_datasets = dplyr::n() |
61 |
) %>% |
|
62 | 3x |
dplyr::slice_max(n = n_taxa, order_by = n_datasets) %>% |
63 | 3x |
dplyr::pull(taxon) |
64 | ||
65 | 3x |
res <- |
66 | 3x |
data %>% |
67 | 3x |
dplyr::filter(taxon %in% vec_common_taxa) |
68 | ||
69 | ||
70 | 3x |
assertthat::assert_that( |
71 | 3x |
nrow(res) > 0, |
72 | 3x |
msg = paste( |
73 | 3x |
"No taxa found in data. Please check the input data.", |
74 | 3x |
"The number of taxa selected is too high." |
75 |
) |
|
76 |
) |
|
77 | ||
78 | 3x |
return(res) |
79 |
} |
1 |
#' @title Get Random Structure for Model |
|
2 |
#' @description |
|
3 |
#' Constructs the random structure for HMSC models based on age and/or space. |
|
4 |
#' @param data |
|
5 |
#' A list containing data for model fitting from the function |
|
6 |
#' [check_and_prepare_data_for_fit()]. |
|
7 |
#' @param type |
|
8 |
#' Character vector specifying random effect types ("age", "space"). |
|
9 |
#' @param min_knots_distance |
|
10 |
#' Minimum distance between knots for spatial random effect (optional). |
|
11 |
#' Only used if "space" is in `type`. |
|
12 |
#' @return |
|
13 |
#' A list describing the random structure for the model. |
|
14 |
#' @export |
|
15 |
get_random_structure_for_model <- function( |
|
16 |
data = NULL, |
|
17 |
type = c("age", "space"), |
|
18 |
min_knots_distance = NULL) { |
|
19 | 15x |
`%>%` <- magrittr::`%>%` |
20 | ||
21 | 15x |
assertthat::assert_that( |
22 | 15x |
class(data) == "list", |
23 | 15x |
msg = "data must be a list" |
24 |
) |
|
25 | ||
26 | 13x |
assertthat::assert_that( |
27 | 13x |
length(data) == 4, |
28 | 13x |
msg = "data must contain four elements" |
29 |
) |
|
30 | ||
31 | 12x |
assertthat::assert_that( |
32 | 12x |
all( |
33 | 12x |
names(data) %in% c( |
34 | 12x |
"data_community_to_fit", |
35 | 12x |
"data_abiotic_to_fit", |
36 | 12x |
"data_ages_to_fit", |
37 | 12x |
"data_coords_to_fit" |
38 |
) |
|
39 |
), |
|
40 | 12x |
msg = "data must contain the elements: data_community_to_fit, data_abiotic_to_fit, data_ages_to_fit, data_coords_to_fit" |
41 |
) |
|
42 | ||
43 | ||
44 | 12x |
assertthat::assert_that( |
45 | 12x |
is.character(type), |
46 | 12x |
msg = "type must be a character vector" |
47 |
) |
|
48 | ||
49 | 11x |
assertthat::assert_that( |
50 | 11x |
length(type) > 0, |
51 | 11x |
msg = "type must be a non-empty character vector" |
52 |
) |
|
53 | ||
54 | 11x |
assertthat::assert_that( |
55 | 11x |
all(type %in% c("age", "space")), |
56 | 11x |
msg = "type must be one of 'age' or 'space'" |
57 |
) |
|
58 | ||
59 | 9x |
assertthat::assert_that( |
60 | 9x |
is.null(min_knots_distance) || is.numeric(min_knots_distance), |
61 | 9x |
msg = "min_knots_distance must be NULL or a numeric value" |
62 |
) |
|
63 | ||
64 | 8x |
assertthat::assert_that( |
65 | 8x |
is.null(min_knots_distance) || min_knots_distance > 0, |
66 | 8x |
msg = "min_knots_distance must be NULL or a positive numeric value" |
67 |
) |
|
68 | ||
69 | 7x |
list_random <- list() |
70 | ||
71 | 7x |
data_study_design <- |
72 | 7x |
data %>% |
73 | 7x |
purrr::chuck("data_community_to_fit") %>% |
74 | 7x |
add_dataset_name_column_from_rownames() %>% |
75 | 7x |
dplyr::mutate( |
76 | 7x |
dataset_name_factor = as.factor(dataset_name) |
77 |
) %>% |
|
78 | 7x |
dplyr::select( |
79 | 7x |
"dataset_name" = "dataset_name_factor" |
80 |
) |
|
81 | ||
82 |
if ( |
|
83 | 7x |
"age" %in% type |
84 |
) { |
|
85 | 5x |
data_study_design <- |
86 | 5x |
data %>% |
87 | 5x |
purrr::chuck("data_community_to_fit") %>% |
88 | 5x |
add_age_column_from_rownames() %>% |
89 | 5x |
add_dataset_name_column_from_rownames() %>% |
90 | 5x |
dplyr::mutate( |
91 | 5x |
age_factor = factor( |
92 | 5x |
x = age, |
93 | 5x |
levels = sort(unique(as.numeric(age))), |
94 | 5x |
ordered = TRUE |
95 |
), |
|
96 | 5x |
dataset_name_factor = as.factor(dataset_name) |
97 |
) %>% |
|
98 | 5x |
dplyr::select( |
99 | 5x |
"dataset_name" = "dataset_name_factor", |
100 | 5x |
"age" = "age_factor" |
101 |
) |
|
102 | ||
103 | 5x |
data_age <- |
104 | 5x |
data %>% |
105 | 5x |
purrr::chuck("data_ages_to_fit") %>% |
106 | 5x |
dplyr::mutate( |
107 | 5x |
age = as.numeric(age), |
108 |
) |
|
109 | ||
110 | 5x |
vec_age <- |
111 | 5x |
data_age %>% |
112 | 5x |
dplyr::distinct(age) %>% |
113 | 5x |
dplyr::pull(age) %>% |
114 | 5x |
as.character() |
115 | ||
116 | 5x |
random_level_age <- |
117 | 5x |
Hmsc::HmscRandomLevel( |
118 | 5x |
sData = data_age |
119 |
) |
|
120 | ||
121 | 5x |
list_random <- |
122 | 5x |
c( |
123 | 5x |
list_random, |
124 | 5x |
list("age" = random_level_age) |
125 |
) |
|
126 |
} |
|
127 | ||
128 | ||
129 |
if ( |
|
130 | 7x |
"space" %in% type |
131 |
) { |
|
132 | 7x |
data_coords <- |
133 | 7x |
data %>% |
134 | 7x |
purrr::chuck("data_coords_to_fit") |
135 | ||
136 | 7x |
random_coors_knots <- |
137 | 7x |
Hmsc::constructKnots( |
138 | 7x |
sData = data_coords, |
139 | 7x |
minKnotDist = min_knots_distance |
140 |
) |
|
141 | ||
142 | 7x |
random_level_coords <- |
143 | 7x |
Hmsc::HmscRandomLevel( |
144 | 7x |
sData = data_coords, |
145 | 7x |
sMethod = "GPP", |
146 | 7x |
sKnot = random_coors_knots |
147 |
) |
|
148 | ||
149 | 7x |
list_random <- |
150 | 7x |
c( |
151 | 7x |
list_random, |
152 | 7x |
list("dataset_name" = random_level_coords) |
153 |
) |
|
154 |
} |
|
155 | ||
156 | 7x |
list( |
157 | 7x |
random_levels = list_random, |
158 | 7x |
study_design = data_study_design |
159 |
) %>% |
|
160 | 7x |
return() |
161 |
} |
1 |
#' @title Get Community Data |
|
2 |
#' @description |
|
3 |
#' This function processes a data frame containing community data |
|
4 |
#' and extracts the relevant columns, unnesting the `data_community` |
|
5 |
#' column in the process. |
|
6 |
#' @param |
|
7 |
#' data A data frame. Must contain the columns `dataset_name` and |
|
8 |
#' `data_community`. |
|
9 |
#' @return |
|
10 |
#' A data frame with the `dataset_name` and unnested `data_community` columns. |
|
11 |
#' @details The function performs the following steps: |
|
12 |
#' \itemize{ |
|
13 |
#' \item Validates that the input is a data frame. |
|
14 |
#' \item Ensures the presence of the `dataset_name` and `data_community` columns. |
|
15 |
#' \item Selects the `dataset_name` and `data_community` columns. |
|
16 |
#' \item Unnests the `data_community` column. |
|
17 |
#' } |
|
18 |
#' @export |
|
19 |
get_community_data <- function(data = NULL) { |
|
20 | 4x |
assertthat::assert_that( |
21 | 4x |
is.data.frame(data), |
22 | 4x |
msg = "data must be a data frame" |
23 |
) |
|
24 | ||
25 | 2x |
assertthat::assert_that( |
26 | 2x |
all(c("dataset_name", "data_community") %in% colnames(data)), |
27 | 2x |
msg = "data must contain columns 'dataset_name' and 'data_community'" |
28 |
) |
|
29 | ||
30 | 2x |
data %>% |
31 | 2x |
dplyr::select(dataset_name, data_community) %>% |
32 | 2x |
tidyr::unnest(data_community) %>% |
33 | 2x |
return() |
34 |
} |
1 |
#' @title Get Sample Ages |
|
2 |
#' @description |
|
3 |
#' Extracts sample ages from a data frame containing dataset and sample info. |
|
4 |
#' @param data |
|
5 |
#' A data frame. Must contain the columns `dataset_name` and `data_samples`. |
|
6 |
#' @return |
|
7 |
#' A data frame with columns `dataset_name`, `sample_name`, and `age`. |
|
8 |
#' @details |
|
9 |
#' Validates the input data frame, ensures required columns are present, and |
|
10 |
#' extracts sample ages by unnesting the `data_samples` column. |
|
11 |
#' @export |
|
12 |
get_sample_ages <- function(data = NULL) { |
|
13 | 4x |
assertthat::assert_that( |
14 | 4x |
is.data.frame(data), |
15 | 4x |
msg = "data must be a data frame" |
16 |
) |
|
17 | ||
18 | 2x |
assertthat::assert_that( |
19 | 2x |
all(c("dataset_name", "data_samples") %in% colnames(data)), |
20 | 2x |
msg = "data must contain columns 'dataset_name' and 'data_samples'" |
21 |
) |
|
22 | ||
23 | 2x |
data %>% |
24 | 2x |
dplyr::select(dataset_name, data_samples) %>% |
25 | 2x |
tidyr::unnest(data_samples) %>% |
26 | 2x |
dplyr::select( |
27 | 2x |
"dataset_name", |
28 | 2x |
"sample_name", |
29 | 2x |
"age" |
30 |
) %>% |
|
31 | 2x |
return() |
32 |
} |
1 |
#' @title Select Better Model Based on Fit |
|
2 |
#' @description |
|
3 |
#' Compares two models and selects the one with better fit based on Tjur's R2. |
|
4 |
#' @param list_models |
|
5 |
#' A list of two fitted model objects. |
|
6 |
#' @return |
|
7 |
#' The model object with better fit. |
|
8 |
#' @export |
|
9 |
get_better_model_based_on_fit <- function(list_models) { |
|
10 | 7x |
assertthat::assert_that( |
11 | 7x |
is.list(list_models), |
12 | 7x |
msg = "list_models must be a list of models" |
13 |
) |
|
14 | ||
15 | 5x |
assertthat::assert_that( |
16 | 5x |
length(list_models) == 2, |
17 | 5x |
msg = "list_models must be a list of two models" |
18 |
) |
|
19 | ||
20 | 4x |
assertthat::assert_that( |
21 | 4x |
!is.null(list_models[[1]]), |
22 | 4x |
msg = "The first model in list_models must not be NULL" |
23 |
) |
|
24 | ||
25 | 3x |
assertthat::assert_that( |
26 | 3x |
!is.null(list_models[[2]]), |
27 | 3x |
msg = "The second model in list_models must not be NULL" |
28 |
) |
|
29 | ||
30 | 2x |
mod_null <- |
31 | 2x |
list_models %>% |
32 | 2x |
purrr::chuck(1) |
33 | ||
34 | 2x |
mod_full <- |
35 | 2x |
list_models %>% |
36 | 2x |
purrr::chuck(2) |
37 | ||
38 |
# by default select the null model |
|
39 | 2x |
res <- mod_null |
40 | ||
41 | 2x |
null_r2 <- |
42 | 2x |
mod_null %>% |
43 | 2x |
purrr::chuck("eval", "TjurR2") |
44 | ||
45 | 2x |
full_r2 <- |
46 | 2x |
mod_full %>% |
47 | 2x |
purrr::chuck("eval", "TjurR2") |
48 | ||
49 | 2x |
n_r2 <- length(null_r2) |
50 | ||
51 | 2x |
n_better_r2 <- |
52 | 2x |
sum( |
53 | 2x |
null_r2 < full_r2, |
54 | 2x |
na.rm = TRUE |
55 |
) |
|
56 | ||
57 |
if ( |
|
58 | 2x |
n_r2 / n_better_r2 >= 0.5 |
59 |
) { |
|
60 | 2x |
res <- |
61 | 2x |
mod_full |
62 |
} |
|
63 | ||
64 | 2x |
return(res) |
65 |
} |
1 |
#' @title Compute Species Associations |
|
2 |
#' @description |
|
3 |
#' Computes species associations from a fitted Hmsc model object. |
|
4 |
#' @param data_source |
|
5 |
#' A list containing a fitted Hmsc model under the 'mod' element. |
|
6 |
#' Generally, this is the output of the function add_model_evaluation() or |
|
7 |
#' # get_better_model_based_on_fit() |
|
8 |
#' @return |
|
9 |
#' A matrix of species associations. |
|
10 |
#' @seealso [add_model_evaluation(), get_better_model_based_on_fit()] |
|
11 |
#' @export |
|
12 |
get_species_association <- function(data_source) { |
|
13 | 5x |
assertthat::assert_that( |
14 | 5x |
is.list(data_source), |
15 | 5x |
msg = "data_source must be a list" |
16 |
) |
|
17 | ||
18 | 5x |
mod <- |
19 | 5x |
data_source %>% |
20 | 5x |
purrr::chuck("mod") |
21 | ||
22 | 5x |
assertthat::assert_that( |
23 | 5x |
assertthat::are_equal( |
24 | 5x |
class(mod), |
25 | 5x |
"Hmsc", |
26 | 5x |
msg = "data_source must be of class Hmsc" |
27 |
) |
|
28 |
) |
|
29 | ||
30 | 5x |
res <- |
31 | 5x |
Hmsc::computeAssociations(mod) %>% |
32 | 5x |
purrr::set_names( |
33 | 5x |
nm = mod %>% |
34 | 5x |
purrr::chuck("ranLevelsUsed") |
35 |
) |
|
36 | ||
37 | 5x |
return(res) |
38 |
} |
1 |
#' @title Get Significant Species Associations |
|
2 |
#' @description |
|
3 |
#' Identifies significant species associations based on support and mean |
|
4 |
#' values for each error level. |
|
5 |
#' @param data_source |
|
6 |
#' A list containing association matrices from a fitted Hmsc model. |
|
7 |
#' Generally, this is the output of the function get_species_association(). |
|
8 |
#' @param alpha |
|
9 |
#' Significance level for support threshold (default: 0.05). |
|
10 |
#' @return |
|
11 |
#' A vector of significant association values. |
|
12 |
#' @seealso [get_species_association()] |
|
13 |
#' @export |
|
14 |
get_significant_associations <- function(data_source, alpha = 0.05) { |
|
15 | 3x |
assertthat::assert_that( |
16 | 3x |
is.list(data_source), |
17 | 3x |
msg = "The data source must be a list." |
18 |
) |
|
19 | ||
20 | 3x |
support_threshold <- 1 - alpha |
21 | ||
22 | 3x |
assertthat::assert_that( |
23 | 3x |
is.numeric(support_threshold) && support_threshold > 0 && support_threshold < 1, |
24 | 3x |
msg = "The support threshold must be a numeric value between 0 and 1." |
25 |
) |
|
26 | ||
27 | 3x |
assertthat::assert_that( |
28 | 3x |
is.list(data_source) && length(data_source) > 0, |
29 | 3x |
msg = "The data source is list and not empty." |
30 |
) |
|
31 | ||
32 | 3x |
assertthat::assert_that( |
33 | 3x |
purrr::map_lgl( |
34 | 3x |
.x = data_source, |
35 | 3x |
.f = ~ is.list(.x) && all(c("mean", "support") %in% names(.x)) |
36 |
) %>% |
|
37 | 3x |
all(), |
38 | 3x |
msg = "The data source must contain lists with 'mean' and 'support' matrices." |
39 |
) |
|
40 | ||
41 | 3x |
result <- |
42 | 3x |
data_source %>% |
43 | 3x |
purrr::map( |
44 | 3x |
.f = ~ { |
45 | 6x |
vec_data_support <- |
46 | 6x |
purrr::pluck(.x, "support") %>% |
47 |
{ |
|
48 | 6x |
.[lower.tri(., diag = FALSE)] |
49 |
} |
|
50 | ||
51 | 6x |
assertthat::assert_that( |
52 | 6x |
is.numeric(vec_data_support), |
53 | 6x |
msg = "The support values must be numeric vectors." |
54 |
) |
|
55 | ||
56 | 6x |
n_values <- length(vec_data_support) |
57 | ||
58 | 6x |
vec_significant <- ( |
59 | 6x |
(vec_data_support > support_threshold) + |
60 | 6x |
((vec_data_support < (1 - support_threshold)) > 0) |
61 |
) |
|
62 | ||
63 | 6x |
n_significant <- sum(vec_significant, na.rm = TRUE) |
64 | ||
65 | 6x |
res <- |
66 | 6x |
list( |
67 | 6x |
n_associations = n_values, |
68 | 6x |
n_significant = n_significant, |
69 | 6x |
proportion_significant = n_significant / n_values |
70 |
) |
|
71 |
} |
|
72 |
) |
|
73 | ||
74 | 3x |
return(result) |
75 |
} |
1 |
#' @title Get Active Configuration Value |
|
2 |
#' @description |
|
3 |
#' Retrieves a specific configuration value from a YAML configuration file. |
|
4 |
#' @param value |
|
5 |
#' A character vector specifying the configuration key(s) to retrieve. |
|
6 |
#' @param file |
|
7 |
#' Path to the YAML configuration file (default: "config.yml"). |
|
8 |
#' @return |
|
9 |
#' Value(s) associated with the specified key(s) in the configuration file. |
|
10 |
#' @details |
|
11 |
#' Validates input parameters, ensures the file is readable, and retrieves |
|
12 |
#' configuration value(s) using `config::get`. Active configuration is set by |
|
13 |
#' the `R_CONFIG_ACTIVE` environment variable. |
|
14 |
#' @export |
|
15 |
get_active_config <- function( |
|
16 |
value = NULL, |
|
17 |
file = here::here("config.yml")) { |
|
18 | 3x |
assertthat::assert_that( |
19 | 3x |
is.character(value) && length(value) > 0, |
20 | 3x |
msg = "value must be a character vector with at least one element" |
21 |
) |
|
22 | ||
23 | 2x |
assertthat::assert_that( |
24 | 2x |
assertthat::is.readable(file) && assertthat::has_extension(file, "yml"), |
25 | 2x |
msg = "file must be a readable YAML file" |
26 |
) |
|
27 | ||
28 | 1x |
config::get( |
29 | 1x |
value = value, |
30 | 1x |
config = Sys.getenv("R_CONFIG_ACTIVE"), |
31 | 1x |
use_parent = FALSE, |
32 | 1x |
file = file |
33 |
) %>% |
|
34 | 1x |
return() |
35 |
} |
1 |
#' @title Check Presence of VegVault File |
|
2 |
#' @description |
|
3 |
#' Checks whether the `VegVault.sqlite` file exists in the specified directory. |
|
4 |
#' @param relative_path |
|
5 |
#' Relative path to the `VegVault.sqlite` file (default: "Data/Input/Vegvault.sqlite"). |
|
6 |
#' @return |
|
7 |
#' Logical value indicating whether the file exists. Stops with an error if |
|
8 |
#' the file is not found. |
|
9 |
#' @details |
|
10 |
#' Verifies the presence of the `VegVault.sqlite` file. If not found, throws |
|
11 |
#' an error with instructions to consult the `Data/Input/README.md` file. |
|
12 |
#' @export |
|
13 |
check_presence_of_vegvault <- function(relative_path = "Data/Input/Vegvault.sqlite") { |
|
14 | 9x |
vegvault_present <- |
15 | 9x |
file.exists( |
16 | 9x |
here::here(relative_path) |
17 |
) |
|
18 | ||
19 |
if ( |
|
20 | 9x |
isFALSE(vegvault_present) |
21 |
) { |
|
22 | 2x |
stop( |
23 | 2x |
paste( |
24 | 2x |
"The VegVault.sqlite file is not present in", |
25 | 2x |
"the `Data/Input/` directory.", |
26 | 2x |
"Please read the `Data/Input/README.md` file for more information." |
27 |
) |
|
28 |
) |
|
29 | ||
30 | ! |
return(vegvault_present) |
31 |
} |
|
32 | ||
33 | 7x |
return(vegvault_present) |
34 |
} |
1 |
#' @title Get Taxa Classification |
|
2 |
#' @description |
|
3 |
#' Retrieves taxonomic classification for a vector of taxa using the |
|
4 |
#' taxospace package. Filters results to include only plant taxa. |
|
5 |
#' @param data A character vector of taxon names to classify. |
|
6 |
#' @return A data frame with columns for selected name, taxonomic name, |
|
7 |
#' rank, and id. |
|
8 |
#' @details |
|
9 |
#' Uses taxospace::get_classification to retrieve classification. Flags and |
|
10 |
#' filters for plant taxa (kingdom Plantae). Returns an empty tibble if no |
|
11 |
#' plant taxa found. |
|
12 |
#' @export |
|
13 |
get_taxa_classification <- function(data) { |
|
14 | 12x |
require(taxospace) |
15 | ||
16 | 12x |
assertthat::assert_that( |
17 | 12x |
is.character(data) && length(data) > 0, |
18 | 12x |
msg = "data must be a non-empty character vector" |
19 |
) |
|
20 | ||
21 | 10x |
res_classification <- |
22 | 10x |
taxospace::get_classification( |
23 | 10x |
taxa_vec = data, |
24 |
# this is done so that the best match is returned |
|
25 |
# even if the result is not flagged as "exact" |
|
26 | 10x |
use_only_exact_match = FALSE |
27 |
) |
|
28 | ||
29 | 10x |
res_plant <- |
30 | 10x |
res_classification %>% |
31 |
# flag taxa that are plants |
|
32 | 10x |
dplyr::mutate( |
33 | 10x |
is_plant = purrr::map_lgl( |
34 | 10x |
.x = classification, |
35 | 10x |
.f = ~ .x %>% |
36 | 10x |
dplyr::filter(rank == "kingdom") %>% |
37 | 10x |
dplyr::pull(name) %>% |
38 | 10x |
stringr::str_detect("Plantae") %>% |
39 | 10x |
any() |
40 |
) |
|
41 |
) %>% |
|
42 |
# filter only plant taxa |
|
43 | 10x |
dplyr::filter(is_plant) |
44 | ||
45 |
if ( |
|
46 | 8x |
isTRUE(nrow(res_plant) == 0) |
47 |
) { |
|
48 | ! |
return( |
49 | ! |
tibble::tibble( |
50 | ! |
sel_name = data, |
51 | ! |
name = character(), |
52 | ! |
rank = character(), |
53 | ! |
id = integer(), |
54 |
) |
|
55 |
) |
|
56 |
} |
|
57 | ||
58 | 8x |
res_plant %>% |
59 | 8x |
dplyr::select(sel_name, classification) %>% |
60 | 8x |
tidyr::unnest(classification) %>% |
61 | 8x |
return() |
62 |
} |
1 |
#' @title Calculate the Total Pollen Count for Each Sample |
|
2 |
#' @description |
|
3 |
#' This function computes the total pollen count for each sample |
|
4 |
#' in the provided dataset. |
|
5 |
#' @param data |
|
6 |
#' A data frame containing at least two columns: |
|
7 |
#' `sample_name` (the name or identifier of the sample) |
|
8 |
#' and `pollen_count` (the count of pollen for each observation). |
|
9 |
#' @return |
|
10 |
#' A data frame with two columns: `sample_name` and `pollen_sum`, |
|
11 |
#' where `pollen_sum` is the total pollen count |
|
12 |
#' for each sample. |
|
13 |
#' @details |
|
14 |
#' The function groups the data by `sample_name`, |
|
15 |
#' calculates the sum of `pollen_count` for each group, |
|
16 |
#' and removes any grouping structure before returning the result. |
|
17 |
#' Missing values (`NA`) in `pollen_count` are ignored. |
|
18 |
#' @export |
|
19 |
get_pollen_sum <- function(data) { |
|
20 | 8x |
data %>% |
21 | 8x |
dplyr::group_by(sample_name) %>% |
22 | 8x |
dplyr::summarize(pollen_sum = sum(pollen_count, na.rm = TRUE)) %>% |
23 | 8x |
dplyr::ungroup() %>% |
24 | 8x |
return() |
25 |
} |
1 |
#' @title Make HMSC Model |
|
2 |
#' @description |
|
3 |
#' Create a Hierarchical Modelling of Species Communities (HMSC) model to |
|
4 |
#' fit community and abiotic data. |
|
5 |
#' @param data_to_fit |
|
6 |
#' A list containing the community and abiotic data to fit the model. |
|
7 |
#' @param random_structure |
|
8 |
#' A list containing the random structure for the model, including the |
|
9 |
#' study design and random levels. |
|
10 |
#' @param error_family |
|
11 |
#' A character string specifying the error family. Options are "normal" or |
|
12 |
#' "binomial" (default: "normal"). |
|
13 |
#' @return |
|
14 |
#' returns an unfitted HMSC model object. |
|
15 |
#' @details |
|
16 |
#' If `error_family` is "binomial", the community data is converted to binary |
|
17 |
#' presence/absence data, and the error family is set to "probit". |
|
18 |
#' @export |
|
19 |
make_hmsc_model <- function( |
|
20 |
data_to_fit = NULL, |
|
21 |
sel_formula = NULL, |
|
22 |
random_structure = NULL, |
|
23 |
error_family = c("normal", "binomial")) { |
|
24 | 13x |
assertthat::assert_that( |
25 | 13x |
is.list(data_to_fit), |
26 | 13x |
msg = "data_to_fit must be a list" |
27 |
) |
|
28 | ||
29 | 11x |
data_community <- |
30 | 11x |
data_to_fit %>% |
31 | 11x |
purrr::chuck("data_community_to_fit") |
32 | ||
33 | ||
34 | 10x |
assertthat::assert_that( |
35 | 10x |
is.data.frame(data_community), |
36 | 10x |
msg = "data_community must be a data frame" |
37 |
) |
|
38 | ||
39 | 10x |
data_abiotic <- |
40 | 10x |
data_to_fit %>% |
41 | 10x |
purrr::chuck("data_abiotic_to_fit") |
42 | ||
43 | ||
44 | 10x |
assertthat::assert_that( |
45 | 10x |
is.data.frame(data_abiotic), |
46 | 10x |
msg = "data_abiotic must be a data frame" |
47 |
) |
|
48 | ||
49 | ||
50 | 10x |
assertthat::assert_that( |
51 | 10x |
is.character(sel_formula), |
52 | 10x |
msg = "sel_formula must be a character string " |
53 |
) |
|
54 | ||
55 | 9x |
assertthat::assert_that( |
56 | 9x |
any( |
57 | 9x |
error_family %in% c("normal", "binomial") |
58 |
), |
|
59 | 9x |
msg = "error_family must be either 'normal' or 'binomial'" |
60 |
) |
|
61 | ||
62 | 6x |
error_family <- match.arg(error_family) |
63 | ||
64 | 6x |
assertthat::assert_that( |
65 | 6x |
length(sel_formula) == 1, |
66 | 6x |
msg = "sel_formula must be a character string of length 1" |
67 |
) |
|
68 | ||
69 |
if ( |
|
70 | 6x |
error_family == "binomial" |
71 |
) { |
|
72 | ! |
data_community <- |
73 | ! |
data_community > 0 |
74 | ||
75 | ! |
error_family <- "probit" |
76 |
} |
|
77 | ||
78 | 6x |
assertthat::assert_that( |
79 | 6x |
is.list(random_structure), |
80 | 6x |
msg = "random_structure must be a list" |
81 |
) |
|
82 | ||
83 | 4x |
assertthat::assert_that( |
84 | 4x |
all(c("study_design", "random_levels") %in% names(random_structure)), |
85 | 4x |
msg = "random_structure must contain study_design and random_levels" |
86 |
) |
|
87 | ||
88 | 3x |
study_design <- |
89 | 3x |
random_structure %>% |
90 | 3x |
purrr::chuck("study_design") |
91 | ||
92 | 3x |
mod_hmsc <- |
93 | 3x |
Hmsc::Hmsc( |
94 | 3x |
Y = data_community, |
95 | 3x |
XData = data_abiotic, |
96 | 3x |
XFormula = as.formula(sel_formula), |
97 | 3x |
distr = error_family, |
98 | 3x |
studyDesign = study_design, |
99 | 3x |
ranLevels = random_structure$random_levels |
100 |
) |
|
101 | ||
102 | 2x |
return(mod_hmsc) |
103 |
} |
1 |
#' @title Extract Data from VegVault |
|
2 |
#' @description |
|
3 |
#' Extracts data from the VegVault SQLite database based on specified |
|
4 |
#' geographic, temporal, and dataset type constraints. |
|
5 |
#' @param path_to_vegvault |
|
6 |
#' A character string specifying the path to the VegVault SQLite database |
|
7 |
#' (default: "Data/Input/VegVault.sqlite"). |
|
8 |
#' @param x_lim |
|
9 |
#' A numeric vector of length 2 specifying the longitude range. |
|
10 |
#' @param y_lim |
|
11 |
#' A numeric vector of length 2 specifying the latitude range. |
|
12 |
#' @param age_lim |
|
13 |
#' A numeric vector of length 2 specifying the age range. |
|
14 |
#' @param sel_dataset_type |
|
15 |
#' A character vector specifying the dataset types to select. |
|
16 |
#' @param sel_abiotic_var_name |
|
17 |
#' A character vector specifying the abiotic variable names to select. |
|
18 |
#' @return |
|
19 |
#' A data frame containing the extracted data. |
|
20 |
#' @details |
|
21 |
#' The function performs the following steps: |
|
22 |
#' |
|
23 |
#' 1. Validates input parameters. |
|
24 |
#' 2. Checks the presence of the VegVault SQLite database. |
|
25 |
#' 3. Accesses the database and filters data based on geographic, temporal, |
|
26 |
#' and dataset type constraints. |
|
27 |
#' 4. Retrieves abiotic data and taxa information. |
|
28 |
#' 5. Returns the extracted data as a data frame. |
|
29 |
#' @export |
|
30 |
extract_data_from_vegvault <- function( |
|
31 |
path_to_vegvault = here::here("Data/Input/VegVault.sqlite"), |
|
32 |
x_lim = NULL, |
|
33 |
y_lim = NULL, |
|
34 |
age_lim = NULL, |
|
35 |
sel_dataset_type = NULL, |
|
36 |
sel_abiotic_var_name = NULL) { |
|
37 | 7x |
`%>%` <- magrittr::`%>%` |
38 | ||
39 | 7x |
assertthat::assert_that( |
40 | 7x |
is.character(path_to_vegvault), |
41 | 7x |
length(path_to_vegvault) == 1, |
42 | 7x |
msg = "path_to_vegvault must be a single character string" |
43 |
) |
|
44 | ||
45 |
# Check if the VegVault file exists |
|
46 | 7x |
check_presence_of_vegvault(path_to_vegvault) |
47 | ||
48 | 6x |
assertthat::assert_that( |
49 | 6x |
is.numeric(x_lim) && length(x_lim) == 2, |
50 | 6x |
msg = "x_lim must be a numeric vector of length 2" |
51 |
) |
|
52 | ||
53 | 5x |
assertthat::assert_that( |
54 | 5x |
is.numeric(y_lim) && length(y_lim) == 2, |
55 | 5x |
msg = "y_lim must be a numeric vector of length 2" |
56 |
) |
|
57 | ||
58 | 4x |
assertthat::assert_that( |
59 | 4x |
is.numeric(age_lim) && length(age_lim) == 2, |
60 | 4x |
msg = "age_lim must be a numeric vector of length 2" |
61 |
) |
|
62 | ||
63 | 3x |
assertthat::assert_that( |
64 | 3x |
is.character(sel_dataset_type) && length(sel_dataset_type) > 0, |
65 | 3x |
msg = "sel_dataset_type must be a character vector of length > 0" |
66 |
) |
|
67 | ||
68 | 2x |
assertthat::assert_that( |
69 | 2x |
is.character(sel_abiotic_var_name) && length(sel_abiotic_var_name) > 0, |
70 | 2x |
msg = "sel_abiotic_var_name must be a character vector of length > 0" |
71 |
) |
|
72 | ||
73 | 1x |
vaultkeepr_plan <- |
74 |
# Access the VegVault file |
|
75 | 1x |
vaultkeepr::open_vault( |
76 | 1x |
path = path_to_vegvault |
77 |
) %>% |
|
78 |
# Add the dataset information |
|
79 | 1x |
vaultkeepr::get_datasets() %>% |
80 |
# Select modern plot data and climate |
|
81 | 1x |
vaultkeepr::select_dataset_by_type( |
82 | 1x |
sel_dataset_type = sel_dataset_type |
83 |
) %>% |
|
84 |
# Limit data to Czech Republic |
|
85 | 1x |
vaultkeepr::select_dataset_by_geo( |
86 | 1x |
lat_lim = y_lim, |
87 | 1x |
long_lim = x_lim, |
88 | 1x |
verbose = FALSE |
89 |
) %>% |
|
90 |
# Add samples |
|
91 | 1x |
vaultkeepr::get_samples() %>% |
92 |
# select only modern data |
|
93 | 1x |
vaultkeepr::select_samples_by_age( |
94 | 1x |
age_lim = age_lim, |
95 | 1x |
verbose = FALSE |
96 |
) %>% |
|
97 |
# Add abiotic data |
|
98 | 1x |
vaultkeepr::get_abiotic_data(verbose = FALSE) %>% |
99 |
# Select only Mean Anual Temperature (bio1) |
|
100 | 1x |
vaultkeepr::select_abiotic_var_by_name( |
101 | 1x |
sel_var_name = sel_abiotic_var_name) %>% |
102 |
# add taxa |
|
103 | 1x |
vaultkeepr::get_taxa() |
104 | ||
105 | 1x |
data_extracted <- |
106 | 1x |
vaultkeepr_plan %>% |
107 | 1x |
vaultkeepr::extract_data( |
108 | 1x |
return_raw_data = FALSE, |
109 | 1x |
verbose = FALSE |
110 |
) |
|
111 | ||
112 | 1x |
return(data_extracted) |
113 |
} |
1 |
#' @title Get Dataset Coordinates |
|
2 |
#' @description |
|
3 |
#' Extracts unique coordinates for each dataset, excluding gridpoints. |
|
4 |
#' @param data |
|
5 |
#' A data frame containing columns 'dataset_name', 'coord_long', and |
|
6 |
#' 'coord_lat'. |
|
7 |
#' @return |
|
8 |
#' A data frame with unique coordinates for each dataset. |
|
9 |
#' @export |
|
10 |
get_coords <- function(data) { |
|
11 | 5x |
data %>% |
12 | 5x |
dplyr::filter( |
13 | 5x |
dataset_type != "gridpoints" |
14 |
) %>% |
|
15 | 5x |
dplyr::select( |
16 | 5x |
"dataset_name", |
17 | 5x |
"coord_long", |
18 | 5x |
"coord_lat" |
19 |
) %>% |
|
20 | 5x |
dplyr::distinct() %>% |
21 | 5x |
tibble::column_to_rownames("dataset_name") %>% |
22 | 5x |
return() |
23 |
} |
1 |
#' @title Classify Taxonomic Resolution |
|
2 |
#' @description |
|
3 |
#' Classifies taxa in a data frame to a specified taxonomic resolution (family, |
|
4 |
#' genus, or species) using a classification table, and aggregates pollen |
|
5 |
#' proportions accordingly. |
|
6 |
#' @param data A data frame containing taxon data with columns including |
|
7 |
#' 'taxon', 'dataset_name', 'age', and 'pollen_prop'. |
|
8 |
#' @param data_classification_table A data frame mapping 'sel_name' to |
|
9 |
#' taxonomic levels (e.g., family, genus, species). |
|
10 |
#' @param taxonomic_resolution A character string specifying the taxonomic |
|
11 |
#' resolution to classify to. Must be one of 'family', 'genus', or 'species'. |
|
12 |
#' @return A data frame with taxa classified to the specified resolution and |
|
13 |
#' pollen proportions aggregated accordingly. The output preserves all |
|
14 |
#' dataset_name and age combinations for true negatives. |
|
15 |
#' @details |
|
16 |
#' Performs a left join to map taxa to the desired resolution, aggregates |
|
17 |
#' pollen proportions, and ensures all dataset_name-age-taxon combinations are |
|
18 |
#' present in the output. |
|
19 |
#' @export |
|
20 |
classify_taxonomic_resolution <- function(data, data_classification_table, taxonomic_resolution) { |
|
21 | 13x |
assertthat::assert_that( |
22 | 13x |
is.data.frame(data), |
23 | 13x |
msg = "data must be a data frame" |
24 |
) |
|
25 | ||
26 | 10x |
assertthat::assert_that( |
27 | 10x |
all(c("taxon", "dataset_name", "age", "pollen_prop") %in% colnames(data)), |
28 | 10x |
msg = "data must contain columns: taxon, dataset_name, age, and pollen_prop" |
29 |
) |
|
30 | ||
31 | 9x |
assertthat::assert_that( |
32 | 9x |
is.data.frame(data_classification_table), |
33 | 9x |
msg = "data_classification_table must be a data frame" |
34 |
) |
|
35 | ||
36 | 6x |
assertthat::assert_that( |
37 | 6x |
is.character(taxonomic_resolution) && length(taxonomic_resolution) == 1, |
38 | 6x |
msg = "taxonomic_resolution must be a single character string" |
39 |
) |
|
40 | ||
41 | 4x |
assertthat::assert_that( |
42 | 4x |
taxonomic_resolution %in% c("family", "genus", "species"), |
43 | 4x |
msg = "taxonomic_resolution must be one of 'family', 'genus', or 'species'" |
44 |
) |
|
45 | ||
46 | 3x |
assertthat::assert_that( |
47 | 3x |
any(taxonomic_resolution %in% colnames(data_classification_table)), |
48 | 3x |
msg = "taxonomic_resolution must be a column in data_classification_table" |
49 |
) |
|
50 | ||
51 | 2x |
data_classification_table_sub <- |
52 | 2x |
data_classification_table %>% |
53 | 2x |
dplyr::select(sel_name, !!taxonomic_resolution) |
54 | ||
55 | 2x |
data_classified <- |
56 | 2x |
data %>% |
57 | 2x |
dplyr::left_join( |
58 | 2x |
data_classification_table_sub, |
59 | 2x |
by = dplyr::join_by("taxon" == "sel_name") |
60 |
) %>% |
|
61 | 2x |
dplyr::select(-taxon) %>% |
62 | 2x |
dplyr::rename( |
63 | 2x |
taxon = !!taxonomic_resolution |
64 |
) |
|
65 | ||
66 |
# make dummy table with all dataset_name and age combinations |
|
67 |
# this is needed to ensure that all combinations are present in the |
|
68 |
# final output to preserve true negative values |
|
69 | 2x |
data_dataset_age_cross_ref <- |
70 | 2x |
data_classified %>% |
71 | 2x |
dplyr::distinct(dataset_name, age, taxon) |
72 | ||
73 | 2x |
res <- |
74 | 2x |
data_classified %>% |
75 | 2x |
tidyr::drop_na(pollen_prop) %>% |
76 | 2x |
dplyr::group_by( |
77 | 2x |
dataset_name, age, taxon |
78 |
) %>% |
|
79 | 2x |
dplyr::summarise( |
80 | 2x |
.groups = "drop", |
81 | 2x |
pollen_prop = sum(pollen_prop) |
82 |
) %>% |
|
83 | 2x |
dplyr::full_join( |
84 | 2x |
data_dataset_age_cross_ref, |
85 | 2x |
by = c("dataset_name", "age", "taxon") |
86 |
) %>% |
|
87 | 2x |
dplyr::arrange(age, dataset_name, taxon) %>% |
88 | 2x |
dplyr::select( |
89 | 2x |
names(data) |
90 |
) |
|
91 | ||
92 | 2x |
return(res) |
93 |
} |
1 |
#' @title Extract Age from String |
|
2 |
#' @description |
|
3 |
#' Extracts the age value from a vector of strings, taking all characters |
|
4 |
#' after the double underscore ("__"). |
|
5 |
#' @param vec_names |
|
6 |
#' A character vector containing names with the format "dataset__age". |
|
7 |
#' @return |
|
8 |
#' A character vector of age values. |
|
9 |
#' @export |
|
10 |
get_age_from_string <- function(vec_names) { |
|
11 | 65x |
assertthat::assert_that( |
12 | 65x |
is.character(vec_names), |
13 | 65x |
msg = "Input must be a character vector." |
14 |
) |
|
15 | ||
16 | 61x |
assertthat::assert_that( |
17 | 61x |
length(vec_names) > 0, |
18 | 61x |
msg = "Input vector must not be empty." |
19 |
) |
|
20 | ||
21 | 59x |
assertthat::assert_that( |
22 | 59x |
all(stringr::str_detect(vec_names, "__")), |
23 | 59x |
msg = "Input strings must contain '__' to extract age." |
24 |
) |
|
25 | ||
26 | 57x |
vec_names %>% |
27 |
# get all values after "__" |
|
28 | 57x |
stringr::str_extract("__(.*)") %>% |
29 |
# remove "__" |
|
30 | 57x |
stringr::str_remove("__") %>% |
31 | 57x |
stringr::str_trim() %>% |
32 | 57x |
return() |
33 |
} |
1 |
#' @title Replace NA in Community Data with Zeros |
|
2 |
#' @description |
|
3 |
#' Replaces NA values in community data with zeros. |
|
4 |
#' @param data |
|
5 |
#' A data frame. Must contain `dataset_name` and `sample_name` columns, and |
|
6 |
#' at least one taxon column. |
|
7 |
#' @return |
|
8 |
#' A data frame with NA values replaced by zeros. |
|
9 |
#' @details |
|
10 |
#' Converts the data to long format, replaces NA values in `pollen_count` |
|
11 |
#' with zeros, and reshapes it back to wide format. |
|
12 |
#' @export |
|
13 |
replace_na_community_data_with_zeros <- function(data = NULL) { |
|
14 | 4x |
assertthat::assert_that( |
15 | 4x |
is.data.frame(data), |
16 | 4x |
msg = "data must be a data frame" |
17 |
) |
|
18 | 2x |
assertthat::assert_that( |
19 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data)), |
20 | 2x |
msg = "data must contain columns 'dataset_name' and 'sample_name'" |
21 |
) |
|
22 | ||
23 | 2x |
assertthat::assert_that( |
24 | 2x |
colnames(data) %>% |
25 | 2x |
length() > 2, |
26 | 2x |
msg = "data must contain at least one taxon column" |
27 |
) |
|
28 | ||
29 | 2x |
data %>% |
30 | 2x |
tidyr::pivot_longer( |
31 | 2x |
cols = !c("dataset_name", "sample_name"), |
32 | 2x |
names_to = "taxon", |
33 | 2x |
values_to = "pollen_count" |
34 |
) %>% |
|
35 | 2x |
dplyr::mutate( |
36 | 2x |
pollen_count = dplyr::if_else(is.na(pollen_count), 0, pollen_count) |
37 |
) %>% |
|
38 | 2x |
tidyr::pivot_wider( |
39 | 2x |
names_from = "taxon", |
40 | 2x |
values_from = "pollen_count" |
41 |
) %>% |
|
42 | 2x |
return() |
43 |
} |
1 |
#' @title Make Community Data Long |
|
2 |
#' @description |
|
3 |
#' Converts community data from wide format to long format. |
|
4 |
#' @param data |
|
5 |
#' A data frame. Must contain `dataset_name` and `sample_name` columns. |
|
6 |
#' @return |
|
7 |
#' A data frame in long format with columns `dataset_name`, `sample_name`, |
|
8 |
#' `taxon`, and `pollen_count`. |
|
9 |
#' @details |
|
10 |
#' Uses `tidyr::pivot_longer` to reshape the data, dropping NA values in the |
|
11 |
#' process. |
|
12 |
#' @export |
|
13 |
make_community_data_long <- function(data = NULL) { |
|
14 | 4x |
assertthat::assert_that( |
15 | 4x |
is.data.frame(data), |
16 | 4x |
msg = "data must be a data frame" |
17 |
) |
|
18 | ||
19 | 2x |
assertthat::assert_that( |
20 | 2x |
all(c("dataset_name", "sample_name") %in% colnames(data)), |
21 | 2x |
msg = "data must contain columns 'dataset_name' and 'sample_name'" |
22 |
) |
|
23 | ||
24 | 2x |
data %>% |
25 | 2x |
tidyr::pivot_longer( |
26 | 2x |
cols = !c("dataset_name", "sample_name"), |
27 | 2x |
names_to = "taxon", |
28 | 2x |
values_to = "pollen_count", |
29 | 2x |
values_drop_na = TRUE |
30 |
) %>% |
|
31 | 2x |
return() |
32 |
} |
1 |
#' @title Add Dataset Name Column from Row Names |
|
2 |
#' @description |
|
3 |
#' Adds a 'dataset_name' column to a data frame by extracting dataset names |
|
4 |
#' from row names. |
|
5 |
#' @param data |
|
6 |
#' A data frame with row names in the format "dataset__something". |
|
7 |
#' @return |
|
8 |
#' The input data frame with an added 'dataset_name' column. |
|
9 |
#' @export |
|
10 |
add_dataset_name_column_from_rownames <- function(data) { |
|
11 | 62x |
row_names <- |
12 | 62x |
row.names(data) %>% |
13 | 62x |
get_dataset_name_from_string() |
14 | ||
15 | 58x |
data %>% |
16 | 58x |
dplyr::mutate( |
17 | 58x |
dataset_name = row_names |
18 |
) %>% |
|
19 | 58x |
return() |
20 |
} |
1 |
#' @title Add Age Column from Row Names |
|
2 |
#' @description |
|
3 |
#' Adds an 'age' column to a data frame by extracting age values from row |
|
4 |
#' names. |
|
5 |
#' @param data |
|
6 |
#' A data frame with row names in the format "dataset__age". |
|
7 |
#' @return |
|
8 |
#' The input data frame with an added 'age' column. |
|
9 |
#' @export |
|
10 |
add_age_column_from_rownames <- function(data) { |
|
11 | 59x |
row_names <- |
12 | 59x |
row.names(data) %>% |
13 | 59x |
get_age_from_string() |
14 | ||
15 | 55x |
data %>% |
16 | 55x |
dplyr::mutate( |
17 | 55x |
age = as.numeric(row_names) |
18 |
) %>% |
|
19 | 55x |
return() |
20 |
} |
1 |
#' @title Transform to Proportions |
|
2 |
#' @description |
|
3 |
#' Transforms pollen count data into proportions based on total pollen count. |
|
4 |
#' @param data |
|
5 |
#' A data frame containing pollen count data. |
|
6 |
#' @param pollen_sum |
|
7 |
#' A data frame with total pollen counts for each sample. |
|
8 |
#' @return |
|
9 |
#' A data frame with pollen proportions, excluding `pollen_sum` and |
|
10 |
#' `pollen_count` columns. |
|
11 |
#' @details |
|
12 |
#' Joins the input data with total pollen counts and calculates proportions |
|
13 |
#' using `dplyr::mutate`. |
|
14 |
#' @export |
|
15 |
transform_to_proportions <- function(data = NULL, pollen_sum = NULL) { |
|
16 | 8x |
data %>% |
17 | 8x |
dplyr::left_join( |
18 | 8x |
pollen_sum, |
19 | 8x |
by = "sample_name" |
20 |
) %>% |
|
21 | 8x |
dplyr::mutate( |
22 | 8x |
pollen_prop = pollen_count / pollen_sum, |
23 | 8x |
.after = pollen_count |
24 |
) %>% |
|
25 | 8x |
dplyr::select( |
26 | 8x |
!c("pollen_sum", "pollen_count") |
27 |
) %>% |
|
28 | 8x |
return() |
29 |
} |
1 |
#' @title Interpolate Community Data |
|
2 |
#' @description |
|
3 |
#' Transforms community data to proportions, interpolates it, and returns it. |
|
4 |
#' @param data |
|
5 |
#' A data frame containing community data to be transformed and interpolated. |
|
6 |
#' @param ... |
|
7 |
#' Additional arguments passed to the `interpolate_data` function. |
|
8 |
#' @return |
|
9 |
#' A data frame with interpolated community data. |
|
10 |
#' @details |
|
11 |
#' Transforms data to proportions using `transform_to_proportions` and total |
|
12 |
#' pollen count from `get_pollen_sum`. Then interpolates using `interpolate_data`. |
|
13 |
#' @export |
|
14 |
interpolate_community_data <- function(data, ...) { |
|
15 | 4x |
data %>% |
16 | 4x |
transform_to_proportions(pollen_sum = get_pollen_sum(data)) %>% |
17 | 4x |
interpolate_data(by = c("dataset_name", "taxon"), ...) %>% |
18 | 4x |
return() |
19 |
} |
1 |
#' @title Make Classification Table |
|
2 |
#' @description |
|
3 |
#' Creates a wide-format taxonomic classification table from a list of data |
|
4 |
#' frames containing taxon names and ranks (e.g., family, genus, species). |
|
5 |
#' @param data A list of data frames, each containing columns 'sel_name', |
|
6 |
#' 'rank', and 'name'. |
|
7 |
#' @return A data frame in wide format with columns for each taxonomic rank |
|
8 |
#' and one row per 'sel_name'. |
|
9 |
#' @details |
|
10 |
#' Filters for relevant taxonomic ranks, removes duplicates, and pivots the |
|
11 |
#' table to wide format with one column per rank. |
|
12 |
#' @export |
|
13 |
make_classification_table <- function(data) { |
|
14 | 6x |
dplyr::bind_rows(data) %>% |
15 | 6x |
dplyr::filter( |
16 | 6x |
rank %in% c( |
17 | 6x |
"family", "genus", "species" |
18 |
) |
|
19 |
) %>% |
|
20 | 6x |
dplyr::distinct( |
21 | 6x |
sel_name, rank, name |
22 |
) %>% |
|
23 | 6x |
tidyr::pivot_wider( |
24 | 6x |
names_from = rank, |
25 | 6x |
values_from = name |
26 |
) %>% |
|
27 | 6x |
return() |
28 |
} |