#' @return calculate great circle distance in kilometers (km).
#' @param R Earth mean radius (km)
#' @param long1.r convert from degrees to radians for latitudes and longitudes.
#' @export
# Function to calculate Great Circle Distance:
gcd.slc <- function(long1, lat1, long2, lat2) {
R <- 6371
long1.r <- long1 * pi / 180
long2.r <- long2 * pi / 180
lat1.r <- lat1 * pi / 180
lat2.r <- lat2 * pi / 180
cos_d <- sin(lat1.r) * sin(lat2.r) +
cos(lat1.r) * cos(lat2.r) * cos(long2.r - long1.r)
# numerical stability
cos_d <- pmin(1, pmax(-1, cos_d))
acos(cos_d) * R
}
#Function for jaccard
jaccard_similarity <- function(x, cells_distinct_pair) {
all_results <- vector("list", length(x))
for (k in seq_along(x)) {
# cat("Replicate", k, "\n") # will write a message in console if the replicate succeeds
dat <- x[[k]]
# split by interval
split_intervals <- split(dat, dat$interval.ma)
res_list <- list()
for (int_name in names(split_intervals)) {
d <- split_intervals[[int_name]]
# build presence-absence matrix
tab <- table(d$cell, d$accepted_name)
mat <- (tab > 0) * 1
if (nrow(mat) < 2) next
# Jaccard calculation
inter <- mat %*% t(mat)
rs <- rowSums(mat)
union <- outer(rs, rs, "+") - inter
jac <- inter / union
# filter pair table using x and y
pair_df <- cells_distinct_pair |>
dplyr::filter(interval.ma == as.numeric(int_name)) |>
dplyr::filter(x %in% rownames(mat),
y %in% rownames(mat))
if (nrow(pair_df) == 0) next
# match indices
idx_x <- match(pair_df$x, rownames(mat))
idx_y <- match(pair_df$y, rownames(mat))
# remove NA matches
valid <- !is.na(idx_x) & !is.na(idx_y)
if (!any(valid)) next
pair_df <- pair_df[valid, ]
idx_x <- idx_x[valid]
idx_y <- idx_y[valid]
# store results
res_list[[length(res_list) + 1]] <- data.frame(
replicate = k,
interval.ma = as.numeric(int_name),
cell_x = pair_df$x,
cell_y = pair_df$y,
jaccard_similarity = jac[cbind(idx_x, idx_y)]
)
}
# combine results for this replicate
all_results[[k]] <- do.call(rbind, res_list)
}
# combine all replicates
final <- do.call(rbind, all_results)
rownames(final) <- NULL
return(final)
}
# Function to calculate jaccard vs distance
jaccard_distance <- function(x, cells_distinct_pair) {
all_results <- vector("list", length(x))
for (k in seq_along(x)) {
dat <- x[[k]]
split_intervals <- split(dat, dat$interval.ma)
res_list <- vector("list", length(split_intervals))
idx <- 1
for (int_name in names(split_intervals)) {
d <- split_intervals[[int_name]]
# clean coords
coords <- as.data.frame(unique(d[, c("cell", "lat", "long")]))
coords <- coords[!is.na(coords$cell) & coords$cell != "", ]
coords <- coords[!is.na(coords$lat) & !is.na(coords$long), ]
coords$cell <- as.character(coords$cell)
# presence/absence matrix
tab <- table(d$cell, d$accepted_name)
mat <- (tab > 0) * 1
valid_cells <- intersect(rownames(mat), coords$cell)
if (length(valid_cells) < 2) next
mat <- mat[valid_cells, , drop = FALSE]
coords <- coords[match(valid_cells, coords$cell), , drop = FALSE]
# Jaccard matrix
inter <- mat %*% t(mat)
rs <- rowSums(mat)
union <- outer(rs, rs, "+") - inter
jac <- inter / union
# cell pairs
pair_df <- cells_distinct_pair |>
dplyr::filter(interval.ma == as.numeric(int_name)) |>
dplyr::filter(x %in% rownames(mat), y %in% rownames(mat))
if (nrow(pair_df) == 0) next
# calculate Jaccard
idx_x <- match(pair_df$x, rownames(mat))
idx_y <- match(pair_df$y, rownames(mat))
pair_df$jaccard_similarity <- jac[cbind(idx_x, idx_y)]
# get coords
pair_df <- pair_df |>
dplyr::left_join(coords, by = c("x" = "cell")) |>
dplyr::rename(long_x = long, lat_x = lat) |>
dplyr::left_join(coords, by = c("y" = "cell")) |>
dplyr::rename(long_y = long, lat_y = lat)
# compute distance
pair_df$distance_km <- gcd.slc(
pair_df$long_x,
pair_df$lat_x,
pair_df$long_y,
pair_df$lat_y
)
# bin distances
pair_df <- pair_df |>
dplyr::mutate(
dist_bin = floor(distance_km / 2000) * 2000
)
# summarize
result_df <- pair_df |>
dplyr::group_by(interval.ma, dist_bin) |>
dplyr::summarise(
replicate = k,
jaccard_similarity = mean(jaccard_similarity, na.rm = TRUE),
n_pairs = dplyr::n(),
n_cells = dplyr::n_distinct(c(x, y)),
.groups = "drop"
)
res_list[[idx]] <- result_df
idx <- idx + 1
}
all_results[[k]] <- do.call(rbind, res_list)
}
final_results <- do.call(rbind, all_results)
rownames(final_results) <- NULL
return(final_results)
}
## Calculate Czekanowski
# Formula
czekanowski_similarity <- function(df) {
min_sum <- sum(pmin(df$count_cell_x, df$count_cell_y))
2 * min_sum / (sum(df$count_cell_x) + sum(df$count_cell_y))
}
# grid combos
gridComb <- function(x, cell, accepted_name) {
expand.grid(
cell = unique(x[[cell]]),
accepted_name = unique(x[[accepted_name]])
) |>
setNames(c("cell", "accepted_name"))
}
# count taxa per cell
countGen <- function(combinations, age_lists) {
purrr::map(seq_along(age_lists), function(i) {
combinations |>
left_join(
age_lists[[i]] |>
group_by(cell, accepted_name) |>
summarise(n = n(), .groups = "drop"),
by = c("cell", "accepted_name")
) |>
replace_na(list(n = 0))
})
}
# count genus per cell
prepare_xy <- function(count_list) {
list(
X = purrr::map(count_list, ~ rename(.x, cell_x = cell, count_cell_x = n)),
Y = purrr::map(count_list, ~ rename(.x, cell_y = cell, count_cell_y = n))
)
}
# Cross-join function.
gridComb <- function(x, cell, accepted_name) {
cA <- expand.grid(cell = unique(x$cell), unique(x$accepted_name)) |> setNames(nm = c("cell","accepted_name"))
return(cA)
}
# Count taxon occurrence per unique cell combination.
czekanowski_data_prep <- function(x, cell, accepted_name) {
count_taxa_x <- x |>
group_by(cell, accepted_name) |>
summarize(count = n(), .groups = 'drop') |> rename("cell_x" = "cell", "count_cell_x" ="count")
count_taxa_y <- x |>
group_by(cell, accepted_name) |>
summarize(count = n(), .groups = 'drop') |> rename("cell_y" = "cell", "count_cell_y" ="count")
# Cell pairs.
cell <- unique(x[[cell]])
taxa <- unique(x[[accepted_name]])
cell_combinations <- expand.grid(cell_x = cell, cell_y = cell,accepted_name = taxa) |> filter(cell_x != cell_y)
result <- cell_combinations |>
left_join(count_taxa_x, by = c("cell_x","accepted_name"), relationship = "many-to-many") |>
# Second join (y)
left_join(count_taxa_y, by = c("cell_y", "accepted_name")) |>
select("cell_x", "cell_y", "accepted_name", "count_cell_x", "count_cell_y") |>
# Replace NA with 0
replace_na(replace = list(count_cell_x = 0, count_cell_y = 0)) |>
# Remove rows that at 0 in both count fields.
filter(!(count_cell_x == 0 & count_cell_y == 0)) |>
# Remove duplicated cell combinations
filter(cell_x == cell_y | cell_x > cell_y) |>
# Split by cell combination
group_split(cell_x,cell_y)
return(result)
}