
# Analyses for Rare Community Index Project
# October 25th, 2024


# Packages
library(readr)
library(dplyr)
library(lmtest)


#Loading in data
df <- read_csv("Data/RCI Maumee Bay Fish Community Dataset.csv")
df$Year <- as.numeric(ifelse(grepl("HEC2013", df$SiteID), 2013, # Assigning Year column from SiteID
                  substr(df$SiteID, 1, 4)))


# Assigning reciprocal ranks to species by incidence
df_Ranks = df %>%
  group_by(CommonName) %>%
  summarise(n = n()) %>% # Summarize number of detections by species
  mutate(reciprocal = 1 / n, # Assign reciprocal values for ranking
         reciprocal = ifelse(grepl("Unident|Hybrid|No fish", CommonName),0,reciprocal), # Unidentified and hybrid spp and 'no fish' detections get reciprocal value of zero
         rank = dense_rank(reciprocal)-1) # Assign ranks based on reciprocal values
df_Ranks <- arrange(df_Ranks, -rank) # Order dataframe from highest to lowest (important step)

# Calculating RCI 
df_RCI = df %>%
  select(SiteID, Latitude, Longitude, Gear, CommonName) %>%
  left_join(df_Ranks, by = "CommonName") %>% # Attach species rank data to df
  group_by(SiteID) %>%
  mutate(Richness = length(unique(CommonName[!grepl("Unident|Hybrid|No fish", CommonName)])), # Calculate species richness, excluding unidentified and hybrid spp and 'no fish' entries
         Sa = sum(rank)) %>% # See (eqn 1) in manuscript
  ungroup() %>%
  group_by(Richness) %>%
  mutate(Smax = sum(df_Ranks$rank[c(1:Richness)])) %>% # Calculates the highest S value possible given the species richness of the site
  ungroup() %>%
  mutate(RCI = Sa / Smax, # See (eqn 2) in manuscript
         RCI = ifelse(is.na(RCI), 0, RCI)) %>% # set NA to equal 0 RCI
  select(SiteID, Latitude, Longitude, Gear, Richness, Sa, Smax, RCI) %>%
  distinct()




#...............................................................................
## Analyses


# Species Rank by Species Detection - Power Regression

# Remove 'species' (hybrids, "No Fish") with rank = 0 
NOZ <- df_Ranks[df_Ranks$rank != 0,]
# Power regression
lm1 <- lm(log(rank) ~ n, data = NOZ)
summary(lm1)





# RCI by Species Richness - Linear Regression

# Convert RCI (proportion) to logit scale (unconstrained)
df_RCI$logit_RCI <- log(df_RCI$RCI / (1 - df_RCI$RCI))
# Remove Sites where RCI = 0 (i.e., logic_RCI = -Inf)
NOZRCI <- df_RCI[df_RCI$RCI != 0,]

# Linear Regression
RCIlm <- lm(logit_RCI~Richness, data = NOZRCI)
summary(RCIlm)
bptest(RCIlm) # p < 0.001, model is heteroscedastic
# Linear regression is a poor model


# Weighted Least Squares Regression - for heteroscedastic data
# Define weights to use
wt <- 1 / lm(abs(RCIlm$residuals) ~ RCIlm$fitted.values)$fitted.values^2
# Weighted Least Squares Regression Model
wls_model <- lm(logit_RCI ~ Richness, data = NOZRCI, weights=wt)
summary(wls_model)
AIC(RCIlm, wls_model) # wls is a better model than lm





# RCI by gear type - One-Way ANOVA and TukeyHSD

# Remove all observations that are not Bottom Trawl, Electrofishing, Fyke Nets
ANOVA_data = df_RCI %>%
  filter(Gear %in% c("Bottom Trawl", "Electrofishing", "Fyke Net"))

#NRI by Gear ANOVA
gearAOV <- aov(RCI ~ Gear, data = ANOVA_data)
summary(gearAOV)

# Tukey's HSD post-hoc
TukeyHSD(gearAOV, "Gear")
