R tools and scripts for vegetation science and ecology: Various plant community analyses: NMDS with vectors, distance-based models/partial correlations, species composition GLMM etc

#This R code reproduces analyses and plots presented in: Guerin, G.R., Biffin, E. Jardine, D.I., Cross, H.B. & Lowe, A.J. (2014) A spatially predictive baseline for monitoring multivariate species occurrences and phylogenetic shifts in Mediterranean southern Australia. Journal of Vegetation Science 25: 338–348. see http://onlinelibrary.wiley.com/doi/10.1111/jvs.12111/abstract using data files in the SI.

#############

#clear space
rm(list=ls())

#load packages
require(vegan)
require(ecodist)
require(simba)
require(lme4)

#load data
records <- read.table(file="species_data.csv", header=TRUE, sep="\t")

DIST <- as.dist(read.table(file="distance_data.csv", header=TRUE, sep="\t", row.names=1))

variables <- read.table(file="environmental_data.csv", header=TRUE, sep="\t", row.names=1)

#transform aspect in degrees to 'eastness' and'northness':
aspect2 <- variables$aspect
variables <- cbind(variables, aspect2)
variables$aspect <- cos((variables$aspect*pi)/180)
variables$aspect2 <- sin((variables$aspect2*pi)/180)

#generate spatial eigenvectors
PCNM <- pcnm(DIST)
SC2 <- as.data.frame(scores(PCNM))
variables <- cbind(variables, SC2[,1:10]) #add first 10 PCNM axes to variables set

#generate species PA matrix
species <- mama(records)

##################

#matrix functions and look at data
species2 <- species
species2[species2>0] = 1 #convert df 'species2' to binary
freq <- colSums(species2) #calculate frequencies
freq <- sort(freq, decreasing=TRUE) #sort them

diversity <- rowSums(species2) #no spp. in each plot

#plot histograms figure
par(mfrow=c(2,2),mar=c(4,5,2,1))

hist(variables$bio12, breaks=c(0:75,150,225,300, 375, 450, 525, 600, 675, 750, 825, 900, 975, 1050, 1125), density=50, col="lightgray", border="black", xlab="Mean annual rainfall (mm)", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,20), xlim=c(200,1200))

mtext("a",3, adj=0, cex=2, font=2)

hist(variables$sand, 10, density=50, col="lightgray", border="black", xlab="%Sand", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,25), xlim=c(40,100))

mtext("b",3, adj=0, cex=2, font=2)

hist(variables$pH_CaCl2, 10, density=50, col="lightgray", border="black", xlab="pH", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,20), xlim=c(3.5,6.5))

mtext("c",3, adj=0, cex=2, font=2)

DISTkm <- DIST/1000

plot(bio1D~DISTkm, xlab="Geographic distance (km)", xlim=c(0,600), ylim=c(0,5),ylab="Difference in mean temperature", bty="n", cex.axis=1.5, cex.lab=1.5, cex=0.8, pch=19) #shows near sites different and distance sites similar (in addition to usual decay with distance)

mtext("d",3, adj=0, cex=2, font=2)

plot(BRAY~DISTkm, ylim=c(0,1.1), xlim=c(0,600), xlab="Geographic distance (km)", ylab="Bray-Curtis dissimilarity", bty="n", cex.axis=1.5, cex.lab=1.5, cex=0.8, pch=19, yaxs="i")

mtext("a",3, adj=0, cex=2, font=2)

hist(freq, 20, density=50, col="lightgray", border="black", xlab="Number of records", ylab="Number of species", cex.axis=1.5, cex.lab=1.5, main="", xlim=c(0,50)) # plot frequency histogram

mtext("b",3, adj=0, cex=2, font=2)

hist(records$VCA, breaks=c(0:1,4:5,9:10,14:15,19:20,24:25,29:30,34:35,39:40,44:45,49:50,54:55,59:60,64:65,69:70,74:75,79:80,84:85,89:90,94:95,99:100), density=50, col="lightgray", border="black", xlab="Cover-abundance", ylab="Number of records", cex.axis=1.5, cex.lab=1.5, main="", xlim=c(0,100), freq=TRUE) # plot frequency his

mtext("c",3, adj=0, cex=2, font=2)

hist(diversity, 10, density=50, col="lightgray", border="black", xlab="Number of species", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,20), xlim=c(10,60))

mtext("d",3, adj=0, cex=2, font=2)

##############

#non-metric multidimensional scaling
BRAY <- vegdist(species)
NMDS <- metaMDS(BRAY, pc=TRUE)
SCORES3 <- as.data.frame(scores(NMDS))

#plot linear correlations of variables with NMDS plot
linears <- with(variables, cbind(bio12, bio1, sand, pH_CaCl2))
colnames(linears) <- c("bio12", "bio1", "%sand", "pH")
VF <- vf(SCORES3, linears, nperm=1000)
print(VF)

par(mar=c(5,5,1,1))
plot(SCORES3, col="white", xlab="NMDS axis-1", ylab="NMDS axis-2", cex.axis=1.5, cex.lab=1.5, bty="l", las=0)

plot(VF, col="black", cex=2, lwd=2)

plot(SCORES3, col="white", xlab="NMDS axis-1", ylab="NMDS axis-2", cex.axis=1.5, cex.lab=1.5, bty="l", las=0)
text(SCORES3, labels=variables$site, cex=0.7, font=2)
arrows(0.06, -0.22, -0.42, -0.02, lty=2, angle=20) #NB re-runs of the NMDS will produce different configurations/orientations and so this arrow won't point to the same plots

####################

#distance based analysis

#create distances for each variable
bio1D <- vegdist(variables$bio1/10, method='euclidean')
bio12D <- vegdist(variables$bio12, method='euclidean')
aspectD <- vegdist(variables$aspect, method='euclidean')
aspect2D <- vegdist(variables$aspect2, method='euclidean')
slopeD <- vegdist(variables$slope, method='euclidean')
sandD <- vegdist(variables$sand, method='euclidean')
outcropD <- vegdist(variables$outcrop, method='euclidean')
surfaceD <- vegdist(variables$surface, method='euclidean')
PD <- vegdist(variables$P, method='euclidean')
NO3D <- vegdist(variables$NO3, method='euclidean')
NH4D <- vegdist(variables$NH4, method='euclidean')
KD <- vegdist(variables$K, method='euclidean')
conductD <- vegdist(variables$conduct, method='euclidean')
pHD <- vegdist(variables$pH_CaCl2, method='euclidean')

##model exponential distance decay:
InverseBRAY <- (1-BRAY) + 1/DIST
BRAYtransform <- -log(InverseBRAY)

multi <- MRM(BRAYtransform ~ DIST + bio1D + NH4D + conductD + aspectD, nperm=10000)

COEF <- multi$coef[,1]

PREDICT1 <- 1-exp(-(COEF[2])*max(DIST))
PREDICT2 <- 1-exp(-(COEF[3])*max(bio1D))
PREDICT3 <- 1-exp(-(COEF[4])*max(NH4D))
PREDICT4 <- 1-exp(-(COEF[5])*max(conductD))
PREDICT5 <- 1-exp(-(COEF[6])*max(aspectD))

PREDICTIONS <- rbind(PREDICT1, PREDICT2, PREDICT3, PREDICT4, PREDICT5)
rownames(PREDICTIONS) <- c("Geographic distance", "Mean temperature", "Ammonium nitrate", "Electrical conductivity", "Aspect")

nothing <- c(1, 2, 3, 4, 5)

par(mar=c(20,15,1,1)+0.5)

plot(nothing ~ PREDICTIONS, yaxt='n', xlim=c(0.5,1), bty="l", las=1, ylab="", cex.axis=1, cex.lab=1.4, xlab="Bray-Curtis dissimilarity", pch=16, cex=2)

axis(2, at=c(1, 2, 3, 4, 5), labels=rownames(PREDICTIONS), cex.axis=1.4, las=2)

##############
#GLMM - multivariate generalised mixed-model of species occurrences

#subset a multivariate binary dataset to include species with a min. number of records
species3 = species2[,colSums(species2) > 4] #i.e. 5 or more records included only
STACK2 <- stack(species3) #stacks P/A of each spp. in each plot into one column
var2 <- do.call("rbind", replicate(ncol(species3), variables, simplify = FALSE)) #number of replicates = number of species with 5 or more records (number of columns in 'species3')
dataset2 <- cbind(STACK2, var2)

#below, site random factor means the slopes of the response don't change btw sites but 'intercepts' can change - accounting for how freq may be biased due to autocorrelation within reps. The random spp. term allows intercept and slope to vary by species, so frequency and response of spp. to gradients can differ:

##run-forward selection on selected terms, having excluded those that increased AIC and/or have high p-values for their addition etc, or have no significant effect individually

#forward selection
#NOTE the following analysis may take several hours on a standard computer
M1 <- lmer(values ~ (1|site), family=binomial(link="logit"), dataset2)

M1

M2 <- update(M1,. ~ . + (1|ind))

M2

M3 <- update(M2,. ~ . - (1|ind) + PCNM1 + (1 + PCNM1|ind))

M3

M4 <- update(M3,. ~ . - (1 + PCNM1|ind) + bio1 + (1 + PCNM1 + bio1|ind))

M4

M5 <- update(M4,. ~ . - (1 + PCNM1 + bio1|ind) + NH4 + (1 + PCNM1 + bio1 + NH4|ind))

M5

M6 <- update(M5,. ~ . - (1 + PCNM1 + bio1 + NH4|ind) + conduct + (1 + PCNM1 + bio1 + NH4 + conduct|ind))

M6

M7 <- update(M6,. ~ . - (1 + PCNM1 + bio1 + NH4 + conduct|ind) + P + (1 + PCNM1 + bio1 + NH4 + conduct + P|ind))

M7

anova(M1, M2, M3, M4, M5, M6, M7)

scor1 <- AIC(M1)
scor2 <- AIC(M2)
scor3 <- AIC(M3)
scor4 <- AIC(M4)
scor5 <- AIC(M5)
scor6 <- AIC(M6)
scor7 <- AIC(M7)

deltaAIC2 <- c(scor2-scor1, scor3-scor2, scor4-scor3,scor5-scor4, scor6-scor5, scor7-scor6)

print(deltaAIC2)

#plot random effects
RANDOM <- ranef(M7, postVar=TRUE, whichel="ind")
colnames(RANDOM$ind) <- c("(Intercept)", "Spatial eigenvector", "Mean temperature", "Ammonium N", "Electrical conductivity", "Colwell P")
qqmath(RANDOM) #plot of species 'random' intercepts & slopes against standard normal distribution

RANDOM2 <- ranef(M7, postVar=TRUE, whichel="site")
dotplot(RANDOM2) #plot of site 'random' intercepts

###############
#correlate phylogenetic shifts with variables

#load packages
require(picante)
require(ppcor)

#generate matching phylogeny and community datasets
PHYLOmine <- match.phylo.comm(phyloTree, species)

#calculate phylogenetic distances
PCDscores <- pcd(PHYLOmine$comm, PHYLOmine$phy)
PCD <- as.numeric(PCDscores$PCD)
PCDp <- as.numeric(PCDscores$PCDp)
PCDc <- as.numeric(PCDscores$PCDc)
UNIFRAC <- unifrac(PHYLOmine$comm, PHYLOmine$phy)
phylouni <- as.numeric(UNIFRAC)

linear <- as.data.frame(cbind(BRAYtransform, DIST, bio1D, bio12D, surfaceD, sandD, outcropD, aspectD, aspect2D, slopeD, PD, NO3D, NH4D, KD, conductD, pHD, phylouni, PCD, PCDp, PCDc))

#Unifrac
subset1 <- with(linear, cbind(DIST, bio1D, sandD, aspectD, outcropD, aspect2D, slopeD, KD, conductD, pHD, phylouni))

#PCD
subset2 <- with(linear, cbind(DIST, bio1D, sandD, aspectD, slopeD, KD, conductD, pHD, PCD))

#PCDp
subset3 <- with(linear, cbind(bio1D, sandD, aspectD, PD, conductD, pHD, PCDp))

#PCDc
subset4 <- with(linear, cbind(DIST, bio1D, surfaceD, sandD, aspectD, aspect2D, slopeD, NH4D, KD, conductD, PCDc))

########
##FUNCTION 'parCorPerm' developed here to run permutational significance test of multivariate pairwise partial correlations by permuting variables and testing correlation against null model. This function has been adapted from a related function for permutational significance testing of 2 variable correlations 'corPerm' by Pierre Legendre, see http://adn.biol.umontreal.ca/~numericalecology/Rcode/, and package 'ppcor' must be loaded

#parCorPerm only tests the partial interactions of multiple variables against 1 response and it assumes the response variable of interest is represented by the final (right hand) row of the input data matrix

#set up functions
sample_func <- function(x)
{
sample(x, length(x))
}

PFUNCTION <- function(x, y, z)
{
if(x > y) z <- z+1
return(z)
}

parCorPerm <- function(x,nperm) #where x is a data.frame with variables in columns and the response of interest in the final (RH) column; and nperm is the desired number of permutations
{
r.ref <- spcor(x) #calculate reference partial correlation on raw data
temp.ref <- as.data.frame(r.ref$estimate) #extract the coefficients

nGT <- as.data.frame(as.vector(rep(1, ncol(x)))) # a vector of 1s same length as number of variables
colnames(nGT) <- "value"

response <- as.data.frame(x[,ncol(x)]) #isolate the response column which must be the last one
number <- as.numeric(ncol(x)-1) #calcuate number of columns without the response
x <- as.data.frame(x[,1:number]) #extract just the variables, exclude the response

for(i in 1:nperm)
{
y.perm <- as.data.frame(apply(x, 2, sample_func)) #permute variables
test_data <- cbind(y.perm, response) #join permuted with unpermuted response

r.perm <- spcor(test_data) #calculate correlation coefficient with permuted variables
temp.perm <- as.data.frame(r.perm$estimate) #extract coefficients

nGT$value <- mapply(PFUNCTION, temp.perm[, ncol(temp.perm)], temp.ref[, ncol(temp.ref)], nGT$value)

}
P <- nGT$value/(nperm+1)
cat('\nPearson partial correlation of response variable to each explanatory variable (i.e. given the others)','\n')
cat('Prob (',nperm,'permutations) =',P,'\n','\n')
return(list(Variables=c(colnames(x), "Response"), Correlation=temp.ref[, ncol(temp.ref)], No.perm=nperm, P.perm=P ))

}

#use function parCorPerm to calculate partial correlations for each metric of phylogenetic distance

parCorPerm(subset1, 1000)
parCorPerm(subset2, 1000)
parCorPerm(subset3, 1000)
parCorPerm(subset4, 1000)

#end

R tools and scripts for vegetation science and ecology

Tuesday, June 3, 2014

Various plant community analyses: NMDS with vectors, distance-based models/partial correlations, species composition GLMM etc

No comments:

Post a Comment

Popular Posts

Search This Blog