Tuesday, June 3, 2014

Various plant community analyses: NMDS with vectors, distance-based models/partial correlations, species composition GLMM etc

#This R code reproduces analyses and plots presented in: Guerin, G.R., Biffin, E. Jardine, D.I., Cross, H.B. & Lowe, A.J. (2014) A spatially predictive baseline for monitoring multivariate species occurrences and phylogenetic shifts in Mediterranean southern Australia. Journal of Vegetation Science 25: 338–348. see http://onlinelibrary.wiley.com/doi/10.1111/jvs.12111/abstract using data files in the SI.

#############

#clear space
rm(list=ls())  

#load packages
require(vegan)
require(ecodist)
require(simba)
require(lme4)

#load data
records <- read.table(file="species_data.csv", header=TRUE, sep="\t")

DIST <- as.dist(read.table(file="distance_data.csv", header=TRUE, sep="\t", row.names=1))

variables <- read.table(file="environmental_data.csv", header=TRUE, sep="\t", row.names=1)


#transform aspect in degrees to 'eastness' and'northness':
aspect2 <- variables$aspect
variables <- cbind(variables, aspect2)
variables$aspect <- cos((variables$aspect*pi)/180)
variables$aspect2 <- sin((variables$aspect2*pi)/180)

#generate spatial eigenvectors
PCNM <- pcnm(DIST)
SC2 <- as.data.frame(scores(PCNM))
variables <- cbind(variables, SC2[,1:10]) #add first 10 PCNM axes to variables set

#generate species PA matrix
species <- mama(records)


##################

#matrix functions and look at data
species2 <- species
species2[species2>0] = 1 #convert df 'species2' to binary
freq <- colSums(species2) #calculate frequencies
freq <- sort(freq, decreasing=TRUE) #sort them

diversity <- rowSums(species2) #no spp. in each plot

#plot histograms figure
par(mfrow=c(2,2),mar=c(4,5,2,1))

hist(variables$bio12, breaks=c(0:75,150,225,300, 375, 450, 525, 600, 675, 750, 825, 900, 975, 1050, 1125), density=50, col="lightgray", border="black", xlab="Mean annual rainfall (mm)", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,20), xlim=c(200,1200))

mtext("a",3, adj=0, cex=2, font=2)

hist(variables$sand, 10, density=50, col="lightgray", border="black", xlab="%Sand", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,25), xlim=c(40,100))

mtext("b",3, adj=0, cex=2, font=2)

hist(variables$pH_CaCl2, 10, density=50, col="lightgray", border="black", xlab="pH", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,20), xlim=c(3.5,6.5))

mtext("c",3, adj=0, cex=2, font=2)

DISTkm <-  DIST/1000

plot(bio1D~DISTkm, xlab="Geographic distance (km)", xlim=c(0,600), ylim=c(0,5),ylab="Difference in mean temperature", bty="n", cex.axis=1.5, cex.lab=1.5, cex=0.8, pch=19)  #shows near sites different and distance sites similar (in addition to usual decay with distance)

mtext("d",3, adj=0, cex=2, font=2)

plot(BRAY~DISTkm, ylim=c(0,1.1), xlim=c(0,600), xlab="Geographic distance (km)", ylab="Bray-Curtis dissimilarity", bty="n", cex.axis=1.5, cex.lab=1.5, cex=0.8, pch=19, yaxs="i")

mtext("a",3, adj=0, cex=2, font=2)

hist(freq, 20, density=50, col="lightgray", border="black", xlab="Number of records", ylab="Number of species", cex.axis=1.5, cex.lab=1.5, main="", xlim=c(0,50)) # plot frequency histogram

mtext("b",3, adj=0, cex=2, font=2)

hist(records$VCA, breaks=c(0:1,4:5,9:10,14:15,19:20,24:25,29:30,34:35,39:40,44:45,49:50,54:55,59:60,64:65,69:70,74:75,79:80,84:85,89:90,94:95,99:100), density=50, col="lightgray", border="black", xlab="Cover-abundance", ylab="Number of records", cex.axis=1.5, cex.lab=1.5, main="", xlim=c(0,100), freq=TRUE) # plot frequency his

mtext("c",3, adj=0, cex=2, font=2)

hist(diversity, 10, density=50, col="lightgray", border="black", xlab="Number of species", ylab="Number of plots", cex.axis=1.5, cex.lab=1.5, main="", freq=TRUE, ylim=c(0,20), xlim=c(10,60))

mtext("d",3, adj=0, cex=2, font=2)


##############

#non-metric multidimensional scaling
BRAY <- vegdist(species)
NMDS <- metaMDS(BRAY, pc=TRUE)
SCORES3 <- as.data.frame(scores(NMDS))

#plot linear correlations of variables with NMDS plot
linears <- with(variables, cbind(bio12, bio1, sand, pH_CaCl2))
colnames(linears) <- c("bio12", "bio1", "%sand", "pH")
VF <- vf(SCORES3, linears, nperm=1000)
print(VF)

par(mar=c(5,5,1,1))
plot(SCORES3, col="white", xlab="NMDS axis-1", ylab="NMDS axis-2", cex.axis=1.5, cex.lab=1.5, bty="l", las=0)

plot(VF, col="black", cex=2, lwd=2)

plot(SCORES3, col="white", xlab="NMDS axis-1", ylab="NMDS axis-2", cex.axis=1.5, cex.lab=1.5, bty="l", las=0)
text(SCORES3, labels=variables$site, cex=0.7, font=2)
arrows(0.06, -0.22, -0.42, -0.02, lty=2, angle=20) #NB re-runs of the NMDS will produce different configurations/orientations and so this arrow won't point to the same plots


####################


#distance based analysis

#create distances for each variable
bio1D <- vegdist(variables$bio1/10, method='euclidean')
bio12D <- vegdist(variables$bio12, method='euclidean')
aspectD <- vegdist(variables$aspect, method='euclidean')
aspect2D <- vegdist(variables$aspect2, method='euclidean')
slopeD <- vegdist(variables$slope, method='euclidean')
sandD <- vegdist(variables$sand, method='euclidean')
outcropD <- vegdist(variables$outcrop, method='euclidean')
surfaceD <- vegdist(variables$surface, method='euclidean')
PD <- vegdist(variables$P, method='euclidean')
NO3D <- vegdist(variables$NO3, method='euclidean')
NH4D <- vegdist(variables$NH4, method='euclidean')
KD <- vegdist(variables$K, method='euclidean')
conductD <- vegdist(variables$conduct, method='euclidean')
pHD <- vegdist(variables$pH_CaCl2, method='euclidean')

##model exponential distance decay:
InverseBRAY <- (1-BRAY) + 1/DIST
BRAYtransform <- -log(InverseBRAY)

multi <- MRM(BRAYtransform ~ DIST + bio1D + NH4D + conductD + aspectD, nperm=10000)

COEF <- multi$coef[,1]

PREDICT1 <- 1-exp(-(COEF[2])*max(DIST))
PREDICT2 <- 1-exp(-(COEF[3])*max(bio1D))
PREDICT3 <- 1-exp(-(COEF[4])*max(NH4D))
PREDICT4 <- 1-exp(-(COEF[5])*max(conductD))
PREDICT5 <- 1-exp(-(COEF[6])*max(aspectD))

PREDICTIONS <- rbind(PREDICT1, PREDICT2, PREDICT3, PREDICT4, PREDICT5)
rownames(PREDICTIONS) <- c("Geographic distance", "Mean temperature", "Ammonium nitrate", "Electrical conductivity", "Aspect")

nothing <- c(1, 2, 3, 4, 5)

par(mar=c(20,15,1,1)+0.5)

plot(nothing ~ PREDICTIONS, yaxt='n', xlim=c(0.5,1), bty="l", las=1, ylab="", cex.axis=1, cex.lab=1.4, xlab="Bray-Curtis dissimilarity", pch=16, cex=2)

axis(2, at=c(1, 2, 3, 4, 5), labels=rownames(PREDICTIONS), cex.axis=1.4, las=2)


##############
#GLMM - multivariate generalised mixed-model of species occurrences

#subset a multivariate binary dataset to include species with a min. number of records
species3 = species2[,colSums(species2) > 4] #i.e. 5 or more records included only
STACK2 <- stack(species3) #stacks P/A of each spp. in each plot into one column
var2 <- do.call("rbind", replicate(ncol(species3), variables, simplify = FALSE)) #number of replicates = number of species with 5 or more records (number of columns in 'species3')
dataset2 <- cbind(STACK2, var2)

#below, site random factor means the slopes of the response don't change btw sites but 'intercepts' can change - accounting for how freq may be biased due to autocorrelation within reps. The random spp. term allows intercept and slope to vary by species, so frequency and response of spp. to gradients can differ:


##run-forward selection on selected terms, having excluded those that increased AIC and/or have high p-values for their addition etc, or have no significant effect individually

#forward selection
#NOTE the following analysis may take several hours on a standard computer
M1 <- lmer(values ~ (1|site), family=binomial(link="logit"), dataset2)

M1

M2 <- update(M1,. ~ . + (1|ind))

M2

M3 <- update(M2,. ~ . - (1|ind) + PCNM1 + (1 + PCNM1|ind))

M3

M4 <- update(M3,. ~ . - (1 + PCNM1|ind) + bio1 + (1 + PCNM1 + bio1|ind))

M4

M5 <- update(M4,. ~ . - (1 + PCNM1 + bio1|ind) + NH4 + (1 + PCNM1 + bio1 + NH4|ind))

M5

M6 <- update(M5,. ~ . - (1 + PCNM1 + bio1 + NH4|ind) + conduct + (1 + PCNM1 + bio1 + NH4 + conduct|ind))

M6

M7 <- update(M6,. ~ . - (1 + PCNM1 + bio1 + NH4 + conduct|ind) + P + (1 + PCNM1 + bio1 + NH4 + conduct + P|ind))

M7

anova(M1, M2, M3, M4, M5, M6, M7)


scor1 <- AIC(M1)
scor2 <- AIC(M2)
scor3 <- AIC(M3)
scor4 <- AIC(M4)
scor5 <- AIC(M5)
scor6 <- AIC(M6)
scor7 <- AIC(M7)

deltaAIC2 <- c(scor2-scor1, scor3-scor2, scor4-scor3,scor5-scor4, scor6-scor5, scor7-scor6)

print(deltaAIC2)


#plot random effects
RANDOM <- ranef(M7, postVar=TRUE, whichel="ind")
colnames(RANDOM$ind) <- c("(Intercept)", "Spatial eigenvector", "Mean temperature", "Ammonium N", "Electrical conductivity", "Colwell P")
qqmath(RANDOM) #plot of species 'random' intercepts & slopes against standard normal distribution

RANDOM2 <- ranef(M7, postVar=TRUE, whichel="site")
dotplot(RANDOM2) #plot of site 'random' intercepts


###############
#correlate phylogenetic shifts with variables

#load packages
require(picante)
require(ppcor)

#generate matching phylogeny and community datasets
PHYLOmine <- match.phylo.comm(phyloTree, species)

#calculate phylogenetic distances
PCDscores <- pcd(PHYLOmine$comm, PHYLOmine$phy)
PCD <- as.numeric(PCDscores$PCD)
PCDp <- as.numeric(PCDscores$PCDp)
PCDc <- as.numeric(PCDscores$PCDc)
UNIFRAC <- unifrac(PHYLOmine$comm, PHYLOmine$phy)
phylouni <- as.numeric(UNIFRAC)

linear <- as.data.frame(cbind(BRAYtransform, DIST, bio1D, bio12D, surfaceD, sandD, outcropD, aspectD, aspect2D, slopeD, PD, NO3D, NH4D, KD, conductD, pHD, phylouni, PCD, PCDp, PCDc))


#Unifrac
subset1 <- with(linear, cbind(DIST, bio1D, sandD, aspectD, outcropD, aspect2D, slopeD, KD, conductD, pHD, phylouni))

#PCD
subset2 <- with(linear, cbind(DIST, bio1D, sandD, aspectD, slopeD, KD, conductD, pHD, PCD))

#PCDp
subset3 <- with(linear, cbind(bio1D, sandD, aspectD, PD, conductD, pHD, PCDp))

#PCDc
subset4 <- with(linear, cbind(DIST, bio1D, surfaceD, sandD, aspectD, aspect2D, slopeD, NH4D, KD, conductD, PCDc))

########
##FUNCTION 'parCorPerm' developed here to run permutational significance test of multivariate pairwise partial correlations by permuting variables and testing correlation against null model. This function has been adapted from a related function for permutational significance testing of 2 variable correlations 'corPerm' by Pierre Legendre, see http://adn.biol.umontreal.ca/~numericalecology/Rcode/, and package 'ppcor' must be loaded

#parCorPerm only tests the partial interactions of multiple variables against 1 response and it assumes the response variable of interest is represented by the final (right hand) row of the input data matrix

#set  up functions
sample_func <- function(x)
{
sample(x, length(x))
}


PFUNCTION <- function(x, y, z)
{
   if(x > y) z <- z+1
   return(z)
}


parCorPerm <- function(x,nperm) #where x is a data.frame with variables in columns and the response of interest in the final (RH) column; and nperm is the desired number of permutations
{
r.ref <- spcor(x) #calculate reference partial correlation on raw data
temp.ref <- as.data.frame(r.ref$estimate) #extract the coefficients

nGT <- as.data.frame(as.vector(rep(1, ncol(x)))) # a vector of 1s same length as number of variables
colnames(nGT) <- "value"

response <- as.data.frame(x[,ncol(x)]) #isolate the response column which must be the last one
number <- as.numeric(ncol(x)-1) #calcuate number of columns without the response
x <- as.data.frame(x[,1:number]) #extract just the variables, exclude the response


for(i in 1:nperm)
   {
   y.perm <- as.data.frame(apply(x, 2, sample_func)) #permute variables
   test_data <- cbind(y.perm, response) #join permuted with unpermuted  response

   r.perm <- spcor(test_data) #calculate correlation coefficient with permuted variables
   temp.perm <- as.data.frame(r.perm$estimate) #extract coefficients



    nGT$value <- mapply(PFUNCTION, temp.perm[, ncol(temp.perm)], temp.ref[, ncol(temp.ref)], nGT$value)
   

     }
P <- nGT$value/(nperm+1)
cat('\nPearson partial correlation of response variable to each explanatory variable (i.e. given the others)','\n')
cat('Prob (',nperm,'permutations) =',P,'\n','\n')
return(list(Variables=c(colnames(x), "Response"), Correlation=temp.ref[, ncol(temp.ref)], No.perm=nperm, P.perm=P ))

}


#use function parCorPerm to calculate partial correlations for each metric of phylogenetic distance

parCorPerm(subset1, 1000)
parCorPerm(subset2, 1000)
parCorPerm(subset3, 1000)
parCorPerm(subset4, 1000)



#end



Calculate and plot bootstrapped 95% confidence intervals for model coeficients


# this R code re-produces Figure 1 of … Guerin G.R. & Lowe, A.J. (2013) Leaf morphology shift: new data and analysis support climate link. Biology Letters 9, 20120860. http://rsbl.royalsocietypublishing.org/content/9/1/20120860.full.pdf ... using the data files in the ESM, see http://intl-rsbl.royalsocietypublishing.org/content/9/1/20120860/suppl/DC1

My version of the figure - a) shows coeficients for temporal change in leaf shape (mm/year) using different data splits, and their bootstrapped  95% CIs. b) shows a shifting latitudinal cline in leaf shape between time periods



########begin code
# load data
all <- read.table(file="ESM_angust_all.csv", header=TRUE, sep="\t") #all data
low <- read.table(file="ESM_angust_low_lats.csv", header=TRUE, sep="\t") #data points between -30 and -31 degree lat. post-1950
predvars <- read.table(file="ESM_predvars.csv", header=TRUE, sep="\t") #dummy variables for GLM predictions pre-1950
predvars2 <- read.table(file="ESM_predvars2.csv", header=TRUE, sep="\t") #dummy variables for GLM predictions post-1990

# subset data
mid <- all[9:209,] #1920-1980
post <- all[31:262,] #1950-2011
other <- all[9:262,] #1920-2011
pre <- all[1:30,] #1880-1950
spli <- all[227:262,] #1990-2011
split <- rbind(pre, spli) #combine pre-1950 with post-1990

#set up boots
require(boot) #package must be installed first
bs <- function(formula, data, indices) {
  d <- data[indices,]
  fit <- lm(formula, data=d)
  return(coef(fit)) 
}

bs2 <- function(formula, data, indices) {
  d <- data[indices,]
  fit <- glm(formula, data=d)
  return(coef(fit)) 
}

# run boots and GLM
results <- boot(data=mid, statistic=bs, R=10000, formula=width~Year+latitude)
BOOTY <- boot.ci(results, index=2, type="basic")

results2 <- boot(data=all, statistic=bs, R=10000, formula=width~Year+latitude)
BOOTY2 <- boot.ci(results2, index=2, type="basic")

results3 <- boot(data=post, statistic=bs, R=10000, formula=width~Year+latitude)
BOOTY3 <- boot.ci(results3, index=2, type="basic")

results4 <- boot(data=other, statistic=bs, R=10000, formula=width~Year+latitude)
BOOTY4 <- boot.ci(results4, index=2, type="basic")

results5 <- boot(data=low, statistic=bs, R=10000, formula=width~Year+latitude)
BOOTY5 <- boot.ci(results5, index=2, type="basic")

results6 <- boot(data=pre, statistic=bs, R=10000, formula=width~Year+latitude)
BOOTY6 <- boot.ci(results6, index=2, type="basic")

GLM <- glm(width~latitude + shift, data=split, family=gaussian (link=identity))
pred1 <- predict.glm(GLM, newdata=predvars)
pred2 <- predict.glm(GLM, newdata=predvars2)

results7 <- boot(data=split, statistic=bs2, R=10000, formula=width~shift + latitude)
BOOTY7 <- boot.ci(results7, index=2, type="basic")


# extract results
pre1950 <- c(BOOTY6$t0, BOOTY6$basic[,4:5])

post1950 <- c(BOOTY3$t0, BOOTY3$basic[,4:5])

allx <- c(BOOTY2$t0, BOOTY2$basic[,4:5])

lowx <- c(BOOTY5$t0, BOOTY5$basic[,4:5])

midx <- c(BOOTY$t0, BOOTY$basic[,4:5])

otherx <- c(BOOTY4$t0, BOOTY4$basic[,4:5])

CIs <- as.data.frame(rbind(pre1950, post1950, allx, lowx, midx, otherx))
colnames(CIs) <- c("slope", "lowerCI", "upperCI")
rownames(CIs) <- c("Pre1950", "Post1950", "1880-2011", "North:post1950", "1920-80", "1920-2011")


#plot
nothing <- c(1,2, 3, 4, 5, 6)

par(mar=c(5,4,1,1)+0.1)

layout(matrix(c(1,1,2,2), 2, 2, byrow = TRUE), respect=TRUE, widths=c(2.5,2.5), heights=c(3,3))

plot(CIs$slope~nothing, ylim=c(-0.09, 0.02), cex=1.5, pch=19, ylab="Slope", xlab="", xaxt='n', xlim=c(0.5, 6.5), cex.axis=1, cex.lab=1.5, las=1, bty="l")
arrows(1, CIs[1,1], x1=1, y1=CIs[1, 2], length=0.1, angle=90, lwd=2)
arrows(1, CIs[1,1], x1=1, y1=CIs[1, 3], length=0.1, angle=90, lwd=2)

arrows(2, CIs[2,1], x1=2, y1=CIs[2, 2], length=0.1, angle=90, lwd=2)
arrows(2, CIs[2,1], x1=2, y1=CIs[2, 3], length=0.1, angle=90, lwd=2)

arrows(3, CIs[3,1], x1=3, y1=CIs[3, 2], length=0.1, angle=90, lwd=2)
arrows(3, CIs[3,1], x1=3, y1=CIs[3, 3], length=0.1, angle=90, lwd=2)

arrows(4, CIs[4,1], x1=4, y1=CIs[4, 2], length=0.1, angle=90, lwd=2)
arrows(4, CIs[4,1], x1=4, y1=CIs[4, 3], length=0.1, angle=90, lwd=2)

arrows(5, CIs[5,1], x1=5, y1=CIs[5, 2], length=0.1, angle=90, lwd=2)
arrows(5, CIs[5,1], x1=5, y1=CIs[5, 3], length=0.1, angle=90, lwd=2)

arrows(6, CIs[6,1], x1=6, y1=CIs[6, 2], length=0.1, angle=90, lwd=2)
arrows(6, CIs[6,1], x1=6, y1=CIs[6, 3], length=0.1, angle=90, lwd=2)

abline(h=0, lty=2)

axis(1, at=c(1,2, 3, 4, 5, 6), labels=c("pre","post", "all", "north", "mid", "other"), cex.axis=1.4, las=2)

text(0.55, 0.02, "(a)", cex=2)

plot(width~latitude, data=pre, pch=16, xlab="Latitude", ylab="Leaf width (mm)", cex=1, cex.lab=1.5, cex.axis=1.5, xlim=c(-35, -30), ylim=c(0, 10), bty="l")
new <- all[252:262,]
points(new$latitude, new$width, pch=3, cex=3)
points(spli$latitude, spli$width, pch=1, cex=0.8)

points(predvars$latitude, pred1, type="l", lwd=2, lty=2)
points(predvars2$latitude, pred2, type="l", lwd=2, lty=1)

text(-34.95, 9.8, "(b)", cex=2)

#end


Mapping species richness in cells based on georeferenced species records


Example of one relatively simple way to map grid cell-based species richness starting with individual georeferenced species records. This is only a method for processing the data, and does not consider sampling biases, for example. There are multiple ways to work between data frames/point data and rasters in R, this is just one. You can also 'rasterize' point data or assign values to a pre-generated raster layer etc.

The basic premise below is to take a set of species records with their geographic (longlat) coordinates, truncate the coordinates so that the records fall within a regular grid pattern (here 0.1 degrees resolution) and then for each point on the truncated grid, count the number of unique species recorded and display as a grid. A moving or sliding (neighbourhood) window can be used to even out the scores a little.



##############################
require(raster)

n <- c(1:2000); for (i in 1:2000) {n[i] <- sample(as.factor(1:100),1)} #create a random set of 100 'species' (numnbers 1 to 100 as factor levels, each representing a species), i.e. pick one species 2000 times

spp_records <- data.frame(latitude=runif(2000, min=-33, max=-30), longitude=runif(2000, min=135, max=140), value=n) #create a data frame simulating 2000 spatial records of the 100 simulated species within a predetermined spatial area - numbers under 'value' are in this case factors representing unique species from the above pool of 100 

#in reality the above step would be replaced by the raw data - a data frame latitude, longitude and a column for the name of the species recorded at those coordinates.

coordinates(spp_records) <- c(1,2) #set to spatial points data frame object by highlighting the latitude and longitude columns

plot(spp_records, pch=20, cex=0.5) # plot as points to visualise where the records were made

spp_records <- data.frame(latitude=round(spp_records$latitude, digits=1), longitude=round(spp_records$longitude, digits=1), value=n) #now recreate, but truncate the coordinates to a regular grid at 0.1 degrees resolution
coordinates(spp_records) <- c(1,2)

rich_count <- function(x) length(unique(x)) # simple function to count the number of unique 'species' (factor levels)

richness <- aggregate(spp_records$value, list(spp_records$latitude, spp_records$longitude), rich_count) #count the number of unique species that have the same truncated coordinates

coordinates(richness) <- c(2,1) #set coordinates

richness2 <- rasterFromXYZ(richness) #create a gridded map (at 0.1 resolution) with values as the number of unique species recorded in that cell

projection(richness2) <- "+proj=longlat +ellps=WGS84" #set the projection for plotting

plot(richness2, col=rev(heat.colors(max(richness$x)))) # cell-based 'species richness' map

Cell-based count of species richness calculated from simulated set of individual georeferenced species records, using 'raster' package in R


plot(focal(richness2, w=matrix(1,3,3), fun=mean, na.rm=TRUE), col=rev(heat.colors(max(richness$x)))) #you can also play around with a moving window type add-on, in this case averaging the richness scores for a given cell using the surrounding set of cells, to even out idiosyncrasies of how records are assigned to one cell or another
The same map of cell-based (simulated) species richness as above with a sliding window average to even out scores among neighbouring cells