Commit db83fd94 authored by Leite, Marcio's avatar Leite, Marcio

Script for Compositional Analysis of a Ecological Dataset

table_from_biom_w_taxonomy <- read_excel("table.from_biom_w_taxonomy.xlsx",
sheet = "Clean table for analyses")
Mapping_file <- read_excel("Mapping file.xlsx") <- data.frame(table_from_biom_w_taxonomy[,-c(1,ncol(table_from_biom_w_taxonomy))], row.names = table_from_biom_w_taxonomy$`OTU ID`)
dim( <- table_from_biom_w_taxonomy %>%
select(-`OTU ID`) %>%
group_by(taxonomy) %>%
taxInfo <- data.frame(Taxname =$taxonomy, TaxCode = paste0("Tax", 1:nrow( %>%
separate(Taxname, into = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "; ", remove = FALSE)
barplot(unlist(lapply(apply(taxInfo[,-1], 2, unique), length))) <- table_from_biom_w_taxonomy %>%
select(-`OTU ID`) %>%
separate(taxonomy, into = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "; ", remove = FALSE) %>%
select(1:25,Kingdom:Family) %>%
group_by(Kingdom,Phylum, Class,Order, Family) %>%
summarise_all(funs(sum)) %>%
unite("Family",Kingdom:Family,sep = " ")
rownames( <-$taxonomy
# Compositional analysis of microbial abundance
# install and load the required R packages
if (!requireNamespace("BiocManager", quietly = TRUE))
require(compositions) # exploratory data analysis of compositional data
require(zCompositions) # used for 0 substitution
require(ALDEx2) # used for per-OTU comparisons
require(xtable) # used to generate tables from datasets
library(igraph) # used to generate graphs from phi data
library(car) # used to generate graphs from phi data
# you will need to download this directly from github
# The files are in a directory called git:
source("chunk/functions.R") # rename proprBayes to propr
d <- data.frame([,-1])
tax.d <-[,1]
colnames(d) <- gsub(".fastq", "", colnames(d))
colnames(d) <- gsub("AMH.Florida", "", colnames(d))
rownames(d) <- tax.d$Family
# Remove OTUs with a mean read count across all samples less than or equal to our cutoff
# Note: we are now working with our subsampled data frame (d.r),
# the original data (d) remains unaltered
count <- 10 #this is the chosen cutoff
d.1 <- data.frame(d[which(apply(d, 1, function(x){mean(x)}) > count),],
ix <- rownames(d) %in% rownames(d.1)
d.1["Others",] <- colSums(d[ix==FALSE,])
# samples must be ROWS, so the data frame will be transposed using t()
# These functions are from the zCompositons package,
# remember to load it!: library(zCompositions)!
d.czm <- cmultRepl(t(d.1), label=0, method="CZM")
# The table needs to be transposed again (samples as COLUMNS)
d.clr <- t(apply(d.czm, 1, function(x){log(x) - mean(log(x))}))
TaxFamInfo <- data.frame(TaxNames = colnames(d.clr), TaxCode = paste0("Tax", 1:ncol(d.clr)))
colnames(d.clr) <- TaxFamInfo$TaxCode
# The output will have samples as ROWS
# Samples must be ROWs and features/OTUs as COLUMNS
d.pcx <- prcomp(d.clr)
# Sum the total variance
d.mvar <- sum(d.pcx$sdev^2)
# Calculate the PC1 and PC2 variance
PC1 <- paste("PC1: ", round(sum(d.pcx$sdev[1]^2)/d.mvar, 3))
PC2 <- paste("PC2: ", round(sum(d.pcx$sdev[2]^2)/d.mvar, 3))
# We are pasting together the component name and the variance to make an axes label
# Look at the PC1 variable by typing it in your console
# The basic biplot function:
# Plus we are printing the variance explained on the axes labels (as calculated before)
biplot(d.pcx, var.axes=F, scale=0, xlab=PC1, ylab=PC2)
# These are some hacks to get the plot to look OK
# Make the number of points equal to the number of features (for labels)
# Use: "O" for the op samples, "A" for the ak samples, and "." for the taxa
points <- c(rep(".", length(dimnames(d.pcx$rotation)[[1]])))
# We can also make the samples as points instead of labels
samples <- stringr::str_sub(rownames(d.pcx$x),-9,-6)
# Color and text size for labels and points (vector of 2)
# The first is the sample lables, the second is the points (OTUs).
size=c(0.5, 2) #Relative scale, 1 is 100%
# Add all the label information into the biplot code
biplot(d.pcx, cex=size, col=col, var.axes=T,
xlab=PC1, ylab=PC2,
scale=0, ylabs=points, xlabs=samples
size.g=c(0.5, 0.4) #Relative scale, 1 is 100%
biplot(d.pcx, cex=size.g, col=col.g, var.axes=F,
xlab=paste("PC1: ", round(sum(d.pcx$sdev[1]^2)/d.mvar, 3)),
ylab=paste("PC2: ", round(sum(d.pcx$sdev[2]^2)/d.mvar, 3)),
scale=0, xlabs=samples
abline(h=0, lty=2, col=rgb(0,0,0,0.1))
abline(v=0, lty=2, col=rgb(0,0,0,0.1))
hc <- hclust(dist(d.clr))
# We are using default parameters (dist=euclidean, hclust=complete linkage)
#line up labels
plot(hc, hang=-1)
# We will keep more OTUs for this exercise by using a lower cutoff
# Remove OTUs <= mean read count
count <- 1 <- data.frame(d[which(apply(d, 1, function(x){mean(x)}) > count),],
# Make a list of the factors
conds <- stringr::str_sub(colnames(,-9,-6)
conds.Cluster <- ifelse(conds %in% c(2008,2009), "C1",
ifelse(conds %in% c(2013,2014), "C2",conds))
## CHECK this vector in your console by typing: conds
# This should match the order and number of columns you have: colnames(
# This is the main ALDEx function for all downstream analyses
# This is the number of Monte-Carlo samples from the Dirichlet distribution
# mc.samples=128 is often sufficient.
x <- aldex.clr([,conds.Cluster!=2010], conds.Cluster[conds.Cluster!=2010], mc.samples=128, verbose=TRUE)
x <- aldex.clr(, conds, mc.samples=128, verbose=TRUE)
# Both Welches and Wilcoxon rank-sum t-test, plus a Benjamini-Hochberg multiple test correction <- aldex.ttest(x, paired.test=FALSE) <-
apply(, 2, hist) %>%
mutate(TaxName = rownames(.)) %>%
filter(wi.eBH<0.05) %>%
summary( %>%
mutate(TaxName = rownames(.)) %>%
filter(wi.eBH<0.05) %>%
#For t-test analysis we can obtain the effect size
x.effect <- aldex.effect(x, verbose=FALSE)
#Then plot both p-values and effect sizes
x.all <- data.frame(,x.effect)
aldex.plot(x.all, type="MA", test="welch")
aldex.plot(x.all, type="MW", test="welch")
#Part 3: OTU Correlations with Phi
# this is the main ALDEx function for all downstream analyses
# mc.samples=128 is often sufficient.
# This is the number of Monte-Carlo samples from the Dirichlet distribution
x <- aldex.clr(d.1, mc.samples=128, verbose=TRUE)
cont.var <- 1:ncol(d.1)
corr.test <- aldex.corr(x, cont.var)
# propr: calculate the phi statistic.
d.sma.df <- propr.aldex.phi(x)
# choose a cutoff.
phi.cutoff <- 0.25
# get the subset of OTUs that are joined by one or more low phi connections
d.sma.lo.phi <- subset(d.sma.df, phi < phi.cutoff)
# igraph: convert the connections into a graphical object
g <-, directed=FALSE)
# igraph: find the clusters
g.clust <- clusters(g)
# make a table to examine the cluster membership by hand
g.df <- data.frame($name, cluster=g.clust$membership,
# generate a set of clusters larger than some size # minimum is 2 (obviously)
big <- g.df[which(g.df$cluster.size >= 2),]
colnames(big) <- colnames(g.df)
# igraph: rename the cluster members by their genus name
# gsub(pattern, replacement, strings, perl-syntax)
V(g)$name <- as.vector(TaxFamInfo[TaxFamInfo$TaxNames %in% names(V(g)),"TaxCode"])
# igraph:
# vertex.size controls point and text color
# vertex.color controls point color
# vertex.frame controls point outline color
plot(g, vertex.size=5, vertex.color=rgb(0,0,0,0.2), vertex.frame.color="white")
# this is exactly the same procedure as for the biplot example,
# just a larger table
d.czm <- cmultRepl(t(d.1), label=0, method="CZM")
d.clr <- t(apply(d.czm, 1, function(x){log(x) - mean(log(x))}))
d.pcx <- prcomp(d.clr)
d.mvar <- sum(d.pcx$sdev^2)
PC1 <- paste("PC1: ", round(sum(d.pcx$sdev[1]^2)/d.mvar, 3))
PC2 <- paste("PC2: ", round(sum(d.pcx$sdev[2]^2)/d.mvar, 3))
size.g=c(0.5, 0.01) #Relative scale, 1 is 100%
biplot(d.pcx, cex=size.g, col=col.g, var.axes=F,
xlab=paste("PC1: ", round(sum(d.pcx$sdev[1]^2)/d.mvar, 3)),
ylab=paste("PC2: ", round(sum(d.pcx$sdev[2]^2)/d.mvar, 3)),
scale=0#, xlabs=samples
# get the names of the clusters in big
lev <- factor(big$cluster)
# step through each cluster and plot the genus name
# at the point in the biplot that the OTU would have been plotted
# OTU positions are in pcx$rotation
for(i in as.numeric(levels(lev))){ nms <- rownames(big)[big$cluster==i]
text(d.pcx$rotation[nms,][,1], d.pcx$rotation[nms,][,2],
labels = gsub("(^[A-Za-z]{3}).+", "\\1", taxon[rownames(big)[big$cluster==i],"genus"],
perl=TRUE),col=colours[i], cex=1)
abline(h=0, lty=2, col=rgb(0,0,0,0.1))
abline(v=0, lty=2, col=rgb(0,0,0,0.1))
#Part 1: generating strip charts showing difference by taxonomic level
# a function to plot strip charts at various taxonomic levels, cutoffs and x axes
aldex.stripchart <- function(
aldex.out=x.all, group.label="genus",
x.axis="diff.btw", cex=0.8, cutoff=0.1)
# aldex.out holds the new data frame
# group.label is the column name from the taxon file
# x.axis should be either diff.btw (difference) or effect
# cex controls plot size
# cutoff is the BH corrected wilcoxon rank test statistic
# copy the aldex output data frame
aldex.out <- data.frame(aldex.out, taxon[rownames(x.all), group.label])
colnames(aldex.out)[ncol(aldex.out)] <- group.label
# some R housekeeping
aldex.out <- droplevels(aldex.out)
# get a vector of significant and non-significant rows
cutoff <- 0.1 # change this to whatever you want
non.sig <- aldex.out$wi.eBH >= cutoff
sig.pos <- aldex.out$wi.eBH < cutoff & aldex.out$effect > 0
sig.neg <- aldex.out$wi.eBH < cutoff & aldex.out$effect < 0
# make a list of unique taxa in the dataset
# there may be empty groups, ignore these for now
groups <- unique(aldex.out$group.label)
# generate a y axis plotting limit a bit larger than needed
ylim<-c(length(groups) - (length(groups)+0.5), length(groups) + 0.5)
x.axis <- x.axis # we find the effect or diff.btw columns to be most useful
xlim = c(min(-1 * max(abs(aldex.out[,x.axis]))), max(aldex.out[,x.axis]))
# basically can call the different significant groups within strip chart
par(mar=c(5,15,5,1), las=1, cex=cex)
stripchart(aldex.out[non.sig,x.axis] ~ aldex.out[non.sig,group.label],
col=rgb(0,0,0,0.3), method="jitter", pch=19, xlim=xlim, xlab=x.axis,
stripchart(aldex.out[sig.pos,x.axis] ~ aldex.out[sig.pos,group.label],
col=rgb(0,0,1,0.5),method="jitter", pch=19, add=T)
stripchart(aldex.out[sig.neg,x.axis] ~ aldex.out[sig.neg,group.label],
col=rgb(1,0,0,0.5),method="jitter", pch=19, add=T)
abline(v=0, lty=2, col=rgb(0,0,0,0.2),lwd=2)
aldex.stripchart(aldex.out=x.all, group.label="genus", x.axis="diff.btw", cex=0.8, cutoff=0.1)
aldex.stripchart(aldex.out=x.all, group.label="genus", x.axis="effect", cex=0.8, cutoff=0.1)
aldex.stripchart(, group.label="phylum", x.axis="diff.btw", cex=0.8, cutoff=0.1)
# you can save any plot by doing
#Part 2: an example bargraph at the genus level
# get a vector of genus names in the same order as in the dataset
tax <- tax.d %>%
separate(Family, into = c("Kingdom", "Phylum", "Class", "Order","Family2"), sep = " ", remove = FALSE)
# sum counts by name
d1.agg <- aggregate(d, by=list(tax$Class), FUN=sum)
d1.agg <- aggregate(d, by=list(tax.d$Family), FUN=sum)
tax.agg <- d1.agg$Group.1
d1.agg$Group.1 <- NULL
d1.agg <- d.1
tax.agg <- rownames(d1.agg)
# convert to abundances
d1.prop <- apply(d1.agg, 2, function(x){x/sum(x)})
# filter so that organisms must be at least x proportionally abundant in any sample
# you can use any abundance, by default the pipeline uses an abundance of 0.001 in any sample, but this
# usually results in more OTUs than can be reasonably plotted.
# the choice of cutoff is arbitrary and depends on your hypothesis. If you believe that
# abundant organisms are the major difference, then increase your values, if you believe that rare
# organisms are important, decrease your cutoffs. 0.01 is a happy medium usually
abund <- 0.01
d1.abund <- d1.agg[apply(d1.prop, 1, max) > abund,]
tax.abund.u <- tax.agg[apply(d1.prop, 1, max) > abund]
# add a prior expectation for 0 count reads, all OTUs have at least one column with > 0 reads
# so our prior expectation is that the value of 0 represents a sampling depth problem and not
# in practice it does not make much difference if we use an additive pseudo-count as here
# or an actual prior (add 0.5 to all), or if we use the count zero multiplicative approach
# of zCompositions. We will use zCompositions as usual
d1.abund <- t(cmultRepl(t(d1.abund), label=0, method="CZM"))
# get proportions of the filtered data for plotting below
# in log-ratio speak, you are re-closing your dataset
d1.P.u <- apply(d1.abund, 2, function(x){x/sum(x)})
# order by OTU abundances
new.order <- rownames(d1.P.u)[ order( apply(d1.P.u, 1, sum), decreasing=T)]
tax.abund <- tax.abund.u[ order( apply(d1.P.u, 1, sum), decreasing=T)]
d1.P <- d1.P.u[new.order,]
# apply the centered log ratio function as per Aitchison
# this moves values from a constrained set, to an open set of numbers
# distances between values are represented as differences in the ratio between the value
# and the geometric mean value, so a differenc of 1 is a 2 fold change
#### important #### distances between ratios are linear, so we can use euclidian distances
# using euclidian distance and average linkage distance
d1.clr <- apply(d1.P, 2, function(x){log2(x) - mean(log2(x))})
dist.d1.clr <- dist(t(d1.clr), method="euclidian")
clust.d1 <- hclust(dist.d1.clr, method="average")
# standard colour scheme (Jean Macklaim)
colours <- c("steelblue3", "skyblue1", "indianred1", "mediumpurple1", "olivedrab3", "pink", "#FFED6F", "mediumorchid3", "ivory2", "tan1", "aquamarine3", "#C0C0C0", "royalblue4", "mediumvioletred", "#999933", "#666699", "#CC9933", "#006666", "#3399FF", "#993300", "#CCCC99", "#666666", "#FFCC66", "#6699CC", "#663366", "#9999CC", "#CCCCCC", "#669999", "#CCCC66", "#CC6600", "#9999FF", "#0066CC", "#99CCCC", "#999999", "#FFCC00", "#009999", "#FF9900", "#999966", "#66CCCC", "#339966", "#CCCC33", "#EDEDED")
# setup a new plot
# location parameters are (x1,x2,y1,y2)
# these can overlap
par(fig=c(0,1,0,1), new=TRUE)
# plot the dendrogram in the top left two thirds
plot(clust.d1, main=NULL, cex=0.8, xlab="")
# plot the bargraph in the bottom left half
par(fig=c(0,0.7,0,0.55), new=TRUE)
barplot(d1.P[,clust.d1$order], space=0, col=colours, las=2, axisnames=F)
# plot the legend in the right one-third
par(fig=c(0.6,1,0,1), new=TRUE)
legend(x="center", legend=TaxFamInfo[TaxFamInfo$TaxNames %in% tax.abund, "TaxCode"] , col=colours, lwd=5, cex=0.8, border=NULL)
covariates <- Mapping_file[,-1]
rownames(covariates) <- Mapping_file$`#NAME` %>%
gsub(".fastq", "", .) %>%
gsub("AMH.Florida", "", .)
pca1 <- dudi.pca(covariates, scan = FALSE, nf = 4)
pca2 <- dudi.pca(t(d1.clr), scal = FALSE, scan = FALSE, nf = 4)
conds <- factor(conds, levels = c("2008","2009","2010","2013","2014"))
bet1 <- bca(pca1, conds, scan = FALSE, nf = 2)
bet2 <- bca(pca2, conds, scan = FALSE, nf = 2)
if(adegraphicsLoaded()) {
g1 <- s.class(pca1$li, conds, psub.text = "Principal Component Analysis (env)",
plot = FALSE)
g2 <- s.class(pca2$li, conds, psub.text = "Principal Component Analysis (spe)",
plot = FALSE)
g3 <- s.class(bet1$ls, conds, psub.text = "Between sites PCA (env)", plot = FALSE)
g4 <- s.class(bet2$ls, conds, psub.text = "Between sites PCA (spe)", plot = FALSE)
G <- ADEgS(list(g1, g2, g3, g4), layout = c(2, 2))
} else {
par(mfrow = c(2, 2))
s.class(pca1$li, conds, sub = "Principal Component Analysis (env)", csub = 1.75)
s.class(pca2$li, conds, sub = "Principal Component Analysis (spe)", csub = 1.75)
s.class(bet1$ls, conds, sub = "Between sites PCA (env)", csub = 1.75)
s.class(bet2$ls, conds, sub = "Between sites PCA (spe)", csub = 1.75)
par(mfrow = c(1, 1))
coib <- coinertia(pca1, pca2, scann = FALSE)
coib <- coinertia(bet1, bet2, scann = FALSE)
RV.rtest(bet1$tab, bet2$tab, 999)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment