############################################################################## # # R functions for the generation of count matrices and the construction of networks. # # Author: Karoline Faust # # Date: 27/January/2015 # # Examples: # 1) Basic usage # Simulate matrix with 50 taxa and samples using broken-stick to generate taxon probabilities # sim=simMat(taxa=50,samples=50,mode=5) # Compute positive edge percentage and plot the Spearman score distribution # get.pep(mat=sim, plot=T) # Compute taxon probability evenness with Sheldon's and Pielou's index # sheldon(getProbabs(taxa=50,mode=5)) # pielou(getProbabs(taxa=50,mode=5)) # Compute positive edge percentage for network generated with Dirichlet-Multinomial with taxon probabilities following uneven geometric series with k = 0.8 # get.pep(taxa=50, samples=50, mode=6, k=0.8) # Assess evenness of taxon probability vector # sheldon(geom.series(l=50,k=0.8)) # # 2) Impact of normalization and rarefaction on PEP computed on matrix with varying sequencing depth # counts=c(300,400,500,600,400,600,500,600,800,500) # get.pep(taxa=50, samples=length(counts), counts=1000) # around 46% # get.pep(taxa=50, samples=length(counts), counts=counts) # around 84% # get.pep(taxa=50, samples=length(counts), counts=counts, norm=T) # around 46% # get.pep(taxa=50, samples=length(counts), counts=counts, rarefy=T) # around 46% # # 3) Impact of evenness and permutation on total edge number # get.pep(taxa=50, samples=10, mode=6, k=0.8, report.full=T)$total # strongly uneven taxon probabilities # get.pep(taxa=50, samples=10, mode=6, k=0.1, report.full=T)$total # even taxon probabilities (many more edges than in the uneven case) # get.pep(taxa=50, samples=10, mode=6, k=0.8, permut=T, iters=100, report.full=T)$total # edge number reduced compared to the case without permutation # get.pep(taxa=50, samples=10, mode=6, k=0.1, permut=T, iters=100, report.full=T)$total # edge number reduced compared to the case without permutation # # 4) Impact of taxon number on Spearman distribution # median(get.pep(taxa=50,samples=100,report.full=T,T.up=0,T.down=0)$scores) # median(get.pep(taxa=150,samples=100,report.full=T,T.up=0,T.down=0)$scores) # more taxa: scores are right-shifted # # 5) Count matrix with community structure # heatmap(simMatMixed(mode=6,k=0.01,groups=4),xlab="samples",ylab="taxa",main="Count matrix with community structure", Rowv=NA, Colv=NA) # ############################################################################## # load required libraries require(vegan) # vegan is required for Bray Curtis dissimilarity, Shannon and Simpson diversity and broken-stick function require(MCMCpack) # MCMCpack is required for sampling Dirichlet distribution require(gdata) # gdata is required for easy setting of matrix triangles ############################################################################## # Build a network and compute its positive edge percentage (PEP) # # Arguments: # mat = user-provided matrix, if left empty, the matrix will be generated using parameters "taxa", "samples", "distrib" and depending on the distribution: "theta", "mode", "k", "counts" and "maxcount" # method = network construction method, values: "pearson", "spearman", "kld" or "bray" # T.up = upper threshold, edges with scores above (for Pearson or Spearman) or below (for Bray Curtis or Kullback-Leibler) are counted as positive # T.down = lower threshold, edges with scores below (for Pearson or Spearman) or above (for Bray Curtis or Kullback-Leibler) are counted as negative # taxa = number of rows in the simulated matrix # samples = number of columns in the simulated matrix # theta = overdispersion parameter for Dirichlet-Multinomial distribution # mode = generation mode for taxon probability vector needed for Dirichlet-Multinomial distribution, for details see function "getProbabs" # k = parameter for mode 6 # shuffle.samples = shuffle each sample before network construction # distrib = distribution with which to generate count matrix, possible values are "dm" and "unif" # counts = for Dirichlet-Multinomial distribution: the sequencing depth, can be a vector with the length of samples # norm = normalize matrix before network construction # rarefy = if TRUE, rarefy matrix to the minimum total read count, if larger than 1, rarefy matrix to the given total read count # stand.rows = standardize rows by dividing each entry by its corresponding row sum, applied after normalization and rarefaction and before network construction # pval.cor = compute p-values of correlations with cor.test (cannot be selected together with permut, renorm and/or permutandboot or a method that is not a correlation) # permut = compute p-values on edges with a simple permutation test # renorm = compute p-values with a permutation test, using renormalization # permutandboot = compute p-values from both permutation and bootstrap distribution (if renorm is true, this is equivalent to ReBoot) # iters = number of iterations for the permutation test # bh = multiple-test-correct using Benjamini-Hochberg # min.occ = only keep rows with at least the given number of non-zero values (carried out before network construction) # keep.filtered = sum all filtered rows and add the sum vector as additional row # maxcount = for uniform distribution: the largest possible count # plot = plot score or, if permut is true, p-value distribution # report.full = in addition to positive edge percentage, return negative edge percentage, positive edge number, negative edge number, total edge number, score distribution and p-values # verbose = print the number of positive and negative edges and details of p-value computation # # Value: # The percentage of positive edges. If report.full is true, the following information is returned: # "pospercent" = positive edge percentage # "negpercent" = negative edge percentage, # "posnumber" = positive edge number # "negnumber"= negative edge number # "total" = total edge number # "scores" = lower triangle of the correlation/dissimilarity matrix, missing values discarded (if p-values are computed, non-significant scores are modified such that they do not pass the specified thresholds) # "pvalues" = matrix of p-values for each pair-wise correlation/dissimilarity, if bh is set to true, lower triangle reports the corrected p-values, upper triangle the original p-values # ############################################################################## get.pep<-function(mat = matrix(), method="spearman", T.up=0.2, T.down=-0.2, taxa=5, samples=10, theta=0.002, mode=1, k=0.5, shuffle.samples=F, distrib="dm", counts=1000, norm=FALSE, rarefy=0, stand.rows=FALSE, pval.cor=F, permut=F, renorm=F, permutandboot=F, iters=100, bh=F, min.occ=0, keep.filtered=F, maxcount=100, plot=F, report.full=F, verbose=F){ if(nrow(mat) == 1){ mat = simMat(taxa=taxa,samples=samples,theta=theta, distrib=distrib, counts=counts, norm = norm, maxcount=maxcount, mode=mode, k=k, shuffle.samples=shuffle.samples) } min.occ.default=1 keep.indices = c() if(pval.cor == TRUE & (method == "kld" || method == "bray")){ stop("P-value computation with cor.test is only possible for correlations!") } if(pval.cor == TRUE & (permut == TRUE || permutandboot == TRUE || renorm == TRUE)){ stop("P-value computation with cor.test cannot be combined with permut, renorm or permutandboot!") } # remove rows consisting of zeros only if(min.occ > 0){ min.occ.default = min.occ for(i in 1:nrow(mat)){ occ = 0 for(j in 1:ncol(mat)){ if(mat[i,j] > 0){ occ = occ + 1 } } if(occ >= min.occ.default){ keep.indices = c(keep.indices,i) } } if(length(keep.indices) == 0){ stop("No rows left in matrix after rare taxon filtering!") } is.summed=FALSE summed.row=c() if(keep.filtered == TRUE){ filtered.rows=mat[setdiff(c(1:nrow(mat)),keep.indices),] if(nrow(filtered.rows) > 0){ is.summed=TRUE summed.row = filtered.rows[1,] if(nrow(filtered.rows) > 1){ for(f.index in 2:nrow(filtered.rows)){ summed.row=summed.row+filtered.rows[f.index,] } } } } mat = mat[keep.indices,] if(keep.filtered == TRUE & is.summed == TRUE){ mat=rbind(mat,summed.row) } } # if requested, rarefy matrix if(rarefy == TRUE){ mat = rarefy(mat,min = 0) }else if(rarefy > 1){ mat = rarefy(mat, min=rarefy) } # if requested, normalize matrix if(norm == TRUE){ mat=normalize(mat) } # normalize row-wise if(stand.rows == TRUE){ mat = t(normalize(t(mat))) } pvals = matrix(nrow=nrow(mat),ncol=nrow(mat)) # compute p-values if(permut == TRUE || pval.cor == TRUE){ if(iters < 1){ stop("iters should be at least 1.") } for(i in 1:nrow(mat)){ for(j in 1:i){ if(i != j){ if(pval.cor == TRUE){ pvals[i, j] = cor.test(mat[i,],mat[j,],method=method)$p.value }else{ pvals[i, j] = get.pval(mat, i, j, method=method, N.rand=iters, renorm=renorm, permutandboot=permutandboot, verbose=verbose) } pvals[j, i] = pvals[i, j] } } } # if requested, carry out Benjamini-Hochberg correction if(bh == TRUE){ pvec = pvals[lower.tri(pvals)] pvec=p.adjust(pvec,method="BH") # only lower triangle is used later on lowerTriangle(pvals)=pvec } } if(method == "spearman"){ scores=cor(t(mat),method="spearman") }else if(method == "pearson"){ scores=cor(t(mat),method="pearson") }else if(method == "bray"){ scores = vegdist(mat, method="bray") scores=as.matrix(scores) }else if(method == "kld"){ scores = compute.kld(mat) }else{ stop("Choose either spearman, pearson, kld or bray as a method.") } # if p-values were calculated, set all correlations with p-value above 0.05 to 0, set Bray Curtis scores to 0.5 and KLD scores to a value between the lower and upper threshold if(permut == TRUE || pval.cor == TRUE){ FILTER=pvals>0.05 if(method == "bray"){ # Bray-Curtis dissimilarity of 0.5 is filtered out scores[FILTER]=0.5 }else if(method == "kld"){ # KLD value between the thresholds is filtered out scores[FILTER]=T.down+(T.up-T.down)/2 }else{ # correlation of zero is filtered out scores[FILTER]=0 } } # get lower triangle of score matrix scores=scores[lower.tri(scores)] storage=scores # discard missing values scores=scores[!is.na(scores)] if(method == "bray" || method == "kld"){ neg.edges=scores[scores>T.up] pos.edges=scores[scores<(T.down)] }else{ pos.edges=scores[scores>T.up] neg.edges=scores[scores<(T.down)] } if(plot == T){ values = scores what = "Score" if(permut == TRUE){ values = pvals what = "P-value" } hist(values, main=paste(what," distribution for method ",method,sep=""), xlab=paste(what,"s",sep=""), ylab="Frequency") } num.pos=length(pos.edges) num.neg=length(neg.edges) total=num.pos+num.neg one.percent=total/100 pos.percent=num.pos/one.percent neg.percent=num.neg/one.percent if(verbose == T){ print(paste(num.pos,"positive edges",sep=" ")) print(paste(num.neg,"negative edges",sep=" ")) print(paste(pos.percent,"positive percent",sep=" ")) print(paste(neg.percent,"negative percent",sep=" ")) } if(report.full == TRUE){ # append unfiltered score list out=list(pos.percent, neg.percent, num.pos, num.neg, total, storage, pvals) names(out)=c("pospercent", "negpercent","posnumber","negnumber","total","scores","pvalues") return(out) }else{ return(pos.percent) } } ############################################################################## # Simulate a count matrix # # Arguments: # taxa = the number of rows in the matrix # samples = the number of columns in the matrix # counts = either the total number of counts in each sample or a vector specifying the count number in each sample # distrib = "dm" for Dirichlet-Multinomial distribution or "unif" for uniform distribution # maxcount = maximal count number for any taxon (only for uniform distribution) # mode = the mode according to which the taxon probability vector for the DM distribution is to be generated, please find the description of the modes for function "getProbabs" # k = evenness parameter of mode 6 # theta = overdispersion parameter of the DM distribution # norm = normalize matrix column-wise, such that the entries in each column add to one # shuffle.samples = shuffle each sample # # Value: # A count matrix or relative abundance matrix # ############################################################################## simMat<-function(taxa=5, samples=10, counts=1000, distrib="dm", maxcount=100, mode=1, k=0.5, theta=0.002, norm=F, shuffle.samples=F){ if(length(counts)==1 && counts < 1){ stop("counts should be at least 1.") } if(taxa < 1){ stop("There should be at least one taxon.") } if(samples < 1){ stop("There should be at least one sample.") } pi = getProbabs(taxa=taxa, mode=mode, k=k) mat=matrix(nrow=taxa,ncol=samples) # get taxon probabilities for each column sample.count = counts for(i in 1:samples){ if(length(counts)==samples){ sample.count = counts[i] } mat[,i]=simCounts(pi=pi, theta=theta, counts = sample.count, distrib=distrib, maxcount=maxcount) if(shuffle.samples){ mat[,i]=sample(mat[,i]) } } # normalize matrix column-wise if(norm){ mat = normalize(mat) } mat } #################################################################################### # Simulate a count matrix with groups of taxa which are high together across samples # # Arguments: # gtaxa = the number of taxa (rows) per group # gsamples = the number of samples (columns) per group # groups = the number of different communities # method = method with which to introduce groups, "swap" simply swaps blocks of taxa, whereas "select" # replaces all but the group taxa by counts sampled from a multinomial Dirichlet distribution with equal # taxon probabilities (sample count sums are preserved for both methods) # counts = either the total number of counts in each sample or a vector specifying the count number in each sample # distrib = "dm" for Dirichlet-Multinomial distribution or "unif" for uniform distribution # maxcount = maximal count number for any taxon (only for uniform distribution) # mode = the mode according to which the taxon probability vector for the DM distribution is to be generated, please find the description of the modes for function "getProbabs" # k = evenness parameter of mode 6 # theta = overdispersion parameter of the DM distribution # norm = normalize matrix column-wise, such that the entries in each column add to one # shuffle.samples = shuffle each sample # # Value: # A count matrix or relative abundance matrix with a block structure # ############################################################################## simMatMixed<-function(gtaxa=50, gsamples=10, groups=3, method="select", counts=1000, distrib="dm", maxcount=100, mode=1, k=0.5, theta=0.002, norm=F, shuffle.samples=F){ taxa = groups * gtaxa samples = groups * gsamples mat=simMat(taxa = taxa, samples = samples, counts = counts, distrib = distrib, maxcount = maxcount, mode = mode, k = k, theta = theta, norm = norm, shuffle.samples = shuffle.samples) tmp = mat indices=c(1:taxa) if(method == "select"){ community.indices=c(1:gtaxa) # print(paste("community: ",community.indices)) for(j in 1:gsamples){ # print(paste("sample number: ",j)) communityCount = sum(mat[community.indices,j]) if(length(counts) > 1){ countLeft = counts[j] - communityCount }else{ countLeft = counts - communityCount } non.community.indices = setdiff(indices,community.indices) if(countLeft > 0){ fill=simMat(taxa=(groups-1)*gtaxa,samples=1,distrib="dm",mode=1,counts=countLeft) mat[non.community.indices,j]=fill }else{ mat[non.community.indices,j]=0 } } } for(i in 2:groups){ # replace current block by upmost block if(method == "swap"){ mat[(1+(gtaxa*(i-1))):(gtaxa*i),(1+(gsamples*(i-1))):(gsamples*i)] = mat[1:gtaxa,(1+(gsamples*(i-1))):(gsamples*i)] mat[1:gtaxa,(1+(gsamples*(i-1))):(gsamples*i)] = tmp[(1+(gtaxa*(i-1))):(gtaxa*i),(1+(gsamples*(i-1))):(gsamples*i)] }else if(method == "select") { community.indices=(1+(gtaxa*(i-1))):(gtaxa*i) # print(paste("community: ",community.indices)) for(j in 1:gsamples){ samplenum=j+(gsamples*(i-1)) # print(paste("sample number: ",samplenum)) mat[community.indices,samplenum] = mat[1:gtaxa,samplenum] communityCount = sum(mat[community.indices,samplenum]) if(length(counts) > 1){ countLeft = counts[samplenum] - communityCount }else{ countLeft = counts - communityCount } non.community.indices = setdiff(indices,community.indices) if(countLeft > 0){ fill=simMat(taxa=(groups-1)*gtaxa,samples=1,distrib="dm",mode=1,counts=countLeft) mat[non.community.indices,samplenum]=fill }else{ mat[non.community.indices,samplenum]=0 } } }else{ stop("Community method not supported. Please choose either swap or select.") } } mat } ######################## Helper functions ################################### ############################################################################## # Generate counts with Dirichlet-multinomial (dm) or uniform distribution (unif) # # Arguments: # samples = number of samples to generate # counts = total number of counts in each sample # pi = taxon probability vector, its length indicates the number of taxa # theta = overdispersion parameter for dm # maxcount = maximal count number for any taxon (only for uniform distribution) # # Value: # a count vector or matrix # # Note: taken partly from package HMP, function Dirichlet.multinomial and # package dirmult, function simPop # ############################################################################## simCounts<-function(samples=1,counts=1000,pi=rep(1/10,10),theta=0.002, distrib="dm", maxcount=100){ if(sum(pi) != 1){ stop("Taxon probabilities should sum to one!") } if(theta > 1 || theta < 0){ stop("Theta should be between 0 and 1.") } if(distrib == "dm"){ # from a line in function simPop in package dirmult gamma = pi*(1 - theta)/theta # from a line in Dirichlet.multinomial from package HMP res=rmultinom(n=samples, size=counts, prob=rdirichlet(samples,gamma)) }else if(distrib == "unif"){ if(maxcount < 1){ stop("The maximal count should be at least 1.") } res=matrix(runif(length(pi)*samples,0,maxcount),nrow=length(pi),ncol=samples) res = round(res) }else{ stop("Choose dm or unif as generating distribution.") } res } ############################################################################## # Generate geometric series (niche apportionment model). # # Arguments: # l = length of geometric series # counts = sum of geometric series # k = evenness of geometric series, between 0 and 1 (the smaller, the more even) # # Value: # The geometric series # # Note: The geometric series is for instance described in # "Comments about some species abundance patterns: classic, neutral, and niche partitioning models" # by Ferreira and Petrere-Jr., Braz. J. Biol. 2008 # ############################################################################## geom.series<-function(l=10, counts=1000, k=0.5){ if(l < 1){ stop("l should be at least 1.") } if(k > 1 || k < 0){ stop("k should be a number between 0 and 1.") } pi=c() C = 1/(1 - ((1-k)^l)) for(i in 1:l){ pi=c(pi,counts*C*k*(1-k)^(i-1)) } pi=pi/sum(pi) pi } ############################################################################## # Compute evenness using Pielou's index # # Arguments: # x = matrix-like object or vector # # Value: # pielou's evenness # ############################################################################## pielou<-function(x){ H = diversity(x, index="shannon") J = H/log(specnumber(x)) J } ############################################################################## # Compute evenness using Sheldon's index # # Arguments: # x = matrix-like object or vector # correction = whether or not to apply the correction described in Alatalo, Oikos 37, 199-204, 1981 # N2N1 = whether to compute Sheldon's evenness as the ratio of e raised to the power of H (H = Shannon diversity) and Simpson's diversity # although Alatalo recommends this setting, it does not return perfect evenness in case probabilities are exactly equal # e.g. sheldon(rep(1/10,10),N2N1=TRUE)=0.01234568 # # Value: # Sheldon's evenness # ############################################################################## sheldon<-function(x, correction = TRUE, N2N1 = FALSE){ H = diversity(x, index="shannon") if(N2N1){ simpson = diversity(x, index="simpson") numerator = 1/simpson denominator = exp(1)^H }else{ numerator = exp(1)^H denominator = specnumber(x) } if(correction){ numerator = numerator - 1 denominator = denominator - 1 } S = numerator/denominator S } ############################################################################## # Generate a taxon probability vector # # Arguments: # taxa = the length of the taxon probability vector # mode = 1-6 # 1 = perfect evenness (pi = rep(1/N,N)) # 2 = probabilities sampled from uniform distribution and normalized to one # 3 = dominant species has a probability of 0.95 and all other species are equally distributed to add probabilities up to 1 # 4 = probabilities are sampled from a poisson distribution with lambda set to 0.5 # 5 = probabilities are generated using broken-stick function bstick() from vegan # 6 = probabilities are generated with the geometric series of given evenness k # k = evenness parameter for mode 6 # # Value: # A taxon probability vector # ############################################################################## getProbabs<-function(taxa = 10, mode = 1, k = 0.5){ if(taxa < 1){ stop("taxa should be at least 1.") } if(!is.wholenumber(mode)){ stop("The mode should be an integer between 1 and 6") } if(mode == 1){ pi=rep(1/taxa,taxa) }else if(mode == 2){ pi = runif(taxa) pi =pi/sum(pi) }else if(mode == 3){ dominant=0.95 commoner=0.05/(taxa-1) pi=c(dominant,rep(commoner,(taxa-1))) }else if(mode == 4){ pi=rpois(taxa,lambda=0.5) pi=pi/sum(pi) }else if(mode == 5){ pi = bstick(taxa,1) }else if(mode == 6){ pi=geom.series(l=taxa, counts=1, k = k) }else if(mode < 1 || mode > 6){ stop("Choose a mode between 1 and 6.") } pi } ############################################################################## # Compute Kullback Leibler dissimilarity (symmetrized divergence). # # Argument: # x = a matrix with non-negative numbers # pseudocount = this value is added to each zero # # Value: # Kullback Leibler dissimilarity matrix # # Note: # Equation: D(x,y) = SUM(x_i*log(x_i/y_i) + y_i*log(y_i/x_i)) # taken from "Caution! Compositions! Can constraints on omics data lead analyses astray?" # David Lovell et al., Report Number EP10994 # ############################################################################## compute.kld=function(x, pseudocount=0.00000001){ # diagonal is zero kld=matrix(data=0,nrow=nrow(x),ncol=nrow(x)) for(i in 1:nrow(x)){ for(j in 1:i){ kld[i,j]=get.kld(x[i,],x[j,], pseudocount=pseudocount) kld[j,i]=kld[i,j] } } kld } ############################################################################## # Compute Kullback Leibler dissimilarity (symmetrized divergence) between two vectors. # # Argument: # x = vector with non-negative numbers # y = vector with non-negative numbers # pseudocount = this value is added to each zero # # Value: # Kullbakc Leibler dissimilarity # # Note: # Equation: D(x,y) = SUM(x_i*log(x_i/y_i) + y_i*log(y_i/x_i)) # taken from "Caution! Compositions! Can constraints on omics data lead analyses astray?" # David Lovell et al., Report Number EP10994 # ############################################################################## get.kld=function(x,y, pseudocount=0.00000001){ if(length(x) != length(y)){ stop("The two vectors should have the same length!") } x[x==0]=pseudocount y[y==0]=pseudocount dis = 0 x = x/sum(x) y = y/sum(y) for(i in 1:length(x)){ if(!is.nan(x[i]) && !is.nan(y[i])){ ratioxy = log(x[i]/y[i]) ratioyx = log(y[i]/x[i]) dis = x[i]*ratioxy+y[i]*ratioyx + dis } } dis } ############################################################################## # Check whether a number is an integer. # # Argument: # x = number # tol = tolerance # # Value: # boolean true if number is an integer within the tolerance, false otherwise # # Note: function taken from http://stackoverflow.com/questions/3476782/how-to-check-if-the-number-is-integer # ############################################################################## is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol ############################################################################## # Rarefy a given count matrix using vegan's rrarefy function # # Arguments: # x = matrix-like object or vector # min = minimum count to which x is to be rarefied (if equal to zero, minimum sample sum is taken as min) # # Value: # A rarefied matrix or vector # ############################################################################## rarefy<-function(x,min = 0){ if(min < 0){ stop("Min should be either 0 or positive.") } if(min == 0){ min=min(colsums=apply(x,2,sum)) print(paste("Rarefy to minimum count",min)) } rar=t(rrarefy(t(x),min)) rar } ############################################################################## # Normalize a matrix by dividing each entry by its corresponding column sum # # Arguments: # x = matrix # # Value: # A normalized matrix # ############################################################################## normalize<-function(x){ colsums = apply(x,2,sum) for(i in 1:ncol(x)){ x[,i]=x[,i]/colsums[i] } x } ############################################################################## # Compute the association between two vectors using the given method and # compute its p-value using a permutation test # # Arguments: # matrix = input matrix # x.index = index of first vector in the input matrix # y.index = index of second vector in the input matrix # N.rand = number of iterations used for the permutation test # method = similarity measure (supported measures are: "pearson", "spearman", "bray" and "kld") # renorm = renormalize after permutation # permutandboot = compute a bootstrap distribution in addition to the permutation distribution and # return the p-value as the mean of the permutation distribution under the bootstrap distribution # plot = plot the histogram of the permutation and, if permutandboot is true, of the bootstrap distribution # verbose = print distribution properties and p-value # # Value: # The p-value of the association # # Note: this method was adapted from R code by Fah Sahtirapongsasuti # ############################################################################## get.pval = function(matrix, x.index, y.index, N.rand=1000, method="spearman", renorm=F, permutandboot=F, plot=F, verbose=F) { x = matrix[x.index,] y = matrix[y.index,] lower.tail = TRUE # bray and kld are dissimilarities, so one-sided p-value needs to be computed from the upper tail if(method == "bray" || method == "kld"){ lower.tail = FALSE } if(method == "spearman"){ this.sim = cor(x, y, use="complete.obs", method="spearman") }else if(method == "pearson"){ this.sim = cor(x, y, use="complete.obs", method="pearson") }else if(method == "bray"){ this.sim= vegdist(rbind(x,y),method="bray") }else if(method == "kld"){ this.sim=get.kld(x,y) }else{ stop("Select either spearman, pearson, kld or bray as method.") } rand.sim = rep(NA, N.rand) boot.sim = rep(NA, N.rand) for (i in 1:N.rand) { rand = sample(x, length(x)) if(renorm == T){ mat.copy=matrix mat.copy[x.index,]=rand mat.copy = normalize(mat.copy) rand = mat.copy[x.index,] y = mat.copy[y.index,] } if(method == "spearman"){ rand.sim[i] = cor(rand, y, method="spearman", use="complete.obs") }else if(method == "pearson"){ rand.sim[i] = cor(rand, y, method="pearson",use="complete.obs") }else if(method == "bray"){ rand.sim[i] = vegdist(rbind(rand,y),method="bray") }else if(method == "kld"){ rand.sim[i] = get.kld(rand,y) } } rand.sim = na.omit(rand.sim) if(plot == T){ col1=rgb(0,0,1,1/3) col2=rgb(1,0,0,1/3) hist(rand.sim,col=col1) abline(v=mean(rand.sim),col="blue") } if(permutandboot){ x=matrix[x.index,] y=matrix[y.index,] for (i in 1:N.rand) { rand.idx = sample(1:length(x),replace=TRUE) x.boot=x[rand.idx] y.boot=y[rand.idx] if(method == "spearman"){ boot.sim[i] = cor(x.boot, y.boot, method="spearman", use="complete.obs") }else if(method == "pearson"){ boot.sim[i] = cor(x.boot, y.boot, method="pearson",use="complete.obs") }else if(method == "bray"){ boot.sim[i] = vegdist(rbind(x.boot,y.boot),method="bray") }else if(method == "kld"){ boot.sim[i] = get.kld(x.boot,y.boot) } } boot.sim = na.omit(boot.sim) if(plot == T){ hist(boot.sim,col=col2,add=T) abline(v=mean(boot.sim),col="red") legend(x="topleft", c("Permut","Boot"), bg="white",col=c(col1,col2),lty=rep(1,2),merge=T) } # if we got enough non-NA permutation and bootstrap values, compute p-value if(length(rand.sim) > round(N.rand/3) && length(boot.sim) > round(N.rand/3)){ pval = pnorm(mean(rand.sim),mean=mean(boot.sim),sd=sd(boot.sim), lower.tail=lower.tail) }else{ pval = 0.5 } }else{ # if we got enough non-NA permutation values, compute p-value if(length(rand.sim) > round(N.rand/3)){ if (lower.tail) { pval = (sum(this.sim > rand.sim) / length(rand.sim)) } else { pval = (sum(this.sim < rand.sim) / length(rand.sim)) } }else{ pval = 0.5 } } # set missing value (from constant vector) to intermediate p-value (worst possible p-value in this context) if(is.na(pval)){ pval = 0.5 } # p-values are one-sided, so high p-values signal mutual exclusion and are converted into low ones if(pval > 0.5){ pval = 1 - pval } if(verbose == T){ print(paste("p-value =",pval)) print(paste("original score",this.sim)) print(paste("mean of null distrib",mean(rand.sim))) print(paste("sd of null distrib",sd(rand.sim))) if(permutandboot == T){ print(paste("mean of boot distrib",mean(boot.sim))) print(paste("sd of boot distrib",sd(boot.sim))) } } pval }