Phylogenetic distance analysis and clustering

Ricardo A. Segovia

Improve Research Reproducibility A Bio-protocol resource

Home
Protocols

Preprint

Phylogenetic distance analysis and clustering

RS Ricardo A. Segovia

Last updated date: Jun 29, 2022 Views: 553 Forks: 0

An abbreviated version of this protocol was published in Science Advances in May, 2020

Freezing and water availability structure the evolutionary diversity of trees across the Americas

Download PDF

Ask a question

How to cite

Favorite

# chunk phylosor distances matrix and kmeans clustering

# Ricardo Segovia, Institute of Ecology and Biodiversity (ieb-chile.cl)/ 29/06/2022

##CREATING THE COMMUNITY MATRIX

#let's create a dummy variable to have something to operate on

genusXarea <-genusXarea_america

genusXarea$dummy <- 1

#create the genus by site matrix

genus_commat <- tapply(genusXarea$dummy,INDEX=list(site=genusXarea$V1,genus=genusXarea$V2),FUN=sum)

dim(genus_commat)

#fill in the zeros for genera absent from sites

genus_commat[which(is.na(genus_commat))] <- 0

#check and make sure it matches with rows in genusXarea data

sum(genus_commat); dim(genusXarea)

#MAtching phylo and matrix

#let's figure out which genera have multiple accessions and which are in our genus matrix, to figure out which ones we have to deal with

tree_original <- read.tree("R3019.tre") # genus-level phylogeny / genera in more than one continent are labeled genusX-SA (if in South America )genusX-AF (if in Africa)

tree_names_table <- matrix(NA,Ntip(tree_original),4)

for (i in 1:nrow(tree_names_table)){

tree_names_table[i,1] <- unlist(strsplit(tree_original$tip.label[i],split="_"))[1]

tree_names_table[i,2] <- unlist(strsplit(tree_original$tip.label[i],split="_"))[2]

tree_names_table[i,3] <- unlist(strsplit(tree_original$tip.label[i],split="_"))[3]

tree_names_table[i,4] <- unlist(strsplit(tree_original$tip.label[i],split="_"))[4]

}

colnames(tree_names_table) <- c("Order","Family","Genus","Region")

tree_names_table <- as.data.frame(tree_names_table)

rownames(tree_names_table) <- tree_original$tip.label

summary(tree_names_table)

#so, we have lots of genera with multiple accessions, and often found in different regions

#let's focus on those genera that are in the South America table

tree_names_table_sub1 <- tree_names_table[which(tree_names_table$Genus%in%colnames(genus_commat)),]

#let's get rid of empty levels to our genus factor column

tree_names_table_sub1$Genus <- as.factor(as.character(tree_names_table_sub1$Genus))

summary(tree_names_table_sub1)

#good, so we have removed all genera that are not in phylogeny, but still have a lot of repeated names to deal with

#let's check out those repeated names real quick

tmp <- summary(tree_names_table_sub1$Genus,maxsum=2000)

repeated_genera <- names(tmp)[which(tmp>1)]

#just printing information about the repeated names

for (i in 1:length(repeated_genera)){

print(tree_names_table_sub1[which(tree_names_table_sub1$Genus==repeated_genera[i]),])

}

#it looks like all repeated taxa have one sequence with a _SA appendix, so we can just keep that

#it also looks like the one that is with the _SA appendix is more accurately placed than the one without an appendix (e.g. Capparis, Dacryodes, see names above from Ricardo)

#first get the ones that are not repeated, because we clearly want to keep them

names2keep <- rownames(tree_names_table_sub1)[which(!tree_names_table_sub1$Genus%in%repeated_genera)]

#then get the repeated names

tmp <- rownames(tree_names_table_sub1)[which(tree_names_table_sub1$Genus%in%repeated_genera)]

#only keep ones with south america appendix

tmp2 <- tmp[grep("_SA",tmp)]

names2keep <- c(names2keep,tmp2)

names2exclude <- tree_original$tip.label[which(!tree_original$tip.label%in%names2keep)]

tree_SA <- drop.tip(tree_original,names2exclude)

Ntip(tree_SA)

##change names

newnames <- matrix(NA,length(tree_SA$tip.label))

newnames <- tree_SA$tip.label

newnames <- data.frame(newnames)

newnames ['genus'] <- sapply (strsplit(as.character(newnames$newnames),'_'),"[",3)

tree_SA$tip.label <- as.character(newnames$genus)

write.tree(tree_SA, "sa_tree.tre")

##create America Commat

SA_genus_commat <- genus_commat[,which (colnames(genus_commat) %in% tree_SA$tip.label)]

dim(SA_genus_commat)

##Cluster Analyses

#But, before that, let's go ahead and try to get the full phylosor distance object

phylosor_all <- phylosor.query(tree_SA,SA_genus_commat)

# Warning: one of the input matrices has fewer columns than the number of species in the tree.

#not sure if that could have messed things up...

rownames(phylosor_all) <- colnames(phylosor_all) <- rownames(SA_genus_commat)

phylosor_all_dist <- 1 - phylosor_all

phylosor_all_dist <- as.dist(phylosor_all_dist)

#let's do a simple cluster of this

phylosor_all_cluster <- hclust(phylosor_all_dist,method="average")

phylosor_all_cluster_phylo <- as.phylo(phylosor_all_cluster)

write.tree(phylosor_all_cluster_phylo,"hclustaverage.tre")

####Elbow Analysis. (to select the bes K)

wss <- (nrow(phylosor_all)-1)*sum(apply(phylosor_all,2,var))

for (i in 2:15) wss[i] <- sum(kmeans(phylosor_all,

centers=i)$withinss)

plot(1:15, wss, type="b", xlab="Number of Clusters",

ylab="Within groups sum of squares")

##Try another bayesian approach for select the best K

library(mclust)

d_clust <- mclustBIC(phylosor_all, G=1:15,

modelNames = mclust.options("emModelNames"))

d_clust$BIC

plot(d_clust)

###Too slow... advanced 4% in a whole night.

# K means and silouethe approach validation

##K-means clustering

set.seed(123)

km.res2 <- kmeans(phylosor_all_dist, 2, nstart = 25)