4. Molecular evolution and phylogeny inference - solutions

Author

EPFL - SV - BIO-463

Published

March 11, 2025

Exercise 1 - Molecular evolution in the flu virus

#install.packages("seqinr")
#install.packages("DescTools")

seqs=seqinr::read.fasta(file = "HA_sequences.fasta",
  seqtype = "DNA", as.string = FALSE, forceDNAtolower = FALSE,
  whole.header = TRUE)
seqinr::getAnnot(seqs[1])

[[1]]
[1] ">A/Aichi/2/1968"

seqinr::getLength(seqs[1])

[1] 1694

length(seqs)

[1] 841

#seqinr::getSequence(seqs[1])

lg = matrix(nrow=length(seqs),ncol=1)
for(i in 1:length(seqs)){
lg[i]=seqinr::getLength(seqs[i])
}
min(lg)

[1] 1694

max(lg)

[1] 1694

There are 841 sequences, the length of all sequences is the same and it is 1694 nucleotides.

nseqs=841
seqlength=1694
TheDists = matrix(nrow=nseqs,ncol=3)

for(i in 1:nseqs){
  header=seqinr::getAnnot(seqs[i])
  posSep=tail(unlist(gregexpr('/', header)), n=1) #find position of last '\' in header
  TheDists[i,1]=as.numeric(substr(header,posSep+1,posSep+4)) #get the year and convert it to number
  TheDists[i,2]=DescTools::StrDist( seqinr::getSequence(seqs[1]), seqinr::getSequence(seqs[i]), method = "hamming", mismatch = 1, gap = 1, ignore.case = TRUE)/seqlength
  TheDists[i,3]=-3/4*log(1-4/3*TheDists[i,2])
}

plot(TheDists[,1], TheDists[,2], pch=20, type="p", main='Distances to initial sequence', xlab='Year of sampling', ylab='Distance')
points(TheDists[,1], TheDists[,3], pch=3, type="p", col='red')
legend("topleft", legend=c("Hamming", "Jukes-Cantor"),col=c("black", "red"), pch=c(20,3))

max(TheDists[,2])

[1] 0.1310508

max(TheDists[,3])

[1] 0.1440375

These distances increase over time in an approximately linear way. Sequences gradually diverge from the early ones. A fraction 0.131 of the HA gene has changed due to mutations during this 37 year period. This corresponds to 0.144 mutations per site on average.

No, the UPGMA method would not give a reasonable result. Indeed, it assumes that all sequences are equidistant from the root, while here the sequences we have are not contemporary to each other and the oldest ones are closer to their common ancestor than the most recent ones.

nseqs=841
seqlength=1694
YearDists = numeric() #empty vector
Years=unique(TheDists[,1]) #list of years with data, with no repetition
seqIdx=c(1:nseqs) #list of sequence indices

for(i in 1:length(Years)){ #loop over years
  yearIdx=seqIdx[TheDists[,1]==Years[i]] #indices of sequences from the year of focus
  for (j in 1:length(yearIdx)){
    for (k in 1:length(yearIdx)){
      if (k>j){
      YearDists=append ( YearDists , DescTools::StrDist( seqinr::getSequence(seqs[yearIdx[j]]), seqinr::getSequence(seqs[yearIdx[k]]), method = "hamming", mismatch = 1, gap = 1, ignore.case = TRUE)/seqlength )
      }
    }
  }
}

h1 = hist(YearDists, main='', xlab='Hamming distance between sequences from same year', ylab='Counts')

mean(YearDists)

[1] 0.007861195

max(YearDists)

[1] 0.04014168

The mean and even the maximum distance are substantially smaller within a year than between the first sequences and the sequences of a given year. There is relatively little divergence during a year. The divergence is gradual over years.

The mean distance between sequences from the same year is 0.0079. Over 37 years, the distance between the oldest sequence and the newest ones is about 0.13, and has increased approximately linearly, meaning that sequences diverge from the oldest sequences acquiring 0.13/37=0.0035 differences per nucleotide each year. Thus we need about 2.2 years to accumulate these differences.

0.1310508/37

[1] 0.003541914

0.007861195/(0.1310508/37)

[1] 2.219477

Exercise 2 - Phylogeny reconstruction for influenza hemagglutinin

#install.packages("ape") #install if necessary
#install.packages("adegenet") #install if necessary

library(ape)
seqs = read.dna(file = "US_sequences_93_08.fasta", format = "fasta")
annot = read.csv("US_sequences_93_08_annotations.csv", header=TRUE, row.names=1)
annot

   accession year                              misc
1   CY013200 1993       (A/New York/783/1993(H3N2))
2   CY013781 1993       (A/New York/802/1993(H3N2))
3   CY012128 1993       (A/New York/758/1993(H3N2))
4   CY013613 1993       (A/New York/766/1993(H3N2))
5   CY012160 1993       (A/New York/762/1993(H3N2))
6   CY012272 1994       (A/New York/729/1994(H3N2))
7   CY010988 1994       (A/New York/733/1994(H3N2))
8   CY012288 1994       (A/New York/734/1994(H3N2))
9   CY012568 1994       (A/New York/746/1994(H3N2))
10  CY013016 1994       (A/New York/750/1994(H3N2))
11  CY012480 1995       (A/New York/666/1995(H3N2))
12  CY010748 1995       (A/New York/648/1995(H3N2))
13  CY011528 1995       (A/New York/669/1995(H3N2))
14  CY017291 1995       (A/New York/681/1995(H3N2))
15  CY012504 1995       (A/New York/678/1995(H3N2))
16  CY009476 1996       (A/New York/565/1996(H3N2))
17  CY010028 1996       (A/New York/591/1996(H3N2))
18  CY011128 1996       (A/New York/599/1996(H3N2))
19  CY010036 1996       (A/New York/592/1996(H3N2))
20  CY011424 1996       (A/New York/577/1996(H3N2))
21  CY006259 1997       (A/New York/511/1997(H3N2))
22  CY006243 1997       (A/New York/508/1997(H3N2))
23  CY006267 1997       (A/New York/513/1997(H3N2))
24  CY006235 1997       (A/New York/505/1997(H3N2))
25  CY006627 1997       (A/New York/547/1997(H3N2))
26  CY006787 1998       (A/New York/506/1998(H3N2))
27  CY006563 1998       (A/New York/533/1998(H3N2))
28  CY002384 1998       (A/New York/330/1998(H3N2))
29  CY008964 1998       (A/New York/540/1998(H3N2))
30  CY006595 1998       (A/New York/542/1998(H3N2))
31  CY001453 1999       (A/New York/184/1999(H3N2))
32  CY001413 1999       (A/New York/263/1999(H3N2))
33  CY001704 1999       (A/New York/257/1999(H3N2))
34  CY001616 1999       (A/New York/265/1999(H3N2))
35  CY003785 1999       (A/New York/422/1999(H3N2))
36  CY000737 2000       (A/New York/180/2000(H3N2))
37  CY001365 2000       (A/New York/187/2000(H3N2))
38  CY003272 2000       (A/New York/437/2000(H3N2))
39  CY000705 2000       (A/New York/175/2000(H3N2))
40  CY000657 2000       (A/New York/169/2000(H3N2))
41  CY002816 2001       (A/New York/301/2001(H3N2))
42  CY000584 2001       (A/New York/127/2001(H3N2))
43  CY001720 2001       (A/New York/273/2001(H3N2))
44  CY000185 2001        (A/New York/83/2001(H3N2))
45  CY002328 2001        (A/New York/77/2001(H3N2))
46  CY000297 2002        (A/New York/96/2002(H3N2))
47  CY003096 2002       (A/New York/403/2002(H3N2))
48  CY000545 2002       (A/New York/115/2002(H3N2))
49  CY000289 2002        (A/New York/92/2002(H3N2))
50  CY001152 2002        (A/New York/74/2002(H3N2))
51  CY000105 2003       (A/New York/60A/2003(H3N2))
52  CY002104 2003           (A/Memphis/31/03(H3N2))
53  CY001648 2003       (A/New York/270/2003(H3N2))
54  CY000353 2003        (A/New York/21/2003(H3N2))
55  CY001552 2003       (A/New York/215/2003(H3N2))
56  CY019245 2004       (A/New York/908/2004(H3N2))
57  CY021989 2004       (A/New York/908/2004(H3N2))
58  CY003336 2004       (A/New York/354/2004(H3N2))
59  CY003664 2004       (A/New York/471/2004(H3N2))
60  CY002432 2004       (A/New York/362/2004(H3N2))
61  CY003640 2005       (A/New York/463/2005(H3N2))
62  CY019301 2005       (A/New York/918/2005(H3N2))
63  CY019285 2005       (A/New York/913/2005(H3N2))
64  CY006155 2005       (A/New York/258/2005(H3N2))
65  CY034116 2005       (A/Wisconsin/67/2005(H3N2))
66  EF554795 2006               (A/Ohio/2006(H3N2))
67  CY019859 2006       (A/New York/938/2006(H3N2))
68  EU100713 2006        (A/Maryland/09/2006(H3N2))
69  CY019843 2006       (A/New York/933/2006(H3N2))
70  CY014159 2006         (A/New York/7/2006(H3N2))
71  EU199369 2007       (A/Minnesota/08/2007(H3N2))
72  EU199254 2007           (A/Idaho/01/2007(H3N2))
73  CY031555 2007 (A/Kentucky/UR06-0571/2007(H3N2))
74  EU516036 2007         (A/Georgia/07/2007(H3N2))
75  EU516212 2007      (A/California/33/2007(H3N2))
76  FJ549055 2008        (A/Illinois/14/2008(H3N2))
77  EU779498 2008     (A/Mississippi/01/2008(H3N2))
78  EU779500 2008         (A/Indiana/02/2008(H3N2))
79  CY035190 2008 (A/Pennsylvania/PIT43/2008(H3N2))
80  EU852005 2008           (A/Texas/06/2008(H3N2))

library(ape)
D = dist.dna(seqs, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
nj_tree = bionj(D)
nj_tree = ladderize(nj_tree) # reorganizes the internal structure of the tree to get the ladderized effect when plotted. Smallest clade is on the right-hand side
nj_tree


Phylogenetic tree with 80 tips and 78 internal nodes.

Tip labels:
  CY013200, CY013781, CY012128, CY013613, CY012160, CY012272, ...

Unrooted; includes branch length(s).

plot(nj_tree)

library(adegenet)
plot(nj_tree, show.tip=FALSE) # remove tip labels from headers which were shown above (and are shown by default)
colorPalette = colorRampPalette(c("red","white","blue")) # use gradually changing color palette
tiplabels(annot$year, bg=num2col(annot$year, col.pal=colorPalette)) # use years from annotations as labels

We choose the first sequence to define the root, as it was sampled in 1993. More precisely, we choose it as outgroup - as the user manual of root says, “If outgroup is of length one (i.e., a single value), then the tree is rerooted using the node below this tip as the new root”.

nj_tree2 = root(nj_tree, out = 1)
nj_tree2 = ladderize(nj_tree2)
plot(nj_tree2, show.tip=FALSE)
tiplabels(annot$year, bg=num2col(annot$year, col.pal=colorPalette))

We observe that the tree is strongly directional, showing a gradual divergence from old sequences. This is consistent with the conclusions of exercise 1, and makes them more precise. The influenza virus evolves rapidly, and successive mutations accumulate over a short period of time. One of the reasons that pushes it to evolve and become different from old strains is the host immunity.

x = as.vector(D)
y = as.vector(as.dist(cophenetic(nj_tree2)))
plot(x, y, xlab="Pairwise distances computed between sequences", ylab="Pairwise distances on the NJ tree")
lines(x, x, col="red") #ideal case: both distances are equal

We observe that the patristic distances on the NJ tree are close to the Jukes-Cantor distances we started with. This means that the inferred NJ tree gives a good representation of the data.

bootstrapValues = boot.phylo(nj_tree2, seqs, function(e) root(nj(dist.dna(e, model = "JC")),1)) #the last argument is the function used to estimate the tree from the original data matrix


Running bootstraps:       100 / 100
Calculating bootstrap values... done.

bootstrapValues

 [1]  NA  16   6   1  63  38  66  42  48 100 100  70  25  52  98  65  39  30   9
[20]  84  98 100  38  39  80  62  48  30  92  49  77 100  98 100 100  98 100  98
[39]  95  77  64  63  29  68  91  36  35  77 100 100  87  92 100   8  55  70  91
[58]  31  42  57  99 100  17  46  32  99  99 100  61  59  28  49  70  60  95  57
[77] 100  59

We observe that some nodes are very well supported, but others less (scores range from 1 to 100).

Remark: it is possible to delete the nodes with low bootstrap support, in order to only keep the higher-confidence parts of the tree. This is done below.

temp = nj_tree2
N = length(nj_tree2$tip.label)
toCollapse = match(which(bootstrapValues<70)+N, temp$edge[,2])
temp$edge.length[toCollapse] = 0 
nj_tree3 = di2multi(temp, tol=0.00001) #deletes all branches smaller than tol and collapses the corresponding dichotomies into a multichotomy
plot(nj_tree3, show.tip=FALSE)
tiplabels(annot$year, bg=num2col(annot$year, col.pal=colorPalette))

Exercise 3: Plot, analyze and manipulate a gene tree

library(ape)

# read the output of RAxML - the reconstructed phylogeny with bootstrap values and plot it
tree = read.tree("RAxML_bipartitions.Homologs")
plot.phylo(tree)

# the tree is "unrooted", we force the root the base of all birds
root = getMRCA(tree, grep("penguin", tree$tip.label))
tree = root(tree, node=root, resolve.root=T)

plot.phylo(tree)

# select bootstrap values > 80 (the node label is the bootstrap values as text, need to convert to number first)
ns = which(as.numeric(tree$node.label) > 80)
nb.tip = length(tree$tip.label)

# select: 1. "human" genes, 2. "salmon" genes, 3. "owl" genes, 4. all the rest
# using grep to identify labels containing a given substring 
ihs = grep("human", tree$tip.label)
ipt = grep("salmo", tree$tip.label)
icn = grep("Barn_owl", tree$tip.label)
irest = (1:length(tree$tip.label))[-c(ihs,ipt,icn)]

# re-plot the tree with only bootstrap values>80, and selected genes in color ('bg'=background color of the label)
par(oma=c(0,0,0,8), xpd=NA) #for legend not to be cut
plot.phylo(tree, show.tip.label=F, use.edge.length=F)
nodelabels(tree$node.label[ns], node=ns+nb.tip, cex=0.6)
tiplabels(tree$tip.label[ihs], ihs, adj=c(0,.5))
tiplabels(tree$tip.label[ipt], ipt, bg='pink', adj=c(0,.5))
tiplabels(tree$tip.label[icn], icn, bg='cyan', adj=c(0,.5))
tiplabels(tree$tip.label[irest], irest, bg=0, frame='none', adj=c(0,.5))

There are 3 paralogs of this gene in mammals, 2 in birds and 4 in fish. black_rat_116909221 is the ortholog of human_5187.

par(oma=c(0,0,0,5), xpd=NA)
subtree.1 = drop.tip(tree, grep("fugu|salmo|zebra", tree$tip.label))
plot.phylo(subtree.1)

subtree.2 = extract.clade(tree, node=getMRCA(tree, grep("_8863|_18628", tree$tip.label)))
plot.phylo(subtree.2)

subtree.3 = keep.tip(tree, grep("penguin|owl", tree$tip.label))
plot.phylo(subtree.3 )