6. Graded assignment number 1 - population genetics and sequence evolution

Author

EPFL - SV - BIO-463

Published

March 26, 2024

Exercise 1 - Evolution of HIV

#install.packages("ape") #install if necessary
#install.packages("adegenet") #install if necessary
library(ape)
seqs = read.dna(file = "sequences1.fasta", format = "fasta")
annot = read.csv("annot1.csv", header=TRUE, row.names=1)
annot
                            Name Year DaysFromSeroconversion SequenceLength
0  1983.-.8888.HXB2-LAI-IIIB-BRU 1983                     NA             NA
9        ZM634F_30Jun2010_CP_9_E 2010                   1072           3539
10      ZM634F_11Jun2008_1Y_13_E 2008                    323           3506
11      ZM634F_11Jun2008_1Y_15_E 2008                    323           3506
12      ZM634F_11Jun2008_1Y_16_E 2008                    323           3503
13      ZM634F_11Jun2008_1Y_17_E 2008                    323           3503
14      ZM634F_11Jun2008_1Y_18_E 2008                    323           3506
15      ZM634F_11Jun2008_1Y_19_E 2008                    323           3506
16      ZM634F_11Jun2008_1Y_22_E 2008                    323           3506
17      ZM634F_11Jun2008_1Y_26_E 2008                    323           3506
18      ZM634F_11Jun2008_1Y_28_E 2008                    323           3506
19      ZM634F_11Jun2008_1Y_30_E 2008                    323           3530
20      ZM634F_18May2011_1A_10_E 2011                   1394           3527
21      ZM634F_11Jan2012_2A_14_E 2012                   1632           3575
22      ZM634F_18May2011_1A_12_E 2011                   1394           3488
23       ZM634F_18May2011_1A_2_E 2011                   1394           3479
24      ZM634F_11Jun2008_1Y_23_E 2008                    323           3506
25      ZM634F_18May2011_1A_18_E 2011                   1394           3554
26       ZM634F_30Jun2010_CP_7_E 2010                   1072           3542
27   ZM634F_11Jan2012_2A_11_E.HY 2012                   1632           3527
28      ZM634F_11Jan2012_2A_12_E 2012                   1632           3554
29       ZM634F_11Jan2012_2A_4_E 2012                   1632           3542
30      ZM634F_11Jun2008_1Y_10_E 2008                    323           3410
31      ZM634F_11Jun2008_1Y_14_E 2008                    323           3503
32       ZM634F_11Jun2008_1Y_1_E 2008                    323           3503
33      ZM634F_11Jun2008_1Y_21_E 2008                    323           3491
34      ZM634F_11Jun2008_1Y_25_E 2008                    323           3506
35      ZM634F_11Jun2008_1Y_33_E 2008                    323           3503
36       ZM634F_11Jun2008_1Y_9_E 2008                    323           3506
37      ZM634F_18May2011_1A_15_E 2011                   1394           3536
38      ZM634F_18May2011_1A_19_E 2011                   1394           3527
39    ZM634F_18May2011_1A_5_E.HY 2011                   1394           3554
40      ZM634F_30Jun2010_CP_14_E 2010                   1072           3521
41      ZM634F_30Jun2010_CP_21_E 2010                   1072           3548
42      ZM634F_30Jun2010_CP_22_E 2010                   1072           3524
43      ZM634F_30Jun2010_CP_25_E 2010                   1072           3524
44       ZM634F_30Jun2010_CP_5_E 2010                   1072           3521
45      ZM634F_18May2011_1A_22_E 2011                   1394           3497
46      ZM634F_18May2011_1A_23_E 2011                   1394           3539
47      ZM634F_18May2011_1A_26_E 2011                   1394           3533
48       ZM634F_18May2011_1A_3_E 2011                   1394           3539
49       ZM634F_18May2011_1A_4_E 2011                   1394           3542
50    ZM634F_18May2011_1A_6_E.HY 2011                   1394           3542
51      ZM634F_30Jun2010_CP_11_E 2010                   1072           3542
52      ZM634F_30Jun2010_CP_15_E 2010                   1072           3542
53      ZM634F_30Jun2010_CP_16_E 2010                   1072           3539
54      ZM634F_30Jun2010_CP_17_E 2010                   1072           3539
55      ZM634F_30Jun2010_CP_19_E 2010                   1072           3539
56       ZM634F_30Jun2010_CP_1_E 2010                   1072           3542
57      ZM634F_30Jun2010_CP_20_E 2010                   1072           3539
58      ZM634F_30Jun2010_CP_23_E 2010                   1072           3542
59      ZM634F_30Jun2010_CP_24_E 2010                   1072           3545
60       ZM634F_30Jun2010_CP_2_E 2010                   1072           3539
61       ZM634F_30Jun2010_CP_3_E 2010                   1072           3542
62       ZM634F_30Jun2010_CP_4_E 2010                   1072           3548
63       ZM634F_30Jun2010_CP_6_E 2010                   1072           3539
64       ZM634F_30Jun2010_CP_8_E 2010                   1072           3542
65      ZM634F_11Jun2008_1Y_20_E 2008                    323           3491
66      ZM634F_11Jun2008_1Y_24_E 2008                    323           3503
67      ZM634F_11Jun2008_1Y_27_E 2008                    323           3503
68      ZM634F_11Jun2008_1Y_29_E 2008                    323           3503
69       ZM634F_11Jun2008_1Y_2_E 2008                    323           3503
70      ZM634F_11Jun2008_1Y_31_E 2008                    323           3506
71      ZM634F_11Jun2008_1Y_32_E 2008                    323           3500
72      ZM634F_11Jun2008_1Y_34_E 2008                    323           3503
73       ZM634F_11Jun2008_1Y_3_E 2008                    323           3491
74       ZM634F_11Jun2008_1Y_4_E 2008                    323           3503
75       ZM634F_11Jun2008_1Y_5_E 2008                    323           3503
76       ZM634F_11Jun2008_1Y_6_E 2008                    323           3503
77       ZM634F_11Jun2008_1Y_7_E 2008                    323           3494
78       ZM634F_11Jun2008_1Y_8_E 2008                    323           3503
79      ZM634F_18May2011_1A_14_E 2011                   1394           3545
80      ZM634F_18May2011_1A_16_E 2011                   1394           3542
81      ZM634F_18May2011_1A_17_E 2011                   1394           3560
82       ZM634F_18May2011_1A_1_E 2011                   1394           3548
83      ZM634F_18May2011_1A_24_E 2011                   1394           3578
84      ZM634F_18May2011_1A_27_E 2011                   1394           3572
85       ZM634F_18May2011_1A_7_E 2011                   1394           3575
86       ZM634F_18May2011_1A_8_E 2011                   1394           3554
87       ZM634F_18May2011_1A_9_E 2011                   1394           3575
88      ZM634F_11Jan2012_2A_13_E 2012                   1632           3542
89   ZM634F_11Jan2012_2A_15_E.HY 2012                   1632           3542
90      ZM634F_11Jan2012_2A_16_E 2012                   1632           3545
91      ZM634F_11Jan2012_2A_17_E 2012                   1632           3536
92      ZM634F_11Jan2012_2A_18_E 2012                   1632           3557
93    ZM634F_11Jan2012_2A_2_E.HY 2012                   1632           3479
94       ZM634F_11Jan2012_2A_3_E 2012                   1632           3545
95       ZM634F_11Jan2012_2A_7_E 2012                   1632           3542
96       ZM634F_11Jan2012_2A_8_E 2012                   1632           3545
97       ZM634F_11Jan2012_2A_9_E 2012                   1632           3539
98      ZM634F_11Jun2008_1Y_11_E 2008                    323           3506
99      ZM634F_11Jun2008_1Y_12_E 2008                    323           3503
   Organism
0     HIV-1
9     HIV-1
10    HIV-1
11    HIV-1
12    HIV-1
13    HIV-1
14    HIV-1
15    HIV-1
16    HIV-1
17    HIV-1
18    HIV-1
19    HIV-1
20    HIV-1
21    HIV-1
22    HIV-1
23    HIV-1
24    HIV-1
25    HIV-1
26    HIV-1
27    HIV-1
28    HIV-1
29    HIV-1
30    HIV-1
31    HIV-1
32    HIV-1
33    HIV-1
34    HIV-1
35    HIV-1
36    HIV-1
37    HIV-1
38    HIV-1
39    HIV-1
40    HIV-1
41    HIV-1
42    HIV-1
43    HIV-1
44    HIV-1
45    HIV-1
46    HIV-1
47    HIV-1
48    HIV-1
49    HIV-1
50    HIV-1
51    HIV-1
52    HIV-1
53    HIV-1
54    HIV-1
55    HIV-1
56    HIV-1
57    HIV-1
58    HIV-1
59    HIV-1
60    HIV-1
61    HIV-1
62    HIV-1
63    HIV-1
64    HIV-1
65    HIV-1
66    HIV-1
67    HIV-1
68    HIV-1
69    HIV-1
70    HIV-1
71    HIV-1
72    HIV-1
73    HIV-1
74    HIV-1
75    HIV-1
76    HIV-1
77    HIV-1
78    HIV-1
79    HIV-1
80    HIV-1
81    HIV-1
82    HIV-1
83    HIV-1
84    HIV-1
85    HIV-1
86    HIV-1
87    HIV-1
88    HIV-1
89    HIV-1
90    HIV-1
91    HIV-1
92    HIV-1
93    HIV-1
94    HIV-1
95    HIV-1
96    HIV-1
97    HIV-1
98    HIV-1
99    HIV-1
library(ape)
library(adegenet)
Loading required package: ade4

   /// adegenet 2.1.11 is loaded ////////////

   > overview: '?adegenet'
   > tutorials/doc/questions: 'adegenetWeb()' 
   > bug reports/feature requests: adegenetIssues()
D = dist.dna(seqs, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
nj_tree = bionj(D)
nj_tree2 = root(nj_tree, out = 1) #We choose the first sequence as the outgroup to root the tree, as it is the reference sequence.
nj_tree2 = ladderize(nj_tree2) # reorganizes the internal structure of the tree to get the ladderized effect when plotted. Smallest clade is on the right-hand side
plot(nj_tree2, show.tip=FALSE)
colorPalette = colorRampPalette(c("red","white","blue")) # use gradually changing color palette
tiplabels(annot$DaysFromSeroconversion, bg=num2col(annot$DaysFromSeroconversion, col.pal=colorPalette))

  1. The large majority of sequences collected at recent time points are positioned on a specific part of the tree, which is distinct from the part of the tree that includes older sequences - the main branching event separates these two ensembles. This reflects that the virus has evolved and acquired new mutations, so that the more recent sequences are distinct from the older ones. There are 6 exceptions to this trend, and 5 of them were collected at 1394 and the last one at 1832 days after seroconversion. This implies that a few viruses with sequences close to initial ones still exist in the patient’s body, even though most of the sequences are distinct and have substantially evolved.

Reference: Brooks K, Jones BR, Dilernia DA, Wilkins DJ, Claiborne DT, McInally S, et al. (2020) HIV-1 variants are archived throughout infection and persist in the reservoir. PLoS Pathog 16(6): e1008378. https://doi.org/10.1371/journal.ppat.1008378 Patient Code 634F - Patient Id 51599 - https://www.hiv.lanl.gov/components/sequence/HIV/search/patient.comp?pat_id=51599

library(ape)
seqs = read.dna(file = "sequences2.fasta", format = "fasta")
annot = read.csv("annot2.csv", header=TRUE, row.names=1)
annot
                            Name Year DaysFromSeroconversion SequenceLength
0  1983.-.8888.HXB2-LAI-IIIB-BRU 1983                     NA             NA
5           074-M-201-2-5_w0_220 2015                      0           9001
6           074-M-201-2-5_w0_224 2015                      0           8989
7           074-M-201-2-5_w0_232 2015                      0           9014
8           074-M-201-2-5_w0_229 2015                      0           9028
9            074-P-201-1-4_w0_33 2015                     31           9018
10          074-P-201-1-4_w0_247 2015                      0           8996
11           074-P-201-1-4_w0_38 2015                     31           9018
12          074-P-201-1-4_w0_250 2015                      0           8996
13           074-P-201-1-4_w4_52 2015                     59           9018
14           074-P-201-1-4_w4_61 2015                     59           9018
15           074-P-201-1-4_w4_51 2015                     59           7334
16           074-P-201-1-4_w0_44 2015                     31           9018
17           074-P-201-1-4_w0_39 2015                     31           9018
18          074-P-201-1-4_w0_249 2015                      0           8996
19           074-P-201-1-4_w0_47 2015                     31           9018
20           074-P-201-1-4_w0_32 2015                     31           9018
21          074-P-201-1-4_w0_237 2015                      0           8996
22           074-P-201-1-4_w4_43 2015                     59           9018
23           074-P-201-1-4_w0_37 2015                     31           9018
24          074-P-201-1-4_w0_242 2015                      0           8996
25           074-P-201-1-4_w0_45 2015                     31           9018
26         074-P-201-1-4_w24_194 2015                    199           9018
27         074-P-201-1-4_w12_185 2015                    115           9018
28           074-P-201-1-4_w0_30 2015                     31           9018
29           074-P-201-1-4_w0_34 2015                     31           9018
30          074-P-201-1-4_w0_235 2015                      0           8996
31           074-P-201-1-4_w0_36 2015                     31           9018
32           074-P-201-1-4_w4_58 2015                     59           9018
33          074-P-201-1-4_w0_243 2015                      0           8996
34          074-P-201-1-4_w0_236 2015                      0           8996
35           074-P-201-1-4_w0_50 2015                     31           9018
36           074-P-201-1-4_w0_31 2015                     31           9018
37           074-P-201-1-4_w4_59 2015                     59           9018
38          074-P-201-1-4_w0_244 2015                      0           8996
39           074-P-201-1-4_w0_40 2015                     31           9018
40          074-P-201-1-4_w0_240 2015                      0           8996
41          074-P-201-1-4_w0_238 2015                      0           8996
42          074-M-201-2-5_w0_223 2015                      0           9051
43           074-P-201-1-4_w0_48 2015                     31           9018
44          074-P-201-1-4_w0_245 2015                      0           8997
45          074-P-201-1-4_w0_246 2015                      0           8996
47          074-P-201-1-4_w0_239 2015                      0           8996
48          074-P-201-1-4_w0_248 2015                      0           8997
52          074-P-201-1-4_w0_241 2015                      0           8996
53          074-M-201-2-5_w0_221 2015                      0           9018
54          074-M-201-2-5_w0_222 2015                      0           8997
55          074-M-201-2-5_w0_234 2015                      0           9018
56           074-P-201-1-4_w0_49 2015                     31           9018
   Organism
0     HIV-1
5     HIV-1
6     HIV-1
7     HIV-1
8     HIV-1
9     HIV-1
10    HIV-1
11    HIV-1
12    HIV-1
13    HIV-1
14    HIV-1
15    HIV-1
16    HIV-1
17    HIV-1
18    HIV-1
19    HIV-1
20    HIV-1
21    HIV-1
22    HIV-1
23    HIV-1
24    HIV-1
25    HIV-1
26    HIV-1
27    HIV-1
28    HIV-1
29    HIV-1
30    HIV-1
31    HIV-1
32    HIV-1
33    HIV-1
34    HIV-1
35    HIV-1
36    HIV-1
37    HIV-1
38    HIV-1
39    HIV-1
40    HIV-1
41    HIV-1
42    HIV-1
43    HIV-1
44    HIV-1
45    HIV-1
47    HIV-1
48    HIV-1
52    HIV-1
53    HIV-1
54    HIV-1
55    HIV-1
56    HIV-1
D = dist.dna(seqs, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
nj_tree = bionj(D)
nj_tree2 = root(nj_tree, out = 1) #We choose the first sequence as the outgroup to root the tree, as it is the reference sequence.
nj_tree2 = ladderize(nj_tree2) # reorganizes the internal structure of the tree to get the ladderized effect when plotted. Smallest clade is on the right-hand side
plot(nj_tree2, show.tip=FALSE)
tiplabels(annot$DaysFromSeroconversion, bg=num2col(annot$DaysFromSeroconversion, col.pal=colorPalette))

  1. In this tree, there are only 2 sequences that were collected more than 100 days after seroconversion. They are positioned in a way that does not seem different from older sequences - no major branching event separates them from other sequences. This is surprising in light of the previous analysis (questions 2-3), as the main branching event was then separating new and older sequences. The difference between these two patients is that the first one is not treated (or at least, was not treated during most of this data sampling) while the second one is treated (was treated very soon after seroconversion). The treatment reduces the number of viruses present and their replication and evolution.

Reference: Pilar Garcia-Broncano et al (2019) Early antiretroviral therapy in neonates with HIV-1 infection restricts viral reservoir size and induces a distinct innate immune profile, Sci Transl Med 11(520):eaax7350 doi: 10.1126/scitranslmed.aax7350. The patient is an infant infected at birth (Mother->Child); diagnosed at 31 days after birth, and began ART at that time. Patient Code 201 - Patient Id 116109 - https://www.hiv.lanl.gov/components/sequence/HIV/search/patient.comp?pat_id=116109

indices = which(annot$DaysFromSeroconversion==0)
annot_extr=annot[indices, ]
seqs_extr=seqs[indices, ]
D_extr = dist.dna(seqs_extr, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
h1 = hist(D_extr, main='', xlab='JC distance between sequences', ylab='Counts')

mean(D_extr)
[1] 0.004534841
max(D_extr)
[1] 0.01065449
  1. Here, within one patient, the mean Jukes-Cantor distance between all sequences collected at seroconversion is 0.004 and the maximum Jukes-Cantor distance between sequences collected at seroconversion is 0.01. For the flu virus, the mean Jukes-Cantor distance between all sequences collected in a given year is 0.008 and the maximum Jukes-Cantor distance between sequences in a given year is 0.04. We observe that the values for the flu are similar to those observed here in question 6, in orders of magnitude. But here, we are considering only one patient, and the virus has evolved only for about one month. For the flu, all patients in the world were potentially included, during one year. This tells us that HIV evolution is much faster than that of the influenza virus.

Exercise 2 - Simulating experimental evolution with serial passage

  1. We have \(100=e^t\), which leads to \(t=\log(100)\).

  2. Because of exponential growth, we have \(x'=\frac{x e^{st}}{1+x(e^{st}-1)}\).

s=0.01
t=5
x=0.01
xp=x*exp(s*t)/(1+x*(exp(s*t)-1))
print(xp)
[1] 0.01050732
s=0.5
t=5
x=0.01
xp=x*exp(s*t)/(1+x*(exp(s*t)-1))
print(xp)
[1] 0.1095721

We see that the change of the fraction x due to growth is small if s=0.01 but that the fraction substantially increases if s=0.5. As a result, the final mutant fraction is much larger in the second case than in the first case.

#install.packages("flan")
library(flan)
sampled_data=rflan(1000, mutations = 1, fitness = 1/1.01, mfn = 100) 
nmut=sampled_data$mc
#ntot=sampled_data$fn
mean(nmut)
[1] 6.565
var(nmut)
[1] 1235.92

We observe that the variance is much larger than the mean. This is because mutations can arise early in the exponential growth, which results in large numbers of mutants, as mutations are inherited. Thus, in these rare cases (jackpot events), there are lots of mutants. This gives a large variance.

#library(flan)
n_rep=10
mutant_frac = matrix(nrow=n_rep,ncol=3)
for(i in 1:n_rep){
  sampled_data=rflan(5e6, mutations = 0.1, fitness = 1/1.01, mfn = 100) 
  nmut=sampled_data$mc
  ntot=sampled_data$fn
  mutant_frac[i,1]=mean(nmut)
  mutant_frac[i,2]=var(nmut)
  mutant_frac[i,3]=sum(nmut)/sum(ntot)
}
#nmut_preex
mean(mutant_frac[,3])
[1] 0.0124307
var(mutant_frac[,3])
[1] 7.53131e-06

We observe that the fraction has a small variability across replicates. This is despite the fact that each cell can give rise to a very variable number of mutant offspring after growth (see question 3), because we are here aggregating a very large number of cells (\(K=5\times 10^6\)).

  1. \(k\) follows a binomial distribution. We have \(P(k)={K\choose k} (x')^{k} \left(1-x'\right)^{K-\,k}\).

  2. We can use the function rbinom.

s=0.01
t=5
x=0.01
K=5e6
xp=x*exp(s*t)/(1+x*(exp(s*t)-1))
n=1000
vals=rbinom(n, K, xp)
mean(vals)
[1] 52538.44
sd(vals)
[1] 220.9481
h1 = hist(vals, main='', xlab='Number of mutants sampled', ylab='Counts')

K=5e6
s=0.01
t=5
n_gen=300
n_rep=10
mutant_frac = matrix(nrow=n_gen,ncol=n_rep)
for(i in 1:n_rep){
  mutant_frac[1,i]=0.01
  for(j in 2:n_gen){
    mutant_frac[j,i] = rbinom( 1, K, mutant_frac[j-1,i]*exp(s*t)/(1+mutant_frac[j-1,i]*(exp(s*t)-1)) ) / K
  }
}
matplot(1:n_gen, mutant_frac, pch=20, type="l", lty="solid", xlab='Bottleneck', ylab='Mutant fraction')

We observe that all trajectories end in fixation, and that there is very little noise: we have smooth ascending trajectories for the fraction x of mutants. This is because we are in the (quasi-)deterministic regime. (Indeed, K>>1, st>>1/K and x>>1/(Kst).)

0.01*log(100) #st
[1] 0.0460517
1/(5e6*0.01*log(100)) #1/(Kst)
[1] 4.342945e-06