#install.packages("ape") #install if necessary
#install.packages("adegenet") #install if necessary6. Graded assignment number 1 - population genetics and sequence evolution
Exercise 1 - Evolution of HIV
library(ape)
seqs = read.dna(file = "sequences1.fasta", format = "fasta")
annot = read.csv("annot1.csv", header=TRUE, row.names=1)
annot Name Year DaysFromSeroconversion SequenceLength
0 1983.-.8888.HXB2-LAI-IIIB-BRU 1983 NA NA
9 ZM634F_30Jun2010_CP_9_E 2010 1072 3539
10 ZM634F_11Jun2008_1Y_13_E 2008 323 3506
11 ZM634F_11Jun2008_1Y_15_E 2008 323 3506
12 ZM634F_11Jun2008_1Y_16_E 2008 323 3503
13 ZM634F_11Jun2008_1Y_17_E 2008 323 3503
14 ZM634F_11Jun2008_1Y_18_E 2008 323 3506
15 ZM634F_11Jun2008_1Y_19_E 2008 323 3506
16 ZM634F_11Jun2008_1Y_22_E 2008 323 3506
17 ZM634F_11Jun2008_1Y_26_E 2008 323 3506
18 ZM634F_11Jun2008_1Y_28_E 2008 323 3506
19 ZM634F_11Jun2008_1Y_30_E 2008 323 3530
20 ZM634F_18May2011_1A_10_E 2011 1394 3527
21 ZM634F_11Jan2012_2A_14_E 2012 1632 3575
22 ZM634F_18May2011_1A_12_E 2011 1394 3488
23 ZM634F_18May2011_1A_2_E 2011 1394 3479
24 ZM634F_11Jun2008_1Y_23_E 2008 323 3506
25 ZM634F_18May2011_1A_18_E 2011 1394 3554
26 ZM634F_30Jun2010_CP_7_E 2010 1072 3542
27 ZM634F_11Jan2012_2A_11_E.HY 2012 1632 3527
28 ZM634F_11Jan2012_2A_12_E 2012 1632 3554
29 ZM634F_11Jan2012_2A_4_E 2012 1632 3542
30 ZM634F_11Jun2008_1Y_10_E 2008 323 3410
31 ZM634F_11Jun2008_1Y_14_E 2008 323 3503
32 ZM634F_11Jun2008_1Y_1_E 2008 323 3503
33 ZM634F_11Jun2008_1Y_21_E 2008 323 3491
34 ZM634F_11Jun2008_1Y_25_E 2008 323 3506
35 ZM634F_11Jun2008_1Y_33_E 2008 323 3503
36 ZM634F_11Jun2008_1Y_9_E 2008 323 3506
37 ZM634F_18May2011_1A_15_E 2011 1394 3536
38 ZM634F_18May2011_1A_19_E 2011 1394 3527
39 ZM634F_18May2011_1A_5_E.HY 2011 1394 3554
40 ZM634F_30Jun2010_CP_14_E 2010 1072 3521
41 ZM634F_30Jun2010_CP_21_E 2010 1072 3548
42 ZM634F_30Jun2010_CP_22_E 2010 1072 3524
43 ZM634F_30Jun2010_CP_25_E 2010 1072 3524
44 ZM634F_30Jun2010_CP_5_E 2010 1072 3521
45 ZM634F_18May2011_1A_22_E 2011 1394 3497
46 ZM634F_18May2011_1A_23_E 2011 1394 3539
47 ZM634F_18May2011_1A_26_E 2011 1394 3533
48 ZM634F_18May2011_1A_3_E 2011 1394 3539
49 ZM634F_18May2011_1A_4_E 2011 1394 3542
50 ZM634F_18May2011_1A_6_E.HY 2011 1394 3542
51 ZM634F_30Jun2010_CP_11_E 2010 1072 3542
52 ZM634F_30Jun2010_CP_15_E 2010 1072 3542
53 ZM634F_30Jun2010_CP_16_E 2010 1072 3539
54 ZM634F_30Jun2010_CP_17_E 2010 1072 3539
55 ZM634F_30Jun2010_CP_19_E 2010 1072 3539
56 ZM634F_30Jun2010_CP_1_E 2010 1072 3542
57 ZM634F_30Jun2010_CP_20_E 2010 1072 3539
58 ZM634F_30Jun2010_CP_23_E 2010 1072 3542
59 ZM634F_30Jun2010_CP_24_E 2010 1072 3545
60 ZM634F_30Jun2010_CP_2_E 2010 1072 3539
61 ZM634F_30Jun2010_CP_3_E 2010 1072 3542
62 ZM634F_30Jun2010_CP_4_E 2010 1072 3548
63 ZM634F_30Jun2010_CP_6_E 2010 1072 3539
64 ZM634F_30Jun2010_CP_8_E 2010 1072 3542
65 ZM634F_11Jun2008_1Y_20_E 2008 323 3491
66 ZM634F_11Jun2008_1Y_24_E 2008 323 3503
67 ZM634F_11Jun2008_1Y_27_E 2008 323 3503
68 ZM634F_11Jun2008_1Y_29_E 2008 323 3503
69 ZM634F_11Jun2008_1Y_2_E 2008 323 3503
70 ZM634F_11Jun2008_1Y_31_E 2008 323 3506
71 ZM634F_11Jun2008_1Y_32_E 2008 323 3500
72 ZM634F_11Jun2008_1Y_34_E 2008 323 3503
73 ZM634F_11Jun2008_1Y_3_E 2008 323 3491
74 ZM634F_11Jun2008_1Y_4_E 2008 323 3503
75 ZM634F_11Jun2008_1Y_5_E 2008 323 3503
76 ZM634F_11Jun2008_1Y_6_E 2008 323 3503
77 ZM634F_11Jun2008_1Y_7_E 2008 323 3494
78 ZM634F_11Jun2008_1Y_8_E 2008 323 3503
79 ZM634F_18May2011_1A_14_E 2011 1394 3545
80 ZM634F_18May2011_1A_16_E 2011 1394 3542
81 ZM634F_18May2011_1A_17_E 2011 1394 3560
82 ZM634F_18May2011_1A_1_E 2011 1394 3548
83 ZM634F_18May2011_1A_24_E 2011 1394 3578
84 ZM634F_18May2011_1A_27_E 2011 1394 3572
85 ZM634F_18May2011_1A_7_E 2011 1394 3575
86 ZM634F_18May2011_1A_8_E 2011 1394 3554
87 ZM634F_18May2011_1A_9_E 2011 1394 3575
88 ZM634F_11Jan2012_2A_13_E 2012 1632 3542
89 ZM634F_11Jan2012_2A_15_E.HY 2012 1632 3542
90 ZM634F_11Jan2012_2A_16_E 2012 1632 3545
91 ZM634F_11Jan2012_2A_17_E 2012 1632 3536
92 ZM634F_11Jan2012_2A_18_E 2012 1632 3557
93 ZM634F_11Jan2012_2A_2_E.HY 2012 1632 3479
94 ZM634F_11Jan2012_2A_3_E 2012 1632 3545
95 ZM634F_11Jan2012_2A_7_E 2012 1632 3542
96 ZM634F_11Jan2012_2A_8_E 2012 1632 3545
97 ZM634F_11Jan2012_2A_9_E 2012 1632 3539
98 ZM634F_11Jun2008_1Y_11_E 2008 323 3506
99 ZM634F_11Jun2008_1Y_12_E 2008 323 3503
Organism
0 HIV-1
9 HIV-1
10 HIV-1
11 HIV-1
12 HIV-1
13 HIV-1
14 HIV-1
15 HIV-1
16 HIV-1
17 HIV-1
18 HIV-1
19 HIV-1
20 HIV-1
21 HIV-1
22 HIV-1
23 HIV-1
24 HIV-1
25 HIV-1
26 HIV-1
27 HIV-1
28 HIV-1
29 HIV-1
30 HIV-1
31 HIV-1
32 HIV-1
33 HIV-1
34 HIV-1
35 HIV-1
36 HIV-1
37 HIV-1
38 HIV-1
39 HIV-1
40 HIV-1
41 HIV-1
42 HIV-1
43 HIV-1
44 HIV-1
45 HIV-1
46 HIV-1
47 HIV-1
48 HIV-1
49 HIV-1
50 HIV-1
51 HIV-1
52 HIV-1
53 HIV-1
54 HIV-1
55 HIV-1
56 HIV-1
57 HIV-1
58 HIV-1
59 HIV-1
60 HIV-1
61 HIV-1
62 HIV-1
63 HIV-1
64 HIV-1
65 HIV-1
66 HIV-1
67 HIV-1
68 HIV-1
69 HIV-1
70 HIV-1
71 HIV-1
72 HIV-1
73 HIV-1
74 HIV-1
75 HIV-1
76 HIV-1
77 HIV-1
78 HIV-1
79 HIV-1
80 HIV-1
81 HIV-1
82 HIV-1
83 HIV-1
84 HIV-1
85 HIV-1
86 HIV-1
87 HIV-1
88 HIV-1
89 HIV-1
90 HIV-1
91 HIV-1
92 HIV-1
93 HIV-1
94 HIV-1
95 HIV-1
96 HIV-1
97 HIV-1
98 HIV-1
99 HIV-1
library(ape)
library(adegenet)Loading required package: ade4
/// adegenet 2.1.11 is loaded ////////////
> overview: '?adegenet'
> tutorials/doc/questions: 'adegenetWeb()'
> bug reports/feature requests: adegenetIssues()
D = dist.dna(seqs, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
nj_tree = bionj(D)
nj_tree2 = root(nj_tree, out = 1) #We choose the first sequence as the outgroup to root the tree, as it is the reference sequence.
nj_tree2 = ladderize(nj_tree2) # reorganizes the internal structure of the tree to get the ladderized effect when plotted. Smallest clade is on the right-hand side
plot(nj_tree2, show.tip=FALSE)
colorPalette = colorRampPalette(c("red","white","blue")) # use gradually changing color palette
tiplabels(annot$DaysFromSeroconversion, bg=num2col(annot$DaysFromSeroconversion, col.pal=colorPalette))- The large majority of sequences collected at recent time points are positioned on a specific part of the tree, which is distinct from the part of the tree that includes older sequences - the main branching event separates these two ensembles. This reflects that the virus has evolved and acquired new mutations, so that the more recent sequences are distinct from the older ones. There are 6 exceptions to this trend, and 5 of them were collected at 1394 and the last one at 1832 days after seroconversion. This implies that a few viruses with sequences close to initial ones still exist in the patient’s body, even though most of the sequences are distinct and have substantially evolved.
Reference: Brooks K, Jones BR, Dilernia DA, Wilkins DJ, Claiborne DT, McInally S, et al. (2020) HIV-1 variants are archived throughout infection and persist in the reservoir. PLoS Pathog 16(6): e1008378. https://doi.org/10.1371/journal.ppat.1008378 Patient Code 634F - Patient Id 51599 - https://www.hiv.lanl.gov/components/sequence/HIV/search/patient.comp?pat_id=51599
library(ape)
seqs = read.dna(file = "sequences2.fasta", format = "fasta")
annot = read.csv("annot2.csv", header=TRUE, row.names=1)
annot Name Year DaysFromSeroconversion SequenceLength
0 1983.-.8888.HXB2-LAI-IIIB-BRU 1983 NA NA
5 074-M-201-2-5_w0_220 2015 0 9001
6 074-M-201-2-5_w0_224 2015 0 8989
7 074-M-201-2-5_w0_232 2015 0 9014
8 074-M-201-2-5_w0_229 2015 0 9028
9 074-P-201-1-4_w0_33 2015 31 9018
10 074-P-201-1-4_w0_247 2015 0 8996
11 074-P-201-1-4_w0_38 2015 31 9018
12 074-P-201-1-4_w0_250 2015 0 8996
13 074-P-201-1-4_w4_52 2015 59 9018
14 074-P-201-1-4_w4_61 2015 59 9018
15 074-P-201-1-4_w4_51 2015 59 7334
16 074-P-201-1-4_w0_44 2015 31 9018
17 074-P-201-1-4_w0_39 2015 31 9018
18 074-P-201-1-4_w0_249 2015 0 8996
19 074-P-201-1-4_w0_47 2015 31 9018
20 074-P-201-1-4_w0_32 2015 31 9018
21 074-P-201-1-4_w0_237 2015 0 8996
22 074-P-201-1-4_w4_43 2015 59 9018
23 074-P-201-1-4_w0_37 2015 31 9018
24 074-P-201-1-4_w0_242 2015 0 8996
25 074-P-201-1-4_w0_45 2015 31 9018
26 074-P-201-1-4_w24_194 2015 199 9018
27 074-P-201-1-4_w12_185 2015 115 9018
28 074-P-201-1-4_w0_30 2015 31 9018
29 074-P-201-1-4_w0_34 2015 31 9018
30 074-P-201-1-4_w0_235 2015 0 8996
31 074-P-201-1-4_w0_36 2015 31 9018
32 074-P-201-1-4_w4_58 2015 59 9018
33 074-P-201-1-4_w0_243 2015 0 8996
34 074-P-201-1-4_w0_236 2015 0 8996
35 074-P-201-1-4_w0_50 2015 31 9018
36 074-P-201-1-4_w0_31 2015 31 9018
37 074-P-201-1-4_w4_59 2015 59 9018
38 074-P-201-1-4_w0_244 2015 0 8996
39 074-P-201-1-4_w0_40 2015 31 9018
40 074-P-201-1-4_w0_240 2015 0 8996
41 074-P-201-1-4_w0_238 2015 0 8996
42 074-M-201-2-5_w0_223 2015 0 9051
43 074-P-201-1-4_w0_48 2015 31 9018
44 074-P-201-1-4_w0_245 2015 0 8997
45 074-P-201-1-4_w0_246 2015 0 8996
47 074-P-201-1-4_w0_239 2015 0 8996
48 074-P-201-1-4_w0_248 2015 0 8997
52 074-P-201-1-4_w0_241 2015 0 8996
53 074-M-201-2-5_w0_221 2015 0 9018
54 074-M-201-2-5_w0_222 2015 0 8997
55 074-M-201-2-5_w0_234 2015 0 9018
56 074-P-201-1-4_w0_49 2015 31 9018
Organism
0 HIV-1
5 HIV-1
6 HIV-1
7 HIV-1
8 HIV-1
9 HIV-1
10 HIV-1
11 HIV-1
12 HIV-1
13 HIV-1
14 HIV-1
15 HIV-1
16 HIV-1
17 HIV-1
18 HIV-1
19 HIV-1
20 HIV-1
21 HIV-1
22 HIV-1
23 HIV-1
24 HIV-1
25 HIV-1
26 HIV-1
27 HIV-1
28 HIV-1
29 HIV-1
30 HIV-1
31 HIV-1
32 HIV-1
33 HIV-1
34 HIV-1
35 HIV-1
36 HIV-1
37 HIV-1
38 HIV-1
39 HIV-1
40 HIV-1
41 HIV-1
42 HIV-1
43 HIV-1
44 HIV-1
45 HIV-1
47 HIV-1
48 HIV-1
52 HIV-1
53 HIV-1
54 HIV-1
55 HIV-1
56 HIV-1
D = dist.dna(seqs, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
nj_tree = bionj(D)
nj_tree2 = root(nj_tree, out = 1) #We choose the first sequence as the outgroup to root the tree, as it is the reference sequence.
nj_tree2 = ladderize(nj_tree2) # reorganizes the internal structure of the tree to get the ladderized effect when plotted. Smallest clade is on the right-hand side
plot(nj_tree2, show.tip=FALSE)
tiplabels(annot$DaysFromSeroconversion, bg=num2col(annot$DaysFromSeroconversion, col.pal=colorPalette))- In this tree, there are only 2 sequences that were collected more than 100 days after seroconversion. They are positioned in a way that does not seem different from older sequences - no major branching event separates them from other sequences. This is surprising in light of the previous analysis (questions 2-3), as the main branching event was then separating new and older sequences. The difference between these two patients is that the first one is not treated (or at least, was not treated during most of this data sampling) while the second one is treated (was treated very soon after seroconversion). The treatment reduces the number of viruses present and their replication and evolution.
Reference: Pilar Garcia-Broncano et al (2019) Early antiretroviral therapy in neonates with HIV-1 infection restricts viral reservoir size and induces a distinct innate immune profile, Sci Transl Med 11(520):eaax7350 doi: 10.1126/scitranslmed.aax7350. The patient is an infant infected at birth (Mother->Child); diagnosed at 31 days after birth, and began ART at that time. Patient Code 201 - Patient Id 116109 - https://www.hiv.lanl.gov/components/sequence/HIV/search/patient.comp?pat_id=116109
indices = which(annot$DaysFromSeroconversion==0)
annot_extr=annot[indices, ]
seqs_extr=seqs[indices, ]D_extr = dist.dna(seqs_extr, model = "JC") # Jukes-Cantor model - we could use other models, e.g. model = "TN93" for Tamura and Nei (1993), a more complex model where transitions and transversions have different rates etc.
h1 = hist(D_extr, main='', xlab='JC distance between sequences', ylab='Counts')mean(D_extr)[1] 0.004534841
max(D_extr)[1] 0.01065449
- Here, within one patient, the mean Jukes-Cantor distance between all sequences collected at seroconversion is 0.004 and the maximum Jukes-Cantor distance between sequences collected at seroconversion is 0.01. For the flu virus, the mean Jukes-Cantor distance between all sequences collected in a given year is 0.008 and the maximum Jukes-Cantor distance between sequences in a given year is 0.04. We observe that the values for the flu are similar to those observed here in question 6, in orders of magnitude. But here, we are considering only one patient, and the virus has evolved only for about one month. For the flu, all patients in the world were potentially included, during one year. This tells us that HIV evolution is much faster than that of the influenza virus.
Exercise 2 - Simulating experimental evolution with serial passage
We have \(100=e^t\), which leads to \(t=\log(100)\).
Because of exponential growth, we have \(x'=\frac{x e^{st}}{1+x(e^{st}-1)}\).
s=0.01
t=5
x=0.01
xp=x*exp(s*t)/(1+x*(exp(s*t)-1))
print(xp)[1] 0.01050732
s=0.5
t=5
x=0.01
xp=x*exp(s*t)/(1+x*(exp(s*t)-1))
print(xp)[1] 0.1095721
We see that the change of the fraction x due to growth is small if s=0.01 but that the fraction substantially increases if s=0.5. As a result, the final mutant fraction is much larger in the second case than in the first case.
#install.packages("flan")
library(flan)
sampled_data=rflan(1000, mutations = 1, fitness = 1/1.01, mfn = 100)
nmut=sampled_data$mc
#ntot=sampled_data$fn
mean(nmut)[1] 6.565
var(nmut)[1] 1235.92
We observe that the variance is much larger than the mean. This is because mutations can arise early in the exponential growth, which results in large numbers of mutants, as mutations are inherited. Thus, in these rare cases (jackpot events), there are lots of mutants. This gives a large variance.
#library(flan)
n_rep=10
mutant_frac = matrix(nrow=n_rep,ncol=3)
for(i in 1:n_rep){
sampled_data=rflan(5e6, mutations = 0.1, fitness = 1/1.01, mfn = 100)
nmut=sampled_data$mc
ntot=sampled_data$fn
mutant_frac[i,1]=mean(nmut)
mutant_frac[i,2]=var(nmut)
mutant_frac[i,3]=sum(nmut)/sum(ntot)
}
#nmut_preex
mean(mutant_frac[,3])[1] 0.0124307
var(mutant_frac[,3])[1] 7.53131e-06
We observe that the fraction has a small variability across replicates. This is despite the fact that each cell can give rise to a very variable number of mutant offspring after growth (see question 3), because we are here aggregating a very large number of cells (\(K=5\times 10^6\)).
\(k\) follows a binomial distribution. We have \(P(k)={K\choose k} (x')^{k} \left(1-x'\right)^{K-\,k}\).
We can use the function rbinom.
s=0.01
t=5
x=0.01
K=5e6
xp=x*exp(s*t)/(1+x*(exp(s*t)-1))
n=1000
vals=rbinom(n, K, xp)
mean(vals)[1] 52538.44
sd(vals)[1] 220.9481
h1 = hist(vals, main='', xlab='Number of mutants sampled', ylab='Counts')K=5e6
s=0.01
t=5
n_gen=300
n_rep=10
mutant_frac = matrix(nrow=n_gen,ncol=n_rep)
for(i in 1:n_rep){
mutant_frac[1,i]=0.01
for(j in 2:n_gen){
mutant_frac[j,i] = rbinom( 1, K, mutant_frac[j-1,i]*exp(s*t)/(1+mutant_frac[j-1,i]*(exp(s*t)-1)) ) / K
}
}
matplot(1:n_gen, mutant_frac, pch=20, type="l", lty="solid", xlab='Bottleneck', ylab='Mutant fraction')We observe that all trajectories end in fixation, and that there is very little noise: we have smooth ascending trajectories for the fraction x of mutants. This is because we are in the (quasi-)deterministic regime. (Indeed, K>>1, st>>1/K and x>>1/(Kst).)
0.01*log(100) #st[1] 0.0460517
1/(5e6*0.01*log(100)) #1/(Kst)[1] 4.342945e-06