3. Hidden Markov Models - solutions

Author

EPFL - SV - BIO-463

Published

March 4, 2025

Exercise 1: Translate genes, find longest open reading frame

Load the file GeneSequences.fa
Translate the sequences (in 3 frames: only the forward strand)
Find the longest ORF in each sequence: use matchPattern for residues “*” and “M”
Save it as a fasta file named orf.fa

library("Biostrings")
genes = readDNAStringSet("GeneSequences.fa")
f1 = genes
f2 = subseq(genes, 2)
f3 = subseq(genes, 3)
frames = translate(c(f1, f2, f3), no.init.codon=T, if.fuzzy.codon="solve")
longest.orf = AAStringSet(rep('', length(genes)))
names(longest.orf) = names(genes)

for (nf in 1:length(frames)) {
    frm = frames[[nf]]
    nme = names(frames)[[nf]]
    ### 1. find positions of "stops", namely "*" characters
    stops = start(matchPattern("*", frm))
    n0 = 1
    ### 2. repeat for each stop found 
    for (n1 in stops) {
        ### find first start ("M") between n0 and n1
        starts = start(matchPattern("M", frm[n0:n1]))
        if (length(starts) > 0) {
            n01 = n0+starts[1]-1
            if (nchar(longest.orf[nme]) < n1-n01+1) longest.orf[[nme]] = frm[n01:n1]
        }
        ### next search will be between this stop and the next
        n0 = n1
    }
}
longest.orf

AAStringSet object of length 2:
    width seq                                               names               
[1]  1291 MSGPLEGADGGGDPRPGESFCPG...EEEEEGRSSSSPALPTAGNCTS* human_5187
[2]  1237 MDVGKPRGGSSERGASWSPAAAA...CEASVADDSMEKKGNSPLLLQR* Barn_owl_104355733

writeXStringSet(longest.orf, "orf.fa")

Exercise 2: Construct an HMM to find ORFs

The states S1, S2, S3 represent the 3 consecutive nucleotides of a start codon, E1, E21, E22, E32, E33 represent the 3 possible stop codons, B is background and I[123] are “inner” codons.
The emitted symbols are nucleotides A, C, G, T, background and inner codons emit with uniform probabilities.
The emission probabilities of start or end states must be specified (easy to guess).
The transition probabilities not specified on the schema should be easy to guess. The probabilities must be calculated so that the 3 possible stop codons have the same probability.

states = c("B", "S1", "S2", "S3", "I1", "I2", "I3", "E1", "E21", "E22","E31", "E32")
nstates = length(states)
symbols = c("A", "C", "G", "T")
nsym = length(symbols)

### Most emission prob. will be 1/4 so fill the entire matrix with 1/4,
### then modify the ones that are different
Emat = matrix(1/4, ncol=nsym, nrow=nstates, dimnames=list(states, symbols))
### start is ATG
Emat["S1",] = c(1, 0, 0, 0)
Emat["S2",] = c(0, 0, 0, 1)
Emat["S3",] = c(0, 0, 1, 0)
### stop is one of TAG, TAA, TGA
### first char is T
Emat["E1",] = c(0, 0, 0, 1)
### second is A or G
Emat["E21",] = c(1, 0, 0, 0)
Emat["E22",] = c(0, 0, 1, 0)
### third is A or G after A, and only A after G
Emat["E31",] = c(0, 0, 1, 0)
Emat["E32",] = c(1, 0, 0, 0)

### Transition matrix: according to the schema
Mmat = matrix(0, ncol=nstates, nrow=nstates, dimnames=list(states, states))
Mmat["B", "B"] = 0.98
Mmat["S1", "S2"] = 1
Mmat["S2", "S3"] = 1
Mmat["S3", "I1"] = 1
Mmat["I1", "I2"] = 1
Mmat["I2", "I3"] = 1
Mmat["I3", "I1"] = 0.9
Mmat["E31", "B"] = 0.95
Mmat["E32", "B"] = 0.95
### Sum of probabilities going out of any state must be 1
Mmat["B", "S1"] = 0.02
Mmat["I3", "E1"] = 0.1
Mmat["E31", "S1"] = 0.05
Mmat["E32", "S1"] = 0.05
### These were calculated so that each path through the E states has probability 1/3
Mmat["E1", "E21"] = 2/3
Mmat["E1", "E22"] = 1/3
Mmat["E21", "E31"] = 0.5
Mmat["E21", "E32"] = 0.5
Mmat["E22", "E32"] = 1

Create the corresponding HMM object
Plot the HMM schema (see plot.HMM)
Run the Viterbi algorithm on the segment 1501:1800 of the human gene and display the resulting states

library("aphid")

### convert DNA sequence to a list of individual characters,
### keep only the positions 1501-1800
seq = strsplit(as.character(genes[[1]]), '')[[1]][1501:1800]

### we create an artificial "Begin" that goes directly to "B":
### add 1 row and 1 column to Mmat
M2 = cbind(rep(0, nstates+1), rbind(rep(0, nstates), Mmat))
colnames(M2)[1] = "Begin"
rownames(M2)[1] = "Begin"
### transit from "Begin" to "B"
M2["Begin", "B"] = 1

### Emat / Mmat are in units of probability,
### the HMM calculations are with log(probability)
hmm.orf = structure(list(A=log(M2), E=log(Emat), qe=rep(.25,4)), class="HMM")

plot(hmm.orf)

hmm.vtrb = Viterbi(hmm.orf, seq)

vis.gene = as.character(genes[[1]][1501:1800])
state.letter = substr(states,1,1)
vis.hmmpath = paste(state.letter[hmm.vtrb$path+1], collapse='')
c(vis.gene, vis.hmmpath)

[1] "GGGGAGTATGTCACCATGGACACCAGCTGGGCTGGCTTTGTGCACCCCTGGAGCCGCAAGGTAGCCTTCGTGTTGGGCCGCCACAAAGTACGCACGGCCCCCCTGAATGAGGACGTGTTCACTCCCCCGGCCCCCAGCCCAGCTCCCTCCCTGGACACTGATATCCAGGAGCTGTCAGAGCAGATCCACCGGCTGCTGCTGCAGCCCGTCCACAGCCCCAGCCCCACGGGACTCTGTGGAGTCGGCGCCGTGACATCCCCAGGCCCTCTCCACAGCCCTGGGTCCTCCAGTGATAGCAAC"
[2] "BBBBBBBSSSIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIEEEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"