3. Hidden Markov Models - solutions

Author

EPFL - SV - BIO-463

Published

March 4, 2025

Exercise 1: Translate genes, find longest open reading frame

  1. Load the file GeneSequences.fa
  2. Translate the sequences (in 3 frames: only the forward strand)
  3. Find the longest ORF in each sequence: use matchPattern for residues “*” and “M”
  4. Save it as a fasta file named orf.fa
library("Biostrings")
genes = readDNAStringSet("GeneSequences.fa")
f1 = genes
f2 = subseq(genes, 2)
f3 = subseq(genes, 3)
frames = translate(c(f1, f2, f3), no.init.codon=T, if.fuzzy.codon="solve")
longest.orf = AAStringSet(rep('', length(genes)))
names(longest.orf) = names(genes)

for (nf in 1:length(frames)) {
    frm = frames[[nf]]
    nme = names(frames)[[nf]]
    ### 1. find positions of "stops", namely "*" characters
    stops = start(matchPattern("*", frm))
    n0 = 1
    ### 2. repeat for each stop found 
    for (n1 in stops) {
        ### find first start ("M") between n0 and n1
        starts = start(matchPattern("M", frm[n0:n1]))
        if (length(starts) > 0) {
            n01 = n0+starts[1]-1
            if (nchar(longest.orf[nme]) < n1-n01+1) longest.orf[[nme]] = frm[n01:n1]
        }
        ### next search will be between this stop and the next
        n0 = n1
    }
}
longest.orf
AAStringSet object of length 2:
    width seq                                               names               
[1]  1291 MSGPLEGADGGGDPRPGESFCPG...EEEEEGRSSSSPALPTAGNCTS* human_5187
[2]  1237 MDVGKPRGGSSERGASWSPAAAA...CEASVADDSMEKKGNSPLLLQR* Barn_owl_104355733
writeXStringSet(longest.orf, "orf.fa")

Exercise 2: Construct an HMM to find ORFs

  • The states S1, S2, S3 represent the 3 consecutive nucleotides of a start codon, E1, E21, E22, E32, E33 represent the 3 possible stop codons, B is background and I[123] are “inner” codons.
  • The emitted symbols are nucleotides A, C, G, T, background and inner codons emit with uniform probabilities.
  • The emission probabilities of start or end states must be specified (easy to guess).
  • The transition probabilities not specified on the schema should be easy to guess. The probabilities must be calculated so that the 3 possible stop codons have the same probability.
states = c("B", "S1", "S2", "S3", "I1", "I2", "I3", "E1", "E21", "E22","E31", "E32")
nstates = length(states)
symbols = c("A", "C", "G", "T")
nsym = length(symbols)

### Most emission prob. will be 1/4 so fill the entire matrix with 1/4,
### then modify the ones that are different
Emat = matrix(1/4, ncol=nsym, nrow=nstates, dimnames=list(states, symbols))
### start is ATG
Emat["S1",] = c(1, 0, 0, 0)
Emat["S2",] = c(0, 0, 0, 1)
Emat["S3",] = c(0, 0, 1, 0)
### stop is one of TAG, TAA, TGA
### first char is T
Emat["E1",] = c(0, 0, 0, 1)
### second is A or G
Emat["E21",] = c(1, 0, 0, 0)
Emat["E22",] = c(0, 0, 1, 0)
### third is A or G after A, and only A after G
Emat["E31",] = c(0, 0, 1, 0)
Emat["E32",] = c(1, 0, 0, 0)

### Transition matrix: according to the schema
Mmat = matrix(0, ncol=nstates, nrow=nstates, dimnames=list(states, states))
Mmat["B", "B"] = 0.98
Mmat["S1", "S2"] = 1
Mmat["S2", "S3"] = 1
Mmat["S3", "I1"] = 1
Mmat["I1", "I2"] = 1
Mmat["I2", "I3"] = 1
Mmat["I3", "I1"] = 0.9
Mmat["E31", "B"] = 0.95
Mmat["E32", "B"] = 0.95
### Sum of probabilities going out of any state must be 1
Mmat["B", "S1"] = 0.02
Mmat["I3", "E1"] = 0.1
Mmat["E31", "S1"] = 0.05
Mmat["E32", "S1"] = 0.05
### These were calculated so that each path through the E states has probability 1/3
Mmat["E1", "E21"] = 2/3
Mmat["E1", "E22"] = 1/3
Mmat["E21", "E31"] = 0.5
Mmat["E21", "E32"] = 0.5
Mmat["E22", "E32"] = 1
library("aphid")

### convert DNA sequence to a list of individual characters,
### keep only the positions 1501-1800
seq = strsplit(as.character(genes[[1]]), '')[[1]][1501:1800]

### we create an artificial "Begin" that goes directly to "B":
### add 1 row and 1 column to Mmat
M2 = cbind(rep(0, nstates+1), rbind(rep(0, nstates), Mmat))
colnames(M2)[1] = "Begin"
rownames(M2)[1] = "Begin"
### transit from "Begin" to "B"
M2["Begin", "B"] = 1

### Emat / Mmat are in units of probability,
### the HMM calculations are with log(probability)
hmm.orf = structure(list(A=log(M2), E=log(Emat), qe=rep(.25,4)), class="HMM")

plot(hmm.orf)

hmm.vtrb = Viterbi(hmm.orf, seq)

vis.gene = as.character(genes[[1]][1501:1800])
state.letter = substr(states,1,1)
vis.hmmpath = paste(state.letter[hmm.vtrb$path+1], collapse='')
c(vis.gene, vis.hmmpath)
[1] "GGGGAGTATGTCACCATGGACACCAGCTGGGCTGGCTTTGTGCACCCCTGGAGCCGCAAGGTAGCCTTCGTGTTGGGCCGCCACAAAGTACGCACGGCCCCCCTGAATGAGGACGTGTTCACTCCCCCGGCCCCCAGCCCAGCTCCCTCCCTGGACACTGATATCCAGGAGCTGTCAGAGCAGATCCACCGGCTGCTGCTGCAGCCCGTCCACAGCCCCAGCCCCACGGGACTCTGTGGAGTCGGCGCCGTGACATCCCCAGGCCCTCTCCACAGCCCTGGGTCCTCCAGTGATAGCAAC"
[2] "BBBBBBBSSSIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIEEEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"