library("Biostrings")
library("Rgraphviz")
library("biomaRt")2. Genome assembly, sequence alignments - solutions
Exercise 1: Eulerian path in a graph
Sequencing reads
#### scan() the file as a list of "character" strings
datafile = scan("SequencingReads.txt", character())
head(datafile) [1] "GGGGTGGTAAAGATCAAGAACAAGA" "TCTACACGCACACTGATCAGTCCCA"
[3] "AGAAGAAAGTGTTGGACTCCTCCCG" "AGTCCTTTTTCTGCAGGATCAGGGG"
[5] "CTGACACGTTTGTGGCTGTGTTTTC" "ACCGAAGAGTGTTTCAGCCAGATGT"
#### convert to a DNAStringSet
reads = DNAStringSet(datafile)
readsDNAStringSet object of length 22:
width seq
[1] 25 GGGGTGGTAAAGATCAAGAACAAGA
[2] 25 TCTACACGCACACTGATCAGTCCCA
[3] 25 AGAAGAAAGTGTTGGACTCCTCCCG
[4] 25 AGTCCTTTTTCTGCAGGATCAGGGG
[5] 25 CTGACACGTTTGTGGCTGTGTTTTC
... ... ...
[18] 25 CTGAACACACTCCAAAAAACACTGA
[19] 25 CCCGGTTTGTAGAGCTGCTTGTCCC
[20] 25 TCCGAATTACCCCATACTTGGTCCA
[21] 25 CCCACCTGCCGCTTTGGAACATGGA
[22] 25 ACTTTTTCCAGGCTCTGAATGACCG
#### nb of reads, size of reads
numreads = length(reads)
readlength = width(reads[1])Constructing the graph
#### subset reads: first 4 nucleotides, then last 4
readstarts = as.character(subseq(reads, 1, 4))
readends = as.character(subseq(reads, readlength-3, readlength))
### alternative: end is beginning of reversed read. Works for non-constant length!
readends = as.character(reverse(subseq(reverse(reads), 1, 4)))
###
nodes = sort(unique(c(readstarts, readends)))
numnodes = length(nodes)
edgelabels = 1:numreads
names(edgelabels) = paste(readstarts, readends, sep='~')- Create the adjacency matrix A: its columns and rows represent nodes (4-mers) from the nodes list, and the matrix element A[x,y] is the number of reads connecting x to y.
#### Adjacency matrix numnodes x numnodes, rows and cols are named with elements of "nodes"
A = matrix(0, nrow=numnodes, ncol=numnodes, dimnames=list(nodes, nodes))
for (n in 1:numreads) {
nfrom = readstarts[n]
nto = readends[n]
A[nfrom, nto] = A[nfrom, nto]+1
}- Now display the corresponding graph as follows (you can then play with the options to improve the looks of your graph):
grEuler = graphAM(adjMat=A, values=list(weight=1), edgemode="directed")
grattr = getDefaultAttrs()
grattr$graph$size = c(7,10)
grattr$edge$minlen = 1.5
grattr$edge$arrowsize=.5
grattr$edge$labelfontsize = 18
edgeattr = list(label=edgelabels)
plot(grEuler, edgeAttrs=edgeattr, attrs=grattr)Eulerian path and contig
# this is (visually) the order which edges are visted on a path through the graph
edgepath = c(22, 6, 9, 10, 18, 5, 15, 13, 12, 3, 19, 14, 2, 21, 17, 11, 4, 1, 7, 20, 16, 8)
sortedreads = reads[edgepath]
#### This is ok: cut nucleotides 1-4 of each sequence:
contig = DNAStringSet(paste(subseq(sortedreads, 5), collapse=''))
#### But you would like to keep the very first read intact. So paste it back at the begining:
contig = DNAStringSet(paste(c(subseq(sortedreads[1], 1, 4), subseq(sortedreads, 5)), collapse=''))names(contig) = "Week2Contig"
writeXStringSet(contig, "Week2Contig.fa")- Go the the NCBI blast and run the following operations:
- Choose “Nucleotide Blast”
- Upload your fasta
- Choose “refseq_rna” database
- Organism: Vertebrata
- Run “Blast”
- Which gene of which species have you assembled?
Gene PER3 of Tylo alba (Barn owl):
Nucleotide statistics
- Create a barplot of the sliding window base frequencies.
base.freqs = letterFrequencyInSlidingView(contig[[1]], 20, DNA_BASES, as.prob=T)
#### cosmetic aspects of the plot with par()
par(las=1, bg="white")
#### need to transpose the matrix to get frequencies vertically
barplot(t(base.freqs), col=2:5, border='NA')
#### add a legend to show which color is which base
legend("top", DNA_BASES, col=2:5, pch=15, cex=.8, horiz=T)
axis(1)Exercise 2: Align your contig to the Chicken genome
- Start from the result of Exercise 1: load the assembled contig as a DNAString
- Download the transcript found at chr21:295510-295980 on the Chicken genome using biomaRt and keep only the part between nucleotides 335 and 806
- Align these two sequences using a match score of +1, mismatch score of -1 and gap penalty of -2 (you need to define the missing variables here: see scoring matrices)
ensembl = useMart("ensembl", "ggallus_gene_ensembl")
ensembl_qry = getSequence(chromosome=21, start=295510, end=295980,
type=c("uniprot_gn_symbol","start_position","strand"),
seqType="cdna", mart=ensembl)
gg_seq = subseq(DNAStringSet(ensembl_qry$cdna), 335, 806)
qry_seq = readDNAStringSet("Week2Contig.fa")
scMatrix = nucleotideSubstitutionMatrix(match=1, mismatch=-1, baseOnly=TRUE)
gapPenalty = -2
pairwiseAlignment(qry_seq[[1]], gg_seq[[1]], substitutionMatrix=scMatrix, gapOpening=0, gapExtension=gapPenalty)Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: A-C------T---TTTT-CCAGGCTCTGAATGAC...CTTCTGTCCATGTGGATGCAGAATGCTGCTGCTT
subject: AGCAAACAATGAGTTTTTCCAGGCTCTCAATGAC...CCTCTGTCCATGTGGATGCAGAGT-C--CTGC--
score: 329
- Go to UCSC’s Blat alignment page, select chicken genome (galGal6 release) and paste the contig sequence (or upload the file). See to which genomic region it aligns to and observe the intron/exon structure.