# Connect to helvetios and request an interactive session ssh @helvetios.epfl.ch # On the login node module add tmux tmux # or tmux attach cd /scratch/ Sinteract -a bio-693 -c 20 -t 08:00:00 -m 40g # On the cluster node cp /scratch/iseli/course/SraRunTable.txt . # Do not forget/omit the dot (.) # Launch the container singularity shell -s /bin/bash -B /scratch/$USER -B /scratch/iseli /scratch/iseli/MicMap-2.4-f38.sif # Retrieve all the samples # Please do not actually run this - it would take too long for n in `cut -d , -f 1 SraRunTable.txt | grep ^SRR`; do echo "Processing $n"; fasterq-dump --split-3 --progress $n; pigz --fast ${n}*.fastq; sra-stat --quick $n >${n}_stat.txt; done # Instead, create symbolic links to my data ln -s /scratch/iseli/course/*.fastq.gz . ln -s /scratch/iseli/course/*_stat.txt . # Check retrieved data for f in SRR*_stat.txt; do n=`basename $f _stat.txt`; cntFQ=`perl -ne '($c)=$_=~/^[^:]+\|(\d+):/;$t+=$c;END{print "$t\n"}' $f`; linesFQ=`gzip -dc ${n}_1.fastq.gz | wc -l`; if [[ $cntFQ -ne $((linesFQ / 4)) ]]; then echo "We have a problem for $n - $cntFQ vs $((linesFQ / 4))"; else echo "count is correct for $n - $cntFQ"; fi; done # Get symbolic links to the reference data ln -s /scratch/iseli/course/GRCm39.primary_assembly.genome.fa . ln -s /scratch/iseli/course/gencode.vM32.annotation.gtf . # You now could prepare the STAR index like so (but no need to actually run it here) STAR --runMode genomeGenerate \ --genomeDir STAR_mm39vM32/ \ --genomeFastaFiles \ GRCm39.primary_assembly.genome.fa \ --sjdbGTFfile gencode.vM32.annotation.gtf # Instead use a symlink to my indices ln -s /scratch/iseli/course/STAR_mm39vM32 . # And now you can run the alignment n=SRR16219925 STAR --runMode alignReads \ --runThreadN 20 \ --genomeDir STAR_mm39vM32 \ --twopassMode Basic \ --outFileNamePrefix ${n}_mm39vM32/ \ --outSAMtype BAM SortedByCoordinate \ --outSAMattributes All \ --outSAMattrRGline ID:$n LB:$n SM:$n PL:ILLUMINA \ --readFilesCommand "gzip -dc" \ --quantMode GeneCounts \ --limitBAMsortRAM 16000000000 \ --readFilesIn ${n}_?.fastq.gz # Check some of the output cat ${n}_mm39vM32/Log.final.out less ${n}_mm39vM32/ReadsPerGene.out.tab sort -k 4,4nr ${n}_mm39vM32/ReadsPerGene.out.tab | less grep ENSMUSG00000064351.1 gencode.vM32.annotation.gtf | less samtools view -h Aligned.sortedByCoord.out.bam | less samtools index Aligned.sortedByCoord.out.bam samtools tview -p chr1:5153351 ${n}_mm39vM32/Aligned.sortedByCoord.out.bam GRCm39.primary_assembly.genome.fa # You can also look at chr1:5171281 – probable alt splicing # and chr1:5194491 – variants # In case you want to look at potential PCR duplicates java -jar /usr/local/share/java/picard.jar MarkDuplicates \ -I ${n}_mm39vM32/Aligned.sortedByCoord.out.bam \ -O ${n}_mm39vM32/Aligned.sortedByCoord.MD.bam \ -M ${n}_mm39vM32/marked_dup_metrics.txt grep -A 1 ^LIBRARY ${n}_mm39vM32/marked_dup_metrics.txt | column -t # Secondary and PCR duplicates samtools view ${n}_mm39vM32/Aligned.sortedByCoord.out.bam chr1:5153351-5153600 | less samtools index ${n}_mm39vM32/Aligned.sortedByCoord.MD.bam samtools view ${n}_mm39vM32/Aligned.sortedByCoord.MD.bam chr1:5153351-5153600 | less # You can play with –f 256 and –f 1024 to filter reads using samtools view # Getting raw counts featureCounts –T 20 \ --primary -s 2 -p -B -Q 10 \ -a gencode.vM32.annotation.gtf \ -o featureCounts.txt \ /scratch/iseli/course/SRR162199*_mm39vM32/Aligned.sortedByCoord.out.bam # Have a look sed 's+/scratch/iseli/course/++;s/_mm39vM32\/Aligned.sortedByCoord.out.bam//g' featureCounts.txt.summary | column –t # Generate table ready to load in R cut -f1,7- featureCounts.txt | sed 1d | sed 's+/scratch/iseli/course/++;s/_mm39vM32\/Aligned.sortedByCoord.out.bam//g' > final_counts.txt # vim: filetype=bash ###