# Connect to helvetios and request an interactive session
ssh <username>@helvetios.epfl.ch

# On the login node
module add tmux
tmux
# or
tmux attach
cd /scratch/<username>
Sinteract -a bio-693 -c 20 -t 08:00:00 -m 40g

# On the cluster node
cp /scratch/iseli/course/SraRunTable.txt .

# Do not forget/omit the dot (.)

# Launch the container
singularity shell -s /bin/bash -B /scratch/$USER -B /scratch/iseli /scratch/iseli/MicMap-2.4-f38.sif


# Retrieve all the samples
# Please do not actually run this - it would take too long

for n in `cut -d , -f 1 SraRunTable.txt | grep ^SRR`; do
  echo "Processing $n";
  fasterq-dump --split-3 --progress $n;
  pigz --fast ${n}*.fastq;
  sra-stat --quick $n >${n}_stat.txt;
done

# Instead, create symbolic links to my data

ln -s /scratch/iseli/course/*.fastq.gz .
ln -s /scratch/iseli/course/*_stat.txt .

# Check retrieved data
for f in SRR*_stat.txt; do
  n=`basename $f _stat.txt`;
  cntFQ=`perl -ne '($c)=$_=~/^[^:]+\|(\d+):/;$t+=$c;END{print "$t\n"}' $f`;
  linesFQ=`gzip -dc ${n}_1.fastq.gz | wc -l`;
  if [[ $cntFQ -ne $((linesFQ / 4)) ]]; then
    echo "We have a problem for $n - $cntFQ vs $((linesFQ / 4))";
  else
    echo "count is correct for $n - $cntFQ";
  fi;
done

# Get symbolic links to the reference data

ln -s /scratch/iseli/course/GRCm39.primary_assembly.genome.fa .
ln -s /scratch/iseli/course/gencode.vM32.annotation.gtf .

# You now could prepare the STAR index like so (but no need to actually run it here)
STAR --runMode genomeGenerate \
  --genomeDir STAR_mm39vM32/ \
  --genomeFastaFiles \
      GRCm39.primary_assembly.genome.fa \
  --sjdbGTFfile gencode.vM32.annotation.gtf

# Instead use a symlink to my indices
ln -s /scratch/iseli/course/STAR_mm39vM32 .

# And now you can run the alignment
n=SRR16219925
STAR --runMode alignReads \
  --runThreadN 20 \
  --genomeDir STAR_mm39vM32 \
  --twopassMode Basic \
  --outFileNamePrefix ${n}_mm39vM32/ \
  --outSAMtype BAM SortedByCoordinate \
  --outSAMattributes All \
  --outSAMattrRGline ID:$n LB:$n SM:$n PL:ILLUMINA \
  --readFilesCommand "gzip -dc" \
  --quantMode GeneCounts \
  --limitBAMsortRAM 16000000000 \
  --readFilesIn ${n}_?.fastq.gz

# Check some of the output
cat ${n}_mm39vM32/Log.final.out
less ${n}_mm39vM32/ReadsPerGene.out.tab
sort -k 4,4nr ${n}_mm39vM32/ReadsPerGene.out.tab | less
grep ENSMUSG00000064351.1 gencode.vM32.annotation.gtf | less
samtools view -h Aligned.sortedByCoord.out.bam | less
samtools index Aligned.sortedByCoord.out.bam
samtools tview -p chr1:5153351 ${n}_mm39vM32/Aligned.sortedByCoord.out.bam GRCm39.primary_assembly.genome.fa

# You can also look at chr1:5171281 – probable alt splicing
# and chr1:5194491 – variants

# In case you want to look at potential PCR duplicates
java -jar /usr/local/share/java/picard.jar MarkDuplicates \
  -I ${n}_mm39vM32/Aligned.sortedByCoord.out.bam \
  -O ${n}_mm39vM32/Aligned.sortedByCoord.MD.bam \
  -M ${n}_mm39vM32/marked_dup_metrics.txt
grep -A 1 ^LIBRARY ${n}_mm39vM32/marked_dup_metrics.txt | column -t

# Secondary and PCR duplicates
samtools view ${n}_mm39vM32/Aligned.sortedByCoord.out.bam chr1:5153351-5153600 | less
samtools index ${n}_mm39vM32/Aligned.sortedByCoord.MD.bam
samtools view ${n}_mm39vM32/Aligned.sortedByCoord.MD.bam chr1:5153351-5153600 | less
# You can play with –f 256 and –f 1024 to filter reads using samtools view

# Getting raw counts
featureCounts –T 20 \
  --primary -s 2 -p -B -Q 10 \
  -a gencode.vM32.annotation.gtf \
  -o featureCounts.txt \
  /scratch/iseli/course/SRR162199*_mm39vM32/Aligned.sortedByCoord.out.bam

# Have a look
sed 's+/scratch/iseli/course/++;s/_mm39vM32\/Aligned.sortedByCoord.out.bam//g' featureCounts.txt.summary | column –t

# Generate table ready to load in R
cut -f1,7-  featureCounts.txt | sed 1d | sed 's+/scratch/iseli/course/++;s/_mm39vM32\/Aligned.sortedByCoord.out.bam//g' > final_counts.txt

# vim: filetype=bash
###