biyelunwen/99.scripts/workflow/transcripts_assembly/02.referenced_assembly.sh

38 lines
1.5 KiB
Bash

#! /bin/bash
set -e
MAX_MEMORY=${MAX_MEMORY:-"50G"}
VIRIDI=${VIRIDI:-"$PROJECTHOME/01.reference/viridiplantae_odb12/"}
ROSALES=${ROSALES:-"$PROJECTHOME/01.reference/rosales_odb12/"}
SCRIPTS=${SCRIPTS:-"$PROJECTHOME/99.scripts"}
if [ "$#" -ne 5 ]; then
echo "Usage: $0 <reads_1.fastq> <reads_2.fastq> <ref> <stem> <threads>"
echo "Perform reference-guided transcriptome assembly using Hisat2 and Trinity"
exit 1
fi
fq1=$1
fq2=$2
ref=$3
stem=$4
outdir="$stem"_trinity_out_dir
THREADS=$5
# Run Hisat2 for reads mapping to reference genome
hisat2 -p "$THREADS" --dta -x "$ref" -1 "$fq1" -2 "$fq2" -S "$stem".sam
samtools view -b -@ "$THREADS" -o "$stem".raw.bam "$stem".sam
samtools sort -@ "$THREADS" -o "$stem".sorted.bam "$stem".raw.bam
samtools index "$stem".sorted.bam
rm "$stem".sam "$stem".raw.bam
# Run Trinity for de novo transcriptome assembly
Trinity --genome_guided_bam "$stem".sorted.bam --genome_guided_max_intron 10000 --max_memory "$MAX_MEMORY" --CPU "$THREADS" --output "$outdir"
# Get Longest isoform per gene
perl "$SCRIPTS"/trinity_utils/util/misc/get_longest_isoform_seq_per_trinity_gene.pl "$outdir/Trinity-GG.fasta" >"$outdir/longest_isoform.fasta"
# BUSCO assessment
busco -i "$outdir"/longest_isoform.fasta -l "$VIRIDI" -m tran --cpu "$THREADS" -o "$outdir"/busco_viridi -f
busco -i "$outdir"/longest_isoform.fasta -l "$ROSALES" -m tran --cpu "$THREADS" -o "$outdir"/busco_rosales -f
# Length Statistics
TrinityStats.pl "$outdir"/Trinity-GG.fasta >"$outdir"/length_stat.txt