Plant genome assembly and annotation pipeline using snakemake
This Snakemake pipeline aims for plant genome assembly with HiFi data and genome annotation with RNA-Seq/IsoSeq.
Assembly
Modified from the VGP assembly VGP tutorial
Annotation
-
Maker running inside
snakemake
still have the problem since the NSF lock and submit system is already included. -
IsoSeq integration is still in development. For diploid or polyploidy, FLNC mapping is better than the clustering of the PacBio pipeline if you have haplotype-resolved assembly.
-
Automatic pipeline always confused the complex gene clusters (metabolic gene clusters / NLR gene clusters), please manually curation it before any further interpretation.
WebApollo
orIGV-GSAman
could be a good start for manual curation. (https://www.bilibili.com/video/BV1x84y1z7ZW/)
Code Snippets
17 18 19 20 21 22 23 24 25 26 | shell: """ {module} {asm} samtools faidx {input.fa} ln -sf {params.fa} {output.fa} ln -sf {params.fa} {output.ref} ln -sf {params.fai} {output.fai} ln -sf {params.fai} {output.ref_fai} """ |
40 41 42 43 44 45 | shell: """ {module} {EDTA} cd {params.DIR} && EDTA_raw.pl --genome {wildcards.name}.fa --species {species} --type ltr --threads {threads} """ |
58 59 60 61 62 63 64 | shell: """ {module} {EDTA} cd {params.DIR} EDTA_raw.pl --genome {wildcards.name}.fa --species {species} --type tir --threads {threads} """ |
78 79 80 81 82 83 | shell: """ {module} {EDTA} cd {params.DIR} && EDTA_raw.pl --genome {wildcards.name}.fa --species {species} --type helitron --threads {threads} """ |
103 104 105 106 107 108 109 110 111 112 113 | run: #cmd = EDTA + " && " + RepeatMasker + " && " + RepeatModeler + " && " + blast + " && " cmd = module + "&&" + EDTA + " && " cmd = cmd + " cd " + params.DIR + " && EDTA.pl --genome " + wildcards.name + ".fa --species " + species if lib : cmd = cmd + " --curatedlib " + lib if cds : cmd = cmd + " --cds " + cds cmd = cmd + " --sensitive " + str(sensitive) + " --anno 1 --step " + step + " -t " + str(threads) #cmd = cmd + " --blast " + params.blast + " --repeatmasker " + params.masker + " --repeatmodeler " + params.modeler + " -protlib " + params.problib shell("{cmd}") |
125 126 127 128 129 130 131 | shell: """ {module} {EDTA} grep -v -P "Satellite|rich|Simple_repeat|Low_complexity|tRNA|rRNA" {input.gff} | bedtools maskfasta -fi {input.fa} -bed - -fo {output.soft} -soft grep -v -P "Satellite|rich|Simple_repeat|Low_complexity|tRNA|rRNA" {input.gff} | bedtools maskfasta -fi {input.fa} -bed - -fo {output.hard} """ |
24 25 26 27 28 29 | shell: """ {module} {QC} || echo "true" fastp -a auto --adapter_sequence_r2 auto --detect_adapter_for_pe -w {threads} -i {input.R1} -I {input.R2} --n_base_limit {params.nbase} --cut_window_size {params.window} --cut_mean_quality {params.mean_qual} --length_required {params.length} --qualified_quality_phred {params.quality} -o {output.R1} -O {output.R2} --json {output.json} --html {output.html} >&2 2>{log} """ |
42 43 44 45 46 | shell: """ md5sum {input.R1} 1> {output.md5_1} md5sum {input.R2} 1> {output.md5_2} """ |
65 66 67 68 69 70 | shell: """ {module} {QC} fastqc -t {threads} {input.R1} {input.R2} -o {params.outdir} >&2 2>{log} """ |
98 99 100 101 102 103 104 | shell: """ {mapping} cp ctl/.gm_key ~/ cp -r /public/home/baozhigui/miniconda3/envs/braker2/config/ ./ hisat2-build -p {threads} {input.ref} {params.DIR}/{wildcards.name} """ |
121 122 123 124 125 | shell: """ {mapping} hisat2 --dta -x {params.index} -1 {input.R1} -2 {input.R2} --rna-strandness RF --summary-file {output.summary} --new-summary -p {threads} | samtools view -@ 4 -Sb - | samtools sort -@ 6 -o {output.bam} - >&2 2>{log} """ |
136 137 138 139 140 141 | shell: """ {mapping} stringtie -p {threads} --rf -l {wildcards.sample} -o {output.gtf} {input.bam} gffread -E {output.gtf} -o - | sed "s#transcript#match#g" | sed "s#exon#match_part#g" > {output.gff3} """ |
156 157 158 159 160 | shell: """ {mapping} hisat2-build -p {threads} {input.ref} {params.DIR}/hardmask.{wildcards.name} """ |
177 178 179 180 181 | shell: """ {mapping} hisat2 --dta -x {params.index} -1 {input.R1} -2 {input.R2} --rna-strandness RF --summary-file {output.summary} --new-summary -p {threads} | samtools view -@ 4 -Sb - | samtools sort -@ 6 -o {output.bam} - >&2 2>{log} """ |
196 197 198 199 200 | shell: """ {mapping} minimap2 -ax splice:hq -uf {input.ref} {input.fa} -t {threads} | samtools view -@ 4 -Shb - | samtools sort -@ 4 -o {output.bam} """ |
216 217 218 | run: bam = ",".join(input.bam) + "," + ",".join(input.iso_bam) shell("{module} && {genemark} && {braker2} && braker.pl --species=braker_{name} --genome={input.ref} --workingdir={params.DIR} --gff3 --AUGUSTUS_CONFIG_PATH={augustus_conf} --nocleanup --softmasking --bam={bam} --cores {threads}") |
233 234 235 | run: bam = ",".join(input.bam) shell("{module} && {genemark} && {braker2} && braker.pl --species=braker_{name} --genome={input.ref} --workingdir={params.DIR} --gff3 --AUGUSTUS_CONFIG_PATH={augustus_conf} --nocleanup --softmasking --bam={bam} --cores {threads}") |
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | shell: """ {module} {maker} mkdir -p results/03.annotation/06.maker/round1 && cd results/03.annotation/06.maker/round1 cp ../../../../ctl/.gm_key ~/ cp ../../../../ctl/maker/round1/*.ctl ./ num=`ls {outdir}/results/03.annotation/04.RNA-seq/stringtie/*.gff3|wc -l` if [ $num -gt 1 ];then gff=`ls {outdir}/results/03.annotation/04.RNA-seq/stringtie/*.gff3|tr '\\n' ','|sed "s/,$//g"` else gff=`ls {outdir}/results/03.annotation/04.RNA-seq/stringtie/*.gff3` fi echo $gff sed -i 's#^genome=#genome={params.fasta}#g' maker_opts.ctl sed -i -e "s|^est_gff=|est_gff=$gff|g" maker_opts.ctl sed -i 's#^protein=#protein={params.pro}#g' maker_opts.ctl sed -i 's#^rmlib=#rmlib={params.te}#g' maker_opts.ctl mpiexec -n {threads} maker -base {wildcards.name} maker_bopts.ctl maker_exe.ctl maker_opts.ctl >&2 2>{log} cd {wildcards.name}.maker.output gff3_merge -d ./{wildcards.name}_master_datastore_index.log -o {wildcards.name}.maker.gff fasta_merge -d ./{wildcards.name}_master_datastore_index.log -o {wildcards.name} """ |
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | shell: """ mkdir -p results/03.annotation/06.maker/round2/Training/SNAP cd results/03.annotation/06.maker/round2/Training/SNAP {module} {maker} maker2zff -x {params.aed} ../../../round1/{wildcards.name}.maker.output/{wildcards.name}.maker.gff # gather some stats and validate fathom genome.ann genome.dna -gene-stats > gene-stats.log 2>&1 fathom genome.ann genome.dna -validate > validate.log 2>&1 # collect the training sequences and annotations, plus 1000 surrounding bp for training fathom genome.ann genome.dna -categorize 1000 > categorize.log 2>&1 fathom uni.ann uni.dna -export 1000 -plus > uni-plus.log 2>&1 mkdir params cd params forge ../export.ann ../export.dna > ../forge.log 2>&1 cd .. hmm-assembler.pl genome params > {params.hmm} """ |
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | shell: """ {module} {maker} cp -r results/03.annotation/05.braker_{wildcards.name}/species/* config/species/ export AUGUSTUS_CONFIG_PATH={params.config} mkdir -p results/03.annotation/06.maker/round2 && cd results/03.annotation/06.maker/round2 cp ../../../../ctl/maker/round2/*.ctl ./ sed -i 's#^genome=#genome={params.fasta}#g' maker_opts.ctl sed -i 's#^maker_gff=#maker_gff={params.gff}#g' maker_opts.ctl sed -i "s#^snaphmm=#snaphmm={params.snap}#g" maker_opts.ctl sed -i "s#^gmhmm=#gmhmm={params.gm}#g" maker_opts.ctl sed -i "s#^augustus_species=#augustus_species=braker_{wildcards.name}#g" maker_opts.ctl mpiexec -n {threads} maker -base {wildcards.name} maker_bopts.ctl maker_exe.ctl maker_opts.ctl >&2 2> maker_round2.log gff3_merge -d ./{wildcards.name}.maker.output/{wildcards.name}_master_datastore_index.log -o {wildcards.name}.maker.all.gff awk '$2 == "maker"' {wildcards.name}.maker.all.gff > {wildcards.name}.maker.only.gff fasta_merge -d ./{wildcards.name}.maker.output/{wildcards.name}_master_datastore_index.log -o {wildcards.name} """ |
Support
- Future updates
Related Workflows





