Modular Shotgun Sequence Analysis Workflow: Oecophylla - Harnessing Snakemake
Canonically pronounced ee-co-fill-uh , Oecophylla is a Snakemake wrapper for shotgun sequence analysis.
Rather than being a single monolithic tool, Oecophylla is composed of a series of
modules
, each of which performs a series of related tasks on the data---examples include
qc
,
assemble
, or
taxonomy
. These modules can be independently installed, and are easy to add or change if you want to try a new analysis on an existing data set.
Because Oecophylla is written using the Snakemake bioinformatics workflow system, it inherits many of that tool's advantages, including reproducible execution, automatic updating of downstream steps after upstream modifications, and cluster-enabled parallel execution. To learn more about Snakemake, please read the documentation .
Documentation
You can find instructions on installation and use of Oecophylla at the documentation on ReadTheDocs.io.
Code Snippets
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: dbname = os.path.basename(output.db) h5name = os.path.basename(output.h5) shell(""" set +u; {params.env}; set -u anvi-gen-contigs-database -f {input} \ -o {temp_dir}/{dbname} \ -n '{wildcards.bin_sample} contigs db' 2> {log} 1>&2 anvi-run-hmms -c {temp_dir}/{dbname} --num-threads {threads} 2>> {log} 1>&2 scp {temp_dir}/{dbname} {output.db} scp {temp_dir}/{h5name} {output.h5} """) |
45 46 47 48 49 50 51 52 53 54 55 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: out_name = os.path.basename(output.gene_calls) db = anvio_dir + "{s}/{s}.db".format(s=wildcards.bin_sample) shell(""" set +u; {params.env}; set -u anvi-get-dna-sequences-for-gene-calls -c {db} -o {temp_dir}/{out_name} 2> {log} 1>&2 scp {temp_dir}/{out_name} {output.gene_calls} """) |
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: hits_name = os.path.basename(output.hits) report_name = os.path.basename(output.report) db = anvio_dir + "{s}/{s}.db".format(s=wildcards.bin_sample) shell(""" set +u; {params.cent_env}; set -u centrifuge -f --threads {threads} \ -x {params.centrifuge_db} \ {input.fa} \ -S {temp_dir}/{hits_name} \ --report-file {temp_dir}/{report_name} scp {temp_dir}/{hits_name} {output.hits} scp {temp_dir}/{report_name} {output.report} set +u; {params.anvi_env}; set -u anvi-import-taxonomy -c {db} \ -i {output.report} {output.hits} \ -p centrifuge 2> {log} 1>&2 """) |
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: prof_dir = os.path.dirname(output.prof) db = anvio_dir + "{s}/{s}.db".format(s=wildcards.bin_sample) shell(""" set +u; {params.env}; set -u scp {input.bam} {temp_dir}/{wildcards.bin_sample}_{wildcards.abund_sample}.bam scp {input.bai} {temp_dir}/{wildcards.bin_sample}_{wildcards.abund_sample}.bam.bai anvi-profile -i {temp_dir}/{wildcards.bin_sample}_{wildcards.abund_sample}.bam \ --num-threads {threads} --write-buffer-size 1000 \ -c {db} \ --skip-SNV-profiling \ --overwrite-output-destinations \ -o {temp_dir}/out 2> {log} 1>&2 scp {temp_dir}/out/* {prof_dir}/. """) lambda wildcards: expand(map_dir + "{bin_sample}/mapping/{bin_sample}_{abund_sample}.bam", abund_sample=bin_config[wildcards.bin_sample], bin_sample=wildcards.bin_sample) |
156 157 158 159 160 161 162 163 164 165 166 167 168 169 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: merge_dir = os.path.dirname(output.prof) db = anvio_dir + "{s}/{s}.db".format(s=wildcards.bin_sample) shell(""" set +u; {params.env}; set -u anvi-merge {input.profiles} \ -o {temp_dir}/SAMPLES_MERGED \ -c {db} \ -W 2> {log} 1>&2 scp -r {temp_dir}/SAMPLES_MERGED/* {merge_dir}/. """) |
184 185 186 187 188 189 190 191 192 193 194 | run: db = anvio_dir + "{s}/{s}.db".format(s=wildcards.bin_sample) shell(""" set +u; {params.env}; set -u anvi-import-collection -p {input.prof} \ -c {db} \ -C "MaxBin2" \ --contigs-mode \ {input.bins} 2> {log} 1>&2 """) |
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: db = anvio_dir + "{s}/{s}.db".format(s=wildcards.bin_sample) shell(""" set +u; {params.env}; set -u anvi-summarize -p {input.prof} \ -c {db} \ -o {temp_dir}/{wildcards.bin_sample}_samples-summary_CONCOCT \ -C CONCOCT 2> {log} 1>&2 scp {temp_dir}/{wildcards.bin_sample}_samples-summary_CONCOCT/bins_summary.txt {output.txt} scp {temp_dir}/{wildcards.bin_sample}_samples-summary_CONCOCT/index.html {output.report} tar -czvf {temp_dir}/{wildcards.bin_sample}_samples-summary_CONCOCT.tar.gz {temp_dir}/{wildcards.bin_sample}_samples-summary_CONCOCT scp {temp_dir}/{wildcards.bin_sample}_samples-summary_CONCOCT.tar.gz {output.tar} """) |
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: mem_b = params.memory * 1000000000 outdir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u #rm -r {outdir} megahit -1 {input.forward} -2 {input.reverse} \ -m {mem_b} -t {threads} -o {temp_dir}/megahit \ --out-prefix {wildcards.sample} \ 2> {log} 1>&2 scp {temp_dir}/megahit/{{*.log,*.fa,*.txt}} {outdir}/. """) |
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: outdir = os.path.dirname(output[0]) contigs_fp = os.path.abspath(os.path.join(outdir,'contigs.fasta')) shell(""" set +u; {params.env}; set -u spades.py --meta -t {threads} -m {params.mem} \ -1 {input.forward} -2 {input.reverse} -o {temp_dir} \ 2> {log} 1>&2 scp -r {temp_dir}/{{spades.log,*.fasta,assembly_graph*,*.paths}} {outdir}/. ln -s {contigs_fp} {outdir}/{wildcards.sample}.contigs.fa """) |
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: outdir = os.path.dirname(output[0]) contigs_fp = os.path.abspath(os.path.join(outdir,'contigs.fasta')) shell(""" set +u; {params.env}; set -u spades.py -t {threads} -m {params.mem} \ -1 {input.forward} -2 {input.reverse} -o {temp_dir} \ 2> {log} 1>&2 scp -r {temp_dir}/{{spades.log,*.fasta,assembly_graph*,*.paths}} {outdir}/. ln -s {contigs_fp} {outdir}/{wildcards.sample}.contigs.fa """) |
116 117 118 119 120 121 122 123 124 125 126 127 128 129 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: outdir = os.path.dirname(output.done) assemblies = ' '.join(input) shell(""" set +u; {params.env}; set -u metaquast.py -t {threads} -o {temp_dir}/metaquast \ {assemblies} 2> {log} 1>&2 scp -r {temp_dir}/metaquast/* {outdir}/. touch {output.done} """) |
147 148 149 150 151 152 153 154 155 156 157 158 159 160 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: outdir = os.path.dirname(output.done) assemblies = ' '.join(input) shell(""" set +u; {params.env}; set -u quast.py -t {threads} -o {temp_dir}/quast \ {assemblies} 2> {log} 1>&2 scp -r {temp_dir}/quast/* {outdir}/. touch {output.done} """) |
176 177 178 179 180 181 | run: out_dir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u multiqc -f -o {out_dir} {assemble_dir}/*/quast/report.tsv 2> {log} 1>&2 """) |
207 208 209 210 211 212 213 | run: f_fastqs = ' '.join(input.forward) r_fastqs = ' '.join(input.reverse) shell(""" cat {f_fastqs} > {output.forward} 2> {log} cat {r_fastqs} > {output.reverse} 2> {log} """) |
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: out_fp = os.path.abspath(output[0]) for bam in input.bams: fn = os.path.basename(bam).split('.')[0] shell(""" set +u; {params.env}; set -u; samtools view {bam} --threads {threads} -h | \ pileup.sh -Xmx8g out={temp_dir}/{fn}_pileup.txt 2> {log} 1>&2 awk '{{print $1"\t"$2}}' {temp_dir}/{fn}_pileup.txt | \ grep -v '^#' > {temp_dir}/{fn}_abund.txt """) shell(""" cd {temp_dir} mkdir {wildcards.bin_sample}_abund_files mv *.txt {wildcards.bin_sample}_abund_files/. tar -czvf {wildcards.bin_sample}_abund_files.tar.gz {wildcards.bin_sample}_abund_files scp {wildcards.bin_sample}_abund_files.tar.gz {out_fp} """) |
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: out_base = os.path.dirname(output[0]) if params.maxbin is None: params.maxbin = '' shell(""" set +u; {params.env}; set -u; mkdir {temp_dir}/abund_files tar -xzvf {input.abund} -C {temp_dir}/abund_files --strip-components=1 touch {temp_dir}/abund_list.txt for f in {temp_dir}/abund_files/*.txt do echo $f >> {temp_dir}/abund_list.txt done run_MaxBin.pl -contig {input.ref} \ -out {temp_dir}/{wildcards.bin_sample} \ -abund_list {temp_dir}/abund_list.txt \ {params.maxbin} -thread {threads} 2> {log} 1>&2 rm -r {temp_dir}/abund_files scp {temp_dir}/* {out_base}/. """) |
96 97 98 99 100 101 102 103 104 105 106 107 108 109 | run: bin_dir = os.path.dirname(input[0]) bins = '' for file in os.listdir(bin_dir): if file.endswith(".fasta"): bin = 'Bin_' + str(file.split('.')[-2]) with open(os.path.join(bin_dir, file),'r') as f: for line in f: if line.strip().startswith('>'): name = line.strip()[1:] bins += "{0}\t{1}\n".format(name, bin) with open(output[0], 'w') as out: out.write(bins) |
20 21 22 23 24 25 26 27 28 29 | run: output_base = os.path.splitext(output[0])[0] shell(""" set +u; {params.env}; set -u mash sketch -r -k {params.kmer} \ -m {params.m} -s {params.size} \ -o {output_base} {input.forward} """) |
45 46 47 48 | run: from itertools import combinations for i, j in combinations(input, 2): shell('mash dist {i} {j} >> {output[0]}') |
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | run: from skbio.stats.distance import DissimilarityMatrix import pandas as pd import numpy as np mash_vec = pd.read_csv(input[0], sep = '\t', header=None) # get sorted list of samples samples = sorted(set(mash_vec[0]) | set(mash_vec[1])) dm = np.zeros([len(samples),len(samples)]) pm = np.zeros([len(samples),len(samples)]) # fill matrices with values for s1, s2, d, p in zip(mash_vec[0],mash_vec[1],mash_vec[2],mash_vec[3]): i1 = samples.index(s1) i2 = samples.index(s2) print('s1: %s, s2: %s, i1: %s, i2: %s, d: %s, p: %s' % (s1, s2, i1, i2, d, p)) dm[i1,i2] = d dm[i2,i1] = d pm[i1,i2] = p pm[i2,i1] = p ids = [os.path.basename(x) for x in samples] sk_dm = DissimilarityMatrix(dm, ids=ids) sk_pm = DissimilarityMatrix(pm, ids=ids) sk_dm.write(output['dist_matrix']) sk_pm.write(output['p_matrix']) |
123 124 125 126 127 128 129 130 131 132 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u zcat {input.forward} {input.reverse} > {temp_dir}/{wildcards.sample} sourmash compute --scaled {params.scaled} \ -k {params.kmer} -o {output.sketch} \ {temp_dir}/{wildcards.sample} """) |
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | run: from skbio.stats.distance import DistanceMatrix from skbio.io import write import pandas as pd import numpy as np i = ' '.join(input) shell("""set +u; {params.env}; set -u sourmash compare {i} --csv {output.sim_list}""") sim = pd.read_csv(output['sim_list']) ids = [os.path.basename(x) for x in sim.columns] dist = (1 - sim).values # because numerical overflow, set diagonal to zero explicitly np.fill_diagonal(dist, 0) dm = DistanceMatrix(dist, ids=ids) dm.write(output.dist_matrix) |
24 25 26 27 28 29 30 31 32 33 34 35 36 37 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: for file in input: shell("cp {0} {1}/.".format(file, temp_dir)) shell(""" set +u; {params.env}; set -u humann2_join_tables --input {temp_dir} \ --output {output.joint_prof} 2> {log} 1>&2 humann2_reduce_table --input {output.joint_prof} \ --output {output.max_prof} --function max \ --sort-by level 2>> {log} 1>&2 """) |
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u zcat {input.forward} {input.reverse} > {temp_dir}/input.fastq humann2 --input {temp_dir}/input.fastq \ --output {temp_dir}/{wildcards.sample} \ --output-basename {wildcards.sample} \ --nucleotide-database {params.nt_db} \ --protein-database {params.aa_db} \ --taxonomic-profile {input.metaphlan_in} \ --o-log {log} \ --threads {threads} \ {params.other} 2> {log} 1>&2 scp {temp_dir}/{wildcards.sample}/{wildcards.sample}_genefamilies.tsv {output.genefamilies} scp {temp_dir}/{wildcards.sample}/{wildcards.sample}_pathcoverage.tsv {output.pathcoverage} scp {temp_dir}/{wildcards.sample}/{wildcards.sample}_pathabundance.tsv {output.pathabundance} """) |
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | run: out_dir = os.path.join(func_dir, "humann2") with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u # copy per-sample tables mkdir {temp_dir}/genef cp {input.genef} {temp_dir}/genef/. mkdir {temp_dir}/pathc cp {input.pathc} {temp_dir}/pathc/. mkdir {temp_dir}/patha cp {input.patha} {temp_dir}/patha/. # combine tables humann2_join_tables --input {temp_dir}/genef \ --output {temp_dir}/genefamilies.tsv \ --file_name genefamilies 2> {log} 1>&2 humann2_join_tables --input {temp_dir}/pathc \ --output {temp_dir}/pathcoverage.tsv \ --file_name pathcoverage 2>> {log} 1>&2 humann2_join_tables --input {temp_dir}/patha \ --output {temp_dir}/pathabundance.tsv \ --file_name pathabundance 2>> {log} 1>&2 # normalize humann2_renorm_table \ --input {temp_dir}/genefamilies.tsv \ --output {temp_dir}/genefamilies_cpm.tsv \ --units cpm -s n 2>> {log} 1>&2 humann2_renorm_table \ --input {temp_dir}/pathcoverage.tsv \ --output {temp_dir}/pathcoverage_relab.tsv \ --units relab -s n 2>> {log} 1>&2 humann2_renorm_table \ --input {temp_dir}/pathabundance.tsv \ --output {temp_dir}/pathabundance_relab.tsv \ --units relab -s n 2>> {log} 1>&2 # stratify humann2_split_stratified_table \ --input {temp_dir}/genefamilies_cpm.tsv \ --output {temp_dir} 2>> {log} 1>&2 humann2_split_stratified_table \ --input {temp_dir}/pathcoverage_relab.tsv \ --output {temp_dir} 2>> {log} 1>&2 humann2_split_stratified_table \ --input {temp_dir}/pathabundance_relab.tsv \ --output {temp_dir} 2>> {log} 1>&2 # convert to biom for f in {temp_dir}/*.tsv do fn=$(basename "$f") biom convert -i $f -o {temp_dir}/"${{fn%.*}}".biom --to-hdf5 done # copy bioms to output cp {temp_dir}/*.biom {out_dir}/. """) |
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | run: out_dir = os.path.dirname(output[0]) # out_dir = os.path.join(func_dir, "{sample}/shogun") with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u cp {input.profile} {temp_dir}/{wildcards.sample}.txt for level in genus species strain do shogun functional \ --database {params.db} \ --input {temp_dir}/{wildcards.sample}.txt \ --output {temp_dir} \ --level $level \ 2>> {log} 1>&2 done cp {temp_dir}/{wildcards.sample}.*.txt {out_dir}/ """) |
321 322 323 324 325 326 | run: for level in ('genus', 'species', 'strain'): for target in ('kegg', 'modules', 'modcov', 'pathways', 'pathcov'): item = '%s_%s' % (level, target) pandas2biom(output[item], combine_profiles(zip(samples, input[item]))) |
12 13 14 15 16 17 | run: prepend = '{0}_{1}_contig_'.format(wildcards.bin_sample, config['params']['mapping_assembler']) simplify_headers(input[0], prepend=prepend, output_fp=output.fasta, header_fp=output.headers) |
33 34 35 36 37 38 | run: outdir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u; bowtie2-build {input} {outdir}/{wildcards.bin_sample} 2> {log} 1>&2""") |
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: idx_base = os.path.join(os.path.dirname(input.idx[0]), wildcards.bin_sample) shell(""" set +u; {params.env}; set -u; bowtie2 -x {idx_base} -p {threads} --no-unal \ -q -1 {input.forward} -2 {input.reverse} 2> {log.bowtie} | \ samtools sort -O bam -l 0 -T {temp_dir} -o {temp_dir}/out.bam 2> {log.other} samtools index {temp_dir}/out.bam scp {temp_dir}/out.bam {output.bam} scp {temp_dir}/out.bam.bai {output.bai} """) |
16 17 18 19 20 21 22 23 24 25 | run: out_dir = os.path.dirname(output[0]) in_fastqs = ' '.join(input.forward + input.reverse) shell(""" set +u; {params.env}; set -u fastqc --threads {threads} --outdir {out_dir} {in_fastqs} 2> {log} 1>&2 touch {output} """) |
42 43 44 45 46 47 48 | run: out_dir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u multiqc -f -o {out_dir} {raw_dir}/*/fastqc_per_file 2> {log} 1>&2 """) |
65 66 67 68 69 70 71 | run: f_fastqs = ' '.join(input.forward) r_fastqs = ' '.join(input.reverse) shell(""" cat {f_fastqs} > {output.forward} 2> {log} cat {r_fastqs} > {output.reverse} 2> {log} """) |
90 91 92 93 94 95 96 97 | run: out_dir = os.path.dirname(output[0]) print(out_dir) shell(""" set +u; {params.env}; set -u fastqc --threads {threads} --outdir {out_dir} {input.forward} {input.reverse} 2> {log} 1>&2 """) |
114 115 116 117 118 119 120 | run: out_dir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u multiqc -f -o {out_dir} {raw_dir}/*/fastqc_per_sample 2> {log} 1>&2 """) |
151 152 153 154 155 156 157 158 159 160 161 162 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: f_fp = os.path.basename(output.forward) r_fp = os.path.basename(output.reverse) shell(""" set +u; {params.env}; set -u atropos --threads {threads} {params.atropos} --report-file {log} --report-formats txt -o {temp_dir}/{f_fp} -p {temp_dir}/{r_fp} -pe1 {input.forward} -pe2 {input.reverse} scp {temp_dir}/{f_fp} {output.forward} scp {temp_dir}/{r_fp} {output.reverse} """) |
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | run: if params.filter_db is None: f_fp = os.path.abspath(input.forward) r_fp = os.path.abspath(input.reverse) shell(""" ln -s {f_fp} {output.forward} ln -s {r_fp} {output.reverse} echo 'No DB provided; sample not filtered.' > {log.bowtie} echo 'No DB provided; sample not filtered.' > {log.other} """) else: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u bowtie2 -p {threads} -x {params.filter_db} --very-sensitive -1 {input.forward} -2 {input.reverse} 2> {log.bowtie}| \ samtools view -f 12 -F 256 -b -o {temp_dir}/{wildcards.sample}.unsorted.bam 2> {log.other} samtools sort -T {temp_dir}/{wildcards.sample} -@ {threads} -n \ -o {temp_dir}/{wildcards.sample}.bam {temp_dir}/{wildcards.sample}.unsorted.bam 2> {log.other} bedtools bamtofastq -i {temp_dir}/{wildcards.sample}.bam -fq {temp_dir}/{wildcards.sample}.R1.trimmed.filtered.fastq -fq2 {temp_dir}/{wildcards.sample}.R2.trimmed.filtered.fastq 2> {log.other} pigz -p {threads} -c {temp_dir}/{wildcards.sample}.R1.trimmed.filtered.fastq > {temp_dir}/{wildcards.sample}.R1.trimmed.filtered.fastq.gz pigz -p {threads} -c {temp_dir}/{wildcards.sample}.R2.trimmed.filtered.fastq > {temp_dir}/{wildcards.sample}.R2.trimmed.filtered.fastq.gz cp {temp_dir}/{wildcards.sample}.R1.trimmed.filtered.fastq.gz {output.forward} cp {temp_dir}/{wildcards.sample}.R2.trimmed.filtered.fastq.gz {output.reverse} """) |
248 249 250 251 252 253 254 | run: out_dir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u fastqc --threads {threads} --outdir {out_dir} {input.forward} {input.reverse} 2> {log} 1>&2 """) |
271 272 273 274 275 276 277 | run: out_dir = os.path.dirname(output[0]) shell(""" set +u; {params.env}; set -u multiqc -f -s -o {out_dir} {qc_dir}/*/fastqc_per_sample {qc_dir}/logs 2> {log} 1>&2 """) |
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u # get stem file path stem={output.profile} stem=${{stem%.profile.txt}} # merge input files zcat {input.forward} {input.reverse} > {temp_dir}/input.fastq # run MetaPhlAn2 to generate taxonomic profile metaphlan2.py {temp_dir}/input.fastq \ --input_type fastq \ --mpa_pkl {params.db}.pkl \ --bowtie2db {params.db} \ --nproc {threads} \ --tmp_dir {temp_dir} \ --bowtie2out {temp_dir}/map.tmp \ -o {output.profile} \ 2> {log} 1>&2 # keep mapping file if [[ "{params.map}" == "True" ]] then gzip -c {temp_dir}/map.tmp > $stem.map.txt.gz fi """) |
86 87 88 89 90 91 92 93 94 95 96 97 | run: table = combine_profiles(zip(samples, input)) pandas2biom(output[0], table) name2tid = None if params['name2tid']: with open(params['name2tid'], 'r') as f: name2tid = dict(x.split('\t') for x in f.read().splitlines()) for level in params['levels'].split(','): pandas2biom('%s/metaphlan2/combined_profile.%s.biom' % (taxonomy_dir, level), extract_level(table, level[0].lower(), delim='|', dic=name2tid)) |
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u # get stem file path stem={output.report} stem=${{stem%.report.txt}} # run Kraken to align reads against reference genomes kraken {input.forward} {input.reverse} \ --db {params.db} \ --paired \ --fastq-input \ --gzip-compressed \ --only-classified-output \ --threads {threads} \ 1> {temp_dir}/map.tmp \ 2> {log} # generate hierarchical report kraken-report {temp_dir}/map.tmp \ --db {params.db} \ 1> {output.report} \ 2>> {log} # generate lineage to count table kraken-mpa-report {temp_dir}/map.tmp \ --db {params.db} \ 1> {output.profile} \ 2>> {log} # keep mapping file if [[ "{params.map}" == "True" ]] then gzip -c {temp_dir}/map.tmp > $stem.map.txt.gz fi # run Bracken to re-estimate abundance at given rank if [[ ! -z {params.levels} ]] then IFS=',' read -r -a levels <<< "{params.levels}" for level in "${{levels[@]}}" do est_abundance.py -i {output.report} \ -k {params.kmers} \ -t 10 \ -l $(echo $level | head -c 1 | tr a-z A-Z) \ -o $stem.redist.$level.txt \ 2>> {log} 1>&2 rm $stem.report_bracken.txt done fi """) |
203 204 205 206 207 208 209 210 211 | run: pandas2biom(output[0], combine_profiles(zip(samples, input))) for level in params['levels'].split(','): redists = ['%s/%s/kraken/%s.redist.%s.txt' % (taxonomy_dir, sample, sample, level) for sample in samples] pandas2biom('%s/kraken/combined_redist.%s.biom' % (taxonomy_dir, level), combine_bracken(zip(samples, redists))) |
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | run: with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u # get stem file path stem={output.report} stem=${{stem%.report.txt}} # run Centrifuge to align reads against reference genomes centrifuge \ -1 {input.forward} \ -2 {input.reverse} \ -x {params.db} \ -p {threads} \ -S {temp_dir}/map.tmp \ --report-file {output.profile} \ 2> {log} 1>&2 # generate Kraken-style hierarchical report centrifuge-kreport {temp_dir}/map.tmp \ -x {params.db} \ 1> {output.report} \ 2>> {log} # keep mapping file if [[ "{params.map}" == "True" ]] then gzip -c {temp_dir}/map.tmp > $stem.map.txt.gz fi """) |
295 296 297 298 299 300 301 302 | run: # this is not a typo. Centrifuge produces Kraken-style reports. comb, lv2tids = combine_kraken(zip(samples, input)) pandas2biom(output[0], comb) for level in params['levels'].split(','): pandas2biom('%s/centrifuge/combined_profile.%s.biom' % (taxonomy_dir, level), comb[comb.index.isin(lv2tids[level[0].upper()])]) |
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | run: aln2ext = {'utree': 'tsv', 'burst': 'b6', 'bowtie2': 'sam'} ext = aln2ext[params['aligner']] with tempfile.TemporaryDirectory(dir=find_local_scratch(TMP_DIR_ROOT)) as temp_dir: shell(""" set +u; {params.env}; set -u # get stem file path stem={output.profile} stem=${{stem%.profile.txt}} # interleave paired fastq's and convert to fasta seqtk mergepe {input.forward} {input.reverse} | \ seqtk seq -A > {temp_dir}/{wildcards.sample}.fna # map reads to reference database shogun align \ --aligner {params.aligner} \ --threads {threads} \ --database {params.db} \ --input {temp_dir}/{wildcards.sample}.fna \ --output {temp_dir} \ 2> {log} 1>&2 # build taxonomic profile based on read map shogun assign_taxonomy \ --aligner {params.aligner} \ --database {params.db} \ --input {temp_dir}/alignment.{params.aligner}.{ext} \ --output {output.profile} \ 2> {log} 1>&2 # keep mapping file if [[ "{params.map}" == "True" ]] then gzip -c {temp_dir}/alignment.{params.aligner}.{ext} > $stem.{params.aligner}.{ext}.gz fi # redistribute reads to given taxonomic ranks if [[ ! -z {params.levels} ]] then IFS=',' read -r -a levels <<< "{params.levels}" for level in "${{levels[@]}}" do shogun redistribute \ --database {params.db} \ --level $level \ --input {output.profile} \ --output $stem.redist.$level.txt \ 2> {log} 1>&2 done fi """) |
406 407 408 409 410 411 412 413 414 | run: pandas2biom(output[0], combine_profiles(zip(samples, input))) for level in params['levels'].split(','): redists = ['%s/%s/shogun/%s.redist.%s.txt' % (taxonomy_dir, sample, sample, level) for sample in samples] pandas2biom('%s/shogun/combined_redist.%s.biom' % (taxonomy_dir, level), combine_profiles(zip(samples, redists))) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | shell: "rm -rf " + raw_dir rule clean_qc: shell: "rm -rf " + qc_dir rule clean_assemble: shell: "rm -rf " + assemble_dir rule clean_map: shell: "rm -rf " + map_dir rule clean_bin: shell: "rm -rf " + bin_dir rule clean_anvio: shell: "rm -rf " + anvio_dir rule clean: shell: "rm -rf " + raw_dir + " " + "results" |
Support
- Future updates
Related Workflows





