Snakemake pipelines for nanopore sequencing data archiving and processing
Documentation

Detailed documentation and tutorials are available at readthedocs .
Nanopype is a snakemake based pipeline providing convenient Oxford Nanopore Technology (ONT) data processing and storage solutions.
The pipeline is split in a processing part including basecalling and alignment and an analysis part covering further downstream applications. A summary of all included tools is given in the tools section.
To get started the installation chapter describes the available installation options depending on the operation system, available hardware and already existing environments.
Recurring steps of the nanopore data analysis are covered under workflow for both local and cluster usage.
The modules part covers an in depth description of all available tools and workflows together with their respective configuration options. This section is the main reference of the pipeline.
Finally for new users the tutorial might be helpful to learn the general concepts and usage of the pipeline. To complete the tutorial the test reads included in the package are sufficient and no separate wet-lab experiment is required.
Code Snippets
79 80 81 82 83 | shell: """ {config[bin_singularity][minimap2]} -t {threads} {config[alignment_minimap2_flags]} {input.reference} {input.sequence} 1>> {output} 2> >(tee {output}.log >&2) if [ $(grep 'ERROR' {output}.log | wc -l) -gt 0 ]; then exit 1; else rm {output}.log; fi """ |
101 102 103 104 | shell: """ {config[bin_singularity][graphmap2]} align -r {input.reference} -d {input.sequence} -t {threads} {config[alignment_graphmap2_flags]} >> {output} """ |
114 115 116 117 | shell: """ {config[bin_singularity][graphmap2]} align -r {input.fasta} --index-only """ |
137 138 139 140 | shell: """ cat {input.sequence} | {config[bin_singularity][ngmlr]} -r {input.reference} -t {threads} {config[alignment_ngmlr_flags]} >> {output} """ |
150 151 152 153 154 | shell: """ echo '' | {config[bin_singularity][ngmlr]} -r {input.fasta} touch {output.index} """ |
169 170 171 172 | shell: """ {config[bin_singularity][samtools]} faidx {input.fasta} """ |
189 190 191 192 193 | shell: """ cat {input.sam} | perl -anle 'BEGIN{{$header=1}}; if($header == 1){{ if($_ =~ /^@/) {{print $_}} else {{$header=0; print "\@RG\tID:{wildcards.batch}"}}}} else {{print $_}}' | perl -anle 'if($_ =~ /^@/){{print $_}}else{{print join("\t", @F, "RG:Z:{wildcards.batch}")}}' | {config[bin_singularity][samtools]} view -b {config[alignment_samtools_flags]} - | {config[bin_singularity][samtools]} sort -m 4G > {output.bam} {config[bin_singularity][samtools]} index {output.bam} """ |
200 201 202 | run: with open(output.txt, 'w') as fp: print('\n'.join(input.bam), file=fp) |
218 219 220 221 222 223 224 225 226 | shell: """ split -l $((`ulimit -n` -10)) {input.bam} {params.input_prefix}.part_ for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads} {config[bin_singularity][samtools]} index {output.bam} -@ {threads} rm {params.input_prefix}.part_* """ |
238 239 240 241 242 | run: with open(output.txt, 'w') as fp_out: for f in input.txt: with open(f, 'r') as fp_in: fp_out.write(fp_in.read()) |
257 258 259 260 261 262 263 264 265 | shell: """ split -l $((`ulimit -n` -10)) {input.bam} {params.input_prefix}.part_ for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads} {config[bin_singularity][samtools]} index {output.bam} -@ {threads} rm {params.input_prefix}.part_* """ |
280 281 282 283 | shell: """ {config[bin_singularity][samtools]} view -F 4 {input} | {config[bin_singularity][python]} {config[sbin_singularity][alignment_1D2.py]} --buffer {params.buffer} --tolerance {params.tolerance} > {output} """ |
296 297 298 299 | shell: """ while IFS= read -r bam_file; do {config[bin_singularity][samtools]} view ${{bam_file}}; done < {input} | {config[bin_singularity][python]} {config[sbin_singularity][alignment_stats.py]} {output} """ |
314 315 316 317 318 | shell: """ while IFS= read -r bam_file; do {config[bin_singularity][samtools]} view -bF 4 ${{bam_file}} -@ {threads} | {config[bin_singularity][bedtools]} bamtobed -i stdin; done < {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][alignment_cov.py]} {input.chr_sizes} > {output.bedGraph} {config[bin_singularity][bedGraphToBigWig]} {output.bedGraph} {input.chr_sizes} {output.bw} """ |
58 59 60 61 62 63 64 | shell: """ flye_dir=`dirname {config[bin_singularity][python]}` PATH=$flye_dir:$PATH {config[bin_singularity][python]} {config[bin_singularity][flye]} {params.flye_flags} -g {params.genome_size} -t {threads} {params.flye_preset} {input.seq} -o {params.out_prefix} mv {params.out_prefix}/assembly.fasta {output.fa} """ |
83 84 85 86 87 88 89 90 91 | shell: """ mkdir -p {params.out_prefix} {config[bin_singularity][wtdbg2]} {params.wtdbg2_flags} -x {params.wtdbg2_preset} -g {params.genome_size} -t {threads} -fo {params.out_prefix}/dbg -i {input.seq} {config[bin_singularity][wtpoa-cns]} -t {threads} -i {params.out_prefix}/dbg.ctg.lay.gz -fo {params.out_prefix}/dbg.raw.fa {config[bin_singularity][minimap2]} -t {threads} -ax map-ont -r2k {params.out_prefix}/dbg.raw.fa {input.seq} | {config[bin_singularity][samtools]} sort -@ 4 > {params.out_prefix}/dbg.bam samtools view -F0x900 {params.out_prefix}/dbg.bam | {config[bin_singularity][wtpoa-cns]} -t {threads} -d {params.out_prefix}/dbg.raw.fa -i - -fo {params.out_prefix}/dbg.cns.fa mv {params.out_prefix}/dbg.cns.fa {output.fa} """ |
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | shell: """ mkdir -p raw {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format bulk {config[bin_singularity][guppy_basecaller]} -i raw/ --recursive --num_callers 1 --cpu_threads_per_caller {threads} -s workspace/ {params.guppy_config} {params.filtering} {params.guppy_flags} {params.guppy_server} FASTQ_DIR='workspace/pass' if [ \'{params.filtering}\' = '' ]; then FASTQ_DIR='workspace' fi find ${{FASTQ_DIR}} -regextype posix-extended -regex '^.*f(ast)?q' -exec cat {{}} \; | gzip > {output[0]} find ${{FASTQ_DIR}} -name 'sequencing_summary.txt' -exec mv {{}} {output[1]} \; if [ \'{params.mod_table}\' != '' ]; then {config[bin_singularity][python]} {config[sbin_singularity][basecalling_guppy_mod.py]} extract `find workspace/ -name '*.fast5'` {params.mod_table} fi """ |
124 125 126 127 128 129 130 131 132 133 134 135 | shell: """ export OPENBLAS_NUM_THREADS=1 mkdir -p raw {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format single find raw/ -regextype posix-extended -regex '^.*fast5' -type f -exec du -h {{}} + | sort -r -h | cut -f2 > raw.fofn split -e -n r/{threads} raw.fofn raw.fofn.part. ls raw.fofn.part.* | xargs -n 1 -P {threads} -I {{}} $SHELL -c 'cat {{}} | shuf | xargs -n 1 {config[bin_singularity][flappie]} --model {config[basecalling_flappie_model]} {config[basecalling_flappie_flags]} > raw/{{}}.fastq' find ./raw -regextype posix-extended -regex '^.*f(ast)?q' -exec cat {{}} \; > {wildcards.batch}.fq cat {wildcards.batch}.fq | {config[bin_singularity][python]} {config[sbin_singularity][methylation_flappie.py]} split methyl_marks.tsv | gzip > {output.sequence} cat methyl_marks.tsv | gzip > {output.methyl_marks} """ |
143 144 145 146 147 | run: with open(output[0], 'wb') as fp_out: for f in input: with open(f, 'rb') as fp_in: fp_out.write(fp_in.read()) |
154 155 156 157 158 | run: with open(output[0], 'wb') as fp_out: for f in input: with open(f, 'rb') as fp_in: fp_out.write(fp_in.read()) |
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | run: import gzip import pandas as pd def fastq_iter(iterable): while True: try: title = next(iterable) assert title[0] == '@' seq = next(iterable) _ = next(iterable) qual = next(iterable) except StopIteration: return mean_q = sum([ord(x) - 33 for x in qual]) / len(qual) if qual else 0.0 yield len(seq), mean_q line_iter = (line for f in input for line in gzip.open(f, 'rb').read().decode('utf-8').split('\n') if line) df = pd.DataFrame(fastq_iter(line_iter), columns=['length', 'quality']) df.to_hdf(output[0], 'stats') |
51 52 | shell: "rm -r {input}" |
60 61 | shell: "rm -r {input}" |
69 70 | shell: "rm -r {input}" |
78 79 | shell: "rm -r {input}" |
87 88 | shell: "rm -r {input}" |
96 97 | shell: "rm -r {input}" |
108 109 | shell: "rm {input}" |
120 121 122 123 | shell: """ {config[bin_singularity][guppy_barcoder]} -i {params.seq_dir} -s {output.batches} -t {threads} -q {config[demux_batch_size]} --compress_fastq --{params.kits} """ |
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | run: import os, gzip, re import shutil import itertools barcodes = [f.name for f in os.scandir(input.batches) if f.is_dir()] def fastq_ID(iterable): fq_iter = itertools.islice(iterable, 0, None, 4) while True: try: header = next(fq_iter).decode('utf-8') yield header[1:].split()[0] except StopIteration: return def touch(fname, times=None): with open(fname, 'a'): os.utime(fname, times) for barcode in barcodes: os.makedirs(os.path.join(output.barcodes, barcode), exist_ok=True) batches = [f.name for f in os.scandir(os.path.join(input.batches, barcode)) if f.name.endswith('fastq.gz')] for batch in batches: # read IDs from fastq stream batch_file = os.path.join(input.batches, barcode, batch) IDs = [] with gzip.open(batch_file, 'rb') as fp: IDs = [ID for ID in fastq_ID(fp)] # write IDs to batch file with open(os.path.join(output.barcodes, barcode, re.sub('fastq\.gz', 'txt', batch)), 'w') as fp: fp.write('\n'.join(IDs)) # move fastq to sequences directory bc_map = (config['barcodes'][wildcards.runname] if wildcards.runname in config['barcodes'] else config['barcodes']['__default__'] if '__default__' in config['barcodes'] else {}) bc_tags = [key for key, value in bc_map.items() if value == barcode] # move if one target tag if len(bc_tags) == 1: target_path = os.path.join('sequences', config['demux_seq_workflow'], 'batches', (config['demux_seq_tag'] if 'demux_seq_tag' in config else 'default') + '.' + bc_tags[0], wildcards.runname) shutil.move(os.path.join('demux', 'guppy', 'batches', wildcards.runname, barcode), target_path) # touch sequences after creating batch ID files for f in os.scandir(target_path): if f.name.endswith('fastq.gz'): touch(f.path) # copy if multiple tags per barcode elif len(bc_tags) > 1: for bc_tag in bc_tags: target_path = os.path.join('sequences', config['demux_seq_workflow'], 'batches', (config['demux_seq_tag'] if 'demux_seq_tag' in config else 'default') + '.' + bc_tags[0], wildcards.runname) if os.path.exists(target_path): shutil.rmtree(target_path) shutil.copytree(os.path.join('demux', 'guppy', 'batches', wildcards.runname, barcode), target_path) |
38 39 40 41 42 | shell : "" #rule demux: # input: # "bin/deepbinner" |
132 133 134 135 136 137 138 | shell: """ mkdir -p src && cd src wget -q -nc https://dl.google.com/go/go1.17.3.linux-amd64.tar.gz tar -xzf go1.17.3.linux-amd64.tar.gz rm go1.17.3.linux-amd64.tar.gz """ |
144 145 146 147 148 149 150 151 152 153 154 155 156 | shell: """ install_prefix=`pwd` mkdir -p src && cd src if [ ! -d squashfs-tools ]; then git clone https://github.com/plougher/squashfs-tools --branch 4.4 --depth=1 && cd squashfs-tools else cd squashfs-tools && git fetch --all --tags --prune && git checkout tags/4.4 fi cd squashfs-tools make cp *squashfs $install_prefix/bin/ """ |
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | shell: """ install_prefix=`pwd` mkdir -p src/gocode export GOPATH=$(pwd)/src/gocode export GOROOT=$(pwd)/src/go export PATH=$(pwd)/$(dirname {input.go}):$PATH cd src if [ ! -d singularity ]; then git clone https://github.com/sylabs/singularity.git --branch v3.3.0 --depth=1 && cd singularity else cd singularity && git fetch --all --tags --prune && git checkout tags/v3.3.0 fi ./mconfig --without-suid --prefix=$install_prefix --localstatedir=$install_prefix/ make -C ./builddir make -C ./builddir install """ |
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | shell: """ install_prefix=`pwd` mkdir -p lib mkdir -p src && cd src if [ ! -d vbz_compression ]; then git clone https://github.com/nanoporetech/vbz_compression --recursive --branch v1.0.1 --depth=1 && cd vbz_compression else cd vbz_compression && git fetch --all --tags --prune && git checkout tags/v1.0.1 fi rm -rf build mkdir build && cd build cmake -D CMAKE_BUILD_TYPE=Release -D ENABLE_CONAN=OFF -D ENABLE_PERF_TESTING=OFF -D ENABLE_PYTHON=OFF .. make cp bin/libvbz_hdf_plugin.so ${{install_prefix}}/{output.lib} """ |
205 206 207 208 209 | shell: """ wget -q -O {output} https://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig chmod 755 {output} """ |
215 216 217 218 219 220 221 222 223 224 225 | shell: """ mkdir -p src && cd src if [ ! -d bedtools2 ]; then git clone https://github.com/arq5x/bedtools2 --branch v2.27.1 --depth=1 && cd bedtools2 else cd bedtools2 && git fetch --all --tags --prune && git checkout tags/v2.27.1 fi make clean && make cp bin/bedtools ../../{output.bin} """ |
231 232 233 234 235 236 237 238 239 240 | shell: """ mkdir -p src && cd src if [ ! -d htslib ]; then git clone https://github.com/samtools/htslib --branch 1.9 --depth=1 && cd htslib else cd htslib && git fetch --all --tags --prune && git checkout tags/1.9 fi autoheader && autoconf && ./configure && make -j{threads} """ |
248 249 250 251 252 253 254 255 256 257 258 | shell: """ mkdir -p src && cd src if [ ! -d samtools ]; then git clone https://github.com/samtools/samtools --branch 1.9 --depth=1 && cd samtools else cd samtools && git fetch --all --tags --prune && git checkout tags/1.9 fi autoheader --warning=none && autoconf -Wno-syntax && ./configure && make -j{threads} cp samtools ../../{output.bin} """ |
264 265 266 267 268 269 270 271 272 273 274 | shell: """ mkdir -p src && cd src if [ ! -d minimap2 ]; then git clone https://github.com/lh3/minimap2 --branch v2.14 --depth=1 && cd minimap2 else cd minimap2 && git fetch --all --tags --prune && git checkout tags/v2.14 fi make clean && make -j{threads} cp minimap2 ../../{output.bin} """ |
280 281 282 283 284 285 286 287 288 289 290 | shell: """ mkdir -p src && cd src if [ ! -d graphmap2 ]; then git clone https://github.com/lbcb-sci/graphmap2 --branch v0.6.4 --depth=1 && cd graphmap2 else cd graphmap2 && git fetch --all --tags --prune && git checkout tags/v0.6.4 fi make modules -j{threads} && make -j{threads} cp bin/*/graphmap2 ../../bin/ """ |
296 297 298 299 300 301 302 303 304 305 306 | shell: """ mkdir -p src && cd src if [ ! -d ngmlr ]; then git clone https://github.com/philres/ngmlr --branch v0.2.7 --depth=1 && cd ngmlr else cd ngmlr && git fetch --all --tags --prune && git checkout tags/v0.2.7 fi mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release -G {config[build_generator]} .. && cmake --build . --config Release -- -j {threads} cp ../bin/*/ngmlr ../../../bin """ |
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | shell: """ mkdir -p src && cd src if [ ! -d nanopolish ]; then #git clone --recursive https://github.com/jts/nanopolish --branch v0.11.0 --depth=1 && cd nanopolish git clone --recursive https://github.com/jts/nanopolish && cd nanopolish && git reset --hard 46a029c else #cd nanopolish && git fetch --all --tags --prune && git checkout tags/v0.11.0 cd nanopolish && git fetch --all --tags --prune && git checkout 46a029c fi make clean set +e make -j{threads} 2>&1 | tee log.txt | grep 'g++\|gcc' exitcode=$? if [ $exitcode -ne 0 ]; then tail -n 100 log.txt; exit 1; fi cp nanopolish ../../bin/ """ |
336 337 338 339 340 341 342 343 344 345 346 | shell: """ mkdir -p src && cd src if [ ! -d Sniffles ]; then git clone https://github.com/fritzsedlazeck/Sniffles --branch v1.0.10 --depth=1 && cd Sniffles else cd Sniffles && git fetch --all --tags --prune && git checkout tags/v1.0.10 fi mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release -G{config[build_generator]} .. && cmake --build . --config Release -- -j {threads} cp ../bin/*/sniffles ../../../bin """ |
353 354 355 356 357 358 359 360 361 362 363 364 | shell: """ prefix=`pwd` mkdir -p src && cd src if [ ! -d svim ]; then git clone https://github.com/eldariont/svim.git --branch v1.4.0 --depth=1 && cd svim else cd svim && git fetch --all --tags --prune && git checkout tags/v1.4.0 fi {config[python]} -m pip install . find {params.prefix}/bin {params.prefix}/local/bin -type f -name svim -exec cp {{}} ../../{output.bin} \; || true """ |
391 392 393 394 395 396 397 398 | shell: """ mkdir -p src/gocode export GOPATH=$(pwd)/src/gocode {input.go} env -w GO111MODULE=auto {input.go} get github.com/github/git-lfs cp src/gocode/bin/git-lfs bin/ """ |
404 405 406 407 408 409 410 411 412 413 414 415 | shell: """ install_prefix=`pwd` mkdir -p src && cd src if [ ! -d OpenBLAS ]; then git clone https://github.com/xianyi/OpenBLAS --branch v0.3.4 --depth=1 && cd OpenBLAS else cd OpenBLAS && git fetch --all --tags --prune && git checkout tags/v0.3.4 fi make clean && make NO_LAPACK=1 NOFORTRAN=1 -j{threads} make install PREFIX=$install_prefix """ |
421 422 423 424 425 426 427 428 429 430 431 432 433 | shell: """ install_prefix=`pwd` mkdir -p src && cd src if [ ! -d hdf5 ]; then git clone https://bitbucket.hdfgroup.org/scm/hdffv/hdf5.git --branch hdf5-1_8_20 --depth=1 && cd hdf5 else cd hdf5 && git fetch --all --tags --prune && git checkout tags/hdf5-1_8_20 fi mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release -DHDF5_ENABLE_Z_LIB_SUPPORT=ON -DHDF5_BUILD_TOOLS=OFF -DBUILD_TESTING=OFF -DHDF5_BUILD_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$install_prefix -G{config[build_generator]} ../ cmake --build . --config Release -- -j {threads} cmake --build . --config Release --target install """ |
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 | shell: """ # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.1_linux64.tar.gz && # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.5_linux64.tar.gz && # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.7_linux64.tar.gz && # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.0.3_linux64.tar.gz && # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.1.5_linux64.tar.gz && # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.4.4_linux64.tar.gz && mkdir -p src/guppy && cd src/guppy && rm -rf * wget -q https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_4.0.11_linux64.tar.gz tar -xzkf ont-guppy-cpu_4.0.11_linux64.tar.gz -C ./ --strip 1 && \ rm ont-guppy-cpu_4.0.11_linux64.tar.gz # copy everything except toplevel softlinks e.g. # skip libhdf5.so # copy libhdf5.so.1.8.11 rsync --files-from=<(find . ! \( -type l -and -regex '^.*so$' \) -print) --links . ../../ rm -r * """ |
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 | shell: """ mkdir -p src && cd src if [ ! -d pychopper ]; then git clone https://github.com/nanoporetech/pychopper --branch v2.4.0 && cd pychopper else cd pychopper && git fetch --all --tags --prune && git checkout tags/v2.4.0 fi {config[python]} -m pip install --upgrade incremental {config[python]} -m pip install --upgrade certifi {config[python]} -m pip install parasail --upgrade {config[python]} -m pip install "matplotlib<3.1" --upgrade {config[python]} setup.py install find {params.prefix}/bin {params.prefix}/local/bin -type f -name cdna_classifier.py -exec cp {{}} ../../{output.bin} \; || true """ |
511 512 513 514 515 516 517 518 519 520 521 522 | shell: """ mkdir -p src && cd src if [ ! -d racon ]; then git clone https://github.com/isovic/racon --recursive --branch 1.3.2 && cd racon else cd racon && git fetch --all --tags --prune && git checkout tags/1.3.2 fi mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release .. make cp bin/racon ../../../{output.bin} """ |
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 | shell: """ mkdir -p src/gocode export PATH={params.go_dir}:$PATH export GOPATH=$(pwd)/src/gocode {input.go} env -w GO111MODULE=auto {input.go} get github.com/biogo/biogo/... {input.go} get github.com/biogo/hts/... {input.go} get -u gonum.org/v1/gonum/... {input.go} get github.com/google/uuid cd src if [ ! -d pinfish ]; then git clone https://github.com/nanoporetech/pinfish --branch master && cd pinfish else cd pinfish && git fetch --all --tags --prune && git checkout master fi cd cluster_gff && rm -f cluster_gff && make && cd .. cd collapse_partials && rm -f collapse_partials && make && cd .. cd polish_clusters && rm -f polish_clusters && make && cd .. cd spliced_bam2gff && rm -f spliced_bam2gff && make && cd .. cp cluster_gff/cluster_gff ../../bin/ cp collapse_partials/collapse_partials ../../bin/ cp polish_clusters/polish_clusters ../../bin/ cp spliced_bam2gff/spliced_bam2gff ../../bin/ """ |
567 568 569 570 571 572 573 574 575 576 577 578 | shell: """ mkdir -p src && cd src if [ ! -d STRique ]; then git clone --recursive https://github.com/giesselmann/STRique --branch v0.3.0 && cd STRique else cd STRique && git fetch --all --tags --prune && git checkout tags/v0.3.0 fi {config[python]} -m pip install -r requirements.txt --upgrade {config[python]} setup.py install cp scripts/STRique.py ../../bin/ """ |
585 586 587 588 589 590 591 592 593 594 595 596 | shell: """ mkdir -p src && cd src if [ ! -d Flye ]; then git clone https://github.com/fenderglass/Flye --branch 2.8.3 && cd Flye else cd Flye && git fetch --all --tags --prune && git checkout tags/2.8.3 fi make {config[python]} setup.py install find {params.prefix}/bin {params.prefix}/local/bin -name flye -exec cp {{}} ../../{output.bin} \; || true """ |
604 605 606 607 608 609 610 611 612 613 614 615 | shell: """ mkdir -p src && cd src if [ ! -d wtdbg2 ]; then git clone https://github.com/ruanjue/wtdbg2 --branch v2.5 && cd wtdbg2 else cd wtdbg2 && git fetch --all --tags --prune && git checkout tags/v2.5 fi make cp wtdbg2 ../../{output.asm} cp wtpoa-cns ../../{output.cns} """ |
102 103 104 105 106 107 108 109 110 111 112 113 | shell: """ mkdir -p raw {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format lazy zcat {input.sequences} > sequences.fastx {config[bin_singularity][nanopolish]} index -d raw/ sequences.fastx {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fastx -g {input.reference} -b {input.bam} > tmp.tsv cat tmp.tsv | {config[bin_singularity][python]} {config[sbin_singularity][methylation_nanopolish.py]} | sort -k1,1 -k2,2n | gzip > {output[0]} if [ \'{params.raw_out}\' != '' ]; then cat tmp.tsv | gzip > {params.raw_out}; fi """ |
132 133 134 135 | shell: """ {config[bin_singularity][samtools]} view -F 4 {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][methylation_flappie.py]} align {input.reference} {input.seq} {input.tsv} | sort -k1,1 -k2,2n | gzip > {output} """ |
153 154 155 156 | shell: """ {config[bin_singularity][samtools]} view -F 4 {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][basecalling_guppy_mod.py]} align {input.hdf5} --reference {input.reference} | sort -k1,1 -k2,2n | gzip > {output} """ |
164 165 166 167 168 169 170 171 172 | run: with gzip.open(output[0], 'wt') as fp_out: for i, f in enumerate(input): with gzip.open(f, 'rt') as fp_in: header, body = fp_in.read().split('\n', 1) # get header if i == 0: print(header, file=fp_out) print(body, file=fp_out, end='') |
180 181 182 183 184 | run: with open(output[0], 'wb') as fp_out: for f in sorted(input): # sort by file name, you never know with open(f, 'rb') as fp_in: fp_out.write(fp_in.read()) |
191 192 193 | run: with open(output[0], 'w') as fp_out: print('\n'.join([re.sub('.tsv.gz$','',f) for f in sorted(input)]), file=fp_out) |
200 201 202 | run: with open(output[0], 'w') as fp_out: print(''.join(open(f, 'r').read() for f in sorted(input)), file=fp_out, end='') |
216 217 218 219 | shell: """ cat {input} | while read line; do zcat ${{line}}.tsv.gz; done | perl -anle 'print $_ if abs($F[6]) > {params.threshold}' | cut -f1-3,5 | sort -k1,1 -k2,2n | {config[bin_singularity][bedtools]} groupby -g 1,2,3 -c 4 -o mean,count | gzip > {output} """ |
233 234 235 236 | shell: """ zcat {input} | perl -anle 'print $_ if $F[4] >= {params.methylation_min_coverage}' | cut -f1-4 > {output} """ |
249 250 251 252 | shell: """ {config[bin_singularity][bedGraphToBigWig]} {input.bedGraph} {input.chr_sizes} {output} """ |
273 274 275 276 277 | shell: """ {config[bin_singularity][samtools]} view -hF 4 {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][methylation_sr.py]} {params.reference} {input.seq} {input.tsv} --threshold {params.threshold} --mode IGV --polish | {config[bin_singularity][samtools]} view -b > {output.bam} {config[bin_singularity][samtools]} index {output.bam} """ |
292 293 294 295 296 297 298 299 300 | shell: """ cat {input.fofn} | while read line; do echo ${{line}}.bam; done | split -l $((`ulimit -n` -10)) - {params.input_prefix}.part_ for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads} {config[bin_singularity][samtools]} index {output.bam} -@ {threads} rm {params.input_prefix}.part_* """ |
315 316 317 318 319 320 321 322 323 | shell: """ cat {input.fofn} | while read line; do echo ${{line}}.bam; done | split -l $((`ulimit -n` -10)) - {params.input_prefix}.part_ for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads} {config[bin_singularity][samtools]} index {output.bam} -@ {threads} rm {params.input_prefix}.part_* """ |
336 337 338 339 | shell: """ {config[bin_singularity][python]} {config[sbin_singularity][methylation_1D2.py]} {input.pairs} {input.values} | gzip > {output} """ |
80 81 82 83 84 85 86 87 88 89 90 91 92 | run: import os import pandas as pd from pathlib import Path def du(root_dir='.'): root_dir = Path(root_dir) return sum(f.stat().st_size for f in root_dir.glob('**/*') if f.is_file()) raw = sum(du(os.path.join(config['storage_data_raw'], runname)) for runname in config['runnames']) modules = [du(m) for m in input.modules] df = pd.DataFrame(data={'name' : ['raw'] + input.modules, 'bytes' : [raw] + modules}) df.to_hdf(output[0], 'disk_usage') |
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | run: import os, glob import snakemake import numpy as np import pandas as pd import matplotlib as mpl mpl.use('Agg') from matplotlib import pyplot as plt import seaborn as sns configure_seaborn(sns) # stats per workflow and tag workflows = snakemake.utils.listfiles('sequences/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}') workflows = sorted(list(workflows), key=lambda x: x[1].sequence_workflow) run_id = {runname:i for i, runname in enumerate(config['runnames'])} os.makedirs(output.stats, exist_ok=True) os.makedirs(output.plot, exist_ok=True) # plots summary = [] df_list = [] def n50(df): return df[df.throughput > (df.throughput.iloc[-1] / 2)].length.iloc[0] for sequence_workflow, values in itertools.groupby(workflows, lambda x: x[1].sequence_workflow): f, ax = plt.subplots(ncols=2, figsize=(10,5), constrained_layout=True) values = list(values) n_groups = len(values) for tag_folder, tag_wildcards in values: tag_inputs = list(snakemake.utils.listfiles('sequences/{sequence_workflow}/batches/{tag}/{runname}.hdf5'.format(sequence_workflow=sequence_workflow, tag=tag_wildcards.tag, runname='{runname, [^\/]*}'))) if not tag_inputs: continue df = pd.concat([pd.read_hdf(f).assign( Flowcell=lambda x: run_id[wc.runname], Tag=lambda x: tag_wildcards.tag, Basecalling=lambda x: sequence_workflow) for f, wc in tag_inputs if wc.runname in config['runnames']]) df_list.append(df) df.sort_values(by='length', inplace=True) df['throughput'] = df.length.cumsum() xlim = df[df.throughput > (0.90 * df.throughput.iloc[-1])].length.iloc[0] xlim *= 1.1 label = '{tag}'.format(tag=tag_wildcards.tag) # read length histogramm up to 90% of throughput sns.distplot(df.length, kde=False, hist_kws={'range': (0, xlim), 'alpha': 0.5 + (0.5 / n_groups)}, label=label, ax=ax[0]) # cumulative throughput over read length sns.lineplot(x='length', y='throughput', label=label, data=df[df.length < xlim].iloc[np.linspace(0, df[df.length < xlim].shape[0]-1, 1000).astype(int)], ax=ax[1]) summary.append((tag_wildcards.tag, sequence_workflow, df.length.sum(), df.length.mean(), df.length.median(), n50(df), df.length.max() )) ax[0].set_xlabel('Read length') ax[0].set_ylabel('Reads') ax[0].legend() ax[1].set_xlabel('Read length') ax[1].set_ylabel('Throughput') ax[1].legend() f.suptitle("Read length distribution and cumulative throughput") f.savefig(os.path.join(output.plot, 'sequences1_{}.svg'.format(sequence_workflow))) if len(summary): df_total = pd.concat(df_list) df_total_agg = df_total.groupby(['Basecalling', 'Tag', 'Flowcell']).agg( Total=('length', 'sum') ).reset_index() for bc in df_total_agg.Basecalling.unique(): f, ax = plt.subplots(nrows=2, figsize=(10,5), sharex=True, constrained_layout=True) # Barplot with throughput per flow cell sns.barplot(x='Flowcell', y='Total', hue='Tag', data=df_total_agg[df_total_agg.Basecalling == bc], errwidth=0, ax=ax[0]) ax[0].set_ylabel('Throughput') ax[0].set_xlabel('') # Read length distribution per flow cell sns.boxplot(x="Flowcell", y="length", hue="Tag", showfliers=False, data=df_total[df_total.Basecalling == bc], ax=ax[1]) ax[1].set_ylabel('Read length') ax[1].set_xlabel('Flow cell') f.suptitle("Total basepairs and read lengths per flow cell") f.savefig(os.path.join(output.plot, 'sequences2_{}.svg'.format(bc))) df_summary = pd.DataFrame(summary, columns=['Tag', 'Basecalling', 'Sum', 'Mean', 'Median', 'N50', 'Maximum']) df_summary.to_hdf(os.path.join(output.stats, 'sequences.hdf5'), 'sequences') |
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | run: import itertools import snakemake import numpy as np import pandas as pd import matplotlib as mpl mpl.use('Agg') from matplotlib import pyplot as plt import seaborn as sns configure_seaborn(sns) os.makedirs(output.plot, exist_ok=True) run_id = {runname:i for i, runname in enumerate(config['runnames'])} run_files = sorted(list(snakemake.utils.listfiles('alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.]*}.hdf5')), key=lambda x: (x[1].sequence_workflow, x[1].tag)) df_list = [] for (sequence_workflow, tag), values in itertools.groupby(run_files, key=lambda x: (x[1].sequence_workflow, x[1].tag)): df = pd.concat([pd.read_hdf(f).assign( Basecaller=lambda x: sequence_workflow, Aligner=lambda x: w.aligner, Tag=lambda x: w.tag, Flowcell=lambda x: run_id[w.runname], Reference=lambda x: w.reference) for f, w in values]) df['Mapping'] = df.flag.apply(lambda x : 'unmapped' if (x & 0x4) else 'secondary' if (x & 0x100) or (x & 0x800) else 'primary' if (x == 0) or (x == 0x10) else 'unknown') if 'identity' in df.columns: df_agg = df.groupby(['Basecaller', 'Tag', 'Aligner', 'Flowcell', 'Reference', 'Mapping', 'ID']).agg( length=('length', 'first'), mapped_length=('mapped_length', 'median'), Identity=('identity', 'median') ).reset_index().drop(columns=['ID']) else: df_agg = df.groupby(['Basecaller', 'Tag', 'Aligner', 'Flowcell', 'Reference', 'Mapping', 'ID']).agg( length=('length', 'first'), mapped_length=('mapped_length', 'median') ).reset_index().drop(columns=['ID']) df_agg['Identity'] = 0 df_list.append(df_agg) # mapping rate by read numbers g = sns.catplot("Mapping", hue='Aligner', col="Reference", data=df_agg, kind="count", height=5, aspect=1.0) g.fig.subplots_adjust(wspace=.05, hspace=0.5) (g.set_axis_labels("Mapping", "Reads")) g.savefig(os.path.join(output.plot, 'alignments_counts_{}_{}.svg'.format(sequence_workflow, tag))) # mapping rate by cumulative length df_agg = pd.concat(df_list) df_agg['Mapped %'] = df_agg['mapped_length'] / df_agg['length'] df_agg_primary = df_agg[df_agg.Mapping == 'primary'].copy() df_agg_primary.sort_values(by=['Tag', 'Basecaller', 'Aligner', 'Reference', 'length'], inplace=True) df_agg_primary_grouper = df_agg_primary.groupby(['Tag', 'Basecaller', 'Aligner', 'Reference'], sort=False) df_agg_primary = df_agg_primary.assign(**{'Sequenced Bases' : df_agg_primary_grouper['length'].cumsum(), 'Mapped Bases': df_agg_primary_grouper['mapped_length'].cumsum()}) xlim = df_agg_primary[df_agg_primary['Mapped Bases'] > (0.95 * df_agg_primary['Mapped Bases'].max())].length.iloc[0] xlim *= 1.1 df_agg_primary.set_index('Basecaller', inplace=True) for bc in df_agg_primary.index.unique(): df_subset = df_agg_primary.loc[bc] g = sns.relplot(x="length", y="Mapped Bases", hue='Aligner', style='Tag', col="Reference", data=df_subset[df_subset.length < xlim].iloc[np.linspace(0, df_subset[df_subset.length < xlim].shape[0]-1, 1000).astype(int)], kind='line', height=5, aspect=1.0) g.fig.subplots_adjust(wspace=.05, hspace=0.5) (g.set_axis_labels("Read length", "Mapped Bases")) g.savefig(os.path.join(output.plot, 'alignments_bases_{}.svg'.format(bc))) # read identity if available # TODO this will fail if one aligner sets the NM-tag and the other not if df_subset.Identity.sum() > 0: g = sns.catplot(x="Flowcell", y="Identity", hue='Tag', row="Reference", data=df_subset, kind='violin', kwargs={'cut': 0}, height=2.5, aspect=4) g.fig.subplots_adjust(wspace=.05, hspace=0.5) (g.set_axis_labels("Flow cell", "BLAST identity") .set(ylim=(0,1))) g.savefig(os.path.join(output.plot, 'alignments_identity_{}.svg'.format(bc))) |
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | run: import re import pandas as pd import matplotlib as mpl mpl.use('Agg') from matplotlib import pyplot as plt import seaborn as sns configure_seaborn(sns) os.makedirs(output.plot, exist_ok=True) for cov_file, cov_wc in snakemake.utils.listfiles('alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.]*}.bedGraph'): df = pd.read_csv(cov_file, sep='\t', header=None, names=['chr', 'begin', 'end', 'coverage']) df['bin'] = df['begin'] // (df.begin.max() / 1000) def sort_key(k): return ''.join(re.findall(r'\D+', k) + ['{:05d}'.format(int(m)) for m in re.findall(r'\d+', k)]) df_agg = df.groupby(['chr', 'bin']).agg( pos=('begin', 'first'), end=('end', 'last'), coverage=('coverage', 'median') ).reset_index() df_agg['key'] = df_agg.chr.map(sort_key) df_agg = df_agg.sort_values(by=['key', 'pos']).drop(columns=['key']) # coverage per chromosome g = sns.FacetGrid(df_agg, col="chr", hue='chr', col_wrap=4, sharex=True, sharey=True, height=1, aspect=2.5) g = (g.map(plt.plot, "pos", "coverage", marker=".", lw=0, markersize=0.2) .set(ylim=(0, df_agg.coverage.mean() * 2.0)) .set_titles("{col_name}")) g.fig.subplots_adjust(wspace=.05, hspace=0.5) g.savefig(os.path.join(output.plot, 'coverage_{}_{}_{}_{}.svg'.format( cov_wc.aligner, cov_wc.sequence_workflow, cov_wc.tag, cov_wc.reference))) |
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | run: import os, itertools import snakemake import pandas as pd import matplotlib as mpl mpl.use('Agg') from matplotlib import pyplot as plt import seaborn as sns configure_seaborn(sns) os.makedirs(output.plot, exist_ok=True) workflows = snakemake.utils.listfiles('methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.\/]*}.frequencies.tsv.gz') df_list = [] df_agg_list = [] for frequencies, wildcards in workflows: df = pd.read_csv(frequencies, sep='\t', header=None, compression='gzip', names=['chr', 'begin', 'end', 'level', 'coverage']) df.sort_values('coverage', inplace=True) df_agg = df.groupby('coverage', sort=False).agg( count=('coverage', 'count') ).reset_index().sort_values('coverage', ascending=False) df_agg['total'] = df_agg['count'].cumsum() df_agg['Methylation caller'] = wildcards.methylation_caller df_agg['Aligner'] = wildcards.aligner df_agg['Basecaller'] = wildcards.sequence_workflow df_agg['Tag'] = wildcards.tag df_agg['Reference'] = wildcards.reference df_agg_list.append(df_agg) # plots if df_agg_list: df_agg = pd.concat(df_agg_list) df_agg.set_index(['Basecaller', 'Aligner'], inplace=True) for basecaller, aligner in df_agg.index.unique(): df_subset = df_agg.loc[basecaller, aligner] g = sns.relplot(x="coverage", y="total", col="Reference", hue="Methylation caller", style="Tag", kind="line", height=5, aspect=1.0, data=df_subset[df_subset.total > (df_subset.total.max() * 0.05)]) g.fig.subplots_adjust(wspace=.05, hspace=0.5) (g.set_axis_labels("Coverage threshold", "CpGs")) g.savefig(os.path.join(output.plot, 'coverage_{}_{}.svg'.format(aligner, basecaller))) |
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 | run: import os import pandas as pd report = nanopype_report(os.getcwd(), output.pdf, version=config['version']['full-tag']) # summary summary_table = [] summary_table.append(('{:d}'.format(len(config['runnames'])), 'Flow cells')) f_sequences_stats = 'report/stats/sequences/sequences.hdf5' df_sequences = pd.read_hdf(f_sequences_stats) if os.path.isfile(f_sequences_stats) else None if df_sequences is not None: for field, label, scale, unit in zip(['Sum', 'N50'], ['Total bases', 'N50'], [1e9, 1e3], ['GB', 'kB']): if df_sequences.shape[0] < 2: summary_table.append(('{:.1f} {}'.format(df_sequences[field][0] / scale, unit), label)) else: summary_table.append(('{:.1f} to {:.1f} {}'.format(df_sequences[field].min() / scale, df_sequences[field].max() / scale, unit), label)) df_disk_usage = pd.read_hdf(input.disk_usage[0]) summary_table.append(('{:.2f} GB'.format(df_disk_usage.bytes.sum() / 1e9), 'Total disk usage')) report.add_summary(summary_table) # flow cells report.add_section_flowcells(config['runnames']) # sequences f_sequences_plots = snakemake.utils.listfiles('report/plots/sequences/sequences{i}_{basecalling}.svg') report.add_section_sequences(f_sequences_plots, df_sequences) # alignments f_alignments_counts = [(x[0], "Basecalling: {} ({})".format(x[1].sequence_workflow, x[1].tag)) for x in snakemake.utils.listfiles('report/plots/alignments/alignments_counts_{sequence_workflow, [^_]*}_{tag}.svg')] f_alignments_bases = [(x[0], "Basecalling: {}".format(x[1].sequence_workflow)) for x in snakemake.utils.listfiles('report/plots/alignments/alignments_bases_{sequence_workflow}.svg')] f_alignments_identity = [(x[0], "Basecalling: {}".format(x[1].sequence_workflow)) for x in snakemake.utils.listfiles('report/plots/alignments/alignments_identity_{sequence_workflow}.svg')] f_alignments_coverage = [x[0] for x in snakemake.utils.listfiles('report/plots/coverage/coverage_{tag}.svg')] report.add_section_alignments(counts=f_alignments_counts, bases=f_alignments_bases, identity=f_alignments_identity, coverage=f_alignments_coverage) # methylation f_methylation_coverage = [(x[0], "Basecalling: {}, Alignment: {}".format(x[1].sequence_workflow, x[1].aligner)) for x in snakemake.utils.listfiles('report/plots/methylation/coverage_{aligner}_{sequence_workflow}.svg')] report.add_section_methylation(coverage=f_methylation_coverage) # build report pdf report.build() |
59 60 61 62 | shell: """ {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input.batch} --out_prefix reads --tmp_prefix $(pwd) > {output} """ |
70 71 72 73 | run: with open(output[0], 'w') as fp: for f in input.batches: print(open(f, 'r').read(), end='', file=fp) |
88 89 90 91 92 | shell: """ mkdir -p {output} {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.names} {output} --index {input.index} --output_format bulk """ |
102 103 | run: print(input.files) |
69 70 71 72 | shell: """ {config[bin_singularity][sniffles]} -m {input} -v {output} -t {threads} {config[sv_sniffles_flags]} """ |
89 90 91 92 93 | shell: """ {config[bin_singularity][svim]} alignment sample {input.alignment} {input.reference} {config[sv_svim_flags]} mv sample/variants.vcf {output} """ |
106 107 108 109 | shell: """ cat {input} | gzip > {output} """ |
131 132 133 134 135 | shell: """ export TMPDIR=$(pwd) {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {params.model} {input.config} {params.mod_model} --t {threads} > {output} """ |
142 143 144 145 146 147 148 149 150 | run: with open(output[0], 'w') as fp_out: for i, f in enumerate(input): with open(f, 'r') as fp_in: if i == 0: print(fp_in.read(), file=fp_out, end='') else: next(fp_in) print(fp_in.read(), file=fp_out, end='') |
157 158 159 160 161 162 163 164 165 | run: with open(output[0], 'w') as fp_out: for i, f in enumerate(input): with open(f, 'r') as fp_in: if i == 0: print(fp_in.read(), file=fp_out, end='') else: next(fp_in) print(fp_in.read(), file=fp_out, end='') |
50 51 52 53 54 55 56 57 | shell: """ mkfifo input_seq mkfifo output_seq zcat {input.seq} > input_seq & {config[bin_singularity][python]} {config[bin_singularity][cdna_classifier]} -b {input.prmr} input_seq output_seq & cat output_seq | gzip > {output.seq} """ |
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | shell: """ racon_dir=`dirname {config[bin_singularity][racon]}` minimap_dir=`dirname {config[bin_singularity][minimap2]}` samtools_dir=`dirname {config[bin_singularity][samtools]}` PATH=$racon_dir:$minimap_dir:samtools_dir:$PATH TMPDIR=`pwd` echo 'splicing initial alignment' {config[bin_singularity][spliced_bam2gff]} {config[transcript_spliced_bam2gff_flags]} -M -t {threads} {input.bam} > raw_transcript.gff echo 'cluster gff' {config[bin_singularity][cluster_gff]} {config[transcript_cluster_gff_flags]} -t {threads} -a cluster_transcript.tsv raw_transcript.gff > cluster_transcript.gff echo 'collapse cluster' {config[bin_singularity][collapse_partials]} {config[transcript_collapse_partials]} -t {threads} cluster_transcript.gff > cluster_collapsed.gff echo 'polish collapsed cluster' {config[bin_singularity][polish_clusters]} {config[transcript_polish_clusters]} -d $(pwd) -a cluster_transcript.tsv -t {threads} -o cluster_polish.fasta {input.bam} echo 'map collapsed polished cluster to reference' {config[bin_singularity][minimap2]} -t {threads} {config[alignment_minimap2_flags]} {input.reference} cluster_polish.fasta | {config[bin_singularity][samtools]} view -Sb -F 2304 | {config[bin_singularity][samtools]} sort -o cluster_polish.{wildcards.reference}.bam - {config[bin_singularity][samtools]} index cluster_polish.{wildcards.reference}.bam {config[bin_singularity][spliced_bam2gff]} {config[transcript_spliced_bam2gff_flags]} -M -t {threads} cluster_polish.{wildcards.reference}.bam > cluster_polish.gff echo 'collapse mapped transcripts' {config[bin_singularity][collapse_partials]} {config[transcript_collapse_partials]} -t {threads} cluster_polish.gff > {output.gff} """ |
Support
- Future updates
Related Workflows





