Flattening annotation for bakR

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

Snakemake workflow for flattening annotations and creating merged annotations in the case of spike-ins.

Code Snippets

shell:
    "cat {input.ref} {input.spike} > {output.cref}"

SnakeMake From line 8 of rules/combine.smk

shell:
    "cat {input.ref} {input.spike} > {output.cref}"

SnakeMake From line 17 of rules/combine.smk

script:
    "../scripts/dexseq_prepare_annotation.py"

SnakeMake DESeq From line 11 of rules/flatten.smk

shell:
    """
    chmod +x {params.shellscript}
    {params.shellscript} {input} {output}
    """

SnakeMake oschmod From line 26 of rules/flatten.smk

wrapper:
    "v2.1.1/bio/samtools/sort"

SnakeMake From line 11 of rules/quantify.smk

shell:
    """
    htseq-count -t aggregate_gene -m intersection-strict -s {params.strand} \
    -r pos -p bam --add-chromosome-info -i gene_id \
    -c {output.counts} {input.bam} {input.gtf}
    """

SnakeMake htseqcount From line 28 of rules/quantify.smk

shell:
    """
    htseq-count -t exonic_part -m union -s {params.strand} \
    -r pos -p bam --add-chromosome-info -i exon_id \
    -c {output.counts} {input.bam} {input.gtf}
    """

SnakeMake htseqcount From line 49 of rules/quantify.smk

shell:
    """
    htseq-count -t exonic_part -m intersection-strict -s {params.strand} \
    -r pos -p bam --add-chromosome-info -i gene_id \
    -c {output.counts} {input.bam} {input.gtf}
    """

SnakeMake htseqcount From line 69 of rules/quantify.smk

wrapper:
    "v1.21.4/bio/reference/ensembl-sequence"

SnakeMake From line 11 of rules/ref.smk

wrapper:
    "v1.21.4/bio/reference/ensembl-annotation"

SnakeMake From line 26 of rules/ref.smk

shell:
    """
    chmod +x {params.shellscript}
    {params.shellscript} {input} {output}
    """

SnakeMake oschmod From line 40 of rules/ref.smk

shell:
    """
    chmod +x {params.shellscript}
    {params.shellscript} {input} {output.real_o}
    """

SnakeMake oschmod From line 56 of rules/ref.smk

wrapper:
    "v1.21.4/bio/reference/ensembl-sequence"

SnakeMake From line 13 of rules/spike.smk

wrapper:
    "v1.21.4/bio/reference/ensembl-annotation"

SnakeMake From line 28 of rules/spike.smk

shell:
    """
    chmod +x {params.shellscript}
    {params.shellscript} {input} {output} {params.species}
    """

SnakeMake oschmod From line 43 of rules/spike.smk

shell:
    """
    chmod +x {params.shellscript}
    {params.shellscript} {input} {output.temp_o} {output.real_o} {params.species}
    """

SnakeMake oschmod From line 61 of rules/spike.smk

import sys, collections, itertools, os.path, optparse

args = [snakemake.input.gtf, snakemake.output.flatgtf]

if len( args ) != 2:
   sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" )
   sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) )
   sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" )
   sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" )
   sys.stderr.write( "with the count_in_exons.py script.\n" )
   sys.exit(1)

try:
   import HTSeq
except ImportError:
   sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )   
   sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )   
   sys.exit(1)



gtf_file = args[0]
out_file = args[1]

aggregateGenes = True

# Step 1: Store all exons with their gene and transcript ID 
# in a GenomicArrayOfSets

exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
for f in HTSeq.GFF_Reader( gtf_file ):
   if f.type != "exon":
      continue
   f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" )
   exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] )


# Step 2: Form sets of overlapping genes

# We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
# contains IDs of genes that overlap, i.e., share bases (on the same strand).
# The keys of 'gene_sets' are the IDs of all genes, and each key refers to
# the set that contains the gene.
# Each gene set forms an 'aggregate gene'.

if aggregateGenes == True:
   gene_sets = collections.defaultdict( lambda: set() )
   for iv, s in exons.steps():
      # For each step, make a set, 'full_set' of all the gene IDs occuring
      # in the present step, and also add all those gene IDs, whch have been
      # seen earlier to co-occur with each of the currently present gene IDs.
      full_set = set()
      for gene_id, transcript_id in s:
         full_set.add( gene_id )
         full_set |= gene_sets[ gene_id ]
      # Make sure that all genes that are now in full_set get associated
      # with full_set, i.e., get to know about their new partners
      for gene_id in full_set:
         assert gene_sets[ gene_id ] <= full_set
         gene_sets[ gene_id ] = full_set


# Step 3: Go through the steps again to get the exonic sections. Each step
# becomes an 'exonic part'. The exonic part is associated with an
# aggregate gene, i.e., a gene set as determined in the previous step, 
# and a transcript set, containing all transcripts that occur in the step.
# The results are stored in the dict 'aggregates', which contains, for each
# aggregate ID, a list of all its exonic_part features.

aggregates = collections.defaultdict( lambda: list() )
for iv, s in exons.steps( ):
   # Skip empty steps
   if len(s) == 0:
      continue
   gene_id = list(s)[0][0]
   ## if aggregateGenes=FALSE, ignore the exons associated to more than one gene ID
   if aggregateGenes == False:
      check_set = set()
      for geneID, transcript_id in s:
         check_set.add( geneID )
      if( len( check_set ) > 1 ):
         continue
      else:
         aggregate_id = gene_id
   # Take one of the gene IDs, find the others via gene sets, and
   # form the aggregate ID from all of them   
   else:
      assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] 
      aggregate_id = '+'.join( gene_sets[ gene_id ] )
   # Make the feature and store it in 'aggregates'
   f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv )   
   f.source = os.path.basename( sys.argv[0] )
#   f.source = "camara"
   f.attr = {}
   f.attr[ 'gene_id' ] = aggregate_id
   transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) )
   f.attr[ 'transcripts' ] = '+'.join( transcript_set )
   aggregates[ aggregate_id ].append( f )


# Step 4: For each aggregate, number the exonic parts

aggregate_features = []
for l in list(aggregates.values()):
   for i in range( len(l)-1 ):
      assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name"
      assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early"
      if l[i].iv.chrom != l[i+1].iv.chrom:
         raise ValueError("Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) ))
      if l[i].iv.strand != l[i+1].iv.strand:
         raise ValueError("Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) ))
   aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", 
      HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, 
         l[-1].iv.end, l[0].iv.strand ) )
   aggr_feat.source = os.path.basename( sys.argv[0] )
   aggr_feat.attr = { 'gene_id': aggr_feat.name }
   for i in range( len(l) ):
      l[i].attr['exonic_part_number'] = "%03d" % ( i+1 )
   aggregate_features.append( aggr_feat )


# Step 5: Sort the aggregates, then write everything out

aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) )

fout = open( out_file, "w" ) 
for aggr_feat in aggregate_features:
   fout.write( aggr_feat.get_gff_line() )
   for f in aggregates[ aggr_feat.name ]:
      fout.write( f.get_gff_line() )

fout.close()      

Python HTSeq From line 1 of scripts/dexseq_prepare_annotation.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"

import subprocess
import sys
from pathlib import Path
from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)


species = snakemake.params.species.lower()
build = snakemake.params.build
release = int(snakemake.params.release)
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")


branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"


flavor = snakemake.params.get("flavor", "")
if flavor:
    flavor += "."


suffix = ""
if out_fmt == "gtf":
    suffix = "gtf.gz"
elif out_fmt == "gff3":
    suffix = "gff3.gz"
else:
    raise ValueError(
        "invalid format specified. Only 'gtf[.gz]' and 'gff3[.gz]' are currently supported."
    )


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    out_fmt=out_fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)


try:
    if out_gz:
        shell("curl -L {url} > {snakemake.output[0]} {log}")
    else:
        shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-annotation/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-sequence/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_mem
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

mem_per_thread_mb = int(get_mem(snakemake) / snakemake.threads)

with tempfile.TemporaryDirectory() as tmpdir:
    tmp_prefix = Path(tmpdir) / "samtools_sort"

    shell(
        "samtools sort {samtools_opts} -m {mem_per_thread_mb}M {extra} -T {tmp_prefix} {snakemake.input[0]} {log}"
    )