A `snakemake` workflow for building gene trees

public 1yr ago 0 bookmarks

View Workflow

A snakemake workflow for phylogenomic analysis of genome clusters.

kizuchi is a snakemake workflow for building gene trees, starting from a collection of genomes and HMM profiles, and then analyzing the phylogenetic histories of alleles within genome-level ANI clusters. The aim of this workflow is to automate and document gene tree analysis in a reproducible way, with the ultimate objective of tracing recombination across timescale domains.

gene prediction using prodigal-gv
gene annotation using hmmer
ortholog scoring
amino acid alignment using clustalo
alignment trimming using trimal
tree inference using fasttree
genome clustring using fastANI
phylogenetic correlations using SuchTree

Rule Graph

Code Snippets

shell :
  'rm -rf aggregated_genomes proteins genes \
      hmm_hits hmmdb preani ani ani_clusters \
      scored_proteins cluster_data clusters \
      statistics benchmarks logs'

SnakeMake From line 85 of workflow/Snakefile

run :
  with open( log[0], 'w' ) as LOG :
    fasta_chunks = batched( input.fastas, size=len(input.fastas)//config['chunks'] )
    if not os.path.exists( 'aggregated_genomes' ) : os.mkdir( 'aggregated_genomes' )
    for n,chunk in enumerate( fasta_chunks ) :
      chunkpath = os.path.join( 'aggregated_genomes',
                                'chunk_{n}.fna'.format(n=str(n) ) ) 
      LOG.write( 'writing {chunk}...\n'.format( chunk=chunkpath ) )
      with open( chunkpath, 'w' ) as f :
        contigs = 0
        for i,genome in enumerate( chunk ) :
          for j,seq in enumerate( parse( open( genome ), 'fasta' ) ) :
            seq.id = Path(genome).stem + '_contig_' + str(j)
            seq.description = ''
            f.write( seq.format('fasta') )
          contigs = contigs + j
        LOG.write( '   wrote {i} genomes with {j} contigs\n'.format( i=str(i+1),
                                                                     j=str(contigs+1) ) )

SnakeMake From line 99 of workflow/Snakefile

wrapper :
  'file://wrappers/prodigal-gv'

SnakeMake prodigal-gv From line 131 of workflow/Snakefile

run :
  profiles = []
  for hmmfile in input.hmms :
    hmm = hmmreader.read_single( open( hmmfile ) )
    if Path(hmmfile).stem != hmm.metadata.model_name :
      raise Exception( 'file name ({hmmfile}) must match model name ({modelname}).'.format(
                        hmmfile=hmmfile, modelname=hmm.metadata.model_name ) )
    profiles.append( hmm ) 
  hmmwriter.save_many_to_file( hmms=profiles, output=output.db )

SnakeMake From line 144 of workflow/Snakefile

wrapper :
  'file://wrappers/hmmer/hmmsearch'

SnakeMake hmmsearch (genouest) From line 168 of workflow/Snakefile

run :
  with open( log[0], 'w' ) as LOG :
    prealignments = { Path(p).stem : { 'prealign' : p,
                                       'hits'     : [] } for p in output.prealign }
    # use trusted cutoffs if provided
    trusted_cutoffs = { m.metadata.model_name : m.metadata.trusted_cutoff
                        for m in hmmreader.read_all( open( input.profile ) ) }
    LOG.write( 'parsed {n} hmms, found {m} trusted cutoffs\n'.format( 
                  n=str(len(trusted_cutoffs)),
                  m=str(sum( [ bool(v) for v in trusted_cutoffs.values() ] ) ) ) )
    # load hmmer hits and protein sequences for each chunk
    for tblout,faa in zip( input.hits, input.faa ) :
      hits = polars.DataFrame( [ h for h in parse_hmmsearch( tblout ) ] )
      seqs = { seq.id : seq for seq in parse( open( faa ), 'fasta' ) }
      chunk_genomes = { name.rsplit('_',1)[0] for name in seqs.keys() }
      LOG.write( '{chunk} : found {h} hits for {p} proteins in {g} genomes\n'.format(
                    chunk=Path(faa).stem,
                    h=str(len(hits)),
                    p=str(len(seqs)),
                    g=str(len(chunk_genomes)) ) )
      if len( hits ) == 0 : continue
      # for each hmm, for each genome, write the proteins 
      # with the top hits in order of their score value
      for genome,hmm in product( hits['tname'].unique(),
                                 hits['qname'].unique() ) :
        if trusted_cutoffs[ hmm ] :
          top_hits = hits.filter( ( polars.col('tname') == genome ) &
                                  ( polars.col('qname') == hmm ) &
                                  ( polars.col('score') >= trusted_cutoff[hmm] ) )\
                         .sort( 'score', descending=True )
        else :
          top_hits = hits.filter( ( polars.col('tname') == genome ) &
                                  ( polars.col('qname') == hmm ) &
                                  ( polars.col('eval') <= EVALUE ) )\
                         .sort( 'score', descending=True )
        if len( top_hits ) == 0 : continue
        with open( prealignments[ hmm ]['prealign'], 'a' ) as f :
          for n,protein in enumerate( top_hits['tname'] ) :
            seq = deepcopy( seqs[ protein ] )
            if seq.seq[-1] == '*' : seq = seq[:-1]
            seq.id = seq.id + '_p' + str(n)
            prealignments[hmm]['hits'].append( seq.id )
            f.write( seq.format( 'fasta' ) )
    for k,p in prealignments.items() :
      genomes = { h.rsplit('_',4)[0] for h in p['hits'] }
      LOG.write( '{hmm} : \n'.format( hmm=k ) )
      LOG.write( '   genomes : {n}\n'.format( n=str(len(genomes)) ) )
      LOG.write( '   hits    : {n}\n'.format( n=str(len(p['hits'])) ) )
      if len( p['hits'] ) == 0 :
        LOG.write( '    NONE\n' )

SnakeMake hmmer From line 184 of workflow/Snakefile

run :
  with open( output.references, 'w' ) as rf :
    for fasta in input.references :
      rf.write( fasta + '\n' )
  with open( output.queries, 'w' ) as qf :
    for fasta in input.queries :
      qf.write( fasta + '\n' )

SnakeMake From line 247 of workflow/Snakefile

wrapper :
  'file://wrappers/fastani'

SnakeMake From line 267 of workflow/Snakefile

run :
  header = [ 'query', 'reference', 'ANI', 
             'bidirectional fragment mappings',
             'total query fragments' ]

  fastani = pandas.read_csv( input.ani, sep='\t', names=header )

  # file names -> genome names
  fastani['query']     = [ Path(p).stem for p in fastani['query'] ]
  fastani['reference'] = [ Path(p).stem for p in fastani['reference'] ]

  # drop self-hits
  fastani = fastani[ fastani['query'] != fastani['reference'] ]

  # de-duplicate (i.e., take the lower triangle of the matrix)
  fastani['hash'] = fastani.apply( lambda row : hash( 
                                                  tuple(
                                                    sorted( ( row['query'],
                                                              row['reference'] ) ) ) ), 
                                  axis=1 )

  fastani.drop_duplicates( subset='hash', keep='first', inplace=True )
  fastani.drop( ['hash'], axis=1, inplace=True )

  # convert % identity to a distance measure for clustering
  threshold = 1.0 - float( Path( output[0] ).stem.rsplit( '_', 1 )[1] )/100

  with open( output[0], 'w' ) as f :
    genomes = list( set( fastani['query'] ) | set( fastani['reference'] ) )

    D = zeros( ( len(genomes), len(genomes) ) )
    D.fill( 1.0 )

    for i in range(len(genomes)) :
      D[i,i] = 0.0

    for n,row in fastani.iterrows() :
      i = genomes.index( row['query'] )
      j = genomes.index( row['reference'] )
      D[i,j] = D[j,i] = 1.0 - row['ANI']/100

    ani_clusters = defaultdict(list)
    for n,c in enumerate( fcluster( single( squareform(D) ),
                                    threshold,
                                    criterion='distance' ) ) :
      ani_clusters[c].append( genomes[n] )

    for cid, cg in ani_clusters.items() :
      f.write( '{cid}\t{genomes}\n'.format( cid=str(cid), genomes=','.join(cg) ) )

SnakeMake From line 280 of workflow/Snakefile

run :
  def intersect( hmms, genomes, threshold ) :
    C = defaultdict( set )
    for hmm,genome in zip( hmms, genomes ) :
      C[hmm].add(genome)
    return sum( [ len( C[a] & C[b] ) > threshold
                  for a,b in combinations( C.keys(), 2 ) ] )

  with open( log[0], 'w' ) as LOG :
    LOG.write( 'loading sequences...\n' )

    sequences = { Path(fasta).stem : { rec.id : rec.seq
                                       for rec
                                       in parse( open( fasta ), 'fasta' ) } 
                  for fasta
                  in input.prealignments }

    LOG.write( 'Genes loaded   : {n}\n'.format( n=str(len(sequences)) ) )

    geneseqs = defaultdict(dict)
    for chunk in input.genes :
      for seq in parse( open( chunk ), 'fasta' ) :
        genome = seq.id.rsplit('_',3)[0] 
        geneseqs[ genome ][ seq.id ] = seq

    LOG.write( 'Genomes loaded : {n}\n'.format( n=str(len(geneseqs)) ) )

    LOG.write( 'creating gene table for genome clusters...\n' )

    records = []
    for ani,clusterfile in zip( params.ANIs, input.ani_clusters ) :
      for cid,names in [ line.split() for line in open( clusterfile ) ] :
        names = names.split(',')
        #if not len( names ) > 5 : continue
        for hmm in sequences.keys() :
          seq_ids = [ seq_id for seq_id in sequences[hmm].keys() if seq_id.rsplit('_',4)[0] in names ]
          for seq_id in seq_ids :
            records.append( { 'seq_id' : seq_id,
                              'genome' : seq_id.rsplit('_',4)[0],
                              'cid'    : int( cid ),
                              'hmm'    : hmm,
                              'ANI'    : float( ani ),
                              'id'     : cid + '_' + Path( clusterfile ).stem.split('_')[-1] } )

    df = polars.DataFrame( records )

    LOG.write( 'number of clusters by ANI...\n' )
    for ani,n in df.groupby('ANI').agg( polars.col('cid').unique().count() ).sort('ANI').iter_rows() :
      LOG.write( '    ANI={ani} : {n} clusters\n'.format( ani=ani, n=n ) )

    LOG.write( 'gene table description :\n\n' + str( df.describe() ) + '\n' )

    clusters = df.groupby('id').agg( ['hmm','genome'] ).apply(
                       lambda x : ( x[0],
                         intersect( x[1],
                                    x[2],
                                    MIN_CLUSTER_LINKS ) ) ).filter( polars.col('column_1') > 0 )['column_0']

    LOG.write( 'found {n} genome clusters with at least {T} genomes sharing at least two genes.\n'.format(
               n=len(clusters), T=MIN_CLUSTER_LINKS ) )

    cluster_table = df.filter( polars.col('id').is_in( clusters ) )

    LOG.write( 'filtered gene table description :\n\n' + str( cluster_table.describe() ) )

    cluster_table.write_csv( file=output.cluster_table, separator='\t' )

    # create cluster directory structure and write gene nucleotide
    # sequences as individual fastas per cluster

    LOG.write( 'found {n} clusters.\n'.format( n=len( cluster_table.groupby('id').agg('seq_id') ) ) )

    if not os.path.exists( 'clusters' ) : os.mkdir( 'clusters' )

    for ANI,cids in cluster_table.groupby('ANI').agg( polars.col('cid').unique() ).sort('ANI').iter_rows() :

      anipath = os.path.join( 'clusters', str(ANI) )
      if not os.path.exists( anipath ) : os.mkdir( anipath )

      for cluster_id in cids :

        clusterpath = os.path.join( anipath, str( cluster_id ) )
        if not os.path.exists( clusterpath ) : os.mkdir( clusterpath )
        genepath = os.path.join( clusterpath, 'genes' )
        if not os.path.exists( genepath ) : os.mkdir( genepath )

        for hmm,seq_ids,genomes in cluster_table.filter( ( polars.col('ANI') == ANI ) & ( polars.col('cid') == cluster_id ) ).groupby('hmm').agg( ['seq_id', 'genome'] ).iter_rows() :
          with open( os.path.join( genepath, hmm + '.fna' ), 'w' ) as f :
            for seq_id,genome in zip( seq_ids, genomes ) :
              orig_seq_id = seq_id.rsplit('_',1)[0] # remove hmmer evalue rank
              seq = geneseqs[genome][orig_seq_id] 
              seq.id = seq_id
              f.write( seq.format( 'fasta' ) )