Generating K-mer Count-Based Features with mlgenofeatures Snakemake Workflow
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Generating simulated and actual k-mer count-based features with mlgenofeatures
The mlgenofeatures Snakemake workflow can be used to create files of k-mer counts from actual or simulated short read datasets from a diverse set of mutated haplotype reference sequences.
These k-mer counts can then be used as input to scripts in the mlgenotype python package to train random forest classifiers to recognize large structural variants in real whole genome datasets.
Software dependencies
The mlgenofeatures pipeline requires:
-
Snakemake (https://snakemake.readthedocs.io/en/stable/)
-
ART version ART-MountRainer-2016-06-05 (https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm)
-
meryl version 1.3 (https://github.com/marbl/meryl)
-
Python with the pysam library (https://pysam.readthedocs.io/en/latest/)
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | import os import re import gzip from collections import namedtuple def retrieve_genotype(match): genolist = [] genolist.append(match.group(2)) genolist.append(match.group(4)) return genolist with open(snakemake.log[0], "w") as f: sys.stderr = sys.stdout = f # kmer count file has kmer counts from the simulated fastq file kmer_count_file = snakemake.input[0] # all kmer file establishes the order of the columns, so all sequence sets will generate the same columns in their feature files all_kmer_file = snakemake.input[1] # feature file will contain a column for each kmer with counts feature_file = snakemake.output[0] match = re.search(r'^features/(\S{2})\_(\S{4})\_(\S{2})\_(\S{4})\.(\d+)x\.(\d+)\.features.txt$', feature_file) if match: genotypelist = retrieve_genotype(match) genotypestring = "_".join(genotypelist) print("Found genotype " + genotypestring) kmercounts = {} print("Opening " + kmer_count_file) with open(kmer_count_file, "r") as inkmercount_f: for line in inkmercount_f: line = line.strip() countmatch = re.search(r'^(\S+)\s(\d+)$', line) if countmatch: currentkmer = countmatch.group(1) currentcount = countmatch.group(2) kmercounts[currentkmer] = currentcount #print("Recording kmer " + currentkmer + " with count " + currentcount) else: print("Can\'t find kmer in line:\n" + line) print("Successfully read in simulated kmer counts from file " + kmer_count_file) # calculate total counts for this sample: # append normalized kmer counts to a list for printing: kmertotal = 0 numkmers = 0 countlist = [] with open(all_kmer_file, "r") as inallkmers_f: for line in inallkmers_f: line = line.strip() kmermatch = re.search(r'^(\S+)\s(\d+)$', line) if kmermatch: numkmers = numkmers + 1 printkmer = kmermatch.group(1) if kmercounts.get(printkmer) == None: countlist.append("0") else: countlist.append(str(kmercounts[printkmer])) kmertotal = kmertotal + int(kmercounts[printkmer]) else: print("Can\'t find kmer in line:\n" + line) print("Normalizing") normcountlist = [] for count in countlist: normcountlist.append(str(round(50.0*int(count)*numkmers/kmertotal))) normcountlist.append(genotypestring) with open(feature_file, "w") as outfeature_f: featurestring = "\t".join(normcountlist) print(featurestring, file=outfeature_f) else: print("Couldn\'t parse filename " + feature_file) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | import os import re import gzip import pysam from collections import namedtuple def print60(seq, outfh): for i in range(0, len(seq), 60): if i+60 <= len(seq): print(seq[i:i+60], file=outfh) else: print(seq[i:len(seq)], file=outfh) def gatherdata(match): Samplegenodata = namedtuple("Samplegenodata", ["samples", "genotypes", "genomefiles", "mutationfiles"]) samplelist = [] genolist = [] genofilelist = [] mutfilelist = [] samplelist.append(match.group(1)) samplelist.append(match.group(3)) genolist.append(match.group(2)) genolist.append(match.group(4)) genofilelist.append(snakemake.config["genomefiles"][samplelist[0]]) genofilelist.append(snakemake.config["genomefiles"][samplelist[1]]) mutfilelist.append(snakemake.config["mutationfiles"][samplelist[0]]) mutfilelist.append(snakemake.config["mutationfiles"][samplelist[1]]) return Samplegenodata(samplelist, genolist, genofilelist, mutfilelist) def retrievemutdata(mutfile, geno): Mutationdata = namedtuple("Mutationdata", ["mutation", "mutentry", "mutstart", "mutend", "regionoffset", "regionstart", "regionend", "regionstrand"]) # Determine whether a mutation is needed for this genotype: mutation = "none" mutentry = "" mutstart = 0 mutend = 0 regionoffset = 0 regionstrand = '' inmuts = open(mutfile, "r") for line in inmuts: fields = line.split() if fields[3] == geno: mutation = fields[5] mutentry = fields[0] mutstart = int(fields[1]) mutend = int(fields[2]) if fields[3] == "REGION": mutentry = fields[0] regionoffset = int(fields[1]) regionstart = regionoffset regionend = int(fields[2]) regionstrand = fields[4] inmuts.close() # mutstart is zero-based start within AlphaThal region, regionoffset is zero-based # start of AlphaThal region in contig, so sum of mutstart and regionoffset is the # zero-based start of the deletion, in coordinates along the contig. First deleted # base is mutstart + 1 (one-based) or mutstart (zero-based) # mutend is one based, so after summing, mutend is one-based end of deletion in # coordinates within the contig mutstart = mutstart + regionoffset mutend = mutend + regionoffset # if there is a deletion or duplication, the regionend value will be altered: if mutation == "deletion": regionend = regionend - (mutend - mutstart) if mutation == "duplication": regionend = regionend + (mutend - mutstart) print("Found mutation " + mutation + " with position " + str(mutstart) + "-" + str(mutend) + " in region from " + str(regionstart) + "-" + str(regionend) ) return Mutationdata(mutation, mutentry, mutstart, mutend, regionoffset, regionstart, regionend, regionstrand) def truncateseq(inputseq, fiveprimeflank, threeprimeflank, capturestart, captureend, strand): print("Applying 5p buffer " + str(fiveprimeflank)) print("Applying 3p buffer " + str(threeprimeflank)) if strand == "+": startbuf = fiveprimeflank endbuf = threeprimeflank if strand == "-": startbuf = threeprimeflank endbuf = fiveprimeflank seqstart = capturestart - startbuf seqend = captureend + endbuf seqlength = len(inputseq) if seqstart < 0: seqstart = 0 print("Only able to include " + str(capturestart) + " of five prime flank to the region of interest") if seqend > seqlength: seqend = seqlength print("Only able to include " + str(seqend - captureend) + " of three prime flank to the region of interest") print("Attempting to extract from " + str(seqstart) + " to " + str(seqend)) truncatedseq = inputseq[seqstart:seqend] return truncatedseq with open(snakemake.log[0], "w") as f: sys.stderr = sys.stdout = f genome_file = snakemake.output[0] match = re.search(r'^genomes/(\S{2})\_(\S{4})\_(\S{2})\_(\S{4})\.fasta$', genome_file) if match: sampledata = gatherdata(match) samples = sampledata.samples genotypes = sampledata.genotypes genomefiles = sampledata.genomefiles mutationfiles = sampledata.mutationfiles fivepgenomeregionbuffer = int(snakemake.config["fivepgenomeregionbuffer"]) threepgenomeregionbuffer = int(snakemake.config["threepgenomeregionbuffer"]) outgenome = open(genome_file, "w") for genomeindex in range(2): inputgenomefile = snakemake.config["genomedir"] + "/" + genomefiles[genomeindex] mutationfile = snakemake.config["mutationdir"] + "/" + mutationfiles[genomeindex] thisgeno = genotypes[genomeindex] mutdata = retrievemutdata(mutationfile, thisgeno) mutation = mutdata.mutation mutentry = mutdata.mutentry mutstart = mutdata.mutstart mutend = mutdata.mutend regionstrand = mutdata.regionstrand regionoffset = mutdata.regionoffset regionstart = mutdata.regionstart regionend = mutdata.regionend currentseqid = '' currentseq = '' startbuf = 0 endbuf = 0 fastagenome = pysam.Fastafile(inputgenomefile) contigseq = fastagenome.fetch(mutentry) print("Length of entry " + mutentry + ":" + str(len(contigseq))) if mutation == "deletion": currentseq = contigseq[0:mutstart] + contigseq[mutend:len(contigseq)] elif mutation == "duplication": currentseq = contigseq[0:mutend] + contigseq[mutstart:mutend] + contigseq[mutend:len(contigseq)] else: currentseq = contigseq[0:len(contigseq)] print("Length of mutated entry " + mutentry + ":" + str(len(currentseq))) if snakemake.config["fivepgenomeregionbuffer"] != "None": currentseq = truncateseq(currentseq, fivepgenomeregionbuffer, threepgenomeregionbuffer, regionstart, regionend, regionstrand) print(">" + mutentry, file=outgenome) print60(currentseq, outgenome) #ingenome = gzip.open(inputgenomefile, "r") #for line in ingenome: #line = str(line,'utf-8') #match = re.search(r'^>\s*(\S+)', line) #if match: ## print current entry if there is one #if len(currentseq) > 0: ## handle case where entry contains our target region: #if currentseqid == mutentry: #print("Found mutation entry " + mutentry + " with length " + str(len(currentseq))) #if mutation == "deletion": #oneseq = currentseq.replace("\n", "") #print("Length of entry " + mutentry + ":" + str(len(oneseq))) #currentseq = oneseq[0:mutstart] + oneseq[mutend:len(oneseq)] #print("Length of mutated entry " + mutentry + ":" + str(len(currentseq))) #if mutation == "duplication": #oneseq = currentseq.replace("\n", "") #print("Length of entry " + mutentry + ":" + str(len(oneseq))) #currentseq = oneseq[0:mutend] + oneseq[mutstart:mutend] + oneseq[mutend:len(oneseq)] #print("Length of mutated entry " + mutentry + ":" + str(len(currentseq))) #if snakemake.config["fivepgenomeregionbuffer"] != "None": #currentseq = truncateseq(currentseq, fivepgenomeregionbuffer, threepgenomeregionbuffer, regionstart, regionend, regionstrand) # #print(">" + currentseqid, file=outgenome) #print60(currentseq, outgenome) #else: #if snakemake.config["fivepgenomeregionbuffer"] == "None": #print(">" + currentseqid, file=outgenome) #print(currentseq, file=outgenome, end="") # #currentseqid = match.group(1) ##print("Found", currentseqid) #currentseq = '' #else: #currentseq = currentseq + line.strip() #ingenome.close() #if len(currentseq) > 0: #if currentseqid == mutentry: #if mutation == "deletion": ## delete appropriate sequence #oneseq = currentseq.replace("\n", "") #print("Length of entry" + mutentry + ":" + str(len(oneseq))) #print("0 to " + str(mutstart) + ", " + str(len(oneseq))) #currentseq = oneseq[0:mutstart] + oneseq[mutend:len(oneseq)] #if mutation == "duplication": #oneseq = currentseq.replace("\n", "") #print("Length of entry" + mutentry + ":" + str(len(oneseq))) #currentseq = oneseq[0:mutend] + oneseq[mutstart:mutend] + oneseq[mutend:len(oneseq)] #print("Length of mutated entry" + mutentry + ":" + str(len(currentseq))) #if snakemake.config["fivepgenomeregionbuffer"] != "None": #currentseq = truncateseq(currentseq, fivepgenomeregionbuffer, threepgenomeregionbuffer, regionstart, regionend, regionstrand) #print(">" + currentseqid, file=outgenome) #print60(currentseq, outgenome) #else: #if snakemake.config["fivepgenomeregionbuffer"] == "None": #print(">" + currentseqid, file=outgenome) #print(currentseq, file=outgenome, end="") # outgenome.close() else: print("Couldn\'t parse filename " + genome_file) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | import os import re import gzip from collections import namedtuple def retrieve_genotype(match): genolist = [] genolist.append(match.group(2)) genolist.append(match.group(4)) return genolist with open(snakemake.log[0], "w") as f: sys.stderr = sys.stdout = f # kmer count file has kmer counts from the simulated fastq file kmer_count_file = snakemake.input[0] # all kmer file establishes the order of the columns, so all sequence sets will generate the same columns in their feature files all_kmer_file = snakemake.input[1] # feature file will contain a column for each kmer with counts feature_file = snakemake.output[0] kmercounts = {} print("Opening " + kmer_count_file) with open(kmer_count_file, "r") as inkmercount_f: for line in inkmercount_f: line = line.strip() countmatch = re.search(r'^(\S+)\s(\d+)$', line) if countmatch: currentkmer = countmatch.group(1) currentcount = countmatch.group(2) kmercounts[currentkmer] = currentcount #print("Recording kmer " + currentkmer + " with count " + currentcount) else: print("Can\'t find kmer in line:\n" + line) print("Successfully read in simulated kmer counts from file " + kmer_count_file) # calculate total counts for this sample: # append normalized kmer counts to a list for printing: kmertotal = 0 numkmers = 0 countlist = [] with open(all_kmer_file, "r") as inallkmers_f: for line in inallkmers_f: line = line.strip() kmermatch = re.search(r'^(\S+)\s(\d+)$', line) if kmermatch: numkmers = numkmers + 1 printkmer = kmermatch.group(1) if kmercounts.get(printkmer) == None: countlist.append("0") else: countlist.append(str(kmercounts[printkmer])) kmertotal = kmertotal + int(kmercounts[printkmer]) else: print("Can\'t find kmer in line:\n" + line) print("Normalizing") normcountlist = [] for count in countlist: normcountlist.append(str(round(50.0*int(count)*numkmers/kmertotal))) normcountlist.append("Unknown") with open(feature_file, "w") as outfeature_f: featurestring = "\t".join(normcountlist) print(featurestring, file=outfeature_f) |
15 16 | script: "scripts/create_genome.py" |
33 34 35 36 37 | shell: """ for i in $(seq {wildcards.iteration}); do sleep 1; done art_illumina -ss HS25 -f {wildcards.coverage} -l {params.rl} -m {params.il} -s {params.ilsig} -i {input} -na -d {wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}. -o simreads/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}. > {log} 2>&1 """ |
51 52 53 54 55 56 | shell: """ meryl intersect [union-sum output features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration} [count simreads/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.1.fq output features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.1] [count simreads/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.2.fq output features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.2]] {params.kmerdb} output features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.targetkmercounts >>{log} 2>&1 meryl print features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.targetkmercounts>{output} 2>>{log} rm -rf features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.targetkmercounts features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.1 features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration}.2 features/{wildcards.samplegenotype}.{wildcards.coverage}x.{wildcards.iteration} """ |
67 68 | shell: "meryl print {params.kmerdb} | awk 'NR==30*int(NR/30) {{print}}' > {output} 2>>{log}" |
78 79 | script: "scripts/create_featurefile.py" |
88 89 | shell: "cat {input} > {output}" |
99 100 | shell: "(awk '{{print $1}}' features/all_feature_kmers.txt | tr \"\n\" \"\t\"; echo \"Genotype\"; cat features/combined_feature_file.{wildcards.genotype}.noheader.txt ) > {output}" |
113 114 115 116 117 118 | shell: """ meryl intersect [count testsamples/{wildcards.prefix}.fastq output testsamples/{wildcards.prefix}] {params.kmerdb} output testsamples/{wildcards.prefix}.targetkmercounts >>{log} 2>&1 meryl print testsamples/{wildcards.prefix}.targetkmercounts>{output} 2>>{log} rm -rf testsamples/{wildcards.prefix}.targetkmercounts testsamples/{wildcards.prefix} """ |
128 129 | script: "scripts/create_test_featurefile.py" |
Support
- Future updates
Related Workflows





