Data Cleaning and Quality Control Workflow for Viral Sequencing Data
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Cleaning the data
-
[ ] Make sure all required outputs are in the
all_rule
-
[ ] Convert RVHaplo viral accession to Viral Name (+ accession ?)
-
[ ] Investiagte Strainline errors
-
[ ] Investigate RVHaplo errors (exit error 0)
-
[ ] Check trimmed fastq data for exact match adapters
-
Make sure the adapter pairs are together
-
Make sure the same barcode is present and same for read from specific sample
-
-
[ ] Check for chimeric reads
-
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5600009/ -> Chimeric reads seem to appear in 1.7% of sequence even when samples are prepped separately
-
if read has exact match of adapter align to viral reference
-
if read has $\geq$ 80% match not chimeric
-
else read is chimeric and is split at adapter
-
Should happen after deduplication/ host removal
-
to align we need to have already selected the target viral strain (variant)
-
if a read is chimeric then the two resulting smaller reads may belong to different viral variants that selected at start though?
-
could also affect deduplication because technically they are two different reads?
-
could also affect trimming, because if read is chimeric that could mean that the adapter sequence was not at the beginning of the sequence like we might expect?
-
So does it makes more sense to remove chimeras during read trimming? Instead of comparing to a selected viral ref, we would just compare to the whole viral database?
- Should happen after deduplication/ host removal; but to align we need to have already selected the target viral strain (variant), but if a read is chimeric then the two resulting smaller reads may belong somewhere else? This could also affect deduplication because technically they are two different reads. Could also affect trimming, becasue if read is chimeric that could mean that the adapter sequence was not at the begging of the sequence like we might expect?
-
-
Generate improved stats
- [ ] Write r-script(s) for looking at read qc data
Extra
-
Command to make
dag.pdf
-
snakemake --forceall --rulegraph | dot -Tsvg > dag.svg
-
Putting haplotype generation on hold FOR NOW
- [ ] Add check for number of reads aligned to single strain and if (for some reason) the read count falls below a selected threshold (look at RVHaplo docs to confirm) don't run RVHaplo for this strain X barcode
Workflow Image
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | outdir=$1 indir=$2 barcode=$3 reads=$4 threads=$5 mkdir -p ${outdir}; for r in ${indir}*; do file="$(basename -- $r)"; vir=${file%.fasta}; minimap2 \ --secondary=no \ -t ${threads} \ -a ${r} ${reads} \ -o ${outdir}${barcode}.aligned.${vir}.sam; done |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import argparse import gzip def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--infile', required=True) parser.add_argument('--outdir', required=True) return parser.parse_args() def gen_read_length_clusters(reads, outdir): for read in reads: read_id = read.split(b' ')[0][1:] seq = next(reads) seq_len = len(seq.strip()) next(reads) # desc next(reads) # qual outfile = f'{outdir}/{seq_len}.rl.bins.fasta.gz' with gzip.open(outfile, "a") as o: o.write(b'>' + read_id + b'\n') o.write(seq) def main(): args = parse_args() reads = (line for line in gzip.open(args.infile, 'rb')) gen_read_length_clusters(reads, args.outdir) if __name__ == "__main__": main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import argparse from math import ceil from os import listdir, system def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--indir', required = True) parser.add_argument('--outdir', required = True) return parser.parse_args() def get_bins(indir): bins = [int(f.split('.')[0]) for f in listdir(indir) if f != '.DS_Store'] bins.sort() return bins def get_clusters(bins): clusters = [] while bins: cluster_len = ceil(bins[0]*1.09) cluster = [x for x in bins if x <= cluster_len] clusters.append(cluster) bins = [x for x in bins if x not in cluster] return clusters def cat_files(clusters, indir, outdir): for cluster in clusters: start = cluster[0] end = cluster[-1] files = ' '.join([f'{indir}/{x}.rl.bins.fasta.gz' for x in cluster]) system(f'cat {files} > {outdir}/{start}.to.{end}.rl.clusters.fasta.gz') def main(): args = parse_args() bins = get_bins(args.indir) clusters = get_clusters(bins) cat_files(clusters, args.indir, args.outdir) if __name__ == '__main__': main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import argparse import gzip def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--reads', required=True) parser.add_argument('--duplicates', required=True) parser.add_argument('--out_reads', required=True) parser.add_argument('--out_dupes', required=True) return parser.parse_args() def get_duplicates(dupes_file): return [line.strip() for line in open(dupes_file)] def get_reads(reads): return (line for line in gzip.open(reads, 'rb')) def make_out_file(fq): open(fq, 'wb').close() def write_out(outfile, read_id, seq, desc, qual): with gzip.open(outfile , "a") as o: o.write(read_id) o.write(seq) o.write(desc) o.write(qual) def dup_check(read, reads, duplicates, out_reads, out_dupes): read_id = read.split(b' ')[0][1:].decode('utf-8') if read_id in duplicates: write_out(out_dupes, read, next(reads), next(reads), next(reads)) else: write_out(out_reads, read, next(reads), next(reads), next(reads)) def gen_deduped_reads(reads, duplicates, out_reads, out_dupes): [dup_check(read, reads, duplicates, out_reads, out_dupes) for read in reads] def main(): args = parse_args() duplicates = get_duplicates(args.duplicates) reads = get_reads(args.reads) make_out_file(args.out_reads) make_out_file(args.out_dupes) gen_deduped_reads(reads, duplicates, args.out_reads, args.out_dupes) if __name__ == "__main__": main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import argparse def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-p', help='BLAT alignment file (.psl)', required=True) parser.add_argument('-o', required=True) parser.add_argument('-s', help='Similarity threshold', type=float, required=True) return parser.parse_args() def read_psl(psl): return (q.strip().split('\t') for q in open(psl)) def skip_header(qresults): [next(qresults) for _ in range(5)] def high_match(q, threshold): return (int(q[0]) >= threshold * int(q[14]) # amount covered is 90% of target? and int(q[0]) >= threshold * int(q[10]) # amount covered is 90% of query? and q[9] != q[13]) # doesn't equal self def find_dupes(qresults, threshold): dupes = {} [dupes.setdefault(q[9], []).append(q[13]) for q in qresults if high_match(q, threshold)] return dupes def try_remove(dupes, x): try: dupes.pop(x) except KeyError: pass def look_through_dupes(dupes, k): try: [try_remove(dupes, x) for x in dupes[k]] except KeyError: pass def remove_doubles(dupes): results = [] [look_through_dupes(dupes, k) for k in list(dupes)] [results.extend(dupes[s]) for s in dupes] return set(results) def write_out(outfile, dup_set): with open(outfile, 'w') as o: [o.write(f'{dup}\n') for dup in dup_set] def main(): args = parse_args() qresults = read_psl(args.p) skip_header(qresults) dupes = find_dupes(qresults, args.s) cleaned_dupes = remove_doubles(dupes) write_out(args.o, cleaned_dupes) if __name__ == "__main__": main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | from argparse import ArgumentParser from pprint import pprint def parse_args(): parser = ArgumentParser() parser.add_argument('--mpileup', required = True) parser.add_argument('--bed', required = True) parser.add_argument('--strains', required = True) parser.add_argument('--logfile', required = True) parser.add_argument('--outfile', required = True) return parser.parse_args() def read_pileup(pileup): return (row.split('\t') for row in open(pileup)) def parse_bed(bed_file): bed = (row.split('\t') for row in open(bed_file)) return dict([(row[0], row[2]) for row in bed]) def parse_strains(strain_db): strains = (row.split('\t') for row in open(strain_db)) return dict([(row[0], row[1]) for row in strains]) def parse_row(covered_counts_dict, strains, row): covered_counts_dict.setdefault(row[0], [strains[row[0]].strip(), 0, 0]) covered_counts_dict[row[0]][1] += int(row[1]) if int(row[1]) > 0: covered_counts_dict[row[0]][2] += 1 def get_counts(pileup, strains): covered_counts = {} [parse_row(covered_counts, strains, row) for row in pileup] return covered_counts def genome_fraction(key, cc, tc): return (cc[key][2]/int(tc[key])) * 100 def mean_depth(key, cc, tc): return cc[key][1]/int(tc[key]) def high_match(key, cc, tc): return genome_fraction(key, cc, tc) > 80.0 def get_viral_targets(cc, tc): # write logfile here? global log_lines best_matches = {} log_lines = [(key, cc[key][0], genome_fraction(key, cc, tc), mean_depth(key, cc, tc), key) for key in cc.keys()] [best_matches.setdefault(cc[key][0], []).append((genome_fraction(key, cc, tc), mean_depth(key, cc, tc), key)) for key in cc.keys() if high_match(key, cc, tc)] [best_matches[key].sort(reverse=True) for key in best_matches.keys()] return [best_matches[key][0] for key in best_matches.keys()] def write_viral_targets(targets, tc, outfile): with open(outfile, "w") as o: [o.write(f'{t[2]}\t0\t{tc[t[2]]}') for t in targets] def write_log(logfile): with open(logfile,'w') as o: o.write(f'Accession\tStrain\tHorizontalCov\tMeanDepth\n') [o.write(f'{line[0]}\t{line[1]}\t{line[2]}\t{line[3]}\n') for line in log_lines] def main(): args = parse_args() pileup = read_pileup(args.mpileup) total_counts = parse_bed(args.bed) # length of the virus strains = parse_strains(args.strains) covered_counts = get_counts(pileup, strains) #pprint(covered_counts) targets = get_viral_targets(covered_counts, total_counts) write_viral_targets(targets, total_counts, args.outfile) write_log(args.logfile) if __name__ == '__main__': main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | import argparse from Bio import Entrez from tqdm import tqdm def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--infile', required=True) parser.add_argument('--outfile', required=True) parser.add_argument('--email', required=True) return parser.parse_args() def get_accessions(infile): return [line.strip().split(' ')[0][1:] for line in open(infile) if '>' in line] def gen_chunk_list(accessions, n): return [accessions[i:i + n] for i in range(0, len(accessions), n)] def get_sources(accessions): handle = Entrez.efetch(db='nucleotide', rettype='gb', id=','.join(accessions)) sources = [line.strip().split(' ')[-1][1:] for line in handle if 'SOURCE' in line] handle.close() return sources def process_chunks(accession_chunk_list): results = [] [results.extend(el) for el in (get_sources(chunk) for chunk in tqdm(accession_chunk_list))] return results def write_out(accessions, sources, outfile): with open(outfile, 'w') as o: [o.write(f'{a}\t{s}\n') for a,s in zip(*[accessions, sources])] def main(): args = parse_args() Entrez.email = args.email accessions = get_accessions(args.infile) accession_chunk_list = gen_chunk_list(accessions, 1000) sources = process_chunks(accession_chunk_list) write_out(accessions, sources, args.outfile) if __name__ == '__main__': main() |
1 2 3 4 5 6 | wget https://github.com/gt1/daccord/releases/download/0.0.10-release-20170526170720/daccord-0.0.10-release-20170526170720-x86_64-etch-linux-gnu.tar.gz; tar -zvxf daccord-0.0.10-release-20170526170720-x86_64-etch-linux-gnu.tar.gz; rm daccord-0.0.10-release-20170526170720-x86_64-etch-linux-gnu.tar.gz; mv daccord-0.0.10-release-20170526170720-x86_64-etch-linux-gnu/ scripts/; mv scripts/daccord-0.0.10-release-20170526170720-x86_64-etch-linux-gnu/ scripts/daccord/; chmod +x scripts/daccord/bin/daccord |
1 2 3 4 5 6 | git clone $1; chmod +x RVHaplo/rvhaplo.sh; chmod +x RVHaplo/src/*; sed -i 's^./src/^scripts/RVHaplo/src/^' RVHaplo/rvhaplo.sh; sed -i 's^./src/^scripts/RVHaplo/src/^' RVHaplo/src/out_haplotypes.py; mv RVHaplo/ scripts/ |
1 2 3 | git clone $1; chmod +x Strainline/src/strainline.sh; mv Strainline/ scripts/ |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | from argparse import ArgumentParser import pandas as pd def parse_args(): parser = ArgumentParser() parser.add_argument('--host_stats', required = True) parser.add_argument('--viral_stats', required = True) parser.add_argument('--outfile', required = True) return parser.parse_args() def prep_host_stats(host_stats): df = pd.read_table(host_stats) df.drop(columns=['Unmapped'], inplace=True) df.columns = ['Sample', 'NumberOfInputReads', 'MappedToHost'] return df def prep_viral_stats(viral_stats): df = pd.read_table(viral_stats) df.drop(columns=['NumberOfInputReads', 'Unmapped'], inplace=True) df.columns=['Sample', 'MappedToViralDB'] return df def on_target_stats(host_df, viral_df, outfile): final_df = pd.merge(host_df, viral_df, on=['Sample']) final_df['Unmapped'] = final_df['NumberOfInputReads'] - (final_df['MappedToHost'] + final_df['MappedToViralDB']) final_df['HostReadsPercent'] = (final_df['MappedToHost']/final_df['NumberOfInputReads'])*100 final_df['OnTargetPercent'] = (final_df['MappedToViralDB']/final_df['NumberOfInputReads'])*100 final_df.to_csv(outfile, index=False, sep='\t') def main(): args = parse_args() host_df = prep_host_stats(args.host_stats) viral_df = prep_viral_stats(args.viral_stats) on_target_stats(host_df, viral_df, args.outfile) if __name__ == '__main__': main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | from argparse import ArgumentParser from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from os import listdir import pandas as pd def parse_args(): parser = ArgumentParser() parser.add_argument('--indir') parser.add_argument('--strains', required=True) parser.add_argument('--outfile') return parser.parse_args() def parse_strains(strain_db): strains = (row.split('\t') for row in open(strain_db)) return dict([(row[0], row[1]) for row in strains]) def get_strain_name(strain): try: name = strains[strain] except KeyError: name = '*' return name.strip() def get_haplotypes(fq): try: fasta = [seq for seq in SeqIO.parse(open(fq), 'fasta')] except: fasta = [SeqRecord(Seq("NAN"), id = "None")] return fasta def make_df(): global df columns = [ 'Strain', 'Name', 'Haplotype', 'Length', 'Abundance', 'Reads#', 'Depth', 'Seq' ] df = pd.DataFrame(columns=columns) def parse_haplotype(strain, hap): # Add name here after strain data = str(hap.id).split('_') df.loc[len(df.index)] = [ strain, get_strain_name(strain), data[1], data[3], data[5], data[9], data[11], str(hap.seq) ] def parse_rvhaplo_out(indir, directory): strain = directory.split('_')[-1] f = f'{indir}{directory}/rvhaplo_haplotypes.fasta' # some of the outputs don't have a fasta file... So I need to come up with a case for this fq = get_haplotypes(f) [parse_haplotype(strain, hap) for hap in fq] def main(): global strains args = parse_args() # haplotype_0_length_979_abundance_1_number_of_reads_94_depth_50.65580448065173 make_df() strains = parse_strains(args.strains) # pulling data from haplotype fasta files [parse_rvhaplo_out(args.indir, directory) for directory in listdir(args.indir)] df.to_csv(args.outfile, sep='\t', index=False) if __name__ == '__main__': main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | import argparse import gzip def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--infile', required=True) parser.add_argument('--outfile', required=True) return parser.parse_args() def skip_lines(reads): next(reads) next(reads) def process_fastq_read(read, reads): read_id = read.split(' ')[0][1:] read_len = len(next(reads)) skip_lines(reads) return f'{read_id}\t{read_len}\n' def get_read_lengths(reads): return [process_fastq_read(read, reads) for read in reads] def write_out(outfile, read_lengths): with open(outfile, 'w') as o: o.write('ReadId\tReadLength\n') [o.write(length) for length in read_lengths] def main(): args = parse_args() reads = (line.strip() for line in gzip.open(args.infile, 'rt')) read_lengths = get_read_lengths(reads) write_out(args.outfile, read_lengths) if __name__ == '__main__': main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | from argparse import ArgumentParser from os import listdir, getcwd, system from concurrent.futures import ProcessPoolExecutor as ppe def parse_args(): parser = ArgumentParser() parser.add_argument('--threads', type = int, required = True) parser.add_argument('--outdir', required = True) parser.add_argument('--read_clusters', required = True) return parser.parse_args() def get_read_clusters(rc): return [f'{getcwd()}/{rc}/{f}' for f in listdir(rc)] def get_out_list(outdir, l): return [outdir] * l def run_blat(cluster, outdir): cluster_id = cluster.split('/')[-1].split('.rl.clusters.fasta.gz')[0] system(f'blat -fastMap {cluster} {cluster} {outdir}{cluster_id}.psl') def thread(threads, rc, out_list): with ppe(threads) as p: p.map(run_blat, rc, out_list) def main(): args = parse_args() rc = get_read_clusters(args.read_clusters) out_list = get_out_list(args.outdir, len(rc)) thread(args.threads, rc, out_list) if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | outdir=$1 psl_indir=$2 threshold=$3 mkdir -p ${outdir}; for f in ${psl_indir}*; do file="$(basename -- $f)"; cluster=${file%.psl}; scripts/find_duplicates.py \ -p $f \ -s ${threshold} \ -o ${outdir}${cluster}.dupes.txt; done |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | outdir=$1 vir_indir=$2 sam_indir=$3 barcode=$4 threads=$5 sub_graph=1 mkdir -p ${outdir}; for r in ${vir_indir}*; do file="$(basename -- $r)"; vir=${file%.fasta}; sam_file=${sam_indir}${barcode}.aligned.${vir}.sam; read_count=$(samtools view -c -F 260 ${sam_file}); if [ ${read_count} -gt 50000 ]; then sub_graph=$(echo ${read_count}/25000 | bc) fi; scripts/RVHaplo/rvhaplo.sh \ -i ${sam_file} \ -r ${r} \ -t ${threads} \ -sg ${sub_graph} \ -l 0 \ -o ${outdir}rvhaplo_${barcode}_${vir}/; done |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | import argparse def parse_cmdline_params(): parser = argparse.ArgumentParser() parser.add_argument('-i', nargs='+', required=True) parser.add_argument('-o', required=True) return parser.parse_args() def write_out(idxstats, output_file): with open(output_file, 'w') as o: o.write('Sample\tNumberOfInputReads\tMapped\tUnmapped\n') [o.write(stat) for stat in idxstats] def mapping_stats(infile): sample_name = infile.split('/')[-1].split('.', 1)[0] idxstats = ((int(row.strip().split('\t')[2]),int(row.strip().split('\t')[3])) for row in open(infile)) mapped, unmapped = [sum(x) for x in zip(*idxstats)] number_of_reads = mapped + unmapped return f'{sample_name}\t{number_of_reads}\t{mapped}\t{unmapped}\n' def main(): args = parse_cmdline_params() idxstats = list(map(mapping_stats, args.i)) write_out(idxstats, args.o) if __name__ == "__main__": main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | mkdir $2 awk -v out=$2 'BEGIN {n=0;} /^>/ { if(n%1==0){file=sprintf(out"/chunk%d.fasta",n);} print >> file; n++; next; } { print >> file; }' < $1 for f in $2/chunk*; do line=$(head -n 1 $f | cut -d':' -f1); name=${line:1}.fasta; mv $f $2/$name; done |
1 2 3 4 | input=$1 output=$2 echo $(zcat ${input} | wc -l)/4 | bc > ${output} |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import argparse def parse_cmdline_params(): parser = argparse.ArgumentParser() parser.add_argument('--infile', required=True) parser.add_argument('--strains', required=True) parser.add_argument('--outfile', required=True) return parser.parse_args() def parse_strains(strain_db): strains = (row.split('\t') for row in open(strain_db)) return dict([(row[0], row[1]) for row in strains]) def get_strain_name(strains, strain): try: name = strains[strain] except KeyError: name = '*' return name.strip() def write_out(idxstats, strains, output_file): with open(output_file, 'w') as o: # o.write('Strain\tName\tNumberOfInputReads\tMapped\tUnmapped\tPercentMapped\n') # [o.write(f'{strain}\t{get_strain_name(strains, strain)}\t{idxstats[strain][0]}\t{idxstats[strain][1]}\t{idxstats[strain][2]}\t{idxstats[strain][3]}\n') for strain in idxstats] o.write('Strain\tName\tNumberOfMappedReads\n') [o.write(f'{strain}\t{get_strain_name(strains, strain)}\t{idxstats[strain][0]}\n') for strain in idxstats] def mapping_stats(infile): idxstats = (row.strip().split('\t') for row in open(infile)) stats = {} for line in idxstats: number_of_reads = int(line[2]) + int(line[3]) l1 = [number_of_reads, int(line[2]), int(line[3])] try: l2 = stats[line[0]] stats[line[0]] = [sum(x) for x in zip(*[l1,l2])] except KeyError: stats[line[0]] = l1 return stats def main(): args = parse_cmdline_params() strains = parse_strains(args.strains) idxstats = mapping_stats(args.infile) # dictionary [idxstats[strain].append((idxstats[strain][1]/idxstats[strain][0])*100) if idxstats[strain][0] != 0 else idxstats[strain].append(0) for strain in idxstats] # Add name here? write_out(idxstats, strains, args.outfile) if __name__ == "__main__": main() |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | import argparse import gzip from datetime import date from Bio import SeqIO, bgzf import re def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--infile', required=True) parser.add_argument('--crop', type=int, required=True) parser.add_argument('--logfile', required=True) parser.add_argument('--outfile', required=True) parser.add_argument('--barcodes', required=True) return parser.parse_args() def get_reads(fq): return (seq for seq in SeqIO.parse(gzip.open(fq, 'rt'), 'fastq')) def get_barcodes(fq): return [str(fasta.seq) for fasta in SeqIO.parse(open(fq), 'fasta')] def make_log(crop): # change this to reflect changes to code with open(logfile, 'w') as o: o.write(f'{date.today()}\n\n') o.write(f'Base crop length is {crop} bases from head and tail of all reads\n') o.write('Some reads may have this value adjusted, this will be listed per read\n\n') def make_fq(): open(outfile, 'wb').close() def write_seq_log(read, head_crop, tail_crop): with open(logfile, 'a') as o: o.write('sequence too short with crop lengths:\n') o.write(f'head crop len: {head_crop}, tail crop len: {-tail_crop}\n') o.write(f'>{read.id}\n') o.write(f'{str(read.seq)}\n\n') def write_crop_log(read, head_crop, tail_crop): with open(logfile, 'a') as o: o.write(f'>{read.id}\n') o.write(f'head crop len: {head_crop}, tail crop len: {-tail_crop}\n') o.write(f'cropped head seq\n') o.write(f'{str(read.seq)[:head_crop]}\n') o.write(f'cropped tail seq\n') o.write(f'{str(read.seq)[tail_crop:]}\n\n') def write_read(read, head_crop, tail_crop): with bgzf.BgzfWriter(outfile, 'ab') as o: SeqIO.write(read[head_crop:tail_crop], o, 'fastq') def write_read_info(): with open(logfile, 'a') as o: o.write('ReadID\tReadLength\tNumMatches\t(Head,Tail,Neither)\n') [o.write(f'{i[0]}\t{i[1]}\t{i[2]}\t{i[3]}\n') for i in read_info] def long_enough(seq, head_crop, tail_crop): # - tail_crop because it is negative return len(seq) >= (head_crop - tail_crop + 50) def in_head(barcode): return 0 <= barcode[0] <= 50 def in_tail(barcode, seq_len): return seq_len - 50 <= barcode[0] <= seq_len def find_matches(barcode, seq): # removed match.group() from tuple because it is the sequence of the barcode return [(m.start(), m.end()) for m in re.finditer(rf'{barcode}', seq)] def barcode_check(seq): return [b for bar in barcodes for b in find_matches(bar, seq) if bar in seq] def trim_read(read, head_crop, tail_crop): write_read(read, head_crop, tail_crop) write_crop_log(read, head_crop, tail_crop) def point_read(read, head_crop, tail_crop): if long_enough(str(read.seq), head_crop, tail_crop): trim_read(read, head_crop, tail_crop) else: write_seq_log(read, head_crop, tail_crop) def split_read(read, barcode): read_a = read[:barcode[0]] read_b = read[barcode[0]:] read_a.id = f'{read_a.id}_A' read_b.id = f'{read_b.id}_B' # read_a.description = read.description + f' new_read_id={read_a.id}_A' # read_b.description = read.description + f' new_read_id={read_b.id}_B' process_read(read_a) process_read(read_b) def classify_match(match, seq_len): if in_head(match): return 'head' if in_tail(match, seq_len): return 'tail' return 'neither' def bin_barcodes(matches, seq_len): # if this doesn't work I can use the below code bins = {'head': [], 'tail': [], 'neither': []} [bins[classify_match(m, seq_len)].append(m) for m in matches] head = bins['head'] tail = bins['tail'] neither = bins['neither'] head.sort() tail.sort() neither.sort() return (head, tail, neither) def process_read(read): head_crop = crop_len tail_crop = -crop_len matches = barcode_check(str(read.seq)) head, tail, neither = bin_barcodes(matches, len(str(read.seq))) read_info.append(( read.id, len(str(read.seq)), len(matches), (len(head), len(tail), len(neither)) )) if len(neither) > 0: barcode = neither[0] split_read(read, barcode) else: if len(head) > 0: head_crop = head[-1][1] + 13 if len(tail) > 0: tail_crop = tail[0][0] - 13 - len(str(read.seq)) point_read(read, head_crop, tail_crop) def main(): # declaring globals global barcodes global crop_len global logfile global outfile global read_info # parsing input arguments args = parse_args() # getting all reads as generator object reads = get_reads(args.infile) # setting values for global variables barcodes = get_barcodes(args.barcodes) crop_len = args.crop logfile = args.logfile outfile = args.outfile read_info = [] # make the inistial log and out file make_log(args.crop) make_fq() # begin processing [process_read(read) for read in reads] write_read_info() if __name__ == '__main__': main() |
Support
- Future updates
Related Workflows





