Pharmacogenomics Pipeline using Hydra for WGS Data Analysis
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Pharmacogenomics pipeline implemented in hydra
:speech_balloon: Introduction
:heavy_exclamation_mark: Dependencies
To run this workflow, the following tools need to be available:
:school_satchel: Preparations
Sample data
-
Add all sample ids to
samples.tsv
in the columnsample
. -
Add all sample data information to
units.tsv
. Each row represents afastq
file pair with corresponding forward and reverse reads. Also indicate the sample id, run id and lane number, adapter.
Reference data
-
You need a BAM file marked for duplicates
-
Reference genome
-
dbSNP database in VCF file format
:white_check_mark: Testing
The workflow repository contains a small test dataset
.tests/integration
which can be run like so:
cd .tests/integration
snakemake -s ../../Snakefile -j1 --use-singularity
# alternative command:
snakemake -s ../../workflow/Snakefile -j1 --use-singularity --configfile config/config.yaml
# generate DAG:
snakemake --cores 1 -s workflow/Snakefile --configfile config/config.yaml --rulegraph | dot -Tsvg > ./images/dag.svg
:rocket: Usage
The workflow is designed for WGS data meaning huge datasets which require a lot of compute power. For HPC clusters, it is recommended to use a cluster profile and run something like:
snakemake -s /path/to/Snakefile --profile my-awesome-profile
Code Snippets
35 36 | wrapper: "v1.14.1/bio/gatk/depthofcoverage" |
67 68 | wrapper: "v1.14.1/bio/gatk/depthofcoverage" |
36 37 | script: "../scripts/generate_pgx_report.py" |
36 37 | script: "../scripts/get_clinical_guidelines.py" |
33 34 | script: "../scripts/get_interaction_guidelines.py" |
34 35 | script: "../scripts/get_target_variants.py" |
65 66 | script: "../scripts/get_target_variants.py" |
34 35 | script: "../scripts/reform_genomic_region.py" |
65 66 | script: "../scripts/reform_genomic_region.py" |
97 98 | script: "../scripts/reform_genomic_region.py" |
37 38 39 40 41 42 43 44 45 | shell: "gatk VariantAnnotator" " --variant {input.vcf}" " --input {input.aln}" " --reference {input.ref}" " --dbsnp {params.dbsnp}" " {params.extra}" " --output {output.vcf}" " >& {log}" |
33 34 | script: "../scripts/variant_filtration.py" |
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import pandas as pd import numpy as np def risk_duplication(x): min_dup_var = min(abs(np.array([1 / 4, 1 / 3, 2 / 3, 3 / 4]) - x)) min_diploid = min(abs(np.array([0, 1 / 2, 1]) - x)) if (min_dup_var > min_diploid): return False return True def get_haplotypes(id_, haplotype_definitions): haplo = haplotype_definitions[haplotype_definitions['ID'] == id_]["HAPLOTYPE"] return ('/'.join(haplo)) def get_interaction_haplotypes(interaction_guidelines): haplotypes = interaction_guidelines['haplotypes'].to_list() haplotypes = haplotypes[0].split(',') haplo_nudt = '' haplo_tpmt = '' for haplo in haplotypes: if 'NUDT' in haplo: haplo_nudt = f"{haplo_nudt}{haplo}/" if 'TPMT' in haplo: haplo_tpmt = f"{haplo_tpmt}{haplo}/" new_haplo = f"{haplo_nudt},{haplo_tpmt}" new_haplo = new_haplo.replace('/,', ',') new_haplo = new_haplo[:len(new_haplo) - 1] interaction_guidelines['haplotypes'] = new_haplo return interaction_guidelines def get_recommendations(found_variants, haplotype_definitions, clinical_guidelines_p, interaction_guidelines_p, report, analyzed_variants): detected_variants = pd.read_csv(found_variants, sep='\t') haplotype_definitions = pd.read_csv(haplotype_definitions, sep='\t') clinical_guidelines = pd.read_csv(clinical_guidelines_p, sep='\t') interaction_guidelines = pd.read_csv(interaction_guidelines_p, sep='\t') zygosities = ['Hetero-', 'Homo-'] if detected_variants.empty: detected_variants_present = [] faulty_haplotypes = [] else: detected_variants['Zygosity'] = detected_variants['GT'].map( lambda row: zygosities[int(row.split('/')[0])]) detected_variants['Position'] = detected_variants[ '#CHROM'] + ':' + detected_variants['POS'].astype(str) detected_variants[['Ref.reads', 'Alt.reads' ]] = detected_variants['AD'].str.split(',', expand=True) detected_variants['Alt.reads'] = detected_variants['Alt.reads'].astype( int) detected_variants['Ref.reads'] = detected_variants['Ref.reads'].astype( int) detected_variants['Haplotype'] = detected_variants['ID'].map( lambda x: get_haplotypes(x, haplotype_definitions)) columns = detected_variants.columns[2:].drop('PL') detected_variants_present = detected_variants[columns] detected_variants_present[ 'Variantfrekvens'] = detected_variants_present['AF'] detected_variants_present[ "Möjlig Duplikation"] = detected_variants_present[ 'Variantfrekvens'].apply(lambda x: risk_duplication(x)) faulty_haplotypes = pd.Series( np.where(~detected_variants_present['Möjlig Duplikation'], '', detected_variants_present['Haplotype'])) faulty_haplotypes = faulty_haplotypes.map( lambda row: row.split("/")).explode().unique() order_columns = [ "GENE", "ID", "Haplotype", "Position", "Zygosity", "Variantfrekvens", "Möjlig Duplikation" ] verbose_columns = [ "Gen", "rsID", "Möjliga Haplotyper", "Position", "Zygositet", "Variantfrekvens", "Möjlig Duplikation" ] detected_variants_present = detected_variants_present[order_columns] detected_variants_present.columns = verbose_columns clin_columns = [ "gene", "Haplotype1", "Haplotype2", "Guideline", "Activity" ] verbose_columns = [ "Gen", "Haplotyp 1", "Haplotyp 2", "Klinisk Rekommendation", "Aktivitet" ] clinical_guidelines_present = clinical_guidelines[clin_columns] clinical_guidelines_present.columns = verbose_columns faulty_haplotype_recommendation = 'Ingen rekommendation ges pga obalans i heterozygositet' clinical_guidelines_present.loc[ clinical_guidelines_present['Haplotyp 1'].isin(faulty_haplotypes), 'Klinisk Rekommendation'] = faulty_haplotype_recommendation clinical_guidelines_present.loc[ clinical_guidelines_present['Haplotyp 2'].isin(faulty_haplotypes), 'Klinisk Rekommendation'] = faulty_haplotype_recommendation interaction_guidelines = get_interaction_haplotypes(interaction_guidelines) for haplotype in faulty_haplotypes: if haplotype == "": continue interaction_guidelines.loc[ interaction_guidelines['haplotypes'].str.contains(haplotype), 'Guideline'] = faulty_haplotype_recommendation clinical_guidelines_present[ 'Klinisk Rekommendation'] = clinical_guidelines_present[ 'Klinisk Rekommendation'].str.replace(r'<[^>]+>', '', regex=True) interaction_guidelines['Guideline'] = interaction_guidelines[ 'Guideline'].str.replace(r'<[^>]+>', '', regex=True) dpyd_genotype = clinical_guidelines_present.loc[ clinical_guidelines_present['Gen'] == 'DPYD']['Haplotyp 1'].values[ 0] + '/' + clinical_guidelines_present.loc[ clinical_guidelines_present['Gen'] == 'DPYD']['Haplotyp 2'].values[0] dpyd_recommendation = clinical_guidelines_present.loc[ clinical_guidelines_present['Gen'] == 'DPYD']['Klinisk Rekommendation'].values[0] tpmt_genotype = interaction_guidelines['haplotypes'].values[0] tpmt_recommendation = interaction_guidelines['Guideline'].values[0] with open(report_template, 'r') as reader: tmp = reader.read() tmp = tmp.replace('TPMT_genotype', tpmt_genotype) tmp = tmp.replace('DPYD_recommendation', dpyd_recommendation) tmp = tmp.replace('TPMT_recommendation', tpmt_recommendation) tmp = tmp.replace('DPYD_genotype', dpyd_genotype) with open(report, 'w') as writer: writer.write(tmp) if __name__ == "__main__": found_variants = snakemake.input.found_variants missed_variants = snakemake.input.missed_variants clinical_guidelines = snakemake.input.clinical_guidelines interactions = snakemake.input.interactions haplotype_definitions = snakemake.params.haplotype_definitions report_template = snakemake.params.report_template report = snakemake.output.report get_recommendations(found_variants, haplotype_definitions, clinical_guidelines, interactions, report, report_template) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | import pandas as pd import re import numpy as np class ArrangeHaplotype: """ Get possible haplotypes from variant combinations detected in file. Add clinical guidelines based on the haplotypes detected """ def __init__(self, detected_variants, haplotype_definitions, activity_scores, clinical_guidelines): self.detected_variants = pd.read_csv(detected_variants, sep="\t") self.haplotype_definitions = pd.read_csv(haplotype_definitions, sep="\t") self.activity_scores = pd.read_csv(activity_scores, sep="\t") self.clinical_guidelines = pd.read_csv(clinical_guidelines, sep="\t") if not self.detected_variants.empty: self.merge_data() def merge_data(self): """ Join possible haplotypes containing ids :return: """ self.detected_variants["multival_haplotype"] = \ self.detected_variants.ID.apply( lambda x: list(self.haplotype_definitions["HAPLOTYPE"][ self.haplotype_definitions.ID == x ]) ) self.detected_variants["CN"] = self.detected_variants["GT"].apply( lambda x: sum(map(int, re.split('[/|]+', x)))) def get_haplotypes(self): """ Return tree-structure of possible combinations of haplotypes explaning seen variants. Assumptions: Haplotype explaning most variants goes first, if any of variants in haplotype is zero the all haplotypes containing that variant is removed for futher chocies. :return: Gene - haplotype-tree dict """ def remove_hap(x, y): return x.remove(y) if y in x else x def _get_haplotypes(variant_subdf, current_haplotype, depth=2): idx = variant_subdf["multival_haplotype"].apply( lambda x: current_haplotype in x) variant_subdf.loc[idx, "CN"] -= 1 if any(variant_subdf["CN"] == 0): variant_subdf["multival_haplotype"] = variant_subdf[ "multival_haplotype"].apply( lambda x: remove_hap(x, current_haplotype)) variant_subdf = variant_subdf[variant_subdf["CN"] != 0] if depth == 1: if len(variant_subdf) == 0: return [current_haplotype, True] else: return [current_haplotype, False] if len(variant_subdf) == 0 or not any( variant_subdf["multival_haplotype"].apply( lambda x: bool(x))): wt_haplotype = "WT" return [ current_haplotype, [ _get_haplotypes(variant_subdf.copy(), wt_haplotype, depth - 1) ] ] remaining_haplo = set([ element for haplolist in variant_subdf["multival_haplotype"] for element in haplolist ]) return [ current_haplotype, [ _get_haplotypes(variant_subdf.copy(), hap, depth - 1) for hap in remaining_haplo ] ] genes = set(self.detected_variants["GENE"]) full_mat = {} for gene in genes: gene_subset = self.detected_variants[self.detected_variants.GENE == gene] candidate_haplotypes = np.array( list( set([ element for haplolist in gene_subset["multival_haplotype"] for element in haplolist ]))) order = list( reversed( np.argsort([ sum(gene_subset["multival_haplotype"].apply( lambda y: x in y)) for x in candidate_haplotypes ]))) candidate_haplotypes = candidate_haplotypes[order] gene_haps = [] for current_haplotype in candidate_haplotypes: gene_haps.append( _get_haplotypes(gene_subset.copy(), current_haplotype)) idx = gene_subset["multival_haplotype"].apply( lambda x: current_haplotype in x) cn = gene_subset.loc[idx, "CN"] if any([(c - 1) == 0 for c in cn]): gene_subset["multival_haplotype"] = gene_subset[ "multival_haplotype"].apply( lambda x: remove_hap(x, current_haplotype)) full_mat.update({gene: gene_haps}) return full_mat def get_haplotype_dataframe(self): # Wow what a mess hap_mat = self.get_haplotypes() def _haplot_to_row(hap, gene): def prim_haplot_to_row(hap, gene): current_hap = (f"{gene}-1" if hap[0] == "WT" else hap[0]) if not type(hap[1]) is list: return [current_hap, gene, hap[1]] else: next_hap = [ prim_haplot_to_row(c_hap, gene) for c_hap in hap[1] ] return [[current_hap] + c_hap for c_hap in next_hap][0] return [prim_haplot_to_row(c_hap, gene) for c_hap in hap] out = [] for gene, hap in hap_mat.items(): if len(hap) != 0: out += _haplot_to_row(hap, gene) hap_df = pd.DataFrame( out, columns=["Haplotype1", "Haplotype2", "gene", "pass"]) hap_df = hap_df[hap_df["pass"]] return hap_df[["gene", "Haplotype1", "Haplotype2"]] def get_clinical_guidelines_table(self): if self.detected_variants.empty: columns = [ "gene", "Haplotype1", "Haplotype2", "HAPLOTYPE1", "ACTIVITY_SCORE1", "HAPLOTYPE2", "ACTIVITY_SCORE2", "Genotype_activity", "Gene", "Activity", "Guideline" ] return pd.DataFrame(columns=columns) hap_df = self.get_haplotype_dataframe() hap_df = hap_df.merge(self.activity_scores, how="left", left_on="Haplotype1", right_on="HAPLOTYPE") hap_df = hap_df.merge(self.activity_scores, how="left", left_on="Haplotype2", right_on="HAPLOTYPE", suffixes=("1", "2")) hap_df["Genotype_activity"] = hap_df["ACTIVITY_SCORE1"] + hap_df[ "ACTIVITY_SCORE2"] hap_df = hap_df.merge(self.clinical_guidelines, how="left", left_on=["gene", "Genotype_activity"], right_on=["Gene", "Activity"]) return hap_df def get_wildtypes(self, hap_df): hap_genes = list(hap_df.gene.values) for gene in set(self.clinical_guidelines.Gene): if hap_df.empty or gene not in hap_genes: gene_df = pd.DataFrame({ "gene": [gene], "Haplotype1": [gene + "-1"], "Haplotype2": [gene + "-1"], "HAPLOTYPE1": [gene + "-1"], "ACTIVITY_SCORE1": [1], "HAPLOTYPE2": [gene + "-1"], "ACTIVITY_SCORE2": [1], "Genotype_activity": [2.0] }) gene_df = gene_df.merge(self.clinical_guidelines, how="left", left_on=["gene", "Genotype_activity"], right_on=["Gene", "Activity"]) hap_df = pd.concat([hap_df, gene_df], ignore_index=True) return hap_df def main(): haplotype_definitions = snakemake.params["haplotype_definitions"] haplotype_activity = snakemake.params["haplotype_activity"] hidden_haplotypes = snakemake.params["hidden_haplotypes"] clinical_guidelines = snakemake.params["clinical_guidelines"] variant_csv = snakemake.input["found_variants"] output = snakemake.output["csv"] ah = ArrangeHaplotype(variant_csv, haplotype_definitions, haplotype_activity, clinical_guidelines) df = ah.get_clinical_guidelines_table() df = ah.get_wildtypes(df) columns = [ "gene", "Haplotype1", "Haplotype2", "HAPLOTYPE1", "ACTIVITY_SCORE1", "HAPLOTYPE2", "ACTIVITY_SCORE2", "Genotype_activity", "Gene", "Activity", "Guideline" ] if not df.empty: hidden_haplotypes = pd.read_csv(hidden_haplotypes, sep="\t") hidden_haplotypes["comb"] = hidden_haplotypes[[ "Haplotype1", "Haplotype2" ]].apply(lambda x: "".join(sorted(x.tolist())), axis=1) df["comb"] = df[["Haplotype1", "Haplotype2" ]].apply(lambda x: "".join(sorted(x.tolist())), axis=1) df = df[~df["comb"].isin(hidden_haplotypes["comb"])] df.to_csv(output, sep="\t", index=False, columns=columns) if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | import pandas as pd import numpy as np from itertools import product class GetInteractions: def __init__(self, variants_file, guideline_file): self.variants_df = pd.read_csv(variants_file, sep="\t") self.variants_df = self.variants_df.sort_values(by=["gene"]) self.interaction_guidelines_df = pd.read_csv(guideline_file, sep="\t") def get_possible_interactions(self): # TODO: refactor inefficient and ugly interacting_genes = set( self.interaction_guidelines_df[["gene1", "gene2"]].values.flat) if self.variants_df.empty: return self.interaction_guidelines_df[np.zeros(len( self.interaction_guidelines_df), dtype=bool)] self.variants_df = self.variants_df[self.variants_df.gene.isin( interacting_genes)] if self.variants_df.empty: return self.interaction_guidelines_df[np.zeros(len( self.interaction_guidelines_df), dtype=bool)] index_combinations = product(range(len(self.variants_df)), range(len(self.variants_df))) prev_idx = np.zeros(len(self.interaction_guidelines_df), dtype=bool) haplotypes = [""] * len(self.interaction_guidelines_df) for i, j in index_combinations: if not i == j: gene1 = self.variants_df.iloc[[i]].gene.values[0] gene2 = self.variants_df.iloc[[j]].gene.values[0] activity1 = int(self.variants_df.iloc[[i]].Genotype_activity) activity2 = int(self.variants_df.iloc[[j]].Genotype_activity) all_haplotypes = ",".join([ self.variants_df.iloc[[i]]["Haplotype1"].values.flat[0], self.variants_df.iloc[[i]]["Haplotype2"].values.flat[0], self.variants_df.iloc[[j]]["Haplotype1"].values.flat[0], self.variants_df.iloc[[j]]["Haplotype2"].values.flat[0] ]) idx = (self.interaction_guidelines_df.gene1 == gene1) & \ (self.interaction_guidelines_df.gene2 == gene2) & \ (self.interaction_guidelines_df.activity_1 == activity1) & \ (self.interaction_guidelines_df.activity_2 == activity2) if any(idx): prev_idx += idx for idx_i in idx: if idx_i: haplotypes[idx_i] = all_haplotypes haplotypes = [h for h in haplotypes if h != ""] interactions = self.interaction_guidelines_df[prev_idx] interactions["haplotypes"] = haplotypes return interactions def run_and_write(self, output): interactions = self.get_possible_interactions() interactions.to_csv(output, index=None, sep="\t") def main(): interaction_guidelines = snakemake.params["interacting_targets"] diploids = snakemake.input["diploids"] output = snakemake.output["csv"] get_interactions = GetInteractions(diploids, interaction_guidelines) get_interactions.run_and_write(output) if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | import argparse import gzip import os import re import sys import pandas as pd class GDF: """ We are assuming single sample GDF """ def __init__(self, filename): self.data = pd.read_csv(filename, sep="\t") self.data_cols = self.data.columns.values pos_df = pd.DataFrame( self.data.Locus.apply(lambda x: x.split(":")).to_list(), columns=["CHROM", "POS"]) pos_df.POS = pos_df.POS.astype('int64') self.data = pd.concat([self.data, pos_df], axis=1) def rsid_per_position(self, target_bed): def _annotate(x, targets): try: ids = targets[(targets.CHROM == x.CHROM) & (targets.START <= x.POS) & (x.POS <= targets.END)].ID.to_list() return ", ".join(ids) except IndexError: return "-" targets = pd.read_csv(target_bed, sep="\t", names=["CHROM", "START", "END", "ID", "GENE"]) targets["save"] = targets.START idx_swap = targets.START > targets.END targets.loc[idx_swap, "START"] = targets.loc[idx_swap, "END"] targets.loc[idx_swap, "END"] = targets.loc[idx_swap, "save"] targets["CHROM"] = targets.CHROM.apply(lambda x: f"chr{x}") self.data["ID"] = self.data.apply(lambda x: _annotate(x, targets), axis=1) def write_proccessed_gdf(self, filename, annotate=True): if annotate: self.data.to_csv(filename, sep="\t", index=False) else: self.data.to_csv(filename, sep="\t", columns=self.data_cols, index=False) class VCF: """ We are assuming single sample VCF """ def __init__(self, filename): self.meta = [] self.data = pd.DataFrame() self.original_header = [] self.read_vcf(filename) def read_vcf(self, filename): if ".gz" in filename: f = gzip.open(filename, "rt") else: f = open(filename, "r") lines = f.readlines() lines = [lr.strip() for lr in lines] i = None for i, line in enumerate(lines): if re.search("^#CHROM", line): break if i is None: raise ImportError("No lines in: " + filename) self.meta = lines[:i - 1] data = [lr.split("\t") for lr in lines[i:]] self.data = pd.DataFrame(data[1:], columns=data[0]) self.original_header = self.data.columns.values if not self.data.empty: sample_column = self.data.columns.values[-1] # max_len_idx = self.data.FORMAT.str.len().idxmax # format_columns = self.data.FORMAT[max_len_idx].split(":") format_columns = self.data.FORMAT[0].split(":") format_split = pd.DataFrame(self.data[sample_column].apply( lambda x: x.split(":")).to_list(), columns=format_columns) self.data = pd.concat([self.data, format_split], axis=1) def filter_snp(self, filter_file, exclude=True, column="SNP"): filter_snps = pd.read_csv(filter_file, sep="\t")[column] if exclude: self.data = self.data[~self.data.ID.isin(filter_snps)] else: self.data = self.data[self.data.ID.isin(filter_snps)] def write_vcf(self, filename_out): with open(filename_out, "w+") as f: for line in self.meta: f.write(line + "\n") self.data.to_csv(f, mode="a", sep="\t", columns=self.original_header, index=False) class VariantQCCollection: def __init__(self, target_bed, vcf_filename): self.bed_targets = pd.read_csv(target_bed, names=None, sep="\t") self.bed_targets.columns = ["#CHROM", "START", "END", "ID", "GENE"] self.sample = os.path.basename(vcf_filename) self.vcf = VCF(vcf_filename) self.detected_variants = [] def detect_variants(self): """ Select all variants within VCF with ID same as in target_bed """ self.detected_variants = self.vcf.data.ID[ (self.vcf.data.ID.isin(self.bed_targets.ID)) & (self.vcf.data.FILTER == "PASS")].tolist() def write_detected_variant_qc(self, output_file): """ Write detected variants to csv """ if not self.detected_variants: self.detect_variants() current_variants = self.vcf.data[self.vcf.data.ID.isin( self.detected_variants)] current_variants = current_variants.merge( self.bed_targets[["ID", "GENE"]], on="ID") current_variants.to_csv(output_file, index=False, sep="\t") def main(): file_type = snakemake.params["file_type"] target_bed = snakemake.params["target_bed"] input_f = snakemake.input["input_f"] output_f = snakemake.output["output_f"] if file_type == "VCF": var_collect = VariantQCCollection(target_bed, input_f) var_collect.write_detected_variant_qc(output_f) elif file_type == "GDF": c_gdf = GDF(input_f) c_gdf.rsid_per_position(target_bed) c_gdf.write_proccessed_gdf(output_f) else: raise Exception("File type must be either VCF or GDF") if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import pandas as pd def reform(target_bed, output_f, detected_variants, padding, file_format): targets = pd.read_csv(target_bed, sep="\t", names=["CHROM", "START", "END", "ID", "GENE"], dtype={ "START": int, "END": int }) if detected_variants is not None: detected_rsid = pd.read_csv(detected_variants, sep="\t").ID targets = targets[~targets.ID.isin(detected_rsid)] bed = file_format == "bed" with open(output_f, "w+") as f: for i, row in targets.iterrows(): chrom, start, end, id = row[0:4] if start > end: end, start = start, end start -= padding end += padding if not str(chrom).startswith('chr'): chrom = f"chr{chrom}" if bed: # f.write(f"chr{chrom}\t{start}\t{end}\t{id}\n") f.write(f"{chrom}\t{start}\t{end}\t{id}\n") else: # f.write(f"chr{chrom}:{start}-{end}\n") f.write(f"{chrom}:{start}-{end}\n") def main(): target_bed = snakemake.input["target_bed"] output_file = snakemake.output["interval"] detected_variants = snakemake.input.get("detected_variants", None) padding = snakemake.params["padding"] file_format = snakemake.params["file_format"] reform(target_bed, output_file, detected_variants, int(padding), file_format) if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | from pysam import VariantFile def filter_variants(vcf, read_ratio, depth, output): """ Soft filter all variants with suspicious read ratio and insufficient read-depth """ vcf_in = VariantFile(vcf) new_header = vcf_in.header new_header.filters.add(f"AR{read_ratio}", None, None, f"Ratio of ref/alt reads lower than {read_ratio}") new_header.filters.add(f"DP{depth}", None, None, f"DP is lower than {depth}x") vcf_out = VariantFile(output, "w", header=new_header) for record in vcf_in.fetch(): ad = record.samples[0]["AD"] # No multiallelic split if record.info["DP"] < depth: record.filter.add("DP100") elif len(ad) == 2: n_ref, n_alt = ad if n_alt / (n_ref + n_alt) < read_ratio: record.filter.add(f"AR{read_ratio}") else: record.filter.add("PASS") vcf_out.write(record) def main(): filter_variants(snakemake.input["vcf"], snakemake.params.read_ratio, snakemake.params.read_depth, snakemake.output["filtered_vcf"]) if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | __author__ = "Lauri Mesilaakso" __copyright__ = "Copyright 2022, Lauri Mesilaakso" __email__ = "lauri.mesilaakso@gmail.com" __license__ = "MIT" import tempfile from snakemake.shell import shell from snakemake_wrapper_utils.java import get_java_opts from os import path java_opts = get_java_opts(snakemake) extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=True, stderr=True) # Extract basename from the output file names out_basename = path.commonprefix(snakemake.output).rstrip(".") with tempfile.TemporaryDirectory() as tmpdir: shell( "gatk --java-options '{java_opts}' DepthOfCoverage" " --input {snakemake.input.bam}" " --intervals {snakemake.input.intervals}" " --reference {snakemake.input.fasta}" " --output {out_basename}" " --tmp-dir {tmpdir}" " {extra}" " {log}" ) |
Support
- Future updates
Related Workflows





