cfMeDIP-seq Circulating Methylome Data Post-processing Pipeline
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
cfmedipseq_pipeline
Post-processing pipeline for next-generation circulating methylome data generated by cfMeDIP-seq
Dependencies
The only up-front dependency is Anaconda.
Key Anaconda package dependencies:
-
pyyaml
-
mamba (conda-forge)
-
snakemake (bioconda)
-
picard (bioconda)
-
samtools (bioconda)
-
bwa (bioconda)
-
R packages: dplyr, data.table
CountsReg R package not handled by Conda and will have to be hacked into the conda environment through R install.packages()
Snakemake profiles
For a guide on how to create a Snakemake profile for your cluster setup, see https://www.sichong.site/2020/02/25/snakemake-and-slurm-how-to-manage-workflow-with-resource-constraint-on-hpc/
Create a slurm_log directory when running snakemake to place log files based on the out location of the config.yaml, example config.yaml is provided as slurm/config.yaml
place the config.yaml file withing /cluster/home/username/.config/snakemake/slurm
Running
Prior to running will require setting of environments on SLURM with
bash set_environment.sh
To run the pipeline on SLURM, submit the launch.sh file with
sbatch launch.sh
.
Code Snippets
177 178 | shell: 'touch {output}' |
184 185 | shell: 'touch {output}' |
191 192 | shell: 'touch {output}' |
198 199 | shell: 'touch {output}' |
205 206 | shell: 'touch {output}' |
212 213 | shell: 'touch {output}' |
225 226 | shell: 'gunzip -c {input} > {output}' |
239 240 | shell: 'fastqc --outdir {params.outdir} {input}' |
264 265 266 267 268 269 270 271 272 273 274 275 | run: if params.bpattern is None: if params.blist is None: shell("cp {input.R1} > {output.R1}") shell("cp {input.R2} > {output.R2}") shell("touch {output.stats}") else: shell("python {params.extract_barcodes} --read1 {input.R1} --read2 {input.R2} --outfile {params.outprefix} --blist {params.blist}") shell("cp {params.barcode_stats_tmp_path} {params.barcode_stats_out_path}") else: shell("python {params.extract_barcodes} --read1 {input.R1} --read2 {input.R2} --outfile {params.outprefix} --bpattern {params.bpattern} --blist {params.blist}") shell("cp {params.barcode_stats_tmp_path} {params.barcode_stats_out_path}") |
294 295 | shell: 'trim_galore --cores 4 --dont_gzip --paired {params.trimgalore_settings} --output_dir {params.outdir} {input.R1} {input.R2} && cp ' + path_to_data + '/{wildcards.cohort}/tmp/trim_fastq/{wildcards.sample}_lib{wildcards.lib}_extract_barcode_R*.fastq_trimming_report.txt ' + path_to_data + '/{wildcards.cohort}/results/qc/trim_report/' |
309 310 | shell: 'trim_galore --cores 4 --dont_gzip {params.trimgalore_settings} --output_dir {params.outdir} {input.R1} && cp ' + path_to_data + '/{wildcards.cohort}/tmp/trim_fastq/{wildcards.sample}_lib{wildcards.lib}_R1.fastq_trimming_report.txt ' + path_to_data + '/{wildcards.cohort}/results/qc/trim_report/' |
348 349 | shell: "bwa mem -M -t4 -R'{params.read_group}' {params.bwa_index} {input.R1} {input.R2} | samtools view -bSo {output}" |
365 366 | shell: "bwa mem -M -t4 -R'{params.read_group}' {params.bwa_index} {input.R1} | samtools view -bSo {output}" |
419 420 | shell: 'samtools merge {output.bam} {input} && samtools index {output}' |
433 434 | shell: 'samtools merge {output.bam} {input} && samtools index {output}' |
453 454 | shell: "samtools markdup -r {input} {output.bam} && samtools index {output.bam}" |
471 472 | shell: 'fastqc --outdir {params.outdir} {input}' |
486 487 | shell: "samtools flagstat {input.aligned} -O tsv > {output.aligned} && samtools flagstat {input.deduped} -O tsv > {output.deduped}" |
503 504 | shell: "samtools flagstat {input} -O tsv > {output}" |
543 544 | shell: 'Rscript src/R/get_read_count_info.R -i {params.in_path} -o {params.out_path}' |
596 597 | shell: 'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths' |
612 613 | shell: 'Rscript src/R/cfmedip_nbglm.R -i {input} -o {output.fit} --modelout {output.model}' |
623 624 | shell: 'Rscript src/R/fit_cpg_bins.R -i {input} -o {output} --method {wildcards.method}' |
640 641 | shell: 'Rscript src/R/run_MEDIPS.R -b {input} -o {output.medips_count} -q {output.medips_qc} -p {params.paired_val}' |
658 659 | shell: 'Rscript src/R/consolidate_cohort_samples.R -i {params.in_path} -o {params.out_path} -d {params.data}' |
675 676 | shell: 'Rscript src/R/run_medestrand.R -b {input} -o {output} -p {params.paired_val} -m {params.medestrand}' |
692 693 | shell: 'Rscript src/R/consolidate_cohort_samples.R -i {params.in_path} -o {params.out_path} -d {params.data}' |
707 708 | shell: 'Rscript src/R/run_QSEA.R -s {wildcards.sample} -c {wildcards.chrom} -b {input} -o {params.out} --count {output.qsea_count} --beta {output.qsea_beta} --qc {output.qsea_qc}' |
716 717 718 719 720 721 722 | run: for i, input_file in enumerate(input): input_data = pd.read_csv(input_file, delimiter='\t', comment='#') if i == 0: input_data.to_csv(output[0], header=True, sep='\t', index=False) else: input_data.to_csv(output[0], header=False, sep='\t', index=False, mode='a') |
730 731 732 733 734 735 736 | run: for i, input_file in enumerate(input): input_data = pd.read_csv(input_file, delimiter='\t', comment='#') if i == 0: input_data.to_csv(output[0], header=True, sep='\t', index=False) else: input_data.to_csv(output[0], header=False, sep='\t', index=False, mode='a') |
744 745 746 747 748 749 750 | run: for i, input_file in enumerate(input): input_data = pd.read_csv(input_file, delimiter='\t', comment='#') if i == 0: input_data.to_csv(output[0], header=True, sep='\t', index=False) else: input_data.to_csv(output[0], header=False, sep='\t', index=False, mode='a') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | ' cfmedip_nbglm.R Fit bins from cfMeDIP-seq data using negative binomial regression. Usage: cfmedip_nbglm.R -i INPUT -o OUTPUT [ --method METHOD --modelout MODELOUT ] Options: -i --input INPUT Path to bin data output by bin_stats.R -o --output OUTPUT Output path for the table of --method METHOD Default is MCMC. Can specify LBFGS, BFGS, Newton, or VB. --modelout MODELOUT Path into which to dump a .Rds file containing the final model specifications and the coefficients used in each iteration. ' -> doc if (! interactive()) { library(docopt) args <- docopt(doc, version='cfMeDIP-seq negative binomial GLM v1.0') print(args) } else { message('Running in interactive mode. Be sure to specify args manually.') } library(tidyverse) library(MASS) library(flexmix) library(countreg) library(Rcpp) MAX_ITER = 20 PERCENT_CHANGE_THRESHOLD = 0.1 message('- Importing data.') bins <- read_tsv(args[['input']], comment = '#', col_types='ciiddddddi') %>% mutate(coverage_int = mean_coverage %>% round %>% as.integer) %>% filter(mean_coverage > 0 | gc_content > 0 | cpg_count > 0) %>% mutate( cpg_bin = factor(round(round(cpg_count * 20 / max(cpg_count)) * max(cpg_count) / 20)), gc_bin = factor(round(gc_content * 4, 1) / 4) ) ## ## Determining the profile of non-specific binding at zero-CPG regions ## ## message('- Determining the profile of bins with no CpGs') zero_profile_gc_model_output <- bins %>% filter(cpg_count == 0) %>% mutate( gc_bin = factor(round(gc_content * 4, 1) / 4) ) %>% group_by(gc_bin) %>% filter(n() > 50) %>% ungroup() %>% plyr::ddply(c('gc_bin'), function(z) { print(sprintf('Running for gc_bin = %s', z$gc_bin %>% unique)) if (all(z$coverage_int == 0)) { return(tibble( log_mu = -Inf, theta = Inf, log_likelihood = NA, count = nrow(z) )) } else { tryCatch({ zero_model <- flexmix(coverage_int ~ 1, data = z, k = 1, model = FLXMRnegbin()) return(tibble( log_mu = parameters(zero_model, component=1)[[1]], theta = parameters(zero_model, component=1)[[2]], log_likelihood = logLik(zero_model) %>% as.numeric, count = nrow(z) )) }, error = function(e) { message('Error: skipping this GC bin') NULL }) } }) %>% mutate(mu = exp(log_mu)) ## ## The zero profile to logistic functions, depending on gc_content. ## zero_mu_fit <- lm( log(mu) ~ log(gc_content), data = zero_profile_gc_model_output %>% mutate(gc_content = gc_bin %>% as.character %>% as.numeric) %>% filter(mu > 0, gc_content > 0) ) zero_theta_fit <- lm( log(theta) ~ log(gc_content), data = zero_profile_gc_model_output %>% mutate(gc_content = gc_bin %>% as.character %>% as.numeric) %>% filter(!is.infinite(theta), theta < 100, gc_content > 0) ) ## ## Begin fitting of CpG by making an initial estimate based on ## mean_coverage = exp(b_0 + b_1 * cpg_count + b_2 * gc_content) ## setting b_1 to 0.5, b_2 to 0, and allowing theta to vary as 0.5 * cpg_count. ## message('- Fitting CpG profile iteratively.\n') bin_methylation <- bins %>% mutate( unmethylated_mu = exp(predict(zero_mu_fit, newdata = .)), unmethylated_theta = exp(predict(zero_theta_fit, newdata = .)), unmethylated_likelihood = dnbinom(coverage_int, mu = unmethylated_mu, size = unmethylated_theta), methylated_mu = ifelse(cpg_count == 0, NA, cpg_count), methylated_theta = ifelse(cpg_count == 0, NA, 0.5 * cpg_count), methylated_likelihood = ifelse(is.na(methylated_mu), 0, dnbinom(coverage_int, mu = methylated_mu, size = methylated_theta)), unmethylated_posterior = unmethylated_likelihood / (methylated_likelihood + unmethylated_likelihood), methylated_posterior = methylated_likelihood / (methylated_likelihood + unmethylated_likelihood) ) %>% mutate(methylation_status = ifelse(methylated_posterior > unmethylated_posterior, 'methylated', 'unmethylated')) ## ## Iterate: For each iteration... ## 1. Separate out the presumed methylated bins as determined by the current model. ## 2. Fit a new negative binomial GLM regression model ## coverage = exp(b_0 + b_1 * cpg_count + b_2 * gc_content + b_3 * cpg_count * gc_content) ## 3. Re-estimate the posterior probabilities of each point being methylated or unmethylated ## based on the new negative binomial regression model. ## 4. Check for convergence, defined as < 1% change in all coefficients from the previous iteration. ## Stop if convergence or the maximum number of allowable iterations has been reached. ## methylated_fits <- list() for (i in 1:MAX_ITER) { ## ## 1. Separate out the presumed methylated bins ## message(sprintf('Running iteration %s', i)) bin_methylation_subset <- bin_methylation %>% filter(methylated_posterior > 0.5) ## ## 2. Fit NB regression to the methylated bins ## methylated_bins_nbfit <- glm.nb( coverage_int ~ cpg_count * gc_content, data = bin_methylation_subset ) methylated_fits[[i]] = methylated_bins_nbfit ## ## 3. Refit the points based on the regression ## refitted <- bins %>% mutate( unmethylated_mu = exp(predict(zero_mu_fit, newdata = .)), unmethylated_theta = exp(predict(zero_theta_fit, newdata = .)), unmethylated_likelihood = dnbinom(coverage_int, mu = unmethylated_mu, size = unmethylated_theta), methylated_mu = ifelse(cpg_count == 0, NA, exp(predict(methylated_bins_nbfit, newdata = .))), methylated_theta = methylated_bins_nbfit$theta, methylated_likelihood = ifelse(is.na(methylated_mu), 0, dnbinom(coverage_int, mu = methylated_mu, size = methylated_theta)), unmethylated_posterior = unmethylated_likelihood / (methylated_likelihood + unmethylated_likelihood), methylated_posterior = methylated_likelihood / (methylated_likelihood + unmethylated_likelihood) ) if (any(is.nan(refitted$unmethylated_posterior)) || any(is.nan(refitted$methylated_posterior))) { nan_rows = refitted %>% filter(is.nan(unmethylated_posterior) | is.nan(methylated_posterior)) message(sprintf('%s bins yielded NaNs - these may be outliers and have been removed. They are printed below.', nrow(nan_rows))) print(nan_rows) refitted <- refitted %>% filter(!is.nan(methylated_posterior), !is.nan(unmethylated_posterior)) } bin_methylation <- refitted %>% mutate( methylation_status = ifelse(methylated_posterior > unmethylated_posterior, 'methylated', 'unmethylated') ) ## ## 4. Check for convergence ## if (i > 1) { percent_changes = abs((methylated_fits[[i]]$coefficients - methylated_fits[[i-1]]$coefficients) / methylated_fits[[i-1]]$coefficients) * 100 message('Percent Changes:') message(sprintf('%s %s %s\n', names(percent_changes), signif(percent_changes, 3), ifelse(percent_changes < PERCENT_CHANGE_THRESHOLD, 'converged', '--'))) if (all(percent_changes < PERCENT_CHANGE_THRESHOLD)) { message('All coefficients converged.') methylated_fit <- methylated_bins_nbfit break } else if (i == MAX_ITER) { message('Maximum iterations hit. Some coefficients did not converge.') } } } ## ## Write output ## message('- Writing output to file.') bin_methylation %>% dplyr::select( bin_chr, bin_start, bin_end, methylation_status, coverage = coverage_int, cpg_count, gc_content, methylated_posterior, methylated_mu, mean_fragment_length ) %>% write_tsv(args[['output']]) message(sprintf('Output data written to %s', args[['output']])) if (!is.null(args[['modelout']])) { message('- Serializing model data to file.') model_data <- list( final_model = methylated_fit, iteration_models = methylated_fits, zero_model = list( model_output = zero_profile_gc_model_output, theta_fit = zero_theta_fit, mu_fit = zero_mu_fit ) ) saveRDS(model_data, args[['modelout']]) message(sprintf('Model data written to %s', args[['modelout']])) } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | ' consolidate_cohort_samples.R Consolidate the cohorts output samples into a single table of samples Usage: compile_cohort_samples.R -i INPUT -o OUTPUT -d DATA Options: -i --input INPUT Path to input -o --output OUTPUT Output path for the table of consolidated values -d --data DATA Data output to consolidate ' -> doc if (! interactive()) { library(docopt) args <- docopt(doc, version='consolidate cohort samples v 1.0') print(args) } else { message('Running in interactive mode. Be sure to specify args manually.') } library(tidyverse) cohort_dir = args[['input']] ifelse(!dir.exists(args[['output']]),dir.create(args[['output']]),FALSE) setwd(cohort_dir) file_list <- list.files(pattern = "_output.tsv") if(args[['data']] == "MEDIPS"){ i <- 0 for(file in file_list){ sample <- separate(data.frame(filename = file),col = filename, into = c("sample", "extra"), sep = "\\.")$sample medips_counts <- read_tsv(file, col_types='ciiid') if(i == 0){ complete_counts <- data.frame( window = paste(medips_counts$bin_chr,medips_counts$bin_start,medips_counts$bin_end, sep = "."), bin_counts = medips_counts$bin_counts ) colnames(complete_counts) <- c("window",sample) complete_cpm <- data.frame( window = paste(medips_counts$bin_chr,medips_counts$bin_start,medips_counts$bin_end, sep = "."), bin_counts = medips_counts$bin_cpm ) colnames(complete_cpm) <- c("window",sample) }else{ old_colnames <- colnames(complete_counts) complete_counts <- complete_counts %>% add_column(medips_counts$bin_counts) colnames(complete_counts) <- c(old_colnames,sample) complete_cpm <- complete_cpm %>% add_column(medips_counts$bin_cpm) colnames(complete_cpm) <- c(old_colnames,sample) } i = i + 1 rm(medips_counts) gc() } write_tsv(complete_counts, file = paste(args[['output']], "MEDIPS_Counts.tsv", sep = ""), col_names = TRUE) write_tsv(complete_cpm, file = paste(args[['output']], "MEDIPS_CPM.tsv", sep = ""), col_names = TRUE) } if(args[['data']] == "MeDEStrand"){ i <- 0 for(file in file_list){ medestrand_meth <- read_tsv(file, col_types='ciiid') sample <- separate(data.frame(filename = file),col = filename, into = c("sample", "extra"), sep = "\\.")$sample if(i == 0){ complete_meth <- data.frame( window = paste(medestrand_meth$bin_chr,medestrand_meth$bin_start,medestrand_meth$bin_end, sep = "."), bin_methyl = medestrand_meth$bin_methyl ) colnames(complete_meth) <- c("window",sample) }else{ old_colnames <- colnames(complete_meth) complete_meth <- complete_meth %>% add_column(medestrand_meth$bin_methyl) colnames(complete_meth) <- c(old_colnames,sample) } i = i + 1 rm(medestrand_meth) gc() } write_tsv(complete_meth, file = paste(args[['output']], "MeDEStrand_AbsMethyl.tsv", sep = ""), col_names = TRUE) } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | ' get_read_count_info.R Obtain the count info of all samples into one table Usage: get_read_count_info.R -i INPUT -o OUTPUT Options: -i --input INPUT Path to input -o --output OUTPUT Output path for the table of consolidated values ' -> doc if (! interactive()) { library(docopt) args <- docopt(doc, version='get count info v 1.0') print(args) } else { message('Running in interactive mode. Be sure to specify args manually.') } library(tidyverse) cohort_qc_flagstat_dir = args[['input']] ifelse(!dir.exists(args[['output']]),dir.create(args[['output']]),FALSE) setwd(cohort_qc_flagstat_dir) aligned_files <- list.files(pattern = ".aligned.sorted.bam.flagstat.tsv") sample_list <- separate(data.frame(filename = aligned_files),col = filename, into = c("sample", "extra"), sep = "\\.")$sample lib_files_number <- list.files(pattern = paste(sample_list[1],"_lib",sep = "")) read_count_info = matrix(NA, nrow=1, ncol = (5 + 2*length(lib_files_number))) read_count_info <- data.frame(read_count_info) lib_data_cols <- c() for(i in 1:length(lib_files_number)){ lib_data_cols <- c(lib_data_cols,paste("All_Reads_lib", i, sep = ""),paste("Mapped_Percent_lib", i, sep = "")) } colnames(read_count_info) <- c("Sample","Dup_Removed_PrimaryMap_Reads","Dup_Removed_Reads","All_Mapped_Reads","Percent_Duplicates",lib_data_cols) for(sample in sample_list){ flagsDupRmoved <- read.delim(paste(cohort_qc_flagstat_dir,sample,".aligned.sorted.markdup.bam.flagstat.tsv", sep = ""), header = FALSE) flags <- read.delim(paste(cohort_qc_flagstat_dir,sample,".aligned.sorted.bam.flagstat.tsv", sep = ""), header = FALSE) lib_files_list <- list.files(pattern = paste(sample,"_lib",sep = "")) lib_file_data <- c() for(lib_file in lib_files_list){ lib_file_flag <- read.delim(paste(cohort_qc_flagstat_dir,lib_file, sep = ""), header = FALSE) lib_file_data <- c(lib_file_data, as.numeric(lib_file_flag$V1[1]), as.numeric(strsplit((lib_file_flag$V1[8]), split = "%")[[1]][1])) } sample_row <- c(sample, as.numeric(flagsDupRmoved$V1[2]), as.numeric(flagsDupRmoved$V1[1]), as.numeric(flags$V1[1]), (1-as.numeric(flagsDupRmoved$V1[1])/as.numeric(flags$V1[1]))*100, lib_file_data) read_count_info <- rbind(read_count_info, sample_row) } read_count_info <- read_count_info[-1,] write.table(read_count_info, file = paste(args[['output']], "flag_stats.tsv", sep = ""), col.names = T, row.names = F, quote = F, sep = "\t", append = F) rm(list = ls()) gc() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | ' run_medestrand.R Run MeDEStrand. Usage: run_medestrand.R -b BAM -o OUTPUT -p PAIRED [ -m MEDESTRAND ] Options: -b --bam BAM Path to input BAM file -o --output OUTPUT Output path (RDS file) -p --paired PAIRED Sample is paired end or single end sqeuncing based on cohort -m --medestrand MEDESTRAND Path to MeDEStrand Package ' -> doc if (! interactive()) { library(docopt) args <- docopt(doc, version='Run MeDEStrand v 1.0') print(args) } else { message('Running in interactive mode. Be sure to specify args manually.') } if (is.null(args[['medestrand']])) { library(MeDEStrand) } else { devtools::load_all(args[['medestrand']]) } library(GenomicRanges) library(MEDIPS) library(BSgenome.Hsapiens.UCSC.hg38) library(tidyverse) BIN_WIDTH = 300 allmainchrs = paste0('chr', c(1:22)) BSgenome = 'BSgenome.Hsapiens.UCSC.hg38' paired_val = (args[['paired']] == "True") methylset <- MeDEStrand.createSet( file = args[['bam']], BSgenome = BSgenome, uniq = 1, extend = 0, shift = 0, window_size = BIN_WIDTH, chr.select = allmainchrs, paired = paired_val ) CS = MeDEStrand.countCG(pattern='CG', refObj=methylset) absolute_methylation <- MeDEStrand.binMethyl(MSetInput = methylset, CSet = CS, Granges = FALSE) MSet = methylset[[1]] chr.select = MSet@chr_names window_size = window_size(MSet) chr_lengths = unname( seqlengths(BSgenome.Hsapiens.UCSC.hg38)[ seqnames(BSgenome.Hsapiens.UCSC.hg38@seqinfo)%in%chr.select ] ) no_chr_windows = ceiling(chr_lengths/window_size) supersize_chr = cumsum(no_chr_windows) chromosomes=chr.select all.Granges.genomeVec = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chromosomes, chr_lengths, window_size) all.Granges.genomeVec$CF = CS@genome_CF all.Granges.genomeVec$binMethyl= absolute_methylation absolute_methylation_df <- as.data.frame(all.Granges.genomeVec) colnames(absolute_methylation_df) <- c("bin_chr","bin_start","bin_end","bin_width","strand","cpg_count","bin_methyl") absolute_methylation_df = absolute_methylation_df[, c("bin_chr","bin_start","bin_end","cpg_count","bin_methyl")] write_tsv(absolute_methylation_df, file = args[['output']], col_names = TRUE) rm(list = ls()) gc() |
1
of
R/run_medestrand.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | ' run_MEDIPS.R Run MEDIPS for counts and conduct MEDIPS QC. Usage: run_MEDIPS.R -b BAM -o OUTPUT -q QCOUT -p PAIRED Options: -b --bam BAM Path to input BAM file -o --output OUTPUT Output path (RDS file) -q --qcout QCOUT Path to output QC results of sample -p --paired PAIRED Sample is paired end or single end sqeuncing based on cohort ' -> doc if (! interactive()) { library(docopt) args <- docopt(doc, version='Run MEDIPS v 1.0') print(args) } else { message('Running in interactive mode. Be sure to specify args manually.') } library(GenomicRanges) library(BSgenome) library(BSgenome.Hsapiens.UCSC.hg38) library(IRanges) library(MEDIPS) library(tidyverse) BIN_WIDTH = 300 allmainchrs = paste0('chr', c(1:22)) BSgenome = 'BSgenome.Hsapiens.UCSC.hg38' paired_val = (args[['paired']] == "True") medips_set = MEDIPS.createSet(file = args[['bam']], BSgenome = BSgenome, extend = 0, shift = 0, uniq = 1, window_size = BIN_WIDTH, paired = paired_val, chr.select = allmainchrs) chr.select = medips_set@chr_names window_size = window_size(medips_set) chr_lengths = unname( seqlengths(BSgenome.Hsapiens.UCSC.hg38)[ seqnames(BSgenome.Hsapiens.UCSC.hg38@seqinfo)%in%chr.select ] ) no_chr_windows = ceiling(chr_lengths/window_size) supersize_chr = cumsum(no_chr_windows) chromosomes = chr.select all.Granges.genomeVec = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chromosomes, chr_lengths, window_size) all.Granges.genomeVec$counts = medips_set@genome_count all.Granges.genomeVec$cpm = (medips_set@genome_count/medips_set@number_regions)*1000000 count_df <- as.data.frame(all.Granges.genomeVec) colnames(count_df) <- c("bin_chr","bin_start","bin_end","bin_width","strand","bin_counts","bin_cpm") count_df = count_df[, c("bin_chr","bin_start","bin_end","bin_counts","bin_cpm")] write_tsv(count_df, file = args[['output']], col_names = TRUE) medipsenrichment <- tryCatch({ medips_enrichment = MEDIPS.CpGenrich(file = args[['bam']], BSgenome = BSgenome, extend = 0, shift = 0, uniq = 1, paired = paired_val, chr.select = allmainchrs) return(TRUE) }, error = function(e){ message('Error: unable to create medips enrichment paramaters') return(FALSE) } ) medips_coverage = MEDIPS.seqCoverage(file = args[['bam']], pattern = "CG", BSgenome = BSgenome, extend = 0, shift = 0, uniq = 1, paired = paired_val, chr.select = allmainchrs) medips_saturation = MEDIPS.saturation(file= args[['bam']], BSgenome = BSgenome, extend = 0, shift = 0, uniq = 1, window_size = BIN_WIDTH, nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE, chr.select = allmainchrs, paired = paired_val) #generating the seqCoverage just on the unique reads cov.level = c(0, 1, 2, 3, 4, 5) cov.res = medips_coverage$cov.res numberReads = medips_coverage$numberReads numberReadsWO = medips_coverage$numberReadsWO numberReadsWO_percentage = round((numberReadsWO/numberReads * 100), digits = 2) results = NULL for (j in 1:length(cov.level)) { if (j == 1) { results = c(results, length(cov.res[cov.res <= cov.level[j]])/length(cov.res) * 100) } else { results = c(results, length(cov.res[cov.res > cov.level[j - 1] & cov.res <= cov.level[j]])/length(cov.res) * 100) } } results = c(results, length(cov.res[cov.res > cov.level[length(cov.level)]])/length(cov.res) * 100) if(medipsenrichment){ MEDIPS_EnrichmentScore_GoGe = medips_enrichment$enrichment.score.GoGe MEDIPS_EnrichmentScore_relH = medips_enrichment$enrichment.score.relH }else{ MEDIPS_EnrichmentScore_GoGe = NA MEDIPS_EnrichmentScore_relH = NA } QCstats = data.frame(numReads_Unique_MEDIPS = medips_coverage$numberReads, MEDIPS_Enrichment = medipsenrichment, EnrichmentScore_GoGe = MEDIPS_EnrichmentScore_GoGe, EnrichmentScore_relH = MEDIPS_EnrichmentScore_relH, Percent_CpG_Seq_Coverage_0x = results[1], Percent_CpG_Seq_Coverage_1x = results[2], Percent_CpG_Seq_Coverage_2x = results[3], Percent_CpG_Seq_Coverage_3x = results[4], Percent_CpG_Seq_Coverage_4x = results[5], Percent_CpG_Seq_Coverage_5x = results[6], Percent_CpG_Seq_Coverage_Over5x = results[7], Reads_do_not_cover_CpG = medips_coverage$numberReadsWO, Percent_Reads_do_not_cover_CpG = numberReadsWO_percentage, Estimated_Saturation_Correlation = medips_saturation$maxEstCor[2], True_Saturation_Correlation = medips_saturation$maxTruCor[2]) write_tsv(QCstats, file=args[['qcout']], col_names = TRUE) #save QC metrics rm(list = ls()) gc() |
1
of
R/run_MEDIPS.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | ' run_QSEA.R Run QSEA for counts and beta value estimation and conduct MEDIPS QC. Usage: run_QSEA.R -s SAMPLE -c CHROM -b BAM -o OUTPUT --count Count --beta BETA --qc QCOut [ --group GROUP ] Options: -s --sample SAMPLE Name of sample -c --chrom CHROM Chromosome -b --bam BAM Path to input BAM file -o --output OUTPUT Output path --count Count Output path for count data --beta BETA Output path for beta methylation estimate --qc QCOut Output path for qc matrix --group GROUP Optional input of whether sample belongs to a group, such as "treatment" or "control" ' -> doc if (! interactive()) { library(docopt) args <- docopt(doc, version='Run QSEA v 1.0') print(args) } else { message('Running in interactive mode. Be sure to specify args manually.') } library(GenomicRanges) library(BSgenome) library(BSgenome.Hsapiens.UCSC.hg38) library(IRanges) library(qsea) library(tidyverse) library(BiocParallel) register(MulticoreParam(workers=4)) BIN_WIDTH = 300 chrom = args[['chrom']] BSgenome = 'BSgenome.Hsapiens.UCSC.hg38' mapq = 30 if (!is.null(args[['group']])) { sample_group = args[['group']] } else { sample_group = "unspecified" } sample_info <- data.frame( sample_name = args[['sample']], file_name = args[['bam']], group = sample_group ) qseaset <- createQseaSet( sampleTable = sample_info, BSgenome = BSgenome, window_size = BIN_WIDTH, chr.select = chrom, ) qseaset = addCoverage(qseaset, uniquePos = TRUE, paired = TRUE, parallel = TRUE, minMapQual = mapq) qseaset = addPatternDensity(qseaset, "CG", name = "CpG") qseaset = addLibraryFactors(qseaset) qseaset = addOffset(qseaset, enrichmentPattern = "CpG") wd = which(getRegions(qseaset)$CpG_density>1 & getRegions(qseaset)$CpG_density<15) signal = (15-getRegions(qseaset)$CpG_density[wd])*.55/15+.25 signal = matrix(signal,nrow=length(signal),ncol=length(getSampleNames(qseaset))) qseaenrichment <- tryCatch({ qseaset = addEnrichmentParameters( qseaset, enrichmentPattern="CpG", windowIdx=wd, signal=signal ) return(TRUE) }, error = function(e){ message('Error: unable to create enrichment paramaters') return(FALSE) } ) if(qseaenrichment){ output_beta <- makeTable( qseaset, norm_methods = c("beta"), samples = getSampleNames(qseaset) ) output_counts <- makeTable( qseaset, norm_methods = c("counts","rpm"), samples = getSampleNames(qseaset) ) }else{ output_counts <- makeTable( qseaset, norm_methods = c("counts","rpm"), samples = getSampleNames(qseaset) ) output_beta <- output_counts[,1:4] output_beta$beta <- rep(NA, nrow(output_beta)) colnames(output_beta) <- c("chr","window_start","window_end","CpG_density",paste(args[['sample']],"_beta",sep = "")) } qseaset_percentfragsbackground = getOffset(qseaset) * 100 QCstats = data.frame(numReads_Unique_QSEA = qseaset@libraries$file_name[1,"valid_fragments"], QSEA_Percent_Fragments_due_Background = qseaset_percentfragsbackground, QSEA_Enrichment = qseaenrichment) ## write out setwd(args[['output']]) write_tsv(output_counts, file = args[['count']], col_names = TRUE) write_tsv(output_beta, file = args[['beta']], col_names = TRUE) write_tsv(QCstats, file = args[['qc']], col_names = TRUE) #save QC metrics if(qseaenrichment){ png(file = paste("EnrichmentProfile",args[['chrom']],".png", sep = ""), width = 480, height = 480, units = "px") plotEPmatrix(qseaset) dev.off() } rm(list = ls()) gc() |
1
of
R/run_QSEA.R
Support
- Future updates
Related Workflows





