Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Peaks Workflow
This workflow is designed to align, perform basic QC and call peaks for peak-based methods such as ChIP-seq, CUT&RUN and ATAC-seq.
Usage
-
git clone <repo> <new_directory_name>
-
Put the 'fastq.gz' files in the
raw_data/
subdirectory. You may use symlinks. -
Look over the config file to ensure that the correct indexes and other species-specific settings are used.
-
Set up the sample sheet (samples.tsv). You may find it helpful to run
./make_samples_template.sh
from inside thebin/
subdirectory to get a template file based on the files inraw_data/
. The columns are:i.
sample
-- Name of the sample. Fastqs will be renamed to this. You may use the same 'sample' name in multiple rows in this file to represent fastqs that should becat
together.ii.
control
-- 'sample' name for the control to be used, for example, as control for MACS2. If not applicable, use 'NA'. Leaving it blank should also be fine.iii.
fq1
-- R1 fileiv.
fq2
-- R2 file; fill in with NA if SE data.v.
sample_group
-- Grouping variable for replicates.vi.
enriched_factor
-- Samples with the sameenriched_factor
will be normalized together with CSAW, if alternative normalization is requested in the config file. -
From the root directory of the project, run
sbatch bin/run_snakemake.sh
.
Code Snippets
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | outCompNorm <- snakemake@output[["bkgrd"]] outEffNorm <- snakemake@output[["hiAbund"]] # cores for parallel processing bp_cores <- snakemake@threads-1 # read in mitochondrial chromosome name mito_chr <- snakemake@params[["mito_chr"]] # blacklist BED file blacklist_file <- snakemake@params[["blacklist"]] # file with the standard chromosome names # file should contain only 1 line, which is space delimited std_chroms_file <- snakemake@input[["std_chroms"]] # bam files to read in bam_samps <- snakemake@params[["bam_samp_names"]] bam.files <- snakemake@input[["bams"]] # before naming the bam files by sample name, we ensure that the sample names and the file names match stopifnot(identical(paste0("analysis/filt_bams/", bam_samps, "_filt_alns.bam"), bam.files)) names(bam.files) <- snakemake@params[["bam_samp_names"]] # output RDS objects binned_rds <- paste0(snakemake@params[["out_pref"]], "_binned.rds") small_wins_rds <- paste0(snakemake@params[["out_pref"]], "_small_wins.rds") filt_small_wins_rds <- paste0(snakemake@params[["out_pref"]], "_filt_small_wins.rds") # csaw params bkgd_bin_width <- 10000 hi_abund_win_width <- snakemake@params[["window_width"]] win_filter_fold <- 3 # fold change from background to keep as high abundance windows minq <- 20 dedup=TRUE ########## ########## ########## # load packages suppressPackageStartupMessages(library(csaw)) suppressPackageStartupMessages(library(rtracklayer)) suppressPackageStartupMessages(library(magrittr)) suppressPackageStartupMessages(library(BiocParallel)) suppressPackageStartupMessages(library(tibble)) # read in standard chromosomes file as a vector, and remove mitochondrial chromosome name chroms_keep_vec <- strsplit(readLines(std_chroms_file)[1], " ")[[1]] %>% grep(pattern=paste0("^", mito_chr, "$"), x=., perl=TRUE, value=TRUE, invert=TRUE) message("Keeping only these chromosomes: ", paste(chroms_keep_vec, collapse=", ")) # import blacklist as genomicranges blacklist_gr <- import(blacklist_file) # Set params for reading in BAM files param <- readParam(minq=minq, dedup=dedup, pe="both", restrict=chroms_keep_vec, discard=blacklist_gr) # Eliminate composition biases # TMM on binned counts binned <- windowCounts(bam.files, bin=TRUE, width=bkgd_bin_width, param=param, BPPARAM=MulticoreParam(bp_cores)) # note that windows with less than 'filter' number of reads summed across libraries are removed message("For TMM on background bins, ", length(binned), " bins were used.") binned <- normFactors(binned, se.out=TRUE) saveRDS(binned, binned_rds) # for faster debugging # code adapted from https://www.biostars.org/p/394434/ # 'se' is csaw generated ranged summarized experiment with norm.factors and totals in the coldata # 'outfile' is the output file containing the colData calc_and_write_final_norm_facs <- function(se, outfile){ se$final.factors <- ((se$norm.factors * colData(se)$totals) / 1000000)^-1 # Recall that the norm factors from edgeR/csaw must be multiplied by the library size to also correct for library size; this is different from and not need with the DESeq2 norm factors. write.table(x = as.data.frame(colData(se)) %>% tibble::rownames_to_column("sample"), file = outfile, sep="\t", quote = FALSE, col.names = TRUE, row.names = FALSE) } ## turn off exponential notation to avoid rounding errors options(scipen=999) calc_and_write_final_norm_facs(se=binned, outfile=outCompNorm) # Eliminate efficiency biases # TMM on high abundance regions small_wins <- windowCounts(bam.files, width=hi_abund_win_width, param=param, BPPARAM=MulticoreParam(bp_cores)) # note that windows with less than 'filter' number of reads summed across libraries are removed saveRDS(small_wins, small_wins_rds) # for faster debugging #binned <- windowCounts(bam.files, bin=TRUE, width=10000, param=param) filter_wins <- filterWindowsGlobal(small_wins, binned) keep <- filter_wins$filter > log2(win_filter_fold) # filter for greater than 'win_filter_fold' fold difference compared to background keep_num <- sum(keep) tot_wins <- length(keep) message("For TMM on high abundance normalization, ", keep_num, " out of ", tot_wins, " (", round(keep_num/tot_wins, 2), ")", " windows were kept.") filtered.wins <- small_wins[keep, ] filtered.wins <- normFactors(filtered.wins, se.out=TRUE) saveRDS(filtered.wins, filt_small_wins_rds) # for faster debugging calc_and_write_final_norm_facs(se=filtered.wins, outfile=outEffNorm) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | options(warn=1) # print all warnings as they happen library(DiffBind) library(readr) library(dplyr) library(stringr) library(Rsamtools) # variables samplesheet <- snakemake@input[['samplesheet']] db_config <- list(cores=snakemake@threads) out_dir <- snakemake@params[["outdir"]] out_samplesheet <- snakemake@output[['samplesheet']] DB_summits <- snakemake@params[['DB_summits']] macs2_type <- snakemake@params[['macs2_type']] subtract_controls <- as.logical(snakemake@params[['subtract_controls']]) curr_enriched_factor <- snakemake@params[['enriched_factor']] is_atac <- as.logical(snakemake@params[['is_atac']]) # set up Diffbind samplesheet samples <- read_tsv(samplesheet) %>% dplyr::mutate(SampleID=sample, bamReads=str_glue("analysis/filt_bams/{sample}_filt_alns.bam"), bamControl=ifelse(!is.na(control) & !is_atac, str_glue("analysis/filt_bams/{control}_filt_alns.bam"), NA), # typically there is no control for ATAC-seq datasets Peaks=str_glue("analysis/{macs2_type}/rm_blacklist/{sample}_macs2_narrow_peaks.rm_blacklist.narrowPeak"), PeakCaller="bed", Factor=sample_group, out_pref=ifelse(is.na(enriched_factor), "peaks", enriched_factor)) %>% dplyr::select(SampleID, bamReads, bamControl, Peaks, PeakCaller, Factor, out_pref) %>% unique() # some samples represented twice if they had more than 1 set of fastq files if(any(!is.na(samples$bamControl))){ samples <- samples %>% dplyr::filter(!is.na(bamControl)) # if project has controls, then we remove all the rows for the controls } else{ samples <- samples %>% dplyr::select(-bamControl) } # filter for just the samples related to the current enriched factor stopifnot(curr_enriched_factor %in% samples$out_pref) samples <- samples %>% dplyr::filter(out_pref == curr_enriched_factor) # subtract controls? if (!subtract_controls){ samples$bamControl <- NA } write_tsv(samples, out_samplesheet) print(samples) message(str_glue("Making DBA for {curr_enriched_factor}")) # make DBA object diffbind <- dba(sampleSheet = samples %>% dplyr::select(-out_pref)) # set num cores and how many reads to store in memory diffbind$config$cores <- db_config$cores-1 diffbind$config$yieldSize <- 20000000 # get counts diffbind$config$scanbamparam <- ScanBamParam(flag = scanBamFlag(isDuplicate=FALSE, isSecondaryAlignment=FALSE, isSupplementaryAlignment=FALSE, isPaired=TRUE, isProperPair=TRUE, isNotPassingQualityControls=FALSE, isUnmappedQuery=FALSE, hasUnmappedMate=FALSE, isMinusStrand=NA, isMateMinusStrand=NA)) if (is_atac){ diffbind$config$singleEnd <- TRUE # if ATAC, each read represents a cut site } diffbind <- dba.count(diffbind, summits=DB_summits, bUseSummarizeOverlaps = TRUE, bParallel = TRUE) print(diffbind) # normalization diffbind <- dba.normalize(diffbind, normalize = DBA_NORM_NATIVE, background = TRUE) # print the normalization methods dba.normalize(diffbind, bRetrieve=TRUE) dir.create(out_dir, recursive=TRUE) # write the DBA to an RDS file saveRDS(diffbind, str_glue("{out_dir}/{curr_enriched_factor}.rds")) # session info sessionInfo() |
5 6 7 8 9 10 11 12 13 14 15 16 17 | suppressPackageStartupMessages(library(GenomeInfoDb)) suppressPackageStartupMessages(library(magrittr)) suppressPackageStartupMessages(library(Rsamtools)) # params from workflow ref_fasta <- snakemake@params[["ref_fasta"]]#"/varidata/research/projects/bbc/versioned_references/2022-03-08_14.47.50_v9/data/mm10_gencode_plus_viruses_and_cfmedips/sequence/mm10_gencode_plus_viruses_and_cfmedips.fa" outfile <- snakemake@output[[1]] #"output.txt" print(outfile) # make GenomicRanges from the genome ref_gr <- as(seqinfo(FaFile(ref_fasta)), "GRanges") # output the standard chromosome names as a space-separated text file standardChromosomes(ref_gr) %>% paste(., collapse=" ") %>% writeLines(., outfile) |
10 11 12 | # save start time for script start_tm <- Sys.time() start_tm |
16 | knitr::opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE, fig.width=8, fig.height=8) |
20 21 | outdir <- snakemake@params[["out_dir"]] dir.create(outdir, recursive=TRUE) |
27 28 29 30 31 | library(dplyr) library(stringr) library(rtracklayer) library(ChIPpeakAnno) library(viridis) |
37 38 39 40 41 42 43 44 45 46 47 48 49 50 | # a list containing vectors as elements, where each vector is a different group of peak files. peak_files <- snakemake@input # extract only the named elements peak_files <- peak_files[grep("^$", names(peak_files), value = TRUE, invert = TRUE)] sample_names <- snakemake@params[["sample_names"]] #save.image(file=paste0(outdir, "/Snakemake_vars.Rimage")) peaks <- lapply(rlang::set_names(names(peak_files), names(peak_files)), function(peak_group_name){ peak_group <- peak_files[[peak_group_name]] stopifnot(length(sample_names) == length(peak_group)) lapply(rlang::set_names(peak_group, sample_names), function(peak_file){ unique(rtracklayer::import(peak_file)) # use unique() to remove duplicate peaks resulting from using --call-summits in macs2 }) }) |
54 55 56 57 58 59 60 61 | # ChipSeeker venn produces different counts than Homer mergePeaks, ChipPeakAnno makeVennDiagram and my own manual checks with R findOverlaps (those were consistent with each other). ChipSeeker appears to be using a different definition of overlap than the defaults in the above methods. Unfortunately, ChipSeeker documentation is unclear as to how it determines overlaps and does not seem to use typical GenomicRanges findOverlaps etc fucntions like ChipPeakAnno does. for (i in 1:length(peaks)){ pdf(paste0(outdir, "/", names(peaks)[i], "_venn.pdf")) # 9 is maximum # of sets for vennerable ChIPseeker::vennplot(peaks[[i]][1:min(c(length(peaks[[i]]), 9))]) dev.off() } |
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | for (i in 1:length(peaks)){ pdf(paste0(outdir, "/", names(peaks)[i], "_vennChipPeakAnno.pdf")) # 5 is maximum # of sets for VennDiagram # setting connectedPeaks="merge" seems to mimic the Homer approach in mergePeaks num_venn <- min(c(length(peaks[[i]]), 5)) makeVennDiagram(peaks[[i]][1:num_venn], connectedPeaks="merge", fill=viridis_pal()(num_venn), fontfamily="sans", cex=1.5, cat.fontfamily="sans", cat.cex=1.5) dev.off() y <- findOverlapsOfPeaks(peaks[[i]][1:num_venn], connectedPeaks="merge") peaklist <- y$peaklist names(peaklist) <- make.names(names(peaklist)) for (j in 1:length(peaklist)){ rtracklayer::export(peaklist[[j]], paste0(outdir, "/", names(peaklist)[j], "_", names(peaks)[i], ".bed")) } } #saveRDS(y, "analysis/peaks_venn/findOverlapsOfPeaks.RDS") |
92 | sessionInfo() |
98 99 100 101 | # output time taken to run script end_tm <- Sys.time() end_tm end_tm - start_tm |
109 110 111 112 113 114 115 116 117 118 | shell: """ if [ {params.num_input} -gt 1 ] then cat {params.input} > {output} else ln -sr {params.input} {output} fi """ |
139 140 141 142 | shell: """ fastqc --outdir {params.outdir} {input} """ |
162 163 164 165 | shell: """ fastq_screen --threads {threads} --outdir analysis/fastq_screen/ {input} """ |
186 187 188 189 | shell: """ trim_galore --paired {input} --output_dir analysis/trim_galore/ --cores {threads} --fastqc """ |
210 211 212 213 | shell: """ trim_galore {input} --output_dir analysis/trim_galore/ --cores {threads} --fastqc """ |
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | shell: """ bwa mem \ -t {threads} \ {params.bwa_idx} \ {input} | \ samblaster {params.samblaster_params} 2>{output.samblaster_err} | \ samtools sort \ -m 6G \ -@ {threads} \ -O "BAM" \ -o {output.outbam} \ - echo "END bwamem" echo "END bwamem" 1>&2 samtools index -@ {threads} {output.outbam} echo "END indexing" echo "END indexing" 1>&2 samtools idxstats {output.outbam} > {output.idxstat} echo "END idxstats" echo "END idxstats" 1>&2 """ |
282 283 284 285 | shell: """ samtools flagstat -@ {threads} {input} > {output} """ |
306 307 308 309 | shell: """ java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir={params.temp} -jar $PICARD CollectAlignmentSummaryMetrics I={input} O={output.out} R={params.reffasta} """ |
330 331 332 333 | shell: """ java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir={params.temp} -jar $PICARD CollectInsertSizeMetrics I={input} O={output.out} H={output.hist} """ |
351 352 353 354 | shell: """ Rscript --vanilla bin/scripts/atacseqqc.R {input.bam} {params.outpref} {params.knownGenesLib} """ |
380 381 382 383 384 385 386 387 388 389 390 391 392 | shell: """ samtools view \ -@ {threads} \ -q {params.mapq} \ -F {params.flags_to_exclude} \ {params.flags_to_include} \ -b -o {output.bam} {input} {params.keep_chroms} samtools index {output.bam} samtools idxstats {output.bam} > {output.idxstat} """ |
413 414 415 416 417 | shell: """ samtools view -b -@ {threads} -F 1024 -o {output.dedup_bam} {input.bam} samtools index {output.dedup_bam} """ |
443 444 445 446 447 448 449 450 451 452 | shell: """ export TMPDIR={params.temp} bamtools filter -insertSize "<={params.maxFragmentSize}" -in {input} -out {output.bam} samtools index {output.bam} samtools idxstats {output.bam} > {output.idxstat} """ |
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 | shell: """ export TMPDIR={params.temp} bamCoverage \ -p {threads} \ --binSize {params.binsize} \ --bam {input.bam} \ {params.extend_reads} \ --blackListFileName {params.blacklist} \ -o {output.bigwig_rmdups} \ --normalizeUsing {params.norm_method} \ --samFlagExclude {params.sam_exclude} \ {params.sam_keep} """ |
507 508 | script: "bin/scripts/get_standard_chrom_names.R" |
532 533 | script: "bin/scripts/calc_norm_factors.R" |
553 554 555 556 557 558 559 560 561 562 563 | shell: """ printf "sample\\tecoli_frags\\tfinal.factors\\n" > {output} for bam in {input.bam} do sample=`basename $bam | perl -npe 'chomp; die unless /(_filt_alns.bam)/; s/$1//'` ecoli_frags=`samtools view -@{threads} -c -f 64 -F 1024 $bam {params.ecoli_chrom}` # count first read only, and ignore duplicates scale_factor=`echo "10000 / $ecoli_frags" | bc -l` printf "$sample\\t$ecoli_frags\\t$scale_factor\\n" >> {output} done """ |
603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 | shell: """ export TMPDIR={params.temp} mkdir -p {params.temp} bamCoverage \ -p {threads} \ --binSize {params.binsize} \ -b {input.bam} \ --extendReads \ --blackListFileName {params.blacklist} \ -o {output.bigwig_rmdups} \ --normalizeUsing {params.norm} \ --scaleFactor {params.scale_factor} \ --samFlagExclude {params.sam_exclude} \ --samFlagInclude 64 rm -r {params.temp} """ |
640 641 642 643 644 645 646 | shell: """ export TMPDIR={params.temp} multiBigwigSummary BED-file --BED {input.peaks} -p {threads} --blackListFileName {params.blacklist} -b {input.bws} -o {output} """ |
663 664 665 666 667 668 669 | shell: """ export TMPDIR={params.temp} plotCorrelation --corData {input} -c spearman -p heatmap --skipZeros -o {output.pdf} """ |
686 687 688 689 690 691 | shell: """ export TMPDIR={params.temp} plotPCA --transpose -in {input} -o {output.pdf} --log2 --ntop 5000 """ |
714 715 716 717 718 719 720 721 722 723 724 725 726 727 | shell: """ export TMPDIR={params.temp} bamCompare -b1 {input.bam1} -b2 {input.bam2} \ -p {threads} \ {params.extend_reads} \ --binSize {params.binsize} \ --blackListFileName {params.blacklist} \ --samFlagExclude {params.sam_exclude} \ {params.sam_keep} \ -o {output.bigwig} """ |
742 743 744 745 746 | shell: """ cut -f 1,2 {input.fai} > {output} """ |
788 789 790 791 | shell: """ {params} """ |
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 | shell: """ export TMPDIR={params.temp} computeMatrix \ scale-regions \ -p {threads} \ -b {params.before} \ -a {params.after} \ --missingDataAsZero \ --samplesLabel {params.samp_labels} \ --binSize {params.binsize} \ -R {input.genes} \ -S {input.bw} \ -o {output.compmat} \ -bl {params.blacklist} \ --transcriptID gene \ --transcript_id_designator gene_id \ --outFileSortedRegions {output.compmatbed} echo "END computeMatrix" echo "END computeMatrix" 1>&2 plotHeatmap \ --heatmapWidth 6 \ --yAxisLabel {params.yaxislabel} \ -m {output.compmat} \ -out {output.heatmap} \ echo "END plotHeatmap" echo "END plotHeatmap" 1>&2 """ |
873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 | shell: """ export TMPDIR={params.temp} plotFingerprint \ -b {input} \ -p {threads} \ -o {output.plot} \ --outRawCounts {output.rawcts} \ {params.extend_reads} \ --labels {params.labels} \ --samFlagExclude {params.sam_exclude} \ {params.sam_keep} \ --blackListFileName {params.blacklist} """ |
929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 | shell: """ macs2 \ callpeak \ -t {input.trt} \ {params.control_param} \ {params.macs2_format} \ --outdir {params.outdir} \ -n {params.broad_name} \ -g {params.species} \ -q {params.q_cutoff} \ --broad \ --keep-dup 'all' \ --tempdir {params.temp_dir} echo "END broad peak calling" 1>&2 echo "END broad peak calling" macs2 \ callpeak \ -t {input.trt} \ {params.control_param} \ {params.macs2_format} \ --outdir {params.outdir} \ -n {params.narrow_name} \ -g {params.species} \ -q {params.q_cutoff} \ --keep-dup 'all' \ --tempdir {params.temp_dir} echo "END narrow peak calling" 1>&2 echo "END narrow peak calling" """ |
984 985 986 987 988 989 990 991 992 993 994 995 | shell: """ # name sort BAMs samtools sort -@ {threads} -n -o {output.bam} {input} # below adapted from https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/src/encode_task_bam2ta.py (Latest commit 867cfe5) bedtools bamtobed -bedpe -mate1 -i {output.bam} | gzip -nc > {output.bedpe} zcat {output.bedpe} | awk 'BEGIN{{OFS="\\t"}}{{printf "%s\\t%s\\t%s\\tN\\t1000\\t%s\\n%s\\t%s\\t%s\\tN\\t1000\\t%s\\n",$1,$2,$3,$9,$4,$5,$6,$10}}' | gzip -nc > {output.bed} """ |
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 | shell: """ # ENCODE uses -p 0.01 but we use -q 0.05 here to be consistent with the rest of this workflow macs2 \ callpeak \ -t {input.trt} \ -f 'BED' \ --outdir {params.outdir} \ -n {params.broad_name} \ -g {params.species} \ --shift -75 --extsize 150 \ --nomodel \ -q {params.q_cutoff} \ --keep-dup all \ --broad \ --tempdir {params.temp_dir} echo "END broad peak calling" 1>&2 echo "END broad peak calling" macs2 \ callpeak \ -t {input.trt} \ -f 'BED' \ --outdir {params.outdir} \ -n {params.narrow_name} \ -g {params.species} \ --shift -75 --extsize 150 \ --nomodel --call-summits \ -q {params.q_cutoff} \ --keep-dup all \ -B \ --SPMR \ --tempdir {params.temp_dir} echo "END narrow peak calling" 1>&2 echo "END narrow peak calling" # remove (-truncate leads to intervals where the end occurs before start) the intervals that extend past the end of a chromosome bedClip -verbose=2 {output.narrow_bedgraph} {input.chr_sizes} {output.narrow_bedgraph_clip} bedGraphToBigWig {output.narrow_bedgraph_clip} {input.chr_sizes} {output.narrow_bigwig} echo "END narrow peak calling" 1>&2 echo "END narrow peak calling" """ |
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 | shell: """ # modified from https://github.com/LiuLabUB/HMMRATAC/blob/master/HMMRATAC_Guide.md # we remove the mitochondrial genome from the genominfo file in order to prevent it from being processed by HMMRATAC (see https://github.com/LiuLabUB/HMMRATAC/issues/50) samtools view -H {input} | perl -ne 'if(/^@SQ.*?SN:(\w+)\s+LN:(\d+)/){{print $1,"\\t",$2,"\\n"}}' | grep -Pv "^{params.mito_chr}\\t" > {output.genomeinfo} java -Xms80g -Xmx{resources.mem_gb}g -Djava.io.tmpdir={params.temp_dir} -jar $HMMRATAC \ --output {params.outpref} \ -b {input} \ -i {input}.bai \ -g {output.genomeinfo} \ --blacklist {params.blacklist} # copied from https://github.com/LiuLabUB/HMMRATAC/blob/master/HMMRATAC_Guide.md awk -v OFS="\\t" '$13>=10 {{print}}' {params.outpref}_peaks.gappedPeak > {output.gappedpeak_filt} # custom one-liner to get only the open regions perl -F"\\t" -lane 'print "$F[0]\\t$F[6]\\t$F[7]\\t$F[3]\\t$F[12]"' {output.gappedpeak_filt} > {output.gappedpeak_filt_open} # copied from https://github.com/LiuLabUB/HMMRATAC/blob/master/HMMRATAC_Guide.md awk -v OFS="\\t" '$5>=10 {{print}}' {params.outpref}_summits.bed > {output.summits_filt} """ |
1159 1160 1161 1162 | shell: """ bedops --merge {input} 1> {output} """ |
1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 | shell: """ # below note the '||' condition for the grep statement that allows a return code of 0 even when no matches are found (as in the case of an empty bed file) bedtools intersect -v \ -a {input.narrow} \ -b {params.blacklist} | {{ grep -P "$(cat {input.std_chroms} | perl -lane 'print q:^:.join(q:\\t|^:, @F).q:\\t:')" || [[ $? == 1 ]]; }} > {output.narrow} # subset the summits also based on the names of the retained narrow peaks Rscript -e 'library(magrittr); library(rtracklayer); keep_pks <- import("{output.narrow}")$name; gr <- import("{input.narrow_summits}"); gr[gr$name %in% keep_pks] %>% export(., "{output.narrow_summits}")' bedtools intersect -v \ -a {input.broad} \ -b {params.blacklist} | {{ grep -P "$(cat {input.std_chroms} | perl -lane 'print q:^:.join(q:\\t|^:, @F).q:\\t:')" || [[ $? == 1 ]]; }} > {output.broad} """ |
1237 1238 | script: "bin/scripts/diffbind_count.R" |
1260 1261 1262 1263 1264 1265 1266 | shell: """ findMotifsGenome.pl {input} {params.genome} {params.outdir} \ -size {params.size} -p {threads} \ -mask """ |
1302 1303 | script: "bin/scripts/peaks_venn.Rmd" |
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 | shell: """ export TMPDIR={params.temp} plotEnrichment \ --bamfiles {input.bam} \ --BED {input.bed} {input.merged_beds} \ --smartLabels \ --variableScales \ --outRawCounts {output.rawcts} \ --blackListFileName {params.blacklist} \ {params.extend_reads} \ --samFlagExclude {params.sam_exclude} \ {params.sam_keep} \ -p {threads} \ -o {output.plot} """ |
1374 1375 1376 1377 | shell: """ samtools view -@ {threads} -F 2304 -b -o {output} {input} """ |
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 | shell: """ # preseq doesn't process supplmentary alignments properly. Remove these using samtools. preseq c_curve \ -v \ {params.paired} \ -bam \ -o {output.ccurve} \ {input} echo "Finished c_curve." >&1 echo "Finished c_curve." >&2 preseq lc_extrap \ -v \ {params.paired} \ -bam \ -o {output.lcextrap} \ {input} echo "Finished lc_extrap." >&1 echo "Finished lc_extrap." >&2 """ |
1441 1442 1443 1444 1445 | shell: """ qualimap bamqc -bam {input} --java-mem-size={resources.mem_gb}G --paint-chromosome-limits -outdir analysis/qualimap/{wildcards.sample} -nt {threads} """ |
1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 | shell: """ multiqc \ --force \ --config bin/multiqc_config.yaml \ --outdir {params.workdir} \ --filename {params.outfile} \ {params.dirs} {params.PE_dirs} analysis/macs2/*narrow* """ |
Support
- Future updates
Related Workflows





