Run MedRemix with bedpe input as part of PLBR database workflow
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Summary
-
Run MedRemix methylation profiler with bedpe input as part of PLBR database workflow.
-
Pipeline is an extension of the original MedRemix pipeline (https://github.com/pughlab/cfMeDIP-seq-analysis-pipeline)
-
Pipeline can take FASTQs, BAM or BEDPE as input.
-
For full specifications of BEDPE7+12 file format, please visit https://github.com/pughlab/bam2bedpe
-
Pipeline is written in Snakemake, designed to run on SLURM cluster, but can run locally as well.
-
Note:
-
.bam filtering is slightly different from original MedRemix pipeline, therefore results will differ slightly
-
however this pipeline will output same results whether using .bam or .bedpe as input.
-
Workflow overview
Please see wiki for tutorial on settings up and running pipeline.
Questions please contact ming.han@uhn.ca.
Code Snippets
18 19 | shell: 'bash src/bam2bedpe_scripts/bam2bedpe.sh -s {params.slurm_local} -c {params.chunks} -b {input} -o {params.out_dir} -a {params.conda_activate} -e {params.conda_env}' |
30 31 | shell: 'bash src/bam2bedpe_scripts/get_clean_bedpe4medremix_v4_getopts.sh -i {input} -s {wildcards.sample} -p {wildcards.species} -c {wildcards.chrom} -f {params.frag_len_limit} -o {params.out_dir}' |
47 48 | shell: 'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths' |
85 86 | shell: 'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths' |
78 79 | shell: 'samtools merge {output} {input} && samtools index {output}' |
26 27 | shell: 'ln -s {input} {output} && samtools index {output}' |
26 27 | shell: 'ln -s {input} {output}' |
31 32 | shell: 'gunzip -dc {input} > {output}' |
80 81 | shell: 'trim_galore --cores 4 --dont_gzip --paired {params.trimgalore_settings} --output_dir {params.outdir} {input.R1} {input.R2} && cp ' + path_to_data + '/{wildcards.cohort}/tmp/trim_fastq/{wildcards.sample}_lib{wildcards.lib}_extract_barcode_R*.fastq_trimming_report.txt ' + path_to_data + '/{wildcards.cohort}/qc/trim_galore' |
16 17 | shell: 'fastqc --outdir {params.outdir} {input}' |
27 28 | shell: 'samtools view {input} | grep -v "F19K16\|F24B22" | cat <(samtools view -H {input} | grep -v "F19K16\|F24B22") - | samtools view -b - > {output} && samtools index {output}' |
41 42 | shell: 'fastqc --outdir {params.outdir} {input}' |
52 53 | shell: 'samtools flagstat {input} > {output}' |
66 67 | shell: 'Rscript src/QC/QC_MEDIPS.R --bamFile {input} --outputDir {params.outdir} --windowSize {params.winsize}' |
84 85 | shell: 'picard CollectGcBiasMetrics -I {input} -O {output.metric} -CHART {output.chart} -S {output.summary} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectGcBiasMetrics {output.summary} {output.summary}.parsed' |
96 97 | shell: 'picard CollectInsertSizeMetrics -INCLUDE_DUPLICATES true -I {input} -O {output.metric} -H {output.histo} -M 0 -W 600 && bash src/QC/parse_picard_QC.sh picard.CollectInsertSizeMetrics {output.metric} {output.metric}.parsed' |
108 109 | shell: 'picard MarkDuplicates -I {input} -O {output.markdup_bam} -M {output.metric} && bash src/QC/parse_picard_QC.sh picard.MarkDuplicates {output.metric} {output.metric}.parsed' |
121 122 | shell: 'picard CollectAlignmentSummaryMetrics -I {input} -O {output.metric} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectAlignmentSummaryMetrics {output.metric} {output.metric}.parsed' |
132 133 | shell: 'picard CollectQualityYieldMetrics -I {input} -O {output.metric} && bash src/QC/parse_picard_QC.sh picard.CollectQualityYieldMetrics {output.metric} {output.metric}.parsed' |
163 164 | shell: 'bash src/QC/methyl_QC.sh {input} {output.methyl_counts} {output.methyl_summary}' |
190 191 | shell: 'cat {input.medips_qc} {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_markdup_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed > {output.full_qc} && rm {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_markdup_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed' |
24 25 | shell: 'touch {output}' |
31 32 | shell: 'touch {output}' |
38 39 | shell: 'touch {output}' |
45 46 | shell: 'touch {output}' |
52 53 | shell: 'touch {output}' |
59 60 | shell: 'touch {output}' |
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | usage(){ echo echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t" echo } no_args="true" KEEP_TMP=false ## Help Help() { # Display Help echo echo "Processes bam to bedpe in chunks, preserving FLAG, TLEN and CIGAR fields." echo echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t" echo "options:" echo "-h [HELP] print help" echo "-s [REQUIRED] type either 'slurm' or 'local', local is with nohup" echo "-c [REQUIRED] number of chunks to process in parallel" echo "-b [REQUIRED] path to bam input (full path)" echo "-o [REQUIRED] output directory (full path)" echo "-a [REQUIRED] full path to conda activate (e.g. /cluster/home/[username]/bin/miniconda3/bin/activate)" echo "-e [REQUIRED] conda env with pysam (e.g. pipeline-medremixBEDPE)" echo "-t [OPTIONAL] keep tmp_dir" echo } ## Get the options while getopts ":hs:c:b:o:a:e:t" option; do case "${option}" in h) Help exit;; s) SLURMLOCAL=${OPTARG};; c) NUM_OF_CHUNKS=${OPTARG};; b) INPUT_BAM_PATH=${OPTARG};; o) OUT_DIR=${OPTARG};; a) CONDA_ACTIVATE=${OPTARG};; e) CONDA_ENV=${OPTARG};; t) KEEP_TMP=true;; \?) echo "Error: Invalid option" exit;; esac no_args="false" done [[ "$no_args" == "true" ]] && { usage; exit 1; } echo "Processing step8_bam2bedpe..." echo "number of chunks: $NUM_OF_CHUNKS" echo "input bam path: $INPUT_BAM_PATH" echo "output path: $OUT_DIR" echo "processing on: $SLURMLOCAL" # Main program ############################################## echo "Job started at "$(date) time1=$(date +%s) #PY_SCRIPT_DIR=${SRC_DIR} ## pwd is workflow/ if using Snakemake PY_SCRIPT_PATH="$(pwd)/src/bam2bedpe_scripts/bam2bedpe_pysam_v5_wCIGAR.py" INPUT_DIR="${INPUT_BAM_PATH%/*}" INPUT_BAM="${INPUT_BAM_PATH##*/}" TMP_DIR="${OUT_DIR}/tmp_dir/${INPUT_BAM%.*}" mkdir -p $TMP_DIR OUT_FRAG_NAMES="${INPUT_BAM%.*}.fragNames" OUT_MERGED_SORTD_BEDPE="${INPUT_BAM%.*}_coordSortd.bedpe" ## -------------------------------------- ## ## get fragNames samtools view ${INPUT_DIR}/${INPUT_BAM} \ | cut -f1 \ | awk '{ a[$1]++ } END { for (b in a) { print b } }' \ > ${TMP_DIR}/${OUT_FRAG_NAMES} ## -------------------------------------- ## ## split bam into chunks NUM_ROWS=$(cat ${TMP_DIR}/${OUT_FRAG_NAMES} | wc -l) CHUNK_SIZE=$(( $NUM_ROWS / $NUM_OF_CHUNKS )) ## won't have decimal, sometimes will be short, last chunk has to go to last row echo ".bam has $NUM_ROWS fragments." echo "Number of chunks was set to ${NUM_OF_CHUNKS}." echo "Each chunk will be $CHUNK_SIZE rows." for ((i=0; i<=$(( $NUM_OF_CHUNKS - 2 )); i++)); do CHUNK=$(( i + 1 )) ROW_START=$(( i*CHUNK_SIZE + 1)) ROW_END=$(( CHUNK*CHUNK_SIZE )) echo "Processing CHUNK${CHUNK}... starting with row ${ROW_START}, ending in row ${ROW_END}." sed -n "$ROW_START,$ROW_END p" ${TMP_DIR}/${OUT_FRAG_NAMES} \ > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${CHUNK} done ## since chunk might not be evenly divided, last chunk will just be to last row ith=$(( $NUM_OF_CHUNKS - 1 )) ROW_START=$(( ith*CHUNK_SIZE + 1 )) echo "Processing CHUNK$NUM_OF_CHUNKS... starting with row ${ROW_START}, ending in row ${NUM_ROWS}." sed -n "$ROW_START,$NUM_ROWS p" ${TMP_DIR}/${OUT_FRAG_NAMES} \ > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${NUM_OF_CHUNKS} ## make log dir mkdir -p ${OUT_DIR}/logs_slurm ## -------------------------------------- ## ## picard FilterSamReads on chunks for CHUNK in ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK*; do echo "Picard FilterSamReads for ${CHUNK##*/}..." echo "Output is ${INPUT_BAM%.*}_${CHUNK##*.}.bam" echo "Writing out ${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" ## write out sample sbatch script cat <<- EOF > "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" #!/bin/bash #SBATCH -t 3-00:00:00 #SBATCH -J bam2bedpe_chunks_${CHUNK##*.} #SBATCH -D ${OUT_DIR}/logs_slurm #SBATCH --mail-type=ALL #SBATCH --mail-user=ming.han@uhn.ca #SBATCH -p himem #SBATCH -c 1 #SBATCH --mem=16G #SBATCH -o ./%j-%x.out #SBATCH -e ./%j-%x.err source ${CONDA_ACTIVATE} ${CONDA_ENV} #PICARD_DIR=${PICARD_DIR} echo "Job started at "\$(date) time1=\$(date +%s) picard FilterSamReads \ -I ${INPUT_DIR}/${INPUT_BAM} \ -O ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \ -READ_LIST_FILE ${CHUNK} \ -FILTER includeReadList \ -WRITE_READS_FILES false \ -USE_JDK_DEFLATER true \ -USE_JDK_INFLATER true \ python ${PY_SCRIPT_PATH} \ --sort_bam_by_qname \ --bam_input ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \ --bedpe_output ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bedpe" rm ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" time2=\$(date +%s) echo "Job ended at "\$(date) echo "Job took \$(((time2-time1)/3600)) hours \$((((time2-time1)%3600)/60)) minutes \$(((time2-time1)%60)) seconds" EOF if [ $SLURMLOCAL == "slurm" ]; then sbatch "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" elif [ $SLURMLOCAL == "local" ]; then nohup bash "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" &> "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.log" & fi done ## periodically check if chunks have been written to completely while true; do if [ $(find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" | wc -l) -eq $NUM_OF_CHUNKS ] then echo "All bedpe chunks have been written to, merging bedpe chunks..." find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" -exec cat {} + \ | sort -k1,1V -k2,2n -k3,3n -k5,5n -k6,6n \ | gzip -c \ > ${OUT_DIR}/${OUT_MERGED_SORTD_BEDPE}.gz break else echo "Sleeping for 5 minutes, then recheck if bedpe chunks were written to completely." sleep 5m fi done ## remove TMP_DIR if "$KEEP_TMP"; then echo "Keeping temp dir" else echo "Removed temp dir after completion" rm -rf ${TMP_DIR} fi time2=$(date +%s) echo "Job ended at "$(date) echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds" echo "" |
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | usage(){ echo echo "Usage: bash get_clean_bedpe4medremix.sh -i INPUT_PATH -s SAMPLE_NAME -p SPECIES -c CHR -f FRAGMENT_LENGTH_LIMIT -o OUT_DIR" echo } no_args="true" ## Help Help() { # Display Help echo echo "Get clean bedpe for input into MedRemixBEDPE." echo echo "Usage: bash get_clean_bedpe4medremix.sh -i INPUT_PATH -s SAMPLE_NAME -p SPECIES -c CHR -f FRAGMENT_LENGTH_LIMIT -o OUT_DIR" echo "options:" echo "-h [HELP] print help" echo "-i [REQUIRED] input file path for bedpe.gz (full path)" echo "-s [REQUIRED] sample name" echo "-p [REQUIRED] species (e.g. human, arabidopsis_F19K16, or arabidopsis_F24B22)" echo "-c [REQUIRED] chromosome to process (e.g. chr2)" echo "-f [REQUIRED] fragment length limit (e.g. 500)" echo "-o [REQUIRED] bedpe_bin_stats output directory (full path)" echo } ## Get the options while getopts ":hi:s:p:c:f:o:" option; do case "${option}" in h) Help exit;; i) INPUT_PATH=${OPTARG};; s) SAMPLE_NAME=${OPTARG};; p) SPECIES=${OPTARG};; c) CHR=${OPTARG};; f) FRAGMENT_LENGTH_LIMIT=${OPTARG};; o) OUT_DIR=${OPTARG};; \?) echo "Error: Invalid option" exit;; esac no_args="false" done [[ "$no_args" == "true" ]] && { usage; exit 1; } # Main program ############################################## echo "Processing get_clean_bedpe4medremix... " echo "Job started at "$(date) time1=$(date +%s) INPUT_DIR=${INPUT_PATH%/*} INPUT_BEDPE_GZ=${INPUT_PATH##*/} BEDPE_CHR="${SAMPLE_NAME}_${SPECIES}_${CHR}.bedpe" BEDPE_NOAMBI_NO113n177_mapQge20_isMate="${BEDPE_CHR%.*}.noAmbiguous.no113n177.mapQge20.isMate.bedpe" BEDPE_lean="${BEDPE_NOAMBI_NO113n177_mapQge20_isMate%.*}.lean" BEDPE_lean_withEnd="${BEDPE_lean}.withEnd" BEDPE_lenFiltd="${BEDPE_lean_withEnd}.lenFiltd.bedpe4medremix" ## get specific chr whether as mate1 or mate2 zcat ${INPUT_DIR}/${INPUT_BEDPE_GZ} \ | awk -v CHR=$CHR '($1 == CHR) || ($4 == CHR) {print}' \ > ${OUT_DIR}/${BEDPE_CHR} ## 4 steps filtering: ## remove 'ambiguous reads' as defined by Rsamtools ## i.e. remove any .bedpe alignment if flag_mate1or2==SUPP && chr_mate1==chr_mate2 ## ( chr_mate1!=chr_mate2 supp reads dealt with separately ) ## remove 'FLAG 113,177' tandem reads ## remove fragments if either mate's MAPQ < 20 ## remove unmated alignments ## ( as defined by Rsamtools: read1-read2; both secondary, or none; no wrongly oriented alignments ) cat ${OUT_DIR}/${BEDPE_CHR} \ | awk '((and($14,0x800) || and($15,0x800)) && $1==$4) {next} {print}' \ | awk '(($14 == 113 && $15 == 177) || ($14 == 177 && $15 == 113)) {next} {print}' \ | awk '($8>=20 && $9>=20) {print}' \ | awk '(and($14,0x40) && and($15,0x80)) || (and($14,0x80) && and($15,0x40)) {print}' \ | awk '(!and($14,0x100) && !and($15, 0x100)) || (and($14,0x100) && and($15, 0x100)) {print}' \ | awk '(and($14,0x10) && and($15,0x20)) || (and($14,0x20) && and($15,0x10)) {print}' \ > ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} ## get lean bedpe for medremix and paste with qwidth for mate1 and mate2 ## { qwidth calculated ignoring DHNPXB in CIGAR, as defined by Rsamtools } paste \ <(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \ | awk 'BEGIN{OFS="\t"}{print $1, $7, $2+1, $5+1, ($8+$9)/2, $10, $11}') \ <(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \ | cut -f12 \ | awk '{gsub(/[1-9][0-9]*[D|H|N|P|X|B]/,"",$1); print $1}' \ | awk 'BEGIN{OFS=""}{gsub(/[A-Z]/,"+",$1); print $1,0}' \ | bc) \ <(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \ | cut -f13 \ | awk '{gsub(/[1-9][0-9]*[D|H|N|P|X|B]/,"",$1); print $1}' \ | awk 'BEGIN{OFS=""}{gsub(/[A-Z]/,"+",$1); print $1,0}' \ | bc) \ > ${OUT_DIR}/${BEDPE_lean} ## use qwidth to calculate 'end' cat ${OUT_DIR}/${BEDPE_lean} \ | awk 'BEGIN{OFS="\t"} ($6=="+") {print $1,$2, $3,$4, $5, $6,$7, $8,$9} ($6=="-") {print $1,$2, $4,$3, $5, $7,$6, $9,$8}' \ | awk 'BEGIN{OFS="\t"} {print $1,$2, $6":"$3"-"$3+$8","$7":"$4"-"$4+$9, $3, $4+$9, $4+$9-$3, $5}' \ > ${OUT_DIR}/${BEDPE_lean_withEnd} ## filter by FRAGMENT_LENGTH_LIMIT cat ${OUT_DIR}/${BEDPE_lean_withEnd} \ | awk -v FRAGMENT_LENGTH_LIMIT=$FRAGMENT_LENGTH_LIMIT 'BEGIN{OFS="\t"} {if(($6 > 0) && ($6 < FRAGMENT_LENGTH_LIMIT)) {print}}' \ > ${OUT_DIR}/${BEDPE_lenFiltd} ## remove intermediate files rm ${OUT_DIR}/${BEDPE_CHR} rm ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} rm ${OUT_DIR}/${BEDPE_lean} rm ${OUT_DIR}/${BEDPE_lean_withEnd} time2=$(date +%s) echo "Job ended at "$(date) echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds" echo "" |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | SEQMETH="F19K16" SEQUMETH="F24B22" BAM_PATH=$1 METH_COUNT=$2 METH_QC=$3 samtools view ${BAM_PATH} | \ cut -f 3 | sort | uniq -c | sort -nr | \ sed -e 's/^ *//;s/ /\t/' | \ awk 'OFS="\t" {print $2,$1}' | \ sort -n -k1,1 \ > ${METH_COUNT} total=$(samtools view -c ${BAM_PATH}) unmap=$(cat ${METH_COUNT} | grep '^\*' | cut -f2); if [[ -z $unmap ]]; then unmap="0"; fi methyl=$(cat ${METH_COUNT} | grep ${SEQMETH} | cut -f2); if [[ -z $methyl ]]; then methyl="0"; fi unmeth=$(cat ${METH_COUNT} | grep ${SEQUMETH} | cut -f2); if [[ -z $unmeth ]]; then unmeth="0"; fi pct_meth_ctrl=$(echo "scale=5; ($methyl + $unmeth)/$total * 100" | bc -l); if [[ -z $pct_meth_ctrl ]]; then pct_meth_ctrl="0"; fi pct_bac_meth=$(echo "scale=5; $methyl/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_meth ]]; then pct_bac_meth="0"; fi pct_bac_unmeth=$(echo "scale=5; $unmeth/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_unmeth ]]; then pct_bac_unmeth="0"; fi methyl_specificity=$(echo "scale=5; 100 - $pct_bac_unmeth/$pct_bac_meth" | bc -l); if [[ -z $methyl_specificity ]]; then methyl_specificity="0"; fi bet_meth_ctrl=$(echo "scale=5; $methyl/($methyl + $unmeth)" | bc -l); if [[ -z $bet_meth_ctrl ]]; then bet_meth_ctrl="0"; fi echo -e "Total_reads\tUnmapped_reads\tNum_reads_align_methyl_F19K16\tNum_reads_align_unmethyl_F24B22\tPctg_reads_aligned_to_BACs\tPctg_BACs_is_methyl_F19K16\tPctg_BACs_is_unmethyl_F24B22\tMethylation_specificity\tMethylation_beta" > ${METH_QC} echo -e "$total\t$unmap\t$methyl\t$unmeth\t$pct_meth_ctrl\t$pct_bac_meth\t$pct_bac_unmeth\t$methyl_specificity\t$bet_meth_ctrl" >> ${METH_QC} cat ${METH_QC} \ | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i max=(max<NF?NF:max)} END {for (i=1; i<=max; i++) {for (j=1; j<=NR; j++) printf "%s%s", a[i,j], (j==NR?RS:FS) } }' \ | awk 'BEGIN {OFS="\t"} $1="F19K16_F24B22_methyl_qc\t"$1' \ > ${METH_QC}.parsed |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | METRIC=$1 cat $2 \ | grep -A10 "## METRICS CLASS" \ | grep -v "## METRICS CLASS" \ | sed '/## HISTOGRAM/,+10d' \ | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i max=(max<NF?NF:max)} END {for (i=1; i<=max; i++) {for (j=1; j<=NR; j++) printf "%s%s", a[i,j], (j==NR?RS:FS) } }' \ | grep -v "^SAMPLE\|^LIBRARY\|^READ_GROUP" \ | grep -v "ACCUMULATION_LEVEL" \ | awk -v METRIC=$METRIC 'BEGIN {OFS="\t"} $1=METRIC"\t"$1' \ > $3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | library(docopt) ## adapted from https://github.com/oicr-gsi/wf_cfmedip/blob/master/workflow/runMedips/runMedips.r doc <- "Get MEDIPS QC metrics. Usage: QC_MEDIPS.R --bamFile <FILE> --outputDir <DIR> --windowSize <SIZE> [ --genome <GENOME> ] Options: --bamFile FILE Aligned, sorted, filtered reads (bam) --outputDir DIR Path to output folder --windowSize SIZE Size of genomic windows (bp, e.g. 300) --genome GENOME Path to a folder containing a custom BSgenome as a package, which will be loaded using devtools::load_all(); or the name of BSgenome (usually BSgenome.Hsapiens.UCSC.hg38 or BSgenome.Athaliana.TAIR.TAIR9) --help show this help text " opt <- docopt(doc) library(tidyverse) library(gtools) library(MEDIPS) library(BSgenome.Hsapiens.UCSC.hg38) #if (file.exists(paste(opt[['genome']], 'DESCRIPTION', sep='/'))) { # devtools::load_all(opt[['genome']]) # bsgenome <- getBSgenome(basename(opt[['genome']])) #} else { # bsgenome <- getBSgenome(opt[['genome']]) #} if (!file.exists(opt$bamFile)){ stop(paste0("ERROR: bam file not found ", opt$bamFile), call.=FALSE) } ## get user parameters bam_file = opt$bamFile ws = as.numeric(opt$windowSize) out_dir = paste0(opt$outputDir, "/") chr.select=paste0("chr", c(1:22,"X","Y")) #chr.select=c("chr1","chr22") BSgenome="BSgenome.Hsapiens.UCSC.hg38" uniq = 0 ## WARNING: default settings normalize the data, must be set to 0 to disable this transformation extend = 0 ## relevant for single-end: https://support.bioconductor.org/p/81098/ shift = 0 paired = TRUE # disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10) options(scipen = 999) ## create MEDIPS set ################################### message("Processing MEDIPS QC metrics: window size: ", ws) MeDIPset = MEDIPS.createSet(file = bam_file, BSgenome = BSgenome, uniq = uniq, extend = extend, shift = shift, paired = paired, window_size = ws, chr.select = chr.select) fname <- unlist(strsplit(basename(bam_file),split="\\."))[1] # fname ## coupling set: maps CG densities across the genome CS = MEDIPS.couplingVector(pattern="CG", refObj=MeDIPset) ## saturation analysis ################################# ## whether a given set of mapped reads is sufficient to generate a saturated and reproducible coverage profile ## calculates Pearson correlation of coverage profile between exclusive sets A and B from a sample sr = MEDIPS.saturation(file = bam_file, BSgenome = BSgenome, uniq = uniq, extend = extend, shift = shift, paired = paired, window_size = ws, chr.select = chr.select, nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE) print(paste0("Estimated correlation is: ", round(sr$maxEstCor[2], 5))) print(paste0("True correlation is: ", round(sr$maxTruCor[2],5))) pdf(paste0(out_dir, fname, ".MEDIPS.SaturationPlot.pdf"), width = 5, height = 4) MEDIPS.plotSaturation(sr) dev.off() ## sequence coverage analysis ########################## ## outputs #of CpGs covered/not covered in a sample cr = MEDIPS.seqCoverage(file = bam_file, pattern = "CG", BSgenome = BSgenome, uniq = uniq, extend = extend, shift = shift, paired = paired, chr.select = chr.select) print(paste0("Total number of reads: ", cr$numberReads)) print(paste0("Number of reads NOT covering a CpG: ", cr$numberReadsWO)) print(paste0("Fraction of reads NOT covering a CpG: ", round(cr$numberReadsWO / cr$numberReads, 5))) print(paste0("Number of CpGs in reference: ", length(cr$cov.res))) print(paste0("Number of CpG not covered by a read: ", length(cr$cov.res[cr$cov.res < 1]))) print(paste0("Number of CpG covered by 1 read: ", length(cr$cov.res[cr$cov.res == 1]))) print(paste0("Number of CpG covered by 2 reads: ", length(cr$cov.res[cr$cov.res == 2]))) print(paste0("Number of CpG covered by 3 reads: ", length(cr$cov.res[cr$cov.res == 3]))) print(paste0("Number of CpG covered by 4 reads: ", length(cr$cov.res[cr$cov.res == 4]))) print(paste0("Number of CpG covered by 5 reads: ", length(cr$cov.res[cr$cov.res == 5]))) print(paste0("Number of CpG covered by >5 reads: ", length(cr$cov.res[cr$cov.res > 5]))) print(paste0("Fraction of CpG not covered by a read: ", round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res),5))) print(paste0("Fraction of CpG covered by 1 read: ", round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res),5))) print(paste0("Fraction of CpG covered by 2 reads: ", round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res),5))) print(paste0("Fraction of CpG covered by 3 reads: ", round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res),5))) print(paste0("Fraction of CpG covered by 4 reads: ", round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res),5))) print(paste0("Fraction of CpG covered by 5 reads: ", round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res),5))) print(paste0("Fraction of CpG covered by >5 reads: ", round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res),5))) pdf(paste0(out_dir, fname, ".MEDIPS.seqCovPie.pdf"), width = 5, height = 4) MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type="pie", cov.level = c(0,1,2,3,4,5), main="Sequence pattern coverage, pie chart") dev.off() pdf(paste0(out_dir, fname, ".MEDIPS.seqCovHist.pdf"), width = 5, height = 4) MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type="hist", t = 15, main="Sequence pattern coverage, histogram") dev.off() ## CpG enrichment ##################################### ## test CpG enrichment of given set of short reads covering a set of genomic regions vs reference genome ## regions.relH - relative freq of CpGs within a sample's immunoprecipitated regions ## genome.relH - relative freq of CpGs within reference genome ## enrichment.score.relH - regions.relH/genome.relH ## regions.GoGe - obs/exp ratio of CpGs within a sample's immunoprecipitated regions ## genome.GoGe - obs/exp ratio of CpGs within genomic regions ## enrichment.score.GoGe - regions.GoGe/genome.GoGe ## (relH and GoGe = 2 different ways of calculating enrichment) ## original MEDIPS.CpGenrich has IRanges issue ## this is adapted from script by Nick Cheng MEDIPS.CpGenrichNew <- function(file=NULL, BSgenome=NULL, extend=0, shift=0, uniq=1e-3, chr.select=NULL, paired=F){ ## Proof correctness.... if(is.null(BSgenome)){stop("Must specify a BSgenome library.")} ## Read region file fileName=unlist(strsplit(file, "/"))[length(unlist(strsplit(file, "/")))] path=paste(unlist(strsplit(file, "/"))[1:(length(unlist(strsplit(file, "/"))))-1], collapse="/") if(path==""){path=getwd()} if(!fileName%in%dir(path)){stop(paste("File", fileName, " not found in", path, sep =" "))} dataset = get(ls(paste("package:", BSgenome, sep = ""))[1]) if(!paired){GRange.Reads = getGRange(fileName, path, extend, shift, chr.select, dataset, uniq)} else{GRange.Reads = getPairedGRange(fileName, path, extend, shift, chr.select, dataset, uniq)} ## Sort chromosomes if(length(unique(seqlevels(GRange.Reads)))>1){chromosomes=mixedsort(unique(seqlevels(GRange.Reads)))} if(length(unique(seqlevels(GRange.Reads)))==1){chromosomes=unique(seqlevels(GRange.Reads))} ## Get chromosome lengths for all chromosomes within data set. cat(paste("Loading chromosome lengths for ",BSgenome, "...\n", sep="")) chr_lengths=as.numeric(seqlengths(dataset)[chromosomes]) ranges(GRange.Reads) <- restrict(ranges(GRange.Reads),+1) ##Calculate CpG density for regions total=length(chromosomes) cat("Calculating CpG density for given regions...\n") ## new code ################################## readsChars <- unlist(getSeq(dataset, GRange.Reads, as.character=TRUE)) regions.CG = sum(vcountPattern("CG",readsChars)) regions.C = sum(vcountPattern("C",readsChars)) regions.G = sum(vcountPattern("G",readsChars)) all.genomic= sum(width(readsChars)) nReads <- length(readsChars) ############################################### regions.relH=as.numeric(regions.CG)/as.numeric(all.genomic)*100 regions.GoGe=(as.numeric(regions.CG)*as.numeric(all.genomic))/(as.numeric(regions.C)*as.numeric(regions.G)) cat(paste("Calculating CpG density for the reference genome", BSgenome, "...\n", sep = " ")) CG <- DNAStringSet("CG") pdict0 <- PDict(CG) params <- new("BSParams", X = dataset, FUN = countPDict, simplify = TRUE, exclude = c("rand", "chrUn")) genome.CG=sum(bsapply(params, pdict = pdict0)) params <- new("BSParams", X = dataset, FUN = alphabetFrequency, exclude = c("rand", "chrUn"), simplify=TRUE) alphabet=bsapply(params) genome.l=sum(as.numeric(alphabet)) genome.C=as.numeric(sum(alphabet[2,])) genome.G=as.numeric(sum(alphabet[3,])) genome.relH=genome.CG/genome.l*100 genome.GoGe=(genome.CG*genome.l)/(genome.C*genome.G); ##Calculate CpG density for reference genome enrichment.score.relH=regions.relH/genome.relH enrichment.score.GoGe=regions.GoGe/genome.GoGe gc() return(list(genome=BSgenome, regions.CG=regions.CG, regions.C=regions.C, regions.G=regions.G, regions.relH=regions.relH, regions.GoGe=regions.GoGe, genome.C=genome.C, genome.G=genome.G, genome.CG=genome.CG, genome.relH=genome.relH, genome.GoGe=genome.GoGe, enrichment.score.relH=enrichment.score.relH, enrichment.score.GoGe=enrichment.score.GoGe)) } er = MEDIPS.CpGenrichNew(file = bam_file, BSgenome = BSgenome, uniq = uniq, extend = extend, shift = shift, paired = paired, chr.select = chr.select) ## medips.satr.est_cor and medips.satr.tru_cor involve randomness, will not give identical results on repeat runs ## rest of metrics should be identical on repeat runs message("Writing out MEDIPS QC metrics: saturation, CpG coverage and CpG enrichment.") QC_MEDIPS.df = data.frame(QC_type = rep("medips_QC", 33), metrics = c("ref_genome", "satr.est_cor", "satr.tru_cor", "CpG_cov.totalNumReads", "CpG_cov.numReadsWoCpG", "CpG_cov.fracReadsWoCpG", "CpG_cov.numCpGinRef", "CpG_cov.numCpGwoReads", "CpG_cov.numCpGw1read", "CpG_cov.numCpGw2Reads", "CpG_cov.numCpGw3Reads", "CpG_cov.numCpGw4Reads", "CpG_cov.numCpGw5Reads", "CpG_cov.numCpGgt5Reads", "CpG_cov.fracCpGwoReads", "CpG_cov.fracCpGw1read", "CpG_cov.fracCpGw2Reads", "CpG_cov.fracCpGw3Reads", "CpG_cov.fracCpGw4Reads", "CpG_cov.fracCpGw5Reads", "CpG_cov.fracCpGgt5Reads", "enrich.regions.C", "enrich.regions.G", "enrich.regions.CG", "enrich.genome.C", "enrich.genome.G", "enrich.genome.CG", "enrich.regions.relH", "enrich.genome.relH", "enrich.regions.GoGe", "enrich.genome.GoGe", "enrich.enrichment.score.relH", "enrich.enrichment.score.GoGe"), values = c(er$genome, round(sr$maxEstCor[2], 5), round(sr$maxTruCor[2], 5), cr$numberReads, cr$numberReadsWO, round(cr$numberReadsWO / cr$numberReads, 5), length(cr$cov.res), length(cr$cov.res[cr$cov.res < 1]), length(cr$cov.res[cr$cov.res == 1]), length(cr$cov.res[cr$cov.res == 2]), length(cr$cov.res[cr$cov.res == 3]), length(cr$cov.res[cr$cov.res == 4]), length(cr$cov.res[cr$cov.res == 5]), length(cr$cov.res[cr$cov.res > 5]), round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res), 5), round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res), 5), round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res), 5), round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res), 5), round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res), 5), round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res), 5), round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res), 5), er$regions.C, er$regions.G, er$regions.CG, er$genome.C, er$genome.G, er$genome.CG, round(er$regions.relH, 5), round(er$genome.relH, 5), round(er$regions.GoGe, 5), round(er$genome.GoGe, 5), round(er$enrichment.score.relH, 5), round(er$enrichment.score.GoGe, 5))) names(QC_MEDIPS.df) = c("QC_type", "metrics", fname) # QC_MEDIPS.df write.table(QC_MEDIPS.df, paste0(out_dir, fname, "_QC_MEDIPS.csv"), row.names=F, quote=F, sep = '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | ' row_bind_tables.R Usage: row_bind_tables.R ( -p PATHS | -i INPUT | -G GLOB ) ( --in-csv | --in-tsv | --in-feather | --in-parquet) (--out-csv | --out-tsv | --out-feather | --out-parquet ) -o OUTPUT [ --omit-paths --trim-paths --index-col-name ICN ] Options: -p --paths PATHS Comma separated list of paths to TSV files -i --input INPUT Path to a file containing paths to tables, one per line -G --glob GLOB Globstring matching paths -o --output OUTPUT Path to output --omit-paths Exclude column listing the path to file --trim-paths Includes only the file name instead of full path --index-col-name ICN Name of the index column containing the paths. Defaults to "paths" ' -> doc library(docopt) library(plyr) library(readr) library(doParallel) library(arrow) args <- docopt(doc) print(args) if ( ! is.null(args[['paths']]) ) { paths = strsplit(args[['paths']], ',')[[1]] } else if (! is.null(args[['input']])) { paths = readLines(args[['input']]) } else if ( ! is.null(args[['glob']]) ) { paths = Sys.glob(args[['glob']]) } else { stop('Must provide one of --paths, --input, or --glob.') } paths <- unique(paths) message('Merging files') registerDoParallel() options(readr.show_progress = FALSE) output <- ddply(data.frame(paths), 'paths', function(z) { path = as.character(z$paths) message(paste0('Reading file: ', path)) if (args[['--in-tsv']]) { infile=suppressMessages(read_tsv(path)) } else if (args[['--in-csv']]) { infile=suppressMessages(read_csv(path)) } else if (args[['--in-feather']]) { infile=read_feather(path) } else if (args[['--in-parquet']]) { infile=read_parquet(path) } else { stop('Must provide an input file format') } return(infile) }, .parallel = TRUE) if (args[['--trim-paths']]) { output <- dplyr::mutate(output, paths = sapply(as.character(paths), function(p) { tail(strsplit(p, '/')[[1]], 1) }) ) } if (args[['--omit-paths']]) { output <- dplyr::select(output, -paths) } if (is.null(args[['--index-col-name']])) { icn = 'paths' } else { icn = as.character(args[['--index-col-name']]) } colnames(output)[colnames(output) == 'paths'] <- icn message('Done merging') if (args[['--out-tsv']]) { write_tsv(output, args[['output']]) } else if (args[['--out-csv']]) { write_csv(output, args[['output']]) } else if (args[['--out-feather']]) { write_feather(output, args[['output']]) } else if (args[['--out-parquet']]) { write_feather(output, args[['output']]) } else { stop('Must provide an output file format') } message(paste0('Output written to ', args[['output']])) |
Support
- Future updates
Related Workflows





