Run MedRemix with bedpe input as part of PLBR database workflow

public 1yr ago Version: v0.2.0 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

Summary

Run MedRemix methylation profiler with bedpe input as part of PLBR database workflow.
Pipeline is an extension of the original MedRemix pipeline (https://github.com/pughlab/cfMeDIP-seq-analysis-pipeline)
Pipeline can take FASTQs, BAM or BEDPE as input.
For full specifications of BEDPE7+12 file format, please visit https://github.com/pughlab/bam2bedpe
Pipeline is written in Snakemake, designed to run on SLURM cluster, but can run locally as well.
Note:
- .bam filtering is slightly different from original MedRemix pipeline, therefore results will differ slightly
- however this pipeline will output same results whether using .bam or .bedpe as input.

Workflow overview

MedRemixBEDPE

Please see wiki for tutorial on settings up and running pipeline.

Questions please contact ming.han@uhn.ca.

Code Snippets

shell:
    'bash src/bam2bedpe_scripts/bam2bedpe.sh -s {params.slurm_local} -c {params.chunks} -b {input} -o {params.out_dir} -a {params.conda_activate} -e {params.conda_env}'

SnakeMake From line 18 of rules/bam2bedpe4medremix.smk

shell:
    'bash src/bam2bedpe_scripts/get_clean_bedpe4medremix_v4_getopts.sh -i {input} -s {wildcards.sample} -p {wildcards.species} -c {wildcards.chrom} -f {params.frag_len_limit} -o {params.out_dir}'

SnakeMake From line 30 of rules/bam2bedpe4medremix.smk

shell:
    'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths'

SnakeMake From line 47 of rules/bin_stat.smk

shell:
    'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths'

SnakeMake From line 85 of rules/bin_stat.smk

shell:
    'samtools merge {output} {input} && samtools index {output}'

SnakeMake SAMtools From line 78 of rules/bwa_align_sort_merge.smk

shell:
    'ln -s {input} {output} && samtools index {output}'

SnakeMake SAMtools From line 26 of rules/process_bam.smk

shell:
    'ln -s {input} {output}'

SnakeMake From line 26 of rules/process_bedpe.smk

shell:
    'gunzip -dc {input} > {output}'

SnakeMake From line 31 of rules/process_fastq.smk

shell:
    'trim_galore --cores 4 --dont_gzip --paired {params.trimgalore_settings} --output_dir {params.outdir} {input.R1} {input.R2} && cp ' + path_to_data + '/{wildcards.cohort}/tmp/trim_fastq/{wildcards.sample}_lib{wildcards.lib}_extract_barcode_R*.fastq_trimming_report.txt ' + path_to_data + '/{wildcards.cohort}/qc/trim_galore'

SnakeMake Trim_Galore From line 80 of rules/process_fastq.smk

shell:
    'fastqc --outdir {params.outdir} {input}'

SnakeMake FastQC From line 16 of rules/qc.smk

shell:
    'samtools view {input} | grep -v "F19K16\|F24B22" | cat <(samtools view -H {input} | grep -v "F19K16\|F24B22") - | samtools view -b - > {output} && samtools index {output}'

SnakeMake SAMtools From line 27 of rules/qc.smk

shell:
    'fastqc --outdir {params.outdir} {input}'

SnakeMake FastQC From line 41 of rules/qc.smk

shell:
    'samtools flagstat {input} > {output}'

SnakeMake SAMtools From line 52 of rules/qc.smk

shell:
    'Rscript src/QC/QC_MEDIPS.R --bamFile {input} --outputDir {params.outdir} --windowSize {params.winsize}'

SnakeMake From line 66 of rules/qc.smk

shell:
    'picard CollectGcBiasMetrics -I {input} -O {output.metric} -CHART {output.chart} -S {output.summary} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectGcBiasMetrics {output.summary} {output.summary}.parsed'

SnakeMake Picard From line 84 of rules/qc.smk

shell:
    'picard CollectInsertSizeMetrics -INCLUDE_DUPLICATES true -I {input} -O {output.metric} -H {output.histo} -M 0 -W 600 && bash src/QC/parse_picard_QC.sh picard.CollectInsertSizeMetrics {output.metric} {output.metric}.parsed'

SnakeMake Picard From line 96 of rules/qc.smk

shell:
    'picard MarkDuplicates -I {input} -O {output.markdup_bam} -M {output.metric} && bash src/QC/parse_picard_QC.sh picard.MarkDuplicates {output.metric} {output.metric}.parsed'

SnakeMake Picard From line 108 of rules/qc.smk

shell:
    'picard CollectAlignmentSummaryMetrics -I {input} -O {output.metric} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectAlignmentSummaryMetrics {output.metric} {output.metric}.parsed'

SnakeMake Picard From line 121 of rules/qc.smk

shell:
    'picard CollectQualityYieldMetrics -I {input} -O {output.metric} && bash src/QC/parse_picard_QC.sh picard.CollectQualityYieldMetrics {output.metric} {output.metric}.parsed'

SnakeMake Picard From line 132 of rules/qc.smk

shell:
    'bash src/QC/methyl_QC.sh {input} {output.methyl_counts} {output.methyl_summary}'

SnakeMake From line 163 of rules/qc.smk

shell:
    'cat {input.medips_qc} {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_markdup_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed > {output.full_qc} && rm {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_markdup_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed'

SnakeMake From line 190 of rules/qc.smk

shell:
    'touch {output}'

SnakeMake From line 24 of rules/update_conda_env.smk

shell:
    'touch {output}'

SnakeMake From line 31 of rules/update_conda_env.smk

shell:
    'touch {output}'

SnakeMake From line 38 of rules/update_conda_env.smk

shell:
    'touch {output}'

SnakeMake From line 45 of rules/update_conda_env.smk

shell:
    'touch {output}'

SnakeMake From line 52 of rules/update_conda_env.smk

shell:
    'touch {output}'

SnakeMake From line 59 of rules/update_conda_env.smk

usage(){
    echo 
    echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t"
    echo 
}
no_args="true"
KEEP_TMP=false

## Help 
Help()
{
    # Display Help
    echo 
    echo "Processes bam to bedpe in chunks, preserving FLAG, TLEN and CIGAR fields."
    echo
    echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t"
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-s   [REQUIRED]  type either 'slurm' or 'local', local is with nohup"
    echo "-c   [REQUIRED]  number of chunks to process in parallel"
    echo "-b   [REQUIRED]  path to bam input (full path)"
    echo "-o   [REQUIRED]  output directory (full path)"
    echo "-a   [REQUIRED]  full path to conda activate (e.g. /cluster/home/[username]/bin/miniconda3/bin/activate)"
    echo "-e   [REQUIRED]  conda env with pysam (e.g. pipeline-medremixBEDPE)"
    echo "-t   [OPTIONAL]  keep tmp_dir"
    echo
}

## Get the options
while getopts ":hs:c:b:o:a:e:t" option; do
    case "${option}" in
        h) Help
           exit;;
        s) SLURMLOCAL=${OPTARG};;
        c) NUM_OF_CHUNKS=${OPTARG};;
        b) INPUT_BAM_PATH=${OPTARG};;
        o) OUT_DIR=${OPTARG};;
        a) CONDA_ACTIVATE=${OPTARG};;
        e) CONDA_ENV=${OPTARG};;
        t) KEEP_TMP=true;;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }

echo "Processing step8_bam2bedpe..."
echo "number of chunks: $NUM_OF_CHUNKS"
echo "input bam path:   $INPUT_BAM_PATH"
echo "output path:      $OUT_DIR"
echo "processing on:    $SLURMLOCAL"

# Main program ##############################################

echo "Job started at "$(date) 
time1=$(date +%s)

#PY_SCRIPT_DIR=${SRC_DIR}
## pwd is workflow/ if using Snakemake
PY_SCRIPT_PATH="$(pwd)/src/bam2bedpe_scripts/bam2bedpe_pysam_v5_wCIGAR.py"
INPUT_DIR="${INPUT_BAM_PATH%/*}"
INPUT_BAM="${INPUT_BAM_PATH##*/}"

TMP_DIR="${OUT_DIR}/tmp_dir/${INPUT_BAM%.*}"
mkdir -p $TMP_DIR

OUT_FRAG_NAMES="${INPUT_BAM%.*}.fragNames"
OUT_MERGED_SORTD_BEDPE="${INPUT_BAM%.*}_coordSortd.bedpe"

## -------------------------------------- ##
## get fragNames
samtools view ${INPUT_DIR}/${INPUT_BAM} \
    | cut -f1 \
    | awk '{ a[$1]++ } END { for (b in a) { print b } }' \
    > ${TMP_DIR}/${OUT_FRAG_NAMES}

## -------------------------------------- ##
## split bam into chunks
NUM_ROWS=$(cat ${TMP_DIR}/${OUT_FRAG_NAMES} | wc -l)
CHUNK_SIZE=$(( $NUM_ROWS / $NUM_OF_CHUNKS ))   ## won't have decimal, sometimes will be short, last chunk has to go to last row
echo ".bam has $NUM_ROWS fragments."
echo "Number of chunks was set to ${NUM_OF_CHUNKS}."
echo "Each chunk will be $CHUNK_SIZE rows."

for ((i=0; i<=$(( $NUM_OF_CHUNKS - 2 )); i++)); do
    CHUNK=$(( i + 1 ))
    ROW_START=$(( i*CHUNK_SIZE + 1))
    ROW_END=$(( CHUNK*CHUNK_SIZE ))
    echo "Processing CHUNK${CHUNK}... starting with row ${ROW_START}, ending in row ${ROW_END}."
    sed -n "$ROW_START,$ROW_END p" ${TMP_DIR}/${OUT_FRAG_NAMES} \
        > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${CHUNK}

done

## since chunk might not be evenly divided, last chunk will just be to last row
ith=$(( $NUM_OF_CHUNKS - 1 ))
ROW_START=$(( ith*CHUNK_SIZE + 1 ))
echo "Processing CHUNK$NUM_OF_CHUNKS... starting with row ${ROW_START}, ending in row ${NUM_ROWS}."
sed -n "$ROW_START,$NUM_ROWS p" ${TMP_DIR}/${OUT_FRAG_NAMES} \
    > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${NUM_OF_CHUNKS}

## make log dir
mkdir -p ${OUT_DIR}/logs_slurm

## -------------------------------------- ##
## picard FilterSamReads on chunks
for CHUNK in ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK*; do
    echo "Picard FilterSamReads for ${CHUNK##*/}..."
    echo "Output is ${INPUT_BAM%.*}_${CHUNK##*.}.bam"

    echo "Writing out ${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh"
    ## write out sample sbatch script
    cat <<- EOF > "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" 
	#!/bin/bash
	#SBATCH -t 3-00:00:00
	#SBATCH -J bam2bedpe_chunks_${CHUNK##*.}
	#SBATCH -D ${OUT_DIR}/logs_slurm
	#SBATCH --mail-type=ALL
	#SBATCH --mail-user=ming.han@uhn.ca
	#SBATCH -p himem
	#SBATCH -c 1
	#SBATCH --mem=16G
	#SBATCH -o ./%j-%x.out
	#SBATCH -e ./%j-%x.err

	source ${CONDA_ACTIVATE} ${CONDA_ENV}
	#PICARD_DIR=${PICARD_DIR}

	echo "Job started at "\$(date) 
	time1=\$(date +%s)

	picard FilterSamReads \
	    -I ${INPUT_DIR}/${INPUT_BAM} \
	    -O ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \
	    -READ_LIST_FILE ${CHUNK} \
	    -FILTER includeReadList \
	    -WRITE_READS_FILES false \
	    -USE_JDK_DEFLATER true \
	    -USE_JDK_INFLATER true \

	python ${PY_SCRIPT_PATH} \
	    --sort_bam_by_qname \
	    --bam_input ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \
	    --bedpe_output ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bedpe"

	rm ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam"

	time2=\$(date +%s)
	echo "Job ended at "\$(date) 
	echo "Job took \$(((time2-time1)/3600)) hours \$((((time2-time1)%3600)/60)) minutes \$(((time2-time1)%60)) seconds"
	EOF

    if [ $SLURMLOCAL == "slurm" ]; then
        sbatch "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh"
    elif [ $SLURMLOCAL == "local" ]; then
        nohup bash "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" &> "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.log" &
    fi

done

## periodically check if chunks have been written to completely
while true; do
    if [ $(find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" | wc -l) -eq $NUM_OF_CHUNKS ] 
    then
        echo "All bedpe chunks have been written to, merging bedpe chunks..."
        find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" -exec cat {} + \
                | sort -k1,1V -k2,2n -k3,3n -k5,5n -k6,6n \
                | gzip -c \
                > ${OUT_DIR}/${OUT_MERGED_SORTD_BEDPE}.gz
        break
    else
        echo "Sleeping for 5 minutes, then recheck if bedpe chunks were written to completely."
        sleep 5m
    fi
done

## remove TMP_DIR
if "$KEEP_TMP"; then
    echo "Keeping temp dir"
else
    echo "Removed temp dir after completion"
    rm -rf ${TMP_DIR}
fi


time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""

Shell SAMtools Picard pysam From line 15 of bam2bedpe_scripts/bam2bedpe.sh

usage(){
    echo 
    echo "Usage: bash get_clean_bedpe4medremix.sh -i INPUT_PATH -s SAMPLE_NAME -p SPECIES -c CHR -f FRAGMENT_LENGTH_LIMIT -o OUT_DIR" 
    echo 
}
no_args="true"

## Help 
Help()
{ 
    # Display Help
    echo 
    echo "Get clean bedpe for input into MedRemixBEDPE."
    echo
    echo "Usage: bash get_clean_bedpe4medremix.sh -i INPUT_PATH -s SAMPLE_NAME -p SPECIES -c CHR -f FRAGMENT_LENGTH_LIMIT -o OUT_DIR"
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-i   [REQUIRED]  input file path for bedpe.gz (full path)"
    echo "-s   [REQUIRED]  sample name"
    echo "-p   [REQUIRED]  species (e.g. human, arabidopsis_F19K16, or arabidopsis_F24B22)"
    echo "-c   [REQUIRED]  chromosome to process (e.g. chr2)"
    echo "-f   [REQUIRED]  fragment length limit (e.g. 500)"
    echo "-o   [REQUIRED]  bedpe_bin_stats output directory (full path)"
    echo
}

## Get the options
while getopts ":hi:s:p:c:f:o:" option; do
    case "${option}" in
        h) Help
           exit;;
        i) INPUT_PATH=${OPTARG};;
        s) SAMPLE_NAME=${OPTARG};;
        p) SPECIES=${OPTARG};;
        c) CHR=${OPTARG};;
        f) FRAGMENT_LENGTH_LIMIT=${OPTARG};;
        o) OUT_DIR=${OPTARG};;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }


# Main program ##############################################

echo "Processing get_clean_bedpe4medremix... " 
echo "Job started at "$(date) 
time1=$(date +%s)

INPUT_DIR=${INPUT_PATH%/*}
INPUT_BEDPE_GZ=${INPUT_PATH##*/}
BEDPE_CHR="${SAMPLE_NAME}_${SPECIES}_${CHR}.bedpe"
BEDPE_NOAMBI_NO113n177_mapQge20_isMate="${BEDPE_CHR%.*}.noAmbiguous.no113n177.mapQge20.isMate.bedpe"
BEDPE_lean="${BEDPE_NOAMBI_NO113n177_mapQge20_isMate%.*}.lean"
BEDPE_lean_withEnd="${BEDPE_lean}.withEnd"
BEDPE_lenFiltd="${BEDPE_lean_withEnd}.lenFiltd.bedpe4medremix"

## get specific chr whether as mate1 or mate2
zcat ${INPUT_DIR}/${INPUT_BEDPE_GZ} \
    | awk -v CHR=$CHR '($1 == CHR) || ($4 == CHR) {print}' \
    > ${OUT_DIR}/${BEDPE_CHR}

## 4 steps filtering:
## remove 'ambiguous reads' as defined by Rsamtools
  ## i.e. remove any .bedpe alignment if flag_mate1or2==SUPP && chr_mate1==chr_mate2
  ## ( chr_mate1!=chr_mate2 supp reads dealt with separately )
## remove 'FLAG 113,177' tandem reads
## remove fragments if either mate's MAPQ < 20
## remove unmated alignments 
  ## ( as defined by Rsamtools: read1-read2; both secondary, or none; no wrongly oriented alignments )
cat ${OUT_DIR}/${BEDPE_CHR} \
    | awk '((and($14,0x800) || and($15,0x800)) && $1==$4) {next} {print}' \
    | awk '(($14 == 113 && $15 == 177) || ($14 == 177 && $15 == 113)) {next} {print}' \
    | awk '($8>=20 && $9>=20) {print}' \
    | awk '(and($14,0x40) && and($15,0x80)) ||
           (and($14,0x80) && and($15,0x40)) {print}' \
    | awk '(!and($14,0x100) && !and($15, 0x100)) ||
           (and($14,0x100) && and($15, 0x100)) {print}' \
    | awk '(and($14,0x10) && and($15,0x20)) ||
           (and($14,0x20) && and($15,0x10)) {print}' \
    > ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate}

## get lean bedpe for medremix and paste with qwidth for mate1 and mate2
## { qwidth calculated ignoring DHNPXB in CIGAR, as defined by Rsamtools }
paste \
<(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \
    | awk 'BEGIN{OFS="\t"}{print $1, $7, $2+1, $5+1, ($8+$9)/2, $10, $11}') \
<(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \
    | cut -f12 \
 	| awk '{gsub(/[1-9][0-9]*[D|H|N|P|X|B]/,"",$1); print $1}' \
    | awk 'BEGIN{OFS=""}{gsub(/[A-Z]/,"+",$1); print $1,0}' \
    | bc) \
<(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \
    | cut -f13 \
    | awk '{gsub(/[1-9][0-9]*[D|H|N|P|X|B]/,"",$1); print $1}' \
    | awk 'BEGIN{OFS=""}{gsub(/[A-Z]/,"+",$1); print $1,0}' \
    | bc) \
> ${OUT_DIR}/${BEDPE_lean}

## use qwidth to calculate 'end'
cat ${OUT_DIR}/${BEDPE_lean} \
    | awk 'BEGIN{OFS="\t"} ($6=="+") {print $1,$2, $3,$4, $5, $6,$7, $8,$9}
                           ($6=="-") {print $1,$2, $4,$3, $5, $7,$6, $9,$8}' \
    | awk 'BEGIN{OFS="\t"} {print $1,$2, $6":"$3"-"$3+$8","$7":"$4"-"$4+$9,
                                  $3, $4+$9, $4+$9-$3, $5}' \
    > ${OUT_DIR}/${BEDPE_lean_withEnd}

## filter by FRAGMENT_LENGTH_LIMIT
cat ${OUT_DIR}/${BEDPE_lean_withEnd} \
    | awk -v FRAGMENT_LENGTH_LIMIT=$FRAGMENT_LENGTH_LIMIT 'BEGIN{OFS="\t"} {if(($6 > 0) && ($6 < FRAGMENT_LENGTH_LIMIT)) {print}}' \
    > ${OUT_DIR}/${BEDPE_lenFiltd}

## remove intermediate files
rm ${OUT_DIR}/${BEDPE_CHR}
rm ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate}
rm ${OUT_DIR}/${BEDPE_lean}
rm ${OUT_DIR}/${BEDPE_lean_withEnd}



time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""

Shell From line 16 of bam2bedpe_scripts/get_clean_bedpe4medremix_v4_getopts.sh

SEQMETH="F19K16"
SEQUMETH="F24B22"

BAM_PATH=$1
METH_COUNT=$2
METH_QC=$3

samtools view ${BAM_PATH} | \
    cut -f 3 | sort | uniq -c | sort -nr | \
    sed -e 's/^ *//;s/ /\t/' | \
    awk 'OFS="\t" {print $2,$1}' | \
    sort -n -k1,1 \
    > ${METH_COUNT}

total=$(samtools view -c ${BAM_PATH})
unmap=$(cat ${METH_COUNT} | grep '^\*' | cut -f2); if [[ -z $unmap ]]; then unmap="0"; fi
methyl=$(cat ${METH_COUNT} | grep ${SEQMETH} | cut -f2); if [[ -z $methyl ]]; then methyl="0"; fi
unmeth=$(cat ${METH_COUNT} | grep ${SEQUMETH} | cut -f2); if [[ -z $unmeth ]]; then unmeth="0"; fi
pct_meth_ctrl=$(echo "scale=5; ($methyl + $unmeth)/$total * 100" | bc -l); if [[ -z $pct_meth_ctrl ]]; then pct_meth_ctrl="0"; fi
pct_bac_meth=$(echo "scale=5; $methyl/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_meth ]]; then pct_bac_meth="0"; fi
pct_bac_unmeth=$(echo "scale=5; $unmeth/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_unmeth ]]; then pct_bac_unmeth="0"; fi
methyl_specificity=$(echo "scale=5; 100 - $pct_bac_unmeth/$pct_bac_meth" | bc -l); if [[ -z $methyl_specificity ]]; then methyl_specificity="0"; fi
bet_meth_ctrl=$(echo "scale=5; $methyl/($methyl + $unmeth)" | bc -l); if [[ -z $bet_meth_ctrl ]]; then bet_meth_ctrl="0"; fi

echo -e "Total_reads\tUnmapped_reads\tNum_reads_align_methyl_F19K16\tNum_reads_align_unmethyl_F24B22\tPctg_reads_aligned_to_BACs\tPctg_BACs_is_methyl_F19K16\tPctg_BACs_is_unmethyl_F24B22\tMethylation_specificity\tMethylation_beta" > ${METH_QC}
echo -e "$total\t$unmap\t$methyl\t$unmeth\t$pct_meth_ctrl\t$pct_bac_meth\t$pct_bac_unmeth\t$methyl_specificity\t$bet_meth_ctrl" >> ${METH_QC}

cat ${METH_QC} \
    | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i
                    max=(max<NF?NF:max)}
                    END {for (i=1; i<=max; i++)
                            {for (j=1; j<=NR; j++) 
                             printf "%s%s", a[i,j], (j==NR?RS:FS)
                            }
                        }' \
    | awk 'BEGIN {OFS="\t"} $1="F19K16_F24B22_methyl_qc\t"$1' \
    > ${METH_QC}.parsed

Shell SAMtools From line 3 of QC/methyl_QC.sh

METRIC=$1

cat $2 \
    | grep -A10 "## METRICS CLASS" \
    | grep -v "## METRICS CLASS" \
    | sed '/## HISTOGRAM/,+10d' \
    | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i
                    max=(max<NF?NF:max)}
                    END {for (i=1; i<=max; i++)
                            {for (j=1; j<=NR; j++) 
                             printf "%s%s", a[i,j], (j==NR?RS:FS)
                            }
                        }' \
    | grep -v "^SAMPLE\|^LIBRARY\|^READ_GROUP" \
    | grep -v "ACCUMULATION_LEVEL" \
    | awk -v METRIC=$METRIC 'BEGIN {OFS="\t"} $1=METRIC"\t"$1' \
    > $3

Shell From line 3 of QC/parse_picard_QC.sh

library(docopt)
## adapted from https://github.com/oicr-gsi/wf_cfmedip/blob/master/workflow/runMedips/runMedips.r

doc <- "Get MEDIPS QC metrics.
Usage:
    QC_MEDIPS.R --bamFile <FILE> --outputDir <DIR> --windowSize <SIZE> [ --genome <GENOME> ]

Options:
    --bamFile FILE       Aligned, sorted, filtered reads (bam)
    --outputDir DIR      Path to output folder
    --windowSize SIZE    Size of genomic windows (bp, e.g. 300)
    --genome GENOME      Path to a folder containing a custom BSgenome as a package, 
                             which will be loaded using devtools::load_all();
                             or the name of BSgenome (usually BSgenome.Hsapiens.UCSC.hg38
                             or BSgenome.Athaliana.TAIR.TAIR9)
    --help               show this help text
"
opt <- docopt(doc)

library(tidyverse)
library(gtools)
library(MEDIPS)
library(BSgenome.Hsapiens.UCSC.hg38)

#if (file.exists(paste(opt[['genome']], 'DESCRIPTION', sep='/'))) {
#    devtools::load_all(opt[['genome']])
#    bsgenome <- getBSgenome(basename(opt[['genome']]))
#} else {
#    bsgenome <- getBSgenome(opt[['genome']])
#}

if (!file.exists(opt$bamFile)){
  stop(paste0("ERROR: bam file not found ", opt$bamFile), call.=FALSE)
}

## get user parameters
bam_file = opt$bamFile
ws = as.numeric(opt$windowSize)
out_dir = paste0(opt$outputDir, "/")
chr.select=paste0("chr", c(1:22,"X","Y"))
#chr.select=c("chr1","chr22")

BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 0 ## WARNING: default settings normalize the data, must be set to 0 to disable this transformation
extend = 0 ## relevant for single-end: https://support.bioconductor.org/p/81098/
shift = 0
paired = TRUE

# disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

## create MEDIPS set ###################################
message("Processing MEDIPS QC metrics: window size: ", ws)
MeDIPset =
  MEDIPS.createSet(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   window_size = ws,
                   chr.select = chr.select)

fname <- unlist(strsplit(basename(bam_file),split="\\."))[1]
# fname

## coupling set: maps CG densities across the genome
CS = MEDIPS.couplingVector(pattern="CG", refObj=MeDIPset)

## saturation analysis #################################
## whether a given set of mapped reads is sufficient to generate a saturated and reproducible coverage profile
## calculates Pearson correlation of coverage profile between exclusive sets A and B from a sample

sr =
  MEDIPS.saturation(file = bam_file,
                    BSgenome = BSgenome,
                    uniq = uniq,
                    extend = extend,
                    shift = shift,
                    paired = paired,
                    window_size = ws,
                    chr.select = chr.select,
                    nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE)
print(paste0("Estimated correlation is: ", round(sr$maxEstCor[2], 5)))
print(paste0("True correlation is: ", round(sr$maxTruCor[2],5)))

pdf(paste0(out_dir, fname, ".MEDIPS.SaturationPlot.pdf"), width = 5, height = 4)
MEDIPS.plotSaturation(sr)
dev.off()

## sequence coverage analysis ##########################
## outputs #of CpGs covered/not covered in a sample

cr =
  MEDIPS.seqCoverage(file = bam_file,
                     pattern = "CG",
                     BSgenome = BSgenome,
                     uniq = uniq,
                     extend = extend,
                     shift = shift,
                     paired = paired,
                     chr.select = chr.select)
print(paste0("Total number of reads: ", cr$numberReads))
print(paste0("Number of reads NOT covering a CpG: ", cr$numberReadsWO))
print(paste0("Fraction of reads NOT covering a CpG: ", round(cr$numberReadsWO / cr$numberReads, 5)))

print(paste0("Number of CpGs in reference: ", length(cr$cov.res)))
print(paste0("Number of CpG not covered by a read: ", length(cr$cov.res[cr$cov.res < 1])))
print(paste0("Number of CpG covered by 1 read: ", length(cr$cov.res[cr$cov.res == 1])))
print(paste0("Number of CpG covered by 2 reads: ", length(cr$cov.res[cr$cov.res == 2])))
print(paste0("Number of CpG covered by 3 reads: ", length(cr$cov.res[cr$cov.res == 3])))
print(paste0("Number of CpG covered by 4 reads: ", length(cr$cov.res[cr$cov.res == 4])))
print(paste0("Number of CpG covered by 5 reads: ", length(cr$cov.res[cr$cov.res == 5])))
print(paste0("Number of CpG covered by >5 reads: ", length(cr$cov.res[cr$cov.res > 5])))

print(paste0("Fraction of CpG not covered by a read: ", round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 1 read: ", round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 2 reads: ", round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 3 reads: ", round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 4 reads: ", round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 5 reads: ", round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by >5 reads: ", round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res),5)))


pdf(paste0(out_dir, fname, ".MEDIPS.seqCovPie.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="pie",
                       cov.level = c(0,1,2,3,4,5),
                       main="Sequence pattern coverage, pie chart")
dev.off()

pdf(paste0(out_dir, fname, ".MEDIPS.seqCovHist.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="hist",
                       t = 15,
                       main="Sequence pattern coverage, histogram")
dev.off()

## CpG enrichment #####################################
## test CpG enrichment of given set of short reads covering a set of genomic regions vs reference genome
## regions.relH - relative freq of CpGs within a sample's immunoprecipitated regions
## genome.relH - relative freq of CpGs within reference genome
## enrichment.score.relH - regions.relH/genome.relH
## regions.GoGe - obs/exp ratio of CpGs within a sample's immunoprecipitated regions
## genome.GoGe - obs/exp ratio of CpGs within genomic regions
## enrichment.score.GoGe - regions.GoGe/genome.GoGe
## (relH and GoGe = 2 different ways of calculating enrichment)


## original MEDIPS.CpGenrich has IRanges issue
## this is adapted from script by Nick Cheng
MEDIPS.CpGenrichNew <-
  function(file=NULL, BSgenome=NULL, extend=0, shift=0, uniq=1e-3, chr.select=NULL, paired=F){

    ## Proof correctness....
    if(is.null(BSgenome)){stop("Must specify a BSgenome library.")}

    ## Read region file
    fileName=unlist(strsplit(file, "/"))[length(unlist(strsplit(file, "/")))]
    path=paste(unlist(strsplit(file, "/"))[1:(length(unlist(strsplit(file, "/"))))-1], collapse="/")
    if(path==""){path=getwd()}
    if(!fileName%in%dir(path)){stop(paste("File", fileName, " not found in", path, sep =" "))}

    dataset = get(ls(paste("package:", BSgenome, sep = ""))[1])

    if(!paired){GRange.Reads = getGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}
    else{GRange.Reads = getPairedGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}

    ## Sort chromosomes
    if(length(unique(seqlevels(GRange.Reads)))>1){chromosomes=mixedsort(unique(seqlevels(GRange.Reads)))}
    if(length(unique(seqlevels(GRange.Reads)))==1){chromosomes=unique(seqlevels(GRange.Reads))}

    ## Get chromosome lengths for all chromosomes within data set.
    cat(paste("Loading chromosome lengths for ",BSgenome, "...\n", sep=""))

    chr_lengths=as.numeric(seqlengths(dataset)[chromosomes])

    ranges(GRange.Reads) <- restrict(ranges(GRange.Reads),+1)

    ##Calculate CpG density for regions
    total=length(chromosomes)
    cat("Calculating CpG density for given regions...\n")

    ## new code ##################################
    readsChars <- unlist(getSeq(dataset, GRange.Reads, as.character=TRUE))

    regions.CG = sum(vcountPattern("CG",readsChars))
    regions.C  = sum(vcountPattern("C",readsChars))
    regions.G  = sum(vcountPattern("G",readsChars))
    all.genomic= sum(width(readsChars))

    nReads <- length(readsChars)
    ###############################################

    regions.relH=as.numeric(regions.CG)/as.numeric(all.genomic)*100
    regions.GoGe=(as.numeric(regions.CG)*as.numeric(all.genomic))/(as.numeric(regions.C)*as.numeric(regions.G))

    cat(paste("Calculating CpG density for the reference genome",
              BSgenome, "...\n", sep = " "))

    CG <- DNAStringSet("CG")
    pdict0 <- PDict(CG)
    params <- new("BSParams", X = dataset, FUN = countPDict, simplify = TRUE, exclude = c("rand", "chrUn"))
    genome.CG=sum(bsapply(params, pdict = pdict0))
    params <- new("BSParams", X = dataset, FUN = alphabetFrequency, exclude = c("rand", "chrUn"), simplify=TRUE)
    alphabet=bsapply(params)
    genome.l=sum(as.numeric(alphabet))
    genome.C=as.numeric(sum(alphabet[2,]))
    genome.G=as.numeric(sum(alphabet[3,]))
    genome.relH=genome.CG/genome.l*100
    genome.GoGe=(genome.CG*genome.l)/(genome.C*genome.G);

    ##Calculate CpG density for reference genome

    enrichment.score.relH=regions.relH/genome.relH
    enrichment.score.GoGe=regions.GoGe/genome.GoGe

    gc()
    return(list(genome=BSgenome,
                regions.CG=regions.CG,
                regions.C=regions.C,
                regions.G=regions.G,
                regions.relH=regions.relH,
                regions.GoGe=regions.GoGe,
                genome.C=genome.C,
                genome.G=genome.G,
                genome.CG=genome.CG,
                genome.relH=genome.relH,
                genome.GoGe=genome.GoGe,
                enrichment.score.relH=enrichment.score.relH,
                enrichment.score.GoGe=enrichment.score.GoGe))
  }

er =
  MEDIPS.CpGenrichNew(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   chr.select = chr.select)

## medips.satr.est_cor and medips.satr.tru_cor involve randomness, will not give identical results on repeat runs
## rest of metrics should be identical on repeat runs

message("Writing out MEDIPS QC metrics: saturation, CpG coverage and CpG enrichment.")
QC_MEDIPS.df =
  data.frame(QC_type = rep("medips_QC", 33),
             metrics = c("ref_genome",
                         "satr.est_cor",
                         "satr.tru_cor",
                         "CpG_cov.totalNumReads",
                         "CpG_cov.numReadsWoCpG",
                         "CpG_cov.fracReadsWoCpG",
                         "CpG_cov.numCpGinRef",
                         "CpG_cov.numCpGwoReads",
                         "CpG_cov.numCpGw1read",
                         "CpG_cov.numCpGw2Reads",
                         "CpG_cov.numCpGw3Reads",
                         "CpG_cov.numCpGw4Reads",
                         "CpG_cov.numCpGw5Reads",
                         "CpG_cov.numCpGgt5Reads",
                         "CpG_cov.fracCpGwoReads",
                         "CpG_cov.fracCpGw1read",
                         "CpG_cov.fracCpGw2Reads",
                         "CpG_cov.fracCpGw3Reads",
                         "CpG_cov.fracCpGw4Reads",
                         "CpG_cov.fracCpGw5Reads",
                         "CpG_cov.fracCpGgt5Reads",
                         "enrich.regions.C",
                         "enrich.regions.G",
                         "enrich.regions.CG",
                         "enrich.genome.C",
                         "enrich.genome.G",
                         "enrich.genome.CG",
                         "enrich.regions.relH",
                         "enrich.genome.relH",
                         "enrich.regions.GoGe",
                         "enrich.genome.GoGe",
                         "enrich.enrichment.score.relH",
                         "enrich.enrichment.score.GoGe"),
             values = c(er$genome,
                        round(sr$maxEstCor[2], 5),
                        round(sr$maxTruCor[2], 5),
                        cr$numberReads,
                        cr$numberReadsWO,
                        round(cr$numberReadsWO / cr$numberReads, 5),
                        length(cr$cov.res),
                        length(cr$cov.res[cr$cov.res < 1]),
                        length(cr$cov.res[cr$cov.res == 1]),
                        length(cr$cov.res[cr$cov.res == 2]),
                        length(cr$cov.res[cr$cov.res == 3]),
                        length(cr$cov.res[cr$cov.res == 4]),
                        length(cr$cov.res[cr$cov.res == 5]),
                        length(cr$cov.res[cr$cov.res > 5]),
                        round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res), 5),
                        er$regions.C,
                        er$regions.G,
                        er$regions.CG,
                        er$genome.C,
                        er$genome.G,
                        er$genome.CG,
                        round(er$regions.relH, 5),
                        round(er$genome.relH, 5),
                        round(er$regions.GoGe, 5),
                        round(er$genome.GoGe, 5),
                        round(er$enrichment.score.relH, 5),
                        round(er$enrichment.score.GoGe, 5)))
names(QC_MEDIPS.df) = c("QC_type", "metrics", fname)
# QC_MEDIPS.df

write.table(QC_MEDIPS.df, paste0(out_dir, fname, "_QC_MEDIPS.csv"), row.names=F, quote=F, sep = '\t')

R tidyverse BSgenome.Hsapiens.UCSC.hg38 docopt GTools MEDIPS From line 1 of QC/QC_MEDIPS.R

' row_bind_tables.R

Usage: row_bind_tables.R ( -p PATHS | -i INPUT | -G GLOB ) ( --in-csv | --in-tsv | --in-feather | --in-parquet) (--out-csv | --out-tsv | --out-feather | --out-parquet ) -o OUTPUT [ --omit-paths --trim-paths --index-col-name ICN ]

Options:
    -p --paths PATHS            Comma separated list of paths to TSV files
    -i --input INPUT            Path to a file containing paths to tables, one per line
    -G --glob GLOB              Globstring matching paths

    -o --output OUTPUT          Path to output

    --omit-paths                Exclude column listing the path to file
    --trim-paths                Includes only the file name instead of full path
    --index-col-name ICN        Name of the index column containing the paths. Defaults to "paths"
' -> doc

library(docopt)
library(plyr)
library(readr)
library(doParallel)
library(arrow)
args <- docopt(doc)
print(args)

if ( ! is.null(args[['paths']]) ) {
    paths = strsplit(args[['paths']], ',')[[1]]
} else if (! is.null(args[['input']])) {
    paths = readLines(args[['input']])
} else if ( ! is.null(args[['glob']]) ) {
    paths = Sys.glob(args[['glob']])
} else {
    stop('Must provide one of --paths, --input, or --glob.')
}

paths <- unique(paths)

message('Merging files')

registerDoParallel()

options(readr.show_progress = FALSE)

output <- ddply(data.frame(paths), 'paths', function(z) {
      path = as.character(z$paths)
      message(paste0('Reading file: ', path))

      if (args[['--in-tsv']]) {
          infile=suppressMessages(read_tsv(path))
      } else if (args[['--in-csv']]) {
          infile=suppressMessages(read_csv(path))
      } else if (args[['--in-feather']]) {
          infile=read_feather(path)
      } else if (args[['--in-parquet']]) {
          infile=read_parquet(path)
      } else {
          stop('Must provide an input file format')
      }
      return(infile)
}, .parallel = TRUE)

if (args[['--trim-paths']]) {
    output <- dplyr::mutate(output,
        paths = sapply(as.character(paths), function(p) { tail(strsplit(p, '/')[[1]], 1) })
    )
}

if (args[['--omit-paths']]) {
    output <- dplyr::select(output, -paths)
}

if (is.null(args[['--index-col-name']])) {
    icn = 'paths'
} else {
    icn = as.character(args[['--index-col-name']])
}
colnames(output)[colnames(output) == 'paths'] <- icn

message('Done merging')

if (args[['--out-tsv']]) {
    write_tsv(output, args[['output']])
} else if (args[['--out-csv']]) {
    write_csv(output, args[['output']])
} else if (args[['--out-feather']]) {
    write_feather(output, args[['output']])
} else if (args[['--out-parquet']]) {
    write_feather(output, args[['output']])
} else {
    stop('Must provide an output file format')
}

message(paste0('Output written to ', args[['output']]))