Bioinformatics pipeline for processing 16s marker gene metagenomic sequence data.
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Bioinformatics pipeline for processing 16S rRNA amplicon sequence data.
Pipeline
Custom parameters stored in
config.yaml
. See below for a description of the parameters.
Installation
To use: navigate to project directory. Clone this repository using the following:
git clone https://github.com/alanaschick/magma.git magma
Note: you need to have conda and snakemake installed locally in order to run this. See instructions below.
Executing
Step 1. Enter location of sequences and list of file names in
config.yaml
file.
Step 2. To execute:
Test the pipeline:
snakemake -np
Run the pipeline:
snakemake --use-conda
Remove Adapters
- This pipeline uses the cutadapt software to remove adapters from the raw sequence reads.
Filtering
-
Remove low quality reads using dada2::filterAndTrim function.
-
Check quality post filtering with fastqc and multiqc.
Dada2 Workflow
-
Generate error model and denoise reads.
-
De-replicate and infer sequence variants.
-
Remove bimeras, assign taxonomy.
-
Track reads throughout processing, print results to table.
Conda and Snakemake
To install conda, see the instructions here .
To install snakemake using conda, run the following line:
conda install -c bioconda -c conda-forge snakemake
See the snakemake installation instructions for further details.
Code Snippets
30 31 32 33 34 | shell: "mkdir -p output/temp/cutadapt/logs; cutadapt -e 0 -O 10 -m 50 -n 2 -g {config[fwd_primer]} " "-G {config[rev_primer]} -a {config[rev_primer_rc]} " "-A {config[fwd_primer_rc]} -o {output.r1} -p {output.r2} " "{input.r1} {input.r2} > output/temp/cutadapt/logs/{params.pre}.cutadapt.log" |
46 | script: "utils/scripts/filter.R" |
56 | shell: "fastqc -o output/temp/qc/fastqc {input.r1} {input.r2}" |
64 | shell: "multiqc -f output/temp -o output -n multiqc_report.html" |
76 | script: "utils/scripts/run_dada2.R" |
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | library(dada2) library(tidyverse) library(colorRamps) ## Set variables list_of_filenames <- snakemake@input$listfiles ## Set filtering parameters from config file trimleft <- c(snakemake@config$trimleft_forward, snakemake@config$trimleft_reverse) expected_errors <- c(snakemake@config$expected_errors_forward, snakemake@config$expected_errors_reverse) truncate <- c(snakemake@config$truncate_forward, snakemake@config$truncate_reverse) readlength <- snakemake@config$readlength ## Cutadapt setting trimmed <- snakemake@config$run_cutadapt path_to_raw <- snakemake@config$path ## Exploring parameter space options trunc_param <- snakemake@config$explore_truncation_params ee_param <- snakemake@config$explore_quality_params ## Make directory for quality plots dir.create("output/quality_plots/") ######################################################### ## Make a vector of sample names samples <- scan(list_of_filenames, what = "character") ## file names of forward and reverse reads, before quality filtering if (trimmed == T){ forward_reads <- file.path("output/temp/cutadapt", paste0(samples, "_r1_cutadapt.fastq.gz")) reverse_reads <- file.path("output/temp/cutadapt", paste0(samples, "_r2_cutadapt.fastq.gz")) } if (trimmed == F){ forward_reads <- file.path(path_to_raw, paste0(samples, "_R1_001.fastq.gz")) reverse_reads <- file.path(path_to_raw, paste0(samples, "_R2_001.fastq.gz")) } ## file names of forward and reverse reads, after quality filtering filtered_forward_reads <- file.path("output/temp/filtered", paste0(samples, "_r1_filtered.fastq.gz")) filtered_reverse_reads <- file.path("output/temp/filtered", paste0(samples, "_r2_filtered.fastq.gz")) ######################################################### ######## Step 0: Exploring filtering parameters if (trunc_param == T){ ## Explore truncation parameters results <- NULL ## Select a set of samples at random to inspect test <- sample(c(1:length(samples)), 12) for (i in seq(from = readlength-45, to = readlength, by = 5)){ for (j in seq(from = readlength-90, to = readlength, by = 10)){ truncparam <- c() out <- filterAndTrim(forward_reads[test], filtered_forward_reads[test], reverse_reads[test], filtered_reverse_reads[test], truncLen=c(i,j), maxEE=expected_errors, rm.phix=TRUE, compress=TRUE, multithread=TRUE, trimLeft = trimleft) res <- data.frame(Sample = rownames(out), perc = out[,2]/out[,1], for_trunc = i, rev_trunc = j) results <- rbind(results, res) } } results <- results %>% separate(Sample, c("Name", "Sample"), sep = "_S") gg <- ggplot(results, aes(x = for_trunc, y = perc, colour = as.factor(rev_trunc))) + geom_point(size = 2) + geom_line(size = 1) + scale_colour_manual(values = sample(primary.colors(20), 10), name = "Truncate Reverse") + xlab("Truncate Forward") + ylab("Percentage reads passed filtering") + ggtitle("Truncation parameters") + theme_minimal() + facet_wrap(~Name) pdf(file.path("output", "truncation_parameters.pdf")) print(gg) dev.off() } if (ee_param == T){ ## Explore expected error values results <- NULL for (i in 1:5){ for (j in 1:5){ out <- filterAndTrim(forward_reads[test], filtered_forward_reads[test], reverse_reads[test], filtered_reverse_reads[test], truncLen=truncate, maxEE=c(i,j), rm.phix=TRUE, compress=TRUE, multithread=TRUE, trimLeft = trimleft) res <- data.frame(Sample = rownames(out), perc = out[,2]/out[,1], for_error = i, rev_error = j) results <- rbind(results, res) } } results <- results %>% separate(Sample, c("Name", "Sample"), sep = "_S") gg <- ggplot(results, aes(x = for_error, y = perc, colour = as.factor(rev_error))) + geom_point(size = 2) + geom_line(size = 1) + scale_colour_manual(values = rainbow(5, v = 0.8), name = "Error Rate Reverse") + xlab("Error Rate Forward") + ylab("Percentage reads passed filtering") + ggtitle("Expected error parameters") + theme_minimal() + facet_wrap(~Name) pdf(file.path("output", "expected_error_parameters.pdf")) print(gg) dev.off() } ######################################################### ####### Step 1: Quality filtering out <- filterAndTrim(forward_reads, filtered_forward_reads, reverse_reads, filtered_reverse_reads, maxEE = expected_errors, multithread = TRUE, rm.phix=TRUE, trimLeft = trimleft, compress = TRUE, truncLen = truncate) saveRDS(out, file.path("output/temp", "filt_out.rds")) ####### Step 2: Plot quality profiles ## Select 10 samples at random to inspect/print toplot <- sample(c(1:length(samples)), 10) ## Plotting forward versus reverse quality for (i in 1:10){ pdf(paste("output/quality_plots/quality_", samples[toplot[i]], ".pdf", sep = "")) print(plotQualityProfile(c(filtered_forward_reads[toplot[i]], forward_reads[toplot[i]], filtered_reverse_reads[toplot[i]], reverse_reads[toplot[i]]))) dev.off() } |
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | library(dada2) #packageVersion("dada2") library(tidyverse) ## Set variables list_of_filenames <- snakemake@input$listfiles tax_ref <- snakemake@config$dada2_tax_ref spe_ref <- snakemake@config$dada2_spe_ref bacterial <- snakemake@config$bacterial ## Make a vector of sample names samples <- scan(list_of_filenames, what = "character") out <- readRDS(snakemake@input$filt_out) ## file names of forward and reverse reads, after quality filtering filtered_forward_reads <- file.path("output/temp/filtered", paste0(samples, "_r1_filtered.fastq.gz")) filtered_reverse_reads <- file.path("output/temp/filtered", paste0(samples, "_r2_filtered.fastq.gz")) ####### Step 2: Generate error model of data err_forward <- learnErrors(filtered_forward_reads, multithread = TRUE) err_reverse <- learnErrors(filtered_reverse_reads, multithread = TRUE) ####### Step 3: Derepliate sequences derep_forward <- derepFastq(filtered_forward_reads, verbose = TRUE) derep_reverse <- derepFastq(filtered_reverse_reads, verbose = TRUE) names(derep_forward) <- samples names(derep_reverse) <- samples ####### Step 4: Infer sequence variants dadaF <- dada(derep_forward, err = err_forward, multithread = TRUE) dadaR <- dada(derep_reverse, err = err_reverse, multithread = TRUE) ####### Step 5: Merge forward and reverse reads merged <- mergePairs(dadaF, derep_forward, dadaR, derep_reverse, verbose = TRUE) ####### Step 6: Generate count table seqtab <- makeSequenceTable(merged) ####### Step 7: Remove chimeras seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE) ## How are the reads concentrated in the merged sequence lengths readsbyseqlen <- tapply(colSums(seqtab.nochim), nchar(colnames(seqtab.nochim)),sum) pdf(file.path("output", "merged_sequence_lengths.pdf")) plot(as.integer(names(readsbyseqlen)),readsbyseqlen, xlab = "Merged length", ylab = "Total reads") dev.off() ####### Step 7b: Track reads throughout processing getN <- function(x) sum (getUniques(x)) summary_tab <- data.frame(row.names=samples, Input=out[,1], Filtered=out[,2], Denoised=sapply(dadaF, getN), Merged=sapply(merged, getN), Non.Chimeric=rowSums(seqtab.nochim), Total.Perc.Remaining = round(rowSums(seqtab.nochim)/out[,1]*100,1)) pdf(file.path("output", "reads_remaining.pdf")) hist(summary_tab$Total.Perc.Remaining, col = "blue", breaks = 50, main = "Total", xlab = "Percentage reads remaining") dev.off() ## Write this table to output write.table(summary_tab, file.path("output", "reads_tracked.txt")) summary_tab$Sample <- rownames(summary_tab) summary_tab <- summary_tab %>% separate(Sample, c("Sample", "temp"), sep = "_S") summary_tab$Sample <- factor(summary_tab$Sample, levels = summary_tab$Sample[order(summary_tab$Non.Chimeric)]) summary_tab_long <- summary_tab %>% gather("QC.Step", "Reads", Input:Non.Chimeric) summary_tab_long$QC.Step <- factor(summary_tab_long$QC.Step, levels = c("Input", "Filtered", "Denoised", "Merged", "Non.Chimeric")) gg <- ggplot(summary_tab_long, aes(x = Sample, y = Reads, color = QC.Step)) + geom_point(size = 2) + scale_color_manual(values = rainbow(5, v = 0.8)) + theme_bw() + theme(axis.text.x = element_text(angle = 90)) pdf(file.path("output", "read_tracking.pdf")) gg dev.off() ####### Step 8: Assign Taxonomy ################################################ ## To do: add parameter to config file to allow user to decide whether or not to allow multiples ################################################ taxa <- assignTaxonomy(seqtab.nochim, tax_ref, multithread = TRUE) ## Add Species (if 16S, not if ITS) if (bacterial == T){ taxa <- addSpecies (taxa, spe_ref, allowMultiple = TRUE) } ####### Step 9: Save output saveRDS(seqtab.nochim, file.path("output", "seqtab_final.rds")) saveRDS(taxa, file.path("output", "taxa_final.rds")) |
Support
- Future updates
Related Workflows





