Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
RNAseq is becoming the one of the most prominent methods for measuring celluar responses. Not only does RNAseq have the ability to analyze differences in gene expression between samples, but can discover new isoforms and analyze SNP variations. This tutorial will cover the basic workflow for processing and analyzing differential gene expression data and is meant to give a general method for setting up an environment and running alignment tools. Be aware that is not meant to be used for all types of analyses and data-types, and the alignment tools are not for every analysis. Additionally, this tutorial is focused on giving a general sense of the flow when performing these analysis. For larger scale studies, it is highly reccomended to use a HPC environment for increased RAM and computational power.
Code Snippets
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | library(knitr) knitr::opts_chunk$set(echo = TRUE) library(DESeq2) library(ggplot2) library(knitr) library(clusterProfiler) library(biomaRt) library(ReactomePA) library(DOSE) library(KEGG.db) library(org.Mm.eg.db) library(org.Hs.eg.db) library(pheatmap) library(genefilter) library(RColorBrewer) library(GO.db) library(topGO) library(dplyr) library(gage) library(ggsci) |
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | # Download the Miniconda3 installer to your home directory (Only for macOS) <<<<<<< HEAD wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O ~/miniconda.sh # Download the Miniconda3 installer to your home directory (Only for LINUX or Cluster) wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh ======= wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O ~/miniconda.sh # Download the Miniconda3 installer to your home directory (Only for LINUX or Cluster) wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O ~/miniconda.sh >>>>>>> origin/master # Run the miniconda installation bash miniconda.sh -b -f -p ~/miniconda # Add miniconda to the system path echo 'PATH="$HOME/miniconda/bin:$PATH' >> ~/.bash_profile # Source system file to activate miniconda source ~/.bash_profile # Add bioinformatic channels for downloading required packages conda config --add channels conda-forge conda config --add channels defaults conda config --add channels r conda config --add channels bioconda |
74 75 76 77 78 79 80 81 | # Install git (if needed) conda install -c anaconda git wget --yes # Clone this repository with folder structure into the current working folder git clone https://github.com/twbattaglia/RNAseq-workflow new_workflow # Change directory into the new folder cd new_workflow |
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | ── new_workflow/ │ └── annotation/ <- Genome annotation file (.GTF/.GFF) │ │ └── genome/ <- Host genome file (.FASTA) │ │ └── input/ <- Location of input RNAseq data │ │ └── output/ <- Data generated during processing steps │ ├── 1_initial_qc/ <- Main alignment files for each sample │ ├── 2_trimmed_output/ <- Log from running STAR alignment step │ ├── 3_rRNA/ <- STAR alignment counts output (for comparison with featureCounts) │ ├── aligned/ <- Sequences that aligned to rRNA databases (rRNA contaminated) │ ├── filtered/ <- Sequences with rRNA sequences removed (rRNA-free) │ ├── logs/ <- logs from running SortMeRNA │ ├── 4_aligned_sequences/ <- Main alignment files for each sample │ ├── aligned_bam/ <- Alignment files generated from STAR (.BAM) │ ├── aligned_logs/ <- Log from running STAR alignment step │ ├── 5_final_counts/ <- Summarized gene counts across all samples │ ├── 6_multiQC/ <- Overall report of logs for each step │ │ └── sortmerna_db/ <- Folder to store the rRNA databases for SortMeRNA │ ├── index/ <- indexed versions of the rRNA sequences for faster alignment │ ├── rRNA_databases/ <- rRNA sequences from bacteria, archea and eukaryotes │ │ └── star_index/ <- Folder to store the indexed genome files from STAR |
123 124 125 126 127 128 129 130 131 | # Download genome fasta file into the genome/ folder wget -P genome/ ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M12/GRCm38.p5.genome.fa.gz # Download annotation file into the annotation/ folder wget -P annotation/ ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M12/gencode.vM12.annotation.gtf.gz # Decompress files for use with tools gunzip genome/GRCm38.p4.genome.fa.gz gunzip annotation/gencode.vM12.annotation.gtf.gz |
136 137 138 139 140 141 142 143 144 | # Download genome fasta file into the genome/ folder wget -p genome/ ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh38.p7.genome.fa.gz # Download annotation file into the annotation/ folder wget -P annotation/ ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.annotation.gtf.gz # Decompress files for use with tools gunzip genome/GRCh38.p7.genome.fa.gz gunzip annotation/gencode.v25.annotation.gtf.gz |
155 156 157 158 | wget -P input/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR137/001/SRR1374921/SRR1374921.fastq.gz wget -P input/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR137/002/SRR1374922/SRR1374922.fastq.gz wget -P input/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR137/003/SRR1374923/SRR1374923.fastq.gz wget -P input/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR137/004/SRR1374924/SRR1374924.fastq.gz |
173 | conda install -c bioconda fastqc --yes |
178 179 180 181 182 183 184 185 | # Help fastqc -h # Run FastQC fastqc \ -o results/1_initial_qc/ \ --noextract \ input/sample.fastq |
190 191 192 | ── results/1_initial_qc/ └── sample_fastqc.html <- HTML file of FastQC fquality analysis figures └── sample_fastqc.zip <- FastQC report data |
213 | conda install -c bioconda trim-galore --yes |
218 219 220 221 222 223 224 225 226 227 | # Help trim_galore -h # Run Trim Galore! trim_galore \ --quality 20 \ --fastqc \ --length 25 \ --output_dir results/2_trimmed_output/ \ input/sample.fastq |
232 233 234 235 236 | ── results/2_trimmed_output/ └── sample_trimmed.fq <- Trimmed sequencing file (.fastq) └── sample_trimmed.html <- HTML file of FastQC fquality analysis figures └── sample_trimmed.zip <- FastQC report data └── sample.fastq.trimming_report.txt <- Cutadapt trimming report |
252 | conda install -c bioconda sortmerna --yes |
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | # Download the sortmerna package (~2min) into sortmerna_db folder wget -P sortmerna_db https://github.com/biocore/sortmerna/archive/2.1b.zip # Decompress folder unzip sortmerna_db/2.1b.zip -d sortmerna_db # Move the database into the correct folder mv sortmerna_db/sortmerna-2.1b/rRNA_databases/ sortmerna_db/ # Remove unnecessary folders rm sortmerna_db/2.1b.zip rm -r sortmerna_db/sortmerna-2.1b # Save the location of all the databases into one folder sortmernaREF=sortmerna_db/rRNA_databases/silva-arc-16s-id95.fasta,sortmerna_db/index/silva-arc-16s-id95:\ sortmerna_db/rRNA_databases/silva-arc-23s-id98.fasta,sortmerna_db/index/silva-arc-23s-id98:\ sortmerna_db/rRNA_databases/silva-bac-16s-id90.fasta,sortmerna_db/index/silva-bac-16s-id95:\ sortmerna_db/rRNA_databases/silva-bac-23s-id98.fasta,sortmerna_db/index/silva-bac-23s-id98:\ sortmerna_db/rRNA_databases/silva-euk-18s-id95.fasta,sortmerna_db/index/silva-euk-18s-id95:\ sortmerna_db/rRNA_databases/silva-euk-28s-id98.fasta,sortmerna_db/index/silva-euk-28s-id98 # Run the indexing command (~8 minutes) indexdb_rna --ref $sortmernaREF |
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 | # Help sortmerna -h # Save variable of rRNA databases # Save the location of all the databases into one folder sortmernaREF=sortmerna_db/rRNA_databases/silva-arc-16s-id95.fasta,sortmerna_db/index/silva-arc-16s-id95:\ sortmerna_db/rRNA_databases/silva-arc-23s-id98.fasta,sortmerna_db/index/silva-arc-23s-id98:\ sortmerna_db/rRNA_databases/silva-bac-16s-id90.fasta,sortmerna_db/index/silva-bac-16s-id95:\ sortmerna_db/rRNA_databases/silva-bac-23s-id98.fasta,sortmerna_db/index/silva-bac-23s-id98:\ sortmerna_db/rRNA_databases/silva-euk-18s-id95.fasta,sortmerna_db/index/silva-euk-18s-id95:\ sortmerna_db/rRNA_databases/silva-euk-28s-id98.fasta,sortmerna_db/index/silva-euk-28s-id98 # Run SortMeRNA (~15min) sortmerna \ --ref $sortmernaREF \ --reads results/2_trimmed_output/sample_trimmed.fq \ --aligned results/3_rRNA/aligned/sample_aligned.fq \ --other results/3_rRNA/filtered/sample_filtered.fq \ --fastx \ --log \ -a 4 \ -v # Move logs into the correct folder mv -v results/3_rRNA/aligned//sample_aligned.log results/3_rRNA/logs |
316 317 318 319 | ── results/3_rRNA/ └── aligned/sample_aligned.fq <- sequences with rRNA contamination └── filtered/sample_filtered.fq <- sequences without any rRNA contamination └── logs/sample_aligned.log <- log from SortMeRNA analysis |
337 | conda install -c bioconda star --yes |
346 347 348 349 350 351 352 | # This can take up to 30 minutes to complete STAR \ --runMode genomeGenerate \ --genomeDir star_index \ --genomeFastaFiles genome/* \ --sjdbGTFfile annotation/* \ --runThreadN 4 |
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 | # Help STAR -h # Run STAR (~10min) STAR \ --genomeDir star_index \ --readFilesIn filtered/sample_filtered.fq \ --runThreadN 4 \ --outSAMtype BAM SortedByCoordinate \ --quantMode GeneCounts # Move the BAM file into the correct folder mv -v results/4_aligned_sequences/sampleAligned.sortedByCoord.out.bam results/4_aligned_sequences/aligned_bam/ # Move the logs into the correct folder mv -v results/4_aligned_sequences/${BN}Log.final.out results/4_aligned_sequences/aligned_logs/ mv -v results/4_aligned_sequences/sample*Log.out results/4_aligned_sequences/aligned_logs/ |
379 380 381 382 | ── results/4_aligned_sequences/ └── aligned_bam/sampleAligned.sortedByCoord.out.bam <- Sorted BAM alignment fole └── aligned_logs/sampleLog.final.out <- Log of STAR alignment rate └── aligned_logs/sampleLog.out <- Log of steps take during STAR alignment |
397 | conda install -c bioconda subread --yes |
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | # Help featureCounts -h # Change directory into the aligned .BAM folder cd results/4_aligned_sequences/aligned_bam # Store list of files as a variable dirlist=$(ls -t ./*.bam | tr '\n' ' ') echo $dirlist # Run featureCounts on all of the samples (~10 minutes) featureCounts \ -a ../../annotation/* \ -o ../../results/5_final_counts/final_counts.txt \ -g 'gene_name' \ -T 4 \ $dirlist # Change directory back to main folder cd ../../../ |
426 427 428 | ── results/5_final_counts/ └── final_counts.txt <- Final gene counts across all samples └── final_counts.txt.summary <- Summary of gene summarization |
444 | conda install -c bioconda multiqc --yes |
449 450 451 452 453 454 | # Help multiqc -h # Run multiqc and output results into final folder multiqc results \ --outdir results/6_multiQC |
459 460 461 | ── results/6_multiQC/ └── multiqc_report.html <- Beautiful figures representing the logs from each step └── multiqc_data/ <- Folder of data that multiqc found from various log files |
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 | source("https://bioconductor.org/biocLite.R") biocLite("DESeq2") ; library(DESeq2) biocLite("ggplot2") ; library(ggplot2) biocLite("clusterProfiler") ; library(clusterProfiler) biocLite("biomaRt") ; library(biomaRt) biocLite("ReactomePA") ; library(ReactomePA) biocLite("DOSE") ; library(DOSE) biocLite("KEGG.db") ; library(KEGG.db) biocLite("org.Mm.eg.db") ; library(org.Mm.eg.db) biocLite("org.Hs.eg.db") ; library(org.Hs.eg.db) biocLite("pheatmap") ; library(pheatmap) biocLite("genefilter") ; library(genefilter) biocLite("RColorBrewer") ; library(RColorBrewer) biocLite("GO.db") ; library(GO.db) biocLite("topGO") ; library(topGO) biocLite("dplyr") ; library(dplyr) biocLite("gage") ; library(gage) biocLite("ggsci") ; library(ggsci) |
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 | # Import gene counts table # - skip first row (general command info) # - make row names the gene identifiers countdata <- read.table("example/final_counts.txt", header = TRUE, skip = 1, row.names = 1) # Remove .bam + '..' from column identifiers colnames(countdata) <- gsub(".bam", "", colnames(countdata), fixed = T) colnames(countdata) <- gsub(".bam", "", colnames(countdata), fixed = T) colnames(countdata) <- gsub("..", "", colnames(countdata), fixed = T) # Remove length/char columns countdata <- countdata[ ,c(-1:-5)] # Make sure ID's are correct head(countdata) |
518 519 520 521 522 523 524 525 526 527 528 529 | # Import metadata file # - make row names the matching sampleID's from the countdata metadata <- read.delim("example/metadata.txt", row.names = 1) # Add sampleID's to the mapping file metadata$sampleid <- row.names(metadata) # Reorder sampleID's to match featureCounts column order. metadata <- metadata[match(colnames(countdata), metadata$sampleid), ] # Make sure ID's are correct head(metadata) |
535 536 537 538 539 540 541 542 543 544 545 | # - countData : count dataframe # - colData : sample metadata in the dataframe with row names as sampleID's # - design : The design of the comparisons to use. # Use (~) before the name of the column variable to compare ddsMat <- DESeqDataSetFromMatrix(countData = countdata, colData = metadata, design = ~Group) # Find differential expressed genes ddsMat <- DESeq(ddsMat) |
550 551 552 553 554 555 556 557 558 559 560 | # Get results from testing with FDR adjust pvalues results <- results(ddsMat, pAdjustMethod = "fdr", alpha = 0.05) # Generate summary of testing. summary(results) # Check directionality of the log2 fold changes ## Log2 fold change is set as (LoGlu / HiGlu) ## Postive fold changes = Increased in LoGlu ## Negative fold changes = Decreased in LoGlu mcols(results, use.names = T) |
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | # Mouse genome database (Select the correct one) library(org.Mm.eg.db) # Add gene full name results$description <- mapIds(x = org.Mm.eg.db, keys = row.names(results), column = "GENENAME", keytype = "SYMBOL", multiVals = "first") # Add gene symbol results$symbol <- row.names(results) # Add ENTREZ ID results$entrez <- mapIds(x = org.Mm.eg.db, keys = row.names(results), column = "ENTREZID", keytype = "SYMBOL", multiVals = "first") # Add ENSEMBL results$ensembl <- mapIds(x = org.Mm.eg.db, keys = row.names(results), column = "ENSEMBL", keytype = "SYMBOL", multiVals = "first") # Subset for only significant genes (q < 0.05) results_sig <- subset(results, padj < 0.05) head(results_sig) |
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 | # Write normalized gene counts to a .txt file write.table(x = as.data.frame(counts(ddsMat), normalized = T), file = 'normalized_counts.txt', sep = '\t', quote = F, col.names = NA) # Write significant normalized gene counts to a .txt file write.table(x = counts(ddsMat[row.names(results_sig)], normalized = T), file = 'normalized_counts_significant.txt', sep = '\t', quote = F, col.names = NA) # Write the annotated results table to a .txt file write.table(x = as.data.frame(results), file = "results_gene_annotated.txt", sep = '\t', quote = F, col.names = NA) # Write significant annotated results table to a .txt file write.table(x = as.data.frame(results_sig), file = "results_gene_annotated_significant.txt", sep = '\t', quote = F, col.names = NA) |
641 642 643 644 645 646 647 648 649 650 | # Convert all samples to rlog ddsMat_rlog <- rlog(ddsMat, blind = FALSE) # Plot PCA by column variable plotPCA(ddsMat_rlog, intgroup = "Group", ntop = 500) + theme_bw() + # remove default ggplot2 theme geom_point(size = 5) + # Increase point size scale_y_continuous(limits = c(-5, 5)) + # change limits to fix figure dimensions ggtitle(label = "Principal Component Analysis (PCA)", subtitle = "Top 500 most variable genes") |
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 | # Convert all samples to rlog ddsMat_rlog <- rlog(ddsMat, blind = FALSE) # Gather 30 significant genes and make matrix mat <- assay(ddsMat_rlog[row.names(results_sig)])[1:40, ] # Choose which column variables you want to annotate the columns by. annotation_col = data.frame( Group = factor(colData(ddsMat_rlog)$Group), Replicate = factor(colData(ddsMat_rlog)$Replicate), row.names = colData(ddsMat_rlog)$sampleid ) # Specify colors you want to annotate the columns by. ann_colors = list( Group = c(LoGlu = "lightblue", HiGlu = "darkorange"), Replicate = c(Rep1 = "darkred", Rep2 = "forestgreen") ) # Make Heatmap with pheatmap function. ## See more in documentation for customization pheatmap(mat = mat, color = colorRampPalette(brewer.pal(9, "YlOrBr"))(255), scale = "row", # Scale genes to Z-score (how many standard deviations) annotation_col = annotation_col, # Add multiple annotations to the samples annotation_colors = ann_colors,# Change the default colors of the annotations fontsize = 6.5, # Make fonts smaller cellwidth = 55, # Make the cells wider show_colnames = F) |
688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 | # Gather Log-fold change and FDR-corrected pvalues from DESeq2 results ## - Change pvalues to -log10 (1.3 = 0.05) data <- data.frame(gene = row.names(results), pval = -log10(results$padj), lfc = results$log2FoldChange) # Remove any rows that have NA as an entry data <- na.omit(data) # Color the points which are up or down ## If fold-change > 0 and pvalue > 1.3 (Increased significant) ## If fold-change < 0 and pvalue > 1.3 (Decreased significant) data <- mutate(data, color = case_when(data$lfc > 0 & data$pval > 1.3 ~ "Increased", data$lfc < 0 & data$pval > 1.3 ~ "Decreased", data$pval < 1.3 ~ "nonsignificant")) # Make a basic ggplot2 object with x-y values vol <- ggplot(data, aes(x = lfc, y = pval, color = color)) # Add ggplot2 layers vol + ggtitle(label = "Volcano Plot", subtitle = "Colored by fold-change direction") + geom_point(size = 2.5, alpha = 0.8, na.rm = T) + scale_color_manual(name = "Directionality", values = c(Increased = "#008B00", Decreased = "#CD4F39", nonsignificant = "darkgray")) + theme_bw(base_size = 14) + # change overall theme theme(legend.position = "right") + # change the legend xlab(expression(log[2]("LoGlu" / "HiGlu"))) + # Change X-Axis label ylab(expression(-log[10]("adjusted p-value"))) + # Change Y-Axis label geom_hline(yintercept = 1.3, colour = "darkgrey") + # Add p-adj value cutoff line scale_y_continuous(trans = "log1p") # Scale yaxis due to large p-values |
724 | plotMA(results, ylim = c(-5, 5)) |
729 | plotDispEsts(ddsMat) |
734 735 736 737 738 739 740 741 742 743 744 745 | # Convert all samples to rlog ddsMat_rlog <- rlog(ddsMat, blind = FALSE) # Get gene with highest expression top_gene <- rownames(results)[which.min(results$log2FoldChange)] # Plot single gene plotCounts(dds = ddsMat, gene = top_gene, intgroup = "Group", normalized = T, transform = T) |
756 757 758 759 760 761 762 763 764 765 766 767 768 | # Remove any genes that do not have any entrez identifiers results_sig_entrez <- subset(results_sig, is.na(entrez) == FALSE) # Create a matrix of gene log2 fold changes gene_matrix <- results_sig_entrez$log2FoldChange # Add the entrezID's as names for each logFC entry names(gene_matrix) <- results_sig_entrez$entrez # View the format of the gene matrix ##- Names = ENTREZ ID ##- Values = Log2 Fold changes head(gene_matrix) |
773 774 775 776 777 778 779 780 781 782 783 | kegg_enrich <- enrichKEGG(gene = names(gene_matrix), organism = 'mouse', pvalueCutoff = 0.05, qvalueCutoff = 0.10) # Plot results barplot(kegg_enrich, drop = TRUE, showCategory = 10, title = "KEGG Enrichment Pathways", font.size = 8) |
788 789 790 791 792 793 794 795 796 797 798 799 800 | go_enrich <- enrichGO(gene = names(gene_matrix), OrgDb = 'org.Mm.eg.db', readable = T, ont = "BP", pvalueCutoff = 0.05, qvalueCutoff = 0.10) # Plot results barplot(go_enrich, drop = TRUE, showCategory = 10, title = "GO Biological Pathways", font.size = 8) |
809 810 811 812 813 814 815 816 | # Load pathview biocLite("pathview") ; library(pathview) # Plot specific KEGG pathways (with fold change) ## pathway.id : KEGG pathway identifier pathview(gene.data = gene_matrix, pathway.id = "04070", species = "mouse") |
Support
- Future updates
Related Workflows





