A pipeline for multi-trait genome-wide association studies (GWAS) using MANTA
A pipeline for multi-trait genome-wide association studies (GWAS) using MANTA . A method for joint analysis of summary statistics from genome-wide association studies (GWAS) of different traits, possibly from overlapping samples. MANTA used for robust comparative metatranscriptomics.
The pipeline performs the following analysis steps:
- Split genotype file
- Preprocess phenotype and covariate data
- Test for association between phenotypes and genetic variants
- Collect summary statistics
The pipeline uses Nextflow as the execution backend. Please check Nextflow documentation for more information.
Cite multi-trait genome-wide association studies (GWAS) using MANTA
If you find
mvgwas-nf
useful in your research please cite the related publication:
Garrido-Martín, D., Calvo, M., Reverter, F., Guigó, R. A fast non-parametric test of association for multiple traits. bioRxiv (2022). https://doi.org/10.1101/2022.06.06.493041
Code Snippets
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | library(optparse) library(data.table) library(seqminer) library(manta) option_list <- list( make_option(c("-p", "--phenotypes"), type = "character", help = "Pre-processed phenotype file (as generated by 'preprocess.R')", metavar = "FILE"), make_option(c("-c", "--covariates"), type = "character", help = "Pre-processed covariate file (as generated by 'preprocess.R')", metavar = "FILE"), make_option(c("-g", "--genotypes"), type = "character", help = "Genotype file (indexed VCF)", metavar = "FILE"), make_option(c("-r", "--region"), type = "character", help = "Genomic region", metavar = "CHARACTER"), make_option(c("-t", "--transform"), type = "character", default = 'none', help = "Phenotype transformation: 'none', 'log', 'sqrt' [default %default]", metavar = "CHARACTER"), make_option(c("-i", "--interaction"), type = "character", default = 'none', help = "Test for interaction with a covariate [default %default]", metavar = "CHARACTER"), make_option(c("-n", "--min_nb_ind_geno"), type = "numeric", default = 10, help = "Minimum number of individuals per genotype group [default %default]", metavar = "NUMERIC"), make_option(c("-o", "--output"), type = "character", help = "Output summary stats", metavar = "FILE"), make_option(c("-s", "--seed"), type = "numeric", help = "Set seed for random processes [default %default]", metavar = "NUMERIC", default = 123), make_option(c("-v", "--verbose"), action = "store_true", help = "[default %default]", default = TRUE) ) opt_parser <- OptionParser(option_list = option_list) opt <- parse_args(opt_parser) tabix.read.table.nochecknames <- function (tabixFile, tabixRange, col.names = TRUE, stringsAsFactors = FALSE) { stopifnot(seqminer:::local.file.exists(tabixFile)) stopifnot(all(isTabixRange(tabixRange))) tabixFile <- path.expand(tabixFile) storage.mode(tabixFile) <- "character" storage.mode(tabixRange) <- "character" header <- .Call("readTabixHeader", tabixFile, PACKAGE = "seqminer") body <- .Call("readTabixByRange", tabixFile, tabixRange, PACKAGE = "seqminer") body <- do.call(rbind, strsplit(body, "\t")) body <- as.data.frame(body, stringsAsFactors = FALSE) if (ncol(body) > 0) { for (i in 1:ncol(body)) { body[, i] <- utils::type.convert(body[, i], as.is = !stringsAsFactors) } num.col <- ncol(body) header <- header[nchar(header) > 0] if (length(header) == 0 || !col.names) { colNames <- paste0("V", 1L:num.col) } else { hdrLine <- header[length(header)] hdrLine <- sub("^#", "", hdrLine) # colNames <- make.names(strsplit(hdrLine, "\t")[[1]]) colNames <- strsplit(hdrLine, "\t")[[1]] if (length(colNames) > ncol(body)) { colNames <- colNames[1:ncol(body)] } else if (length(colNames) < ncol(body)) { tmpNames <- paste0("V", 1L:num.col) tmpNames[1:length(colNames)] <- colNames colNames <- tmpNames } } colnames(body) <- colNames } body } check.genotype <- function(geno.df, min.nb.ind.geno = 10) { apply(geno.df, 1, function(geno.snp) { if (sum(is.na(geno.snp)) > 0.05*length(as.numeric(geno.snp))) { return("Missing genotype in > 5% individuals") } geno.snp.t <- table(geno.snp[!is.na(geno.snp)]) if (length(geno.snp.t) < 3) { return("One or two genotype groups") } if (sum(geno.snp.t >= min.nb.ind.geno) < length(geno.snp.t)) { return(sprintf("Not all the groups with >%s samples", min.nb.ind.geno)) } return("PASS") }) } ## 2. Input files pheno.f <- opt$phenotypes cov.f <- opt$covariates geno.f <- opt$genotypes out.f <- opt$output region <- opt$region set.seed(opt$seed) if (is.null(pheno.f) || is.null(geno.f) || is.null(cov.f) || is.null(region) || is.null(out.f)) { print_help(opt_parser) stop("Missing/not found I/O files", call.= FALSE) } pheno.df <- data.frame(fread(pheno.f, header = TRUE, sep = "\t"), row.names = 1) cov.df <- data.frame(fread(cov.f, header = TRUE, sep = "\t"), row.names = 1) geno.df <- tabix.read.table.nochecknames(geno.f, region) subset.ids <- rownames(pheno.df) cn <- c("chr", "pos", "variant", "REF", "ALT") colnames(geno.df)[1:5] <- cn geno.df <- geno.df[, colnames(geno.df) %in% c(cn, subset.ids)] geno.df[, -c(1:5)] <- apply(geno.df[, -c(1:5)], 2, function(x){ y <- gsub("([012.]/[012.]):.*","\\1", x) sapply(y, function(z){switch(z, # Unphased "./." = NA, "0/0" = "0", "1/0" = "1", "0/1" = "1", "1/1" = "2", # Phased ".|." = NA, "0|0" = "0", "1|0" = "1", "0|1" = "1", "1|1" = "2", # Haploid (X,Y,MT) "." = NA, "0" = "0", "1" = "1" )}) }) ## 3. Filter SNPs snps.to.keep <- check.genotype(geno.df[, subset.ids], min.nb.ind.geno = opt$min_nb_ind_geno) if (opt$verbose) { snps.to.keep.t <- table(snps.to.keep) message("\t", paste(names(snps.to.keep.t), snps.to.keep.t, sep = ": ", collapse=", ")) } ## 4. Test & write output if (any(snps.to.keep == "PASS")) { geno.df <- geno.df[snps.to.keep == "PASS", ] out.df <- c() Y <- as.matrix(pheno.df) if (opt$interaction == "none") { for (p in geno.df$pos) { snp <- subset(geno.df, pos == p) rec <- snp[, !colnames(snp)%in%subset.ids] snp <- as.numeric(snp[, subset.ids]) mvfit <- tryCatch(manta(Y ~ ., data = data.frame(cov.df, "GT" = snp), type = "I", subset = "GT", transform = opt$transform), error = function(e) NULL) if (is.null(mvfit)) { warning(sprintf("SNP %s skipped", subset(geno.df, pos == p)$variant)) next } out.df <- rbind(out.df, c(t(rec), mvfit$aov.tab[1, 4:6])) } } else { INT <- paste0(opt$interaction, ":GT") for (p in geno.df$pos) { snp <- subset(geno.df, pos == p) rec <- snp[, !colnames(snp)%in%subset.ids] snp <- as.numeric(snp[, subset.ids]) Data <- data.frame(cov.df, "GT" = snp) fm <- as.formula(paste("Y ~", paste0(c(colnames(Data), INT), collapse = "+"))) mvfit <- tryCatch(manta(fm, data = data.frame(cov.df, "GT" = snp), type = "II", transform = opt$transform, subset = c(opt$interaction, "GT", INT)), error = function(e) NULL) if (is.null(mvfit)) { warning(sprintf("SNP %s skipped", subset(geno.df, pos == p)$variant)) next } out.df <- rbind(out.df, c(t(rec), mvfit$aov.tab[1:3, 4:6])) } } fwrite(out.df, file = out.f, quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t") } |
105 106 107 108 | """ bcftools query -f '%CHROM\t%POS\n' $vcf > positions split -d -a 10 -l ${params.l} positions chunk """ |
127 128 129 | """ preprocess.R --phenotypes $pheno --covariates $cov --genotypes $vcf --out_pheno pheno_preproc.tsv.gz --out_cov cov_preproc.tsv.gz --verbose """ |
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | """ chunknb=\$(basename $chunk | sed 's/chunk//') if [[ \$(cut -f1 $chunk | sort | uniq -c | wc -l) -ge 2 ]]; then k=1 cut -f1 $chunk | sort | uniq | while read chr; do region=\$(paste <(grep -P "^\$chr\t" $chunk | head -1) <(grep -P "^\$chr\t" $chunk | tail -1 | cut -f2) | sed 's/\t/:/' | sed 's/\t/-/') test.R --phenotypes $pheno --covariates $cov --genotypes $vcf --region "\$region" --output sstats.\$k.tmp --min_nb_ind_geno ${params.ng} -t ${params.t} -i ${params.i} --verbose ((k++)) done cat sstats.*.tmp > sstats.\${chunknb}.txt else region=\$(paste <(head -1 $chunk) <(tail -1 $chunk | cut -f2) | sed 's/\t/:/' | sed 's/\t/-/') test.R --phenotypes $pheno --covariates $cov --genotypes $vcf --region "\$region" --output sstats.\${chunknb}.txt --min_nb_ind_geno ${params.ng} -t ${params.t} -i ${params.i} --verbose fi """ |
183 184 185 | """ sed -i "1 s/^/CHR\tPOS\tID\tREF\tALT\tF\tR2\tP\\n/" ${out} """ |
187 188 189 | """ sed -i "1 s/^/CHR\tPOS\tID\tREF\tALT\tF($params.i)\tF(GT)\tF(${params.i}:GT)\tR2($params.i)\tR2(GT)\tR2(${params.i}:GT)\tP($params.i)\tP(GT)\tP(${params.i}:GT)\\n/" ${out} """ |
Support
- Future updates
Related Workflows





