Snakemake workflow: Meta-analysis of GWAS of gestational duration, preterm and post-term deliveries (EGG Consortium)
Snakemake workflow: Meta-analysis of GWAS of gestational duration, preterm and post-term deliveries (EGG Consortium)
This repository contains code for Genetic effects on the timing of parturition and links to fetal birth weight .
Citation
Contact
-
Pol Sole-Navais (@psnavais)
-
pol.sole.navais@gu.se
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | library(data.table) library(dplyr) library(coloc) library(parallel) pph_outfile= snakemake@output[[1]] results_outfile= snakemake@output[[2]] cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]]) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF')) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) x= fread(snakemake@input[[2]], select= c('ID', 'N','BETA', 'SE', 'pvalue', 'EAF')) x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF) names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf') d= inner_join(d, x, by= 'ID') if (sum(is.na(d$eaf)) == nrow(d)) { d$maf= d$MAF } z= fread(snakemake@input[[3]]) z$CHR= as.numeric(gsub('chr', '', z$chr)) z$locus= 1:nrow(z) funk= function(i) { row= z[i,] locus= paste0('locus_', i) temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop'])) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { temp_df= filter(temp_df, SE>0, se>0) if (grepl('PCOS|miscarriage|POP|endometriosis|Preeclampsia|leiomyoma_uterus', snakemake@input[[2]])) { if (grepl('PCOS', snakemake@input[[2]])) {s_pheno= (1184 + 670 + 157 +658 +984 + 485 + 462 )/ (1184 + 670 + 157 +658 +984 + 485 + 462 + 5799 + 1379 +2807 +6774 +2963+ 407 + 96172)} if (grepl('miscarriage', snakemake@input[[2]])) {s_pheno= 49996 / ( 174109 + 49996)} if (grepl('POP', snakemake@input[[2]])) {s_pheno= 7053 / (57407 + 7053) } if (grepl('endometriosis', snakemake@input[[2]])) {s_pheno= 1496 / (192678 + 1496 )} if (grepl('Preeclampsia', snakemake@input[[2]])){ s_pheno= 4630/ (4630 + 373345)} if (grepl('leiomyoma_uterus', snakemake@input[[2]])){ s_pheno= ( 14569) / (85792 + 14569)} if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) } data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf) } else { if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) } data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf) } myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') next } else { PPH= data.frame(t(myres[[1]])) PPH$locus= locus fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= myres[[2]] res$locus= locus fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) } } } mclapply(1:nrow(z), funk, mc.cores= 3) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | library(data.table) library(dplyr) library(coloc) library(parallel) pph_outfile= snakemake@output[[1]] results_outfile= snakemake@output[[2]] cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]]) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF')) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) x= fread(snakemake@input[[2]], select= c('ID', 'TOTALSAMPLESIZE','BETA', 'SE', 'pvalue', 'EAF')) x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF) names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf') d= inner_join(d, x, by= 'ID') if (sum(is.na(d$eaf)) == nrow(d)) { d$maf= d$MAF } z= fread(snakemake@input[[3]]) z$CHR= as.numeric(gsub('chr', '', z$chr)) z$locus= 1:nrow(z) funk= function(i) { row= z[i,] locus= paste0('locus_', i) temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop'])) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { temp_df= filter(temp_df, SE>0, se>0) s_pheno= 0.067 data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf) myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') next } else { PPH= data.frame(t(myres[[1]])) PPH$locus= locus fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= myres[[2]] res$locus= locus fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) } } } mclapply(1:nrow(z), funk, mc.cores= 3) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | library(data.table) library(dplyr) library(coloc) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 d= fread(snakemake@input[[1]]) d= select(d, ID, CHR, POS, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) x= fread(snakemake@input[[2]]) x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA) x= select(x, ID, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF) x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF) names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf') d= inner_join(d, x, by= 'ID') if (sum(is.na(d$eaf)) == nrow(d)) { d$maf= d$MAF } z= fread(snakemake@input[[3]]) z$CHR= ifelse(z$CHR== 'X', '23', z$CHR) z$CHR= as.integer(z$CHR) z1= fread(snakemake@input[[4]]) z1$CHR= ifelse(z1$CHR== 'X', '23', z1$CHR) z1$CHR= as.integer(z1$CHR) z1= filter(z1, nearestGene== 'LRP5' | nearestGene== 'SCML4') z= rbind(z, z1) pph_list= list() res_list= list() for(i in 1:nrow(z)) { row <- z[i,] locus= paste0('chr', row[,'CHR'], '_', row[,'nearestGene']) temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'pos1']), POS<= as.integer(row[, 'pos2'])) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res } else { temp_df= filter(temp_df, SE>0, se>0) s_pheno= 0.067 data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf) myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res next } PPH= data.frame(t(myres[[1]])) PPH$locus= locus pph_list[[i]]= PPH res= myres[[2]] res$locus= locus res_list[[i]]= res } } pph= data.frame(do.call('rbind', pph_list)) res= data.frame(do.call('rbind', res_list)) write.table(pph, snakemake@output[[1]], sep= '\t', row.names=F, col.names= T, quote=F) write.table(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= T, quote=F) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | library(data.table) library(dplyr) library(coloc) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF')) d= select(d, ID, CHR, POS, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) x= fread(snakemake@input[[2]], select= c('ID', 'REF', 'EFF', 'BETA', 'EAF', 'SE', 'N', 'pvalue')) x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA) x= select(x, ID, N, BETA, SE, pvalue, EAF) x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF) names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf') d= inner_join(d, x, by= 'ID') if (sum(is.na(d$eaf)) == nrow(d)) { d$maf= d$MAF } z= fread(snakemake@input[[3]]) z$CHR= ifelse(z$CHR== 'X', '23', z$CHR) z$CHR= as.integer(z$CHR) pph_list= list() res_list= list() for(i in 1:nrow(z)) { row <- z[i,] locus= paste0('chr', row[,'CHR'], '_', row[,'nearestGene']) temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'POS']) - 250000, POS<= as.integer(row[, 'POS']) + 25000) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res } else { temp_df= filter(temp_df, SE>0, se>0) if (grepl('PCOS|miscarriage|POP|endometriosis|Preeclampsia|leiomyoma_uterus', snakemake@input[[2]])) { if (grepl('PCOS', snakemake@input[[2]])) {s_pheno= (1184 + 670 + 157 +658 +984 + 485 + 462 )/ (1184 + 670 + 157 +658 +984 + 485 + 462 + 5799 + 1379 +2807 +6774 +2963+ 407 + 96172)} if (grepl('miscarriage', snakemake@input[[2]])) {s_pheno= 49996 / ( 174109 + 49996)} if (grepl('POP', snakemake@input[[2]])) {s_pheno= 7053 / (57407 + 7053) } if (grepl('endometriosis', snakemake@input[[2]])) {s_pheno= 1496 / (192678 + 1496 )} if (grepl('Preeclampsia', snakemake@input[[2]])){ s_pheno= 4630/ (4630 + 373345)} if (grepl('leiomyoma_uterus', snakemake@input[[2]])){ s_pheno= ( 14569) / (85792 + 14569)} if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) } data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf) } else { if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) } data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf) } myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus) pph_list[[i]]= PPH res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus) res_list[[i]]= res next } PPH= data.frame(t(myres[[1]])) PPH$locus= locus pph_list[[i]]= PPH res= myres[[2]] res$locus= locus res_list[[i]]= res } } pph= data.frame(do.call('rbind', pph_list)) res= data.frame(do.call('rbind', res_list)) write.table(pph, snakemake@output[[1]], sep= '\t', row.names=F, col.names= T, quote=F) write.table(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= T, quote=F) |
14 15 | script: 'coloc.R' |
23 24 25 26 27 28 29 30 31 | run: df_list= list() for i in input: d= pd.read_csv(i, sep= '\t', header= 0) x= i.split('pph_')[1].replace('.txt', '') d['trait']= x df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
39 40 41 42 43 44 45 46 47 | run: df_list= list() for i in input: d= pd.read_csv(i, sep= '\t', header= 0) x= i.split('results_')[1].replace('.txt', '') d['trait']= x df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
61 62 | script: 'coloc_GA_vs_PTD.R' |
75 76 | script: 'coloc_GA_vs_PTD_GW.R' |
89 90 | script: 'coloc_BW_GA_GW.R' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import pandas as pd import numpy as np def flip_beta(df): 'Flip EFF and REF allele if REF> EFF. Flip beta direction with same condition. Assumed column names: beta, REF, EFF.' df['BETA']= np.where(df.REF>df.EFF, -1 * df.BETA, df.BETA) df['REF'], df['EFF']= np.where(df.REF> df.EFF, [df.EFF, df.REF], [df.REF, df.EFF]) return df def add_ID(x): x['REF']= np.where(x.REF.str.len() > x.EFF.str.len(), 'I', x.REF) x['EFF']= np.where(x.REF.str.len() < x.EFF.str.len(), 'I', x.EFF) x['REF']= np.where(x.EFF== 'I', 'D', x.REF) x['EFF']= np.where(x.REF== 'I', 'D', x.EFF) x['ID']= np.where(x.REF> x.EFF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.EFF + ':' + x.REF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.REF + ':' + x.EFF) x= flip_beta(x) return x def format_df(x, reg): d= pd.read_csv(x, sep= ',', header= 0) d['chr']= d.chr.apply(str) d= pd.merge(d, reg, left_on= 'chr', right_on= 'CHR') d= d.loc[((d.pos >= d.pos1) & (d.pos<= d.pos2)), :] h1= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h1.coef', 'h1.se', 'h1.pval']] h1.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'] h2= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h2.coef', 'h2.se', 'h2.pval']] h2.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'] h3= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h3.coef', 'h3.se', 'h3.pval']] h3.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'] h1= add_ID(h1) h2= add_ID(h2) h3= add_ID(h3) h1.to_csv(snakemake.output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) h2.to_csv(snakemake.output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) h3.to_csv(snakemake.output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) print('Completed file:' + x) regions= pd.read_csv(snakemake.input[0], sep= '\t', header= 0) format_df(snakemake.input[1], regions) |
29 30 | script: 'format_CCHMC_haplotype.py' |
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['contig', 'position', 'testedAllele', 'otherAllele', 'h.Bmnt', 'h.Bmnt.se', 'h.Bmnt.p', 'h.Bmt', 'h.Bmt.se', 'h.Bmt.p', 'h.Bft', 'h.Bft.se', 'h.Bft.p'])[['contig', 'position', 'testedAllele', 'otherAllele', 'h.Bmnt', 'h.Bmnt.se', 'h.Bmnt.p', 'h.Bmt', 'h.Bmt.se', 'h.Bmt.p', 'h.Bft', 'h.Bft.se', 'h.Bft.p']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'h2_beta', 'h2_se', 'h2_pvalue', 'h1_beta', 'h1_se', 'h1_pvalue', 'h3_beta', 'h3_se', 'h3_pvalue'] h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h1_beta', 'h1_se', 'h1_pvalue']] h1.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue'] h2= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h2_beta', 'h2_se', 'h2_pvalue']] h2.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue'] h3= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h3_beta', 'h3_se', 'h3_pvalue']] h3.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue'] h1= add_ID(h1) h2= add_ID(h2) h3= add_ID(h3) h1.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) h2.to_csv(output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) h3.to_csv(output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) |
64 65 66 67 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d[['CHR', 'POS', 'REF', 'EFF']]= d.snp.str.split(':', expand= True) h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h1', 'se_h1', 'pvalue_h1']] |
90 91 | shell: '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}' |
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | run: h1= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'Direction', 'HetISq', 'HetPVal']) h2= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']) h3= pd.read_csv(input[2], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']) h1['N_cohorts']= 6 - h1['Direction'].apply(lambda x: str.count(x, '?')) h1.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h1', 'se_h1', 'pvalue_h1', 'Direction_h1', 'HetISq_h1', 'HetPval_h1', 'n_cohorts'] h1['beta_h1']= np.where(h1.Allele2> h1.Allele1, -1 * h1.beta_h1, h1.beta_h1) h2.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2'] h2['beta_h2']= np.where(h2.Allele2> h2.Allele1, -1 * h2.beta_h2, h2.beta_h2) h3.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3'] h3['beta_h3']= np.where(h3.Allele2> h3.Allele1, -1 * h3.beta_h3, h3.beta_h3) d= pd.merge(h1, h2[['MarkerName', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2']], on= ['MarkerName'], how= 'inner') d= pd.merge(d, h3[['MarkerName', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']], on= ['MarkerName'], how= 'inner') d= d[['MarkerName', 'Allele1', 'Allele2', 'beta_h1', 'se_h1', 'pvalue_h1', 'Direction_h1', 'HetISq_h1', 'HetPval_h1', 'n_cohorts', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']] d['Allele1'], d['Allele2']= np.where(d.Allele2> d.Allele1, [d.Allele2, d.Allele1], [d.Allele1, d.Allele2]) d.to_csv(output[0], sep= '\t', header= True, index= False) |
9 10 11 12 13 14 15 16 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']) d.drop_duplicates(['ID'], keep= 'first', inplace= True) d.sort_values('pvalue', inplace= True, ascending= True) d= d.iloc[0:99999, :] d= d[['RSID', 'CHR', 'POS', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']] d.columns= ['CHR', 'POS', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'PVALUE'] d.to_csv(output[0], sep= '\t', header= True, index= False) |
25 26 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']) |
45 46 47 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']) d.drop_duplicates(['ID'], keep= 'first', inplace= True) |
65 66 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']) |
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) top_list= list() non_top_list= list() for lname in set(d.locus): df_temp= d.loc[d.locus== lname, :] df_temp.sort_values(['PP'], ascending= False, inplace= True) df_temp['PPcum']= df_temp.PP.cumsum() top_vars= df_temp.loc[df_temp.PPcum< 0.95, :] non_top= df_temp.loc[df_temp.PPcum>= 0.95, :] top_list.append(top_vars) non_top_list.append(non_top) top= pd.concat(top_list) non_top= pd.concat(non_top_list) top.to_csv(output[0], sep= '\t', header= True, index= False) non_top.to_csv(output[1], sep= '\t', header= True, index= False) |
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | run: d= pd.read_csv(input[0], sep='\t', header= 0) pli= pd.read_csv(input[1], header= 0, sep= '\t', usecols= ['gene_id', 'gene', 'chromosome', 'start_position', 'end_position', 'pLI'])[['gene_id', 'gene', 'chromosome', 'start_position', 'end_position', 'pLI']] d= d.loc[d.nearestGene.isin(pli.gene.values), :] pli.columns= ['EID', 'gene', 'CHR', 'start', 'end', 'pLI'] pli.dropna(subset= ['pLI'], inplace= True) pli_genes= pli.loc[pli.pLI>= 0.9, 'gene'].values.tolist() df= d.loc[d.nearestGene.isin(pli_genes), :] b= len(pli_genes) - df.shape[0] c= d.shape[0] - df.shape[0] d= df.shape[0] a= pli.shape[0] - b - d - c oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater') z= ['pli', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue] with open(output[0], 'w') as file_handler: file_handler.write('\t'.join([str(item) for item in z]) + '\n') |
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) rna= pd.read_csv(input[1], sep= '\t', header= 0) rna['GA']= np.where(rna['Gene name'].isin(d.nearestGene.values), 1, 0) rna['NX_rk']= rna.groupby('Gene name')['NX'].rank('average', ascending= True) df_list= list() for tissue in set(rna.Tissue): ilist= rna.loc[((rna.GA== 1) & (rna.Tissue == tissue)), 'NX_rk'] base= rna.loc[((rna.GA== 0) & (rna.Tissue == tissue)), 'NX_rk'] mannw_pvalue= st.mannwhitneyu(ilist, base, alternative= 'greater')[1] i_median= np.median(ilist) base_median= np.median(base) df_list.append([tissue, i_median, base_median, mannw_pvalue]) z= pd.DataFrame.from_records(df_list) z.to_csv(output[0], sep= '\t', header= ['tissue', 'i_listmedian', 'base_list_median', 'MannW_pvalue'], index= False) |
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | run: pheno= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene']) x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'gene', 'EnsembleID']) add= [line.strip() for line in open(input[2], 'r')] df= pheno.loc[pheno.nearestGene.isin(add), :] b= len(add) - df.shape[0] c= pheno.shape[0] - df.shape[0] d= df.shape[0] a= x.shape[0] - b - d - c oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater') z= ['dominant', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue] with open(output[0], 'w') as file_handler: file_handler.write('\t'.join([str(item) for item in z]) + '\n') rec= [line.strip() for line in open(input[3], 'r')] df= pheno.loc[pheno.nearestGene.isin(rec), :] b= len(rec) - df.shape[0] c= pheno.shape[0] - df.shape[0] d= df.shape[0] a= x.shape[0] - b - d - c oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater') z= ['recessive', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue] with open(output[0], 'a') as file_handler: file_handler.write('\t'.join([str(item) for item in z]) + '\n') |
134 135 136 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene']) stc= pd.read_csv(input[1], header= 0, sep= '\t', usecols= ['geneid', 'log2FoldChange', 'pvalue'])[['geneid', 'log2FoldChange', 'pvalue']] |
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | run: pheno= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene']) x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'gene', 'EnsembleID']) lab= pd.read_csv(input[2], sep= '\t', header= 0) for i in set(lab.Cell_type): temp_df= lab.loc[lab.Cell_type== i, :] df= pheno.loc[pheno.nearestGene.isin(temp_df.gene_name.to_list()), :] b= len(temp_df.gene_name.to_list()) - df.shape[0] c= pheno.shape[0] - df.shape[0] d= df.shape[0] a= x.shape[0] - b - d - c oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater') z= [i, a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue] with open(output[0], 'a') as file_handler: file_handler.write('\t'.join([str(item) for item in z]) + '\n') df= pheno.loc[pheno.nearestGene.isin(lab.gene_name.to_list()), :] |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | library(data.table) library(dplyr) library(coloc) library(parallel) df= fread(snakemake@input[[1]], select= c('RSID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF')) df= filter(df, !duplicated(RSID)) df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF) z= fread(snakemake@input[[2]]) z$n= 206 z$maf= ifelse(z$Freq< 0.5, 1 - z$Freq, z$Freq) df= inner_join(df, z, by= c('RSID'= 'SNP')) rm(z) pph_outfile= snakemake@output[[1]] results_outfile= snakemake@output[[2]] cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]]) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 df= data.frame(df) colocalization_eqtl= function(temp_df){ protein= unique(temp_df$Gene) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { temp_df = filter(temp_df, SE>0, se> 0) if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$RSID, s= 0.067, MAF= temp_df$MAF) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$RSID, s= 0.122, MAF= temp_df$MAF) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$RSID, MAF= temp_df$MAF) } data2= list(beta= temp_df$b, varbeta= temp_df$se**2, N=temp_df$n, type= 'quant', snp= temp_df$RSID, MAF= temp_df$maf) myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { PPH= data.frame(t(myres[[1]])) PPH$protein= protein fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= myres[[2]] res$protein= protein fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) } } } mclapply(split(df, df$Gene), colocalization_eqtl, mc.cores= 3) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | library(data.table) library(dplyr) library(coloc) library(parallel) df= fread(snakemake@input[[1]], select= c('ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF')) df= filter(df, !duplicated(ID)) df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF) z= fread(snakemake@input[[2]], select= c('gene_id', 'ID', 'maf', 'slope', 'slope_se')) z$n= with(z, ifelse(grepl('Ovary', snakemake@input[[2]]), 167, ifelse(grepl('Uterus', snakemake@input[[2]]), 269, 141))) df= inner_join(df, z, by= 'ID') rm(z) pph_outfile= snakemake@output[[1]] results_outfile= snakemake@output[[2]] cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]]) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 df= data.frame(df) colocalization_eqtl= function(temp_df){ protein= unique(temp_df$gene_id) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { temp_df = filter(temp_df, SE>0, slope_se> 0) if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.067, MAF= temp_df$MAF) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122, MAF= temp_df$MAF) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) } data2= list(beta= temp_df$slope, varbeta= temp_df$slope_se**2, N=temp_df$n, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf) myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { PPH= data.frame(t(myres[[1]])) PPH$protein= protein fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= myres[[2]] res$protein= protein fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) } } } mclapply(split(df, df$gene_id), colocalization_eqtl, mc.cores= 3) |
11 12 | script: 'coloc_endometrium.R' |
20 21 22 23 24 25 26 27 | run: df_list= list() for i in input: d= pd.read_csv(i, header= 0, sep= '\t', usecols= ['ID']) df_list.append(d) x= pd.concat(df_list) x.drop_duplicates('ID', keep= 'first', inplace= True) x.to_csv(output[0], sep= '\t', header= True, index= False) |
36 37 38 39 40 41 42 43 44 45 46 | run: x= ['ID', 'gene_id', 'maf', 'slope', 'slope_se'] with open(output[0], mode="w") as file: file.write("\t".join(x) + "\n") d= pd.read_csv(input[1], sep= '\t', header= 0) for chunk in pd.read_csv(input[0], sep= '\t', header= 0, chunksize= 500000, compression= 'gzip', usecols= ['gene_id', 'variant_id', 'maf', 'slope', 'slope_se']) : chunk[['CHR', 'POS', 'REF', 'EFF', 'build']]= chunk.variant_id.str.split('_', expand= True) chunk['ID']= np.where(chunk.REF> chunk.EFF, chunk.CHR + ':' + chunk.POS + ':' + chunk.EFF + ':' + chunk.REF, chunk.CHR + ':' + chunk.POS + ':' + chunk.REF + ':' + chunk.EFF) chunk= chunk[['ID', 'gene_id', 'maf', 'slope', 'slope_se']] chunk= pd.merge(chunk, d, on= 'ID') chunk.to_csv(output[0], sep= '\t', header= False, index= False, mode= 'a') |
60 61 | script: 'coloc_GTEx.R' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import pandas as pd import numpy as np def flip_beta(df): 'Flip EFF and REF allele if REF> EFF. Flip beta direction with same condition. Assumed column names: beta, REF, EFF.' df['BETA']= np.where(df.REF>df.EFF, -1 * df.BETA, df.BETA) df['REF'], df['EFF']= np.where(df.REF> df.EFF, [df.EFF, df.REF], [df.REF, df.EFF]) return df def add_ID(x): x['REF']= np.where(x.REF.str.len() > x.EFF.str.len(), 'I', x.REF) x['EFF']= np.where(x.REF.str.len() < x.EFF.str.len(), 'I', x.EFF) x['REF']= np.where(x.EFF== 'I', 'D', x.REF) x['EFF']= np.where(x.REF== 'I', 'D', x.EFF) x['ID']= np.where(x.REF> x.EFF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.EFF + ':' + x.REF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.REF + ':' + x.EFF) x= flip_beta(x) return x def format_df(x): d= pd.read_csv(x, sep= ',', header= 0) d['chr']= d.chr.apply(str) d= d.loc[d.chr== '2', :] d= d.loc[d.pos== 113521754, :] h1= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h1.coef', 'h1.se', 'h1.pval']] h1.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'] h2= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h2.coef', 'h2.se', 'h2.pval']] h2.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'] h3= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h3.coef', 'h3.se', 'h3.pval']] h3.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'] h1= add_ID(h1) h2= add_ID(h2) h3= add_ID(h3) h1.to_csv(snakemake.output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) h2.to_csv(snakemake.output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) h3.to_csv(snakemake.output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) print('Completed file:' + x) format_df(snakemake.input[0]) |
9 10 | script: 'format_CCHMC_haplotype.py' |
20 21 22 23 24 25 | run: d= pd.read_csv(input[0], sep= ' ', header= 0) d['CHR']= 2 d['POS']= 113521754 d['REF']= 'C' d['EFF']= 'T' |
47 48 49 50 51 52 53 | run: d= pd.read_csv(input[0], sep= ' ', header= 0) d[['CHR', 'POS', 'REF', 'EFF']]= d.snp.str.split(':', expand= True) h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h1', 'se_h1', 'pvalue_h1']] h1.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue'] h2= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h2', 'se_h2', 'pvalue_h2']] h2.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue'] |
73 74 | shell: '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}' |
84 85 86 87 88 | run: h1= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value']) h1.columns= ['ID', 'EFF', 'REF', 'beta_MT', 'se_MT', 'pvalue_MT'] h1['beta_MT']= np.where(h1.REF > h1.EFF, -1 * h1.beta_MT, h1.beta_MT) h2= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value']) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d1= fread(snakemake@input[[2]]) d$beta= as.numeric(d$beta) d$se= as.numeric(d$se) d$pvalue= as.numeric(d$pvalue) d1= filter(d1, PP.H4.abf> 0.75) d= filter(d, pheno %in% d1$pheno_FINNGEN) mani= fread(snakemake@input[[3]], select= c('phenocode', 'name')) names(mani)= c('pheno', 'description') d= inner_join(d, mani, by= 'pheno') x= fread(snakemake@input[[4]]) x1= fread(snakemake@input[[5]]) x1= filter(x1, PP.H4.abf> 0.75) x= filter(x, pheno %in% x1$pheno_PAN_UKBB) mani=fread(snakemake@input[[6]], select= c('phenocode', 'trait_type', 'description')) mani$pheno= paste(mani$trait_type, mani$phenocode, sep= '_') x= inner_join(x, mani, by= 'pheno') d$zscore= d$beta / d$se x$zscore= x$beta / x$se d= select(d, pheno, description, zscore, pvalue, trait) x= select(x, pheno, description, zscore, pvalue, trait) d= bind_rows(d, x) d$zscore= ifelse(d$zscore> 10, 10, ifelse(d$zscore< -10, -10, d$zscore)) d$trait= ifelse(d$trait== 'Gestational duration', 'rs28654158 (gestational duration)', 'rs11708067 (birth weight)') d$trait= factor(d$trait, levels= rev(c('rs28654158 (gestational duration)', 'rs11708067 (birth weight)'))) d$description= with(d, ifelse(grepl('Other diabetes', description), 'Other diabetes', description)) d$description= with(d, ifelse(grepl('Non-insulin-dep', description), 'Non-insulin dependent diabetes', description)) d$description= with(d, ifelse(grepl('Diabetes, varying def', description), 'Diabetes, wide', description)) d$description= with(d, ifelse(grepl('Intestinal adhesions', description), 'Intestinal adhesions', description)) d$description= with(d, ifelse(grepl('Type 2 diabetes, strict', description), 'Type 2 diabetes', description)) d$description= with(d, ifelse(grepl('Type 2 diabetes with other specified/multiple/unspecified complications', description), 'Type 2 diabetes with complications', description)) d$description= with(d, ifelse(grepl('Diabetes, insuline treatment', description), 'Diabetes, insuline treatment', description)) d$description= with(d, ifelse(grepl('Creatinine', description), 'Creatinine in urine', description)) ord <- hclust( dist(d$zscore, method = "euclidean"), method = "ward.D" )$order d= d[ord, ] d$description= factor(d$description, levels= unique(d$description)) p1= ggplot(d, aes(y= trait, x= description, fill= round(zscore), alpha= factor(as.numeric(pvalue< 5e-6)))) + geom_tile(colour = "white", size= 1) + theme_cowplot(font_size= 8) + scale_alpha_discrete(guide=F, range= c(0.3, 1)) + scale_fill_gradient2(low= colorBlindBlack8[3], high= colorBlindBlack8[8], mid= 'white', guide= F) + theme( axis.text.x= element_text(hjust= 1, angle= 45), axis.text.y= element_text(), axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin= unit(c(t= 0, r= 0, b= 0, l= 0), unit= 'cm'), axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) + coord_equal() + labs(x = NULL, y = NULL) ggsave(snakemake@output[[1]], p1, height= 100, width= 127, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') p1= ggplot(d, aes(description, trait, fill= round(zscore), alpha= factor(as.numeric(pvalue< 5e-6)))) + geom_tile(colour = "white", size= 1) + theme_cowplot(font_size= 8) + scale_alpha_discrete(guide=F) + scale_fill_gradient2(low= colorBlindBlack8[3], high= colorBlindBlack8[8], mid= 'white') + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank(), axis.title.y = element_blank(), plot.margin= margin(t= 0, r= 0, l= 0, b= 0, unit= 'pt')) + scale_x_discrete(position = "top") ggsave(snakemake@output[[3]], p1, height= 100, width= 140, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
2
of
figures/ADCY5_effect_direction.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | library(dplyr) library(data.table) library(ggplot2) library(cowplot) library(ggrepel) library(tidyr) library(showtext) colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') as= 8 as1= 9 showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) names(d)= c('CHR', 'POS', 'FST_EUR_AFR') d1= fread(snakemake@input[[2]]) names(d1)= c('CHR', 'POS', 'FST_EUR_EAS') d2= fread(snakemake@input[[3]]) names(d2)= c('CHR', 'POS', 'FST_AFR_EAS') d= inner_join(d, d1, by= c('CHR', 'POS')) %>% inner_join(., d2, by= c('CHR', 'POS')) rm(d1); rm(d2) d$v_ids= paste(d$CHR, d$POS, sep= ':') z= fread(snakemake@input[[4]]) zl= gather(z, control_set, v_ids, Set_1:Set_10000) bw_pos= c(123065778) ga_pos= c(123112292) zl= inner_join(zl, d[, c('v_ids', 'FST_EUR_AFR', 'FST_EUR_EAS', 'FST_AFR_EAS')], by= 'v_ids') zl= filter(zl, Input_SNP== '3:123065778' | Input_SNP== '3:123112292') zl$haplotype= with(zl, ifelse(Input_SNP== '3:123065778', 'Birth weight', 'Gestational duration')) zl= zl[!duplicated(zl$v_ids), ] zl= data.frame(zl) d= data.frame(d) df_list= list() r_num= 1 for (i in c('FST_EUR_AFR', 'FST_AFR_EAS', 'FST_EUR_EAS')){ ga_pvalue=wilcox.test(zl[zl$haplotype== 'Gestational duration', i], mu= d[d$v_ids== '3:123112292', i], alternative= 'less')$p.value m1= d[d$v_ids== '3:123112292', i] mc1= mean(zl[zl$haplotype== 'Gestational duration', i], na.rm=T) medc1= median(zl[zl$haplotype== 'Gestational duration', i], na.rm=T) prop_above= prop.table(table(d[d$v_ids== '3:123112292', i]> zl[zl$haplotype== 'Gestational duration', i]))[2] temp_df= data.frame(haplotype= 'Gestational duration', ancestries= i, FST= m1, FST_mean_controls= mc1, FST_median_controls= medc1, pvalue= ga_pvalue) ga_pvalue= wilcox.test(zl[zl$haplotype== 'Birth weight', i], mu= d[d$v_ids== '3:123065778', i], alternative= 'less')$p.value medc1= median(zl[zl$haplotype== 'Birth weight', i], na.rm=T) m1= d[d$v_ids== '3:123065778', i] mc1= mean(zl[zl$haplotype== 'Birth weight', i], na.rm=T) temp_df2= data.frame(haplotype= 'Birth weight', ancestries= i, FST= m1, FST_mean_controls= mc1, FST_median_controls= medc1, pvalue= ga_pvalue) temp_df= rbind(temp_df, temp_df2) df_list[[r_num]]= temp_df r_num= r_num + 1 } xp= do.call('rbind', df_list) xp$enrichment= with(xp, FST / FST_median_controls) bw= filter(zl, haplotype== 'Birth weight') %>% select(FST_EUR_AFR, FST_EUR_EAS, FST_AFR_EAS) ga= filter(zl, haplotype== 'Gestational duration') %>% select(FST_EUR_AFR, FST_EUR_EAS, FST_AFR_EAS) names(bw)= c('FST_EUR_AFR_bw', 'FST_EUR_EAS_bw', 'FST_AFR_EAS_bw') df1= cbind(bw, ga) ga_fst= d[d$v_ids== '3:123112292', 'FST_EUR_AFR'] bw_fst= d[d$v_ids== '3:123065778', 'FST_EUR_AFR'] ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_EUR_AFR', 'enrichment'] bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_EUR_AFR', 'enrichment'] p1= ggplot(df1, aes(x=x) ) + geom_density( aes(x = FST_EUR_AFR, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) + annotate('text', x=0.6, y= 10, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') + annotate('text', x=0.6, y= -10 - 0.5, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') + annotate('text', x=ga_fst, y=5 + 0.5, label="rs28654158", color= colorBlindBlack8[4], size= as/ .pt) + annotate('text', x=bw_fst, y= -10 - 0.5, label="rs11708067", color= colorBlindBlack8[2], hjust= 0, size= as/ .pt) + annotate('text', x= 0.6, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) + annotate('text', x= 0.6, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) + geom_density(aes(x = FST_EUR_AFR_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) + theme_cowplot(font_size = 8) + scale_x_continuous(expand= c(0, 0)) + scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) + xlab("Fst Africans - Europeans") + ylab('Density') + geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 5)) + geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -10))+ geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[1]], p1, width= 63, height= 63, units= 'mm', dpi= 300) ga_fst= d[d$v_ids== '3:123112292', 'FST_EUR_EAS'] bw_fst= d[d$v_ids== '3:123065778', 'FST_EUR_EAS'] ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_EUR_EAS', 'enrichment'] bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_EUR_EAS', 'enrichment'] p1= ggplot(df1, aes(x=x) ) + geom_density( aes(x = FST_EUR_EAS, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) + annotate('text', x=0.57, y= 9, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') + annotate('text', x=0.57, y= -10, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') + annotate('text', x=ga_fst, y= 10 + 0.5, label="rs28654158", color= colorBlindBlack8[4], hjust= 0, size= as/ .pt) + annotate('text', x=bw_fst, y= -5 - 0.5, label="rs11708067", color= colorBlindBlack8[2], size= as/ .pt) + annotate('text', x= 0.6, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) + annotate('text', x= 0.6, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) + geom_density( aes(x = FST_EUR_EAS_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) + scale_x_continuous(expand= c(0, 0)) + scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) + theme_cowplot(font_size = 8) + xlab("Fst East Asians - Europeans") + ylab('Density') + geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 10)) + geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -5)) + geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[2]], p1, width= 63, height= 63, units= 'mm', dpi= 300) ga_fst= d[d$v_id== '3:123112292', 'FST_AFR_EAS'] bw_fst= d[d$v_id== '3:123065778', 'FST_AFR_EAS'] ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_AFR_EAS', 'enrichment'] bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_AFR_EAS', 'enrichment'] p1= ggplot(df1, aes(x=x) ) + geom_density( aes(x = FST_AFR_EAS, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) + annotate('text', x=0.72, y=7, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') + annotate('text', x=0.72, y= -7, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') + annotate('text', x=ga_fst, y=5 + 0.5, label="rs28654158", color= colorBlindBlack8[4], size= as/ .pt) + annotate('text', x=bw_fst, y= -5 - 0.5, label="rs11708067", color= colorBlindBlack8[2], hjust= 0, size= as/ .pt) + annotate('text', x= 0.75, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) + annotate('text', x= 0.75, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) + geom_density( aes(x = FST_AFR_EAS_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) + scale_x_continuous(expand= c(0, 0)) + theme_cowplot(font_size = 8) + xlab("Fst Africans - East Asians") + scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) + ylab('Density') + geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 5)) + geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -5))+ geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[3]], p1, width= 63, height= 63, units= 'mm', dpi= 300) fwrite(df1, snakemake@output[[4]], sep= '\t') fwrite(xp, snakemake@output[[5]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
showtext
From
line
1
of
figures/ADCY5_FST_AFR_EUR.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | library(dplyr) library(data.table) library(ggplot2) library(cowplot) library(ggrepel) library(tidyr) library(showtext) colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) names(d)[8]= 'phenocode' mani= fread(snakemake@input[[2]]) trait_list= c('biomarkers', 'continuous', 'icd10') mani= mani[mani$trait_type %in% trait_list, ] mani= filter(mani, saige_heritability_EUR> 0.01) mani= mani[order(mani$saige_heritability_EUR, decreasing= TRUE), ] mani= mani[!duplicated(mani$phenocode), ] mani$phenocode= paste(mani$trait_type, mani$phenocode, sep= '_') mani= mani[, c('phenocode', 'description')] mani= mani[!duplicated(mani$description), ] d= inner_join(d, mani[, c('description', 'phenocode')], by= 'phenocode') d$cohort= 'UKBB' x= fread(snakemake@input[[3]]) names(x)[8]= 'phenocode' mani= fread(snakemake@input[[4]]) mani= mani[, c('phenocode', 'name')] names(mani)= c('phenocode', 'description') mani= mani[!duplicated(mani$description), ] x= inner_join(x, mani, by= 'phenocode') x$cohort= 'FINNGEN' d= rbind(d, x) d= d[order(d$PP.H4.abf, decreasing= F), ] d= filter(d, PP.H4.abf> 0.01, PP.H4.abf + PP.H3.abf> 0.75) d$preg_trait= factor(d$preg_trait) empty_bar <- 5 to_add <- data.frame( matrix(NA, empty_bar*nlevels(d$preg_trait), ncol(d)) ) colnames(to_add) <- colnames(d) to_add$preg_trait <- rep(levels(d$preg_trait), each=empty_bar) d <- rbind(d, to_add) d <- d %>% arrange(preg_trait) d$id= seq(1, nrow(d)) label_data= d number_of_bar <- nrow(label_data) angle <- 90 - 360 * (label_data$id-0.5) /number_of_bar label_data$hjust<-ifelse( angle < -90, 1, 0) label_data$angle<-ifelse(angle < -90, angle+180, angle) #d$id= factor(d$id, levels= d$id[order(d$PP.H4.abf)]) base_data= d %>% group_by(preg_trait) %>% filter(is.na(PP.H4.abf)) %>% summarize(start=min(id), end=max(id) ) %>% rowwise() %>% mutate(title=mean(c(start, end))) arc100= rep(1, 2) arc75= rep(0.75, 2) arc50= rep(0.50, 2) arc25= rep(0.25, 2) label_data$description= with(label_data, ifelse(grepl('Other diabetes', description), 'Other diabetes', description)) label_data$description= with(label_data, ifelse(grepl('Non-insulin-dep', description), 'Non-insulin dependent diabetes', description)) label_data$description= with(label_data, ifelse(grepl('Diabetes, varying def', description), 'Diabetes, wide', description)) label_data$description= with(label_data, ifelse(grepl('Intestinal adhesions', description), 'Intestinal adhesions', description)) label_data$description= with(label_data, ifelse(grepl('Type 2 diabetes, strict', description), 'Type 2 diabetes', description)) label_data$description= with(label_data, ifelse(grepl('Type 2 diabetes with other specified/multiple/unspecified complications', description), 'Type 2 diabetes with complications', description)) label_data$description= with(label_data, ifelse(grepl('and lymph nodes, not elsewhere classified', description), 'Diseases of veins', description)) label_data$description= with(label_data, ifelse(grepl('Diabetes, insuline treatment', description), 'Diabetes, insuline treatment', description)) label_data$description= with(label_data, ifelse(grepl('Creatinine', description), 'Creatinine in urine', description)) p1= ggplot(d, aes(as.factor(id), PP.H4.abf, fill= preg_trait, alpha= PP.H4.abf)) + geom_bar(stat="identity", colour= NA) + scale_alpha_continuous(range= c(0.4, 1), guide= F) + geom_segment(data=base_data, aes(x = end, y = arc100, xend = start, yend = arc100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + geom_segment(data=base_data, aes(x = end, y = arc75, xend = start, yend = arc75), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + geom_segment(data=base_data, aes(x = end, y = arc50, xend = start, yend = arc50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + geom_segment(data=base_data, aes(x = end, y = arc25, xend = start, yend = arc25), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + annotate("text", x = ((base_data$end[1] + base_data$start[1]) / 2), y = c((0.25 + 0.075) , (0.50 + 0.075), (0.75 + 0.075) , (1 + 0.075)), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=2.5 , angle=13, fontface="bold", hjust= 0.5) + annotate("text", x = ((base_data$end[2] + base_data$start[2]) / 2), y = c((0.25 + 0.075) , (0.50 + 0.075), (0.75 + 0.075) , (1 + 0.075) ), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=2.5, angle=13, fontface="bold", hjust=0.5) + ylim(-0.2, 2) + # Limits of the plot = very important. The negative value controls the size of the inner circle, the positive one is useful to add size over each bar theme_cowplot() + scale_fill_manual(values=colorBlindBlack8[c(2,4)], guide= F) + scale_colour_manual(values=colorBlindBlack8[c(2,4)], guide= F) + # plot.margin = margin(t= -200, r= -40, b= -200, l=-70, unit= 'mm') ) + labs(x=NULL, y=NULL) + coord_polar(start = 0) + geom_text(data=filter(label_data, PP.H4.abf> 0.75), aes(x= factor(id), y=PP.H4.abf + 0.01, label=description, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=6/ .pt, angle= filter(label_data, PP.H4.abf> 0.750)$angle, inherit.aes = FALSE) + theme(axis.line=element_blank(),axis.text.x=element_blank(), axis.text.y=element_blank(),axis.ticks=element_blank(), axis.title.x=element_blank(), axis.title.y=element_blank(),legend.position="none", panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(), panel.grid.minor=element_blank(),plot.background=element_blank(), axis.ticks.length = unit(0, "mm")) ggsave(snakemake@output[[1]], plot= p1, width= 127, height= 127, dpi= 300, units= 'mm') fwrite(d, snakemake@output[[2]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | library(scales) library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(tidyverse) library(fmsb) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) flist= snakemake@input #list.files('/mnt/hdd/common/pol/metaGWAS/colocalization/GAraw/', 'pph_BW_', full.names=T) funk= function(x){ d= fread(x) d= filter(d, PP.H1.abf + PP.H2.abf + PP.H3.abf + PP.H4.abf + PP.H0.abf> 0) fname= gsub('.txt', '', gsub('pph_', '', unlist(strsplit(x, '/'))[9])) d= separate(d, locus, into= c('chrom', 'locus'), sep= '_') d$sloc= d$PP.H4.abf + d$PP.H3.abf d= select(d, PP.H4.abf, sloc, locus) names(d)= c(fname, paste0(fname, '_sloc'), 'locus') return(d) } d= lapply(flist, funk) d= reduce(d, full_join, by = "locus") d= arrange(d, BW_maternal_effect) # Spider plot maternal x= as.data.frame(matrix(d$BW_maternal_effect, ncol= nrow(d))) x=rbind(x, as.data.frame(matrix(d$BW_maternal_effect_sloc, ncol= nrow(d)))) names(x)= d$locus rownames(x)= c('BW maternal effect', 'BW maternal effect ') x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x) png(snakemake@output[[1]], width= 60, height= 60, res= 300, units= 'mm') par(mar=c(0,0,0,0)) radarchart(x, axistype= 0, #custom polygon pcol= c(colorBlindBlack8[4], colorBlindBlack8[2]) , pfcol= c(alpha(colorBlindBlack8[4], 0.4), alpha(colorBlindBlack8[2], 0.4)) , plwd=1, pty= 32, plty= 1, #custom the grid cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), caxisoffset= 0.1, cglwd=0.8, calcex= 0.4, #custom labels vlcex= 0.43 ) dev.off() # Spider plot fetal x= as.data.frame(matrix(d$BW_fetal_effect, ncol= nrow(d))) x=rbind(x, as.data.frame(matrix(d$BW_fetal_effect_sloc, ncol= nrow(d)))) names(x)= d$locus rownames(x)= c('BW fetal effect', 'BW fetal effect ') x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x) png(snakemake@output[[2]], width= 60, height= 60, res= 300, units= 'mm') par(mar=c(0,0,0,0)) radarchart(x, axistype= 0, #custom polygon pcol= c(colorBlindBlack8[4], colorBlindBlack8[2]) , pfcol= c(alpha(colorBlindBlack8[4], 0.4), alpha(colorBlindBlack8[2], 0.4)) , plwd=1, pty= 32, plty= 1, #custom the grid cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), caxisoffset= 0.1, cglwd=0.8, calcex= 0.4, #custom labels vlcex= 0.43 ) dev.off() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) as= 8 as1= 8 z= fread(snakemake@input[[1]]) z$chr= as.numeric(gsub('chr', '', z$chr)) z$chr= as.character(z$chr) z$locus= 1:nrow(z) funk= function(infile){ d= fread(infile) names(d)[1:11]= names(d)[2:12] d=d[, 1:11] d= filter(d, p<5e-6) d$bC= ifelse(d$b< 0, -1 * d$bC, d$bC) d$b= ifelse(d$b< 0, -1 * d$b, d$b) d= separate(d, SNP, into= c('chr', 'POS', 'REF', 'EFF'), sep= ':') d$POS= as.numeric(d$POS) d$chr= as.character(d$chr) d$GWAS= ifelse(grepl('BW_maternal_effect_GA', infile), 'BW_maternal_GA', 'BW_fetal_GA') d= inner_join(d, z, on= 'chr') d= d %>% filter(POS>= start, POS< stop) d= group_by(d, locus) %>% arrange(p) %>% filter(row_number()== 1) return(d) } df_list= lapply(snakemake@input[grepl('BW', snakemake@input)], funk) d= do.call('rbind', df_list) d$beta_dif= with(d, (bC - b) / b) mor= filter(d, GWAS== 'BW_maternal_GA') %>% pull(beta_dif) barn= filter(d, GWAS== 'BW_fetal_GA') %>% pull(beta_dif) p1= ggplot() + geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) + annotate('text', x=0.1, y= 3, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x=0.1, y= -15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) + theme_cowplot(font_size = 8) + scale_x_continuous(expand= c(0, 0)) + xlab("Relative difference in effect size on \nbirth weight after conditioning") + ylab('Density') + geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) p1= ggplot(d, aes(beta_dif, group= GWAS, fill= GWAS)) + geom_hline(yintercept= 0, colour= 'black') + geom_density(color= NA) + annotate('text', x=-0.75, y= 1, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x=0.1, y= 15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + theme_cowplot(font_size= 8) + #scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_x_continuous(expand= c(0, 0)) + scale_y_continuous(expand=c(0, 0.5)) + xlab("Relative difference in effect size on \nbirth weight after conditioning") + ylab('Density') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/BW_conditioning.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) as= 8 as1= 8 z= fread(snakemake@input[[1]]) z$SNP= with(z, ifelse(ref> eff, paste(chr, pos, eff, ref, sep= ':'), paste(chr, pos, ref, eff, sep= ':'))) funk= function(infile){ d= fread(infile) names(d)[1:11]= names(d)[2:12] d=d[, 1:11] d$bC= ifelse(d$b< 0, -1 * d$bC, d$bC) d$b= ifelse(d$b< 0, -1 * d$b, d$b) d$GWAS= ifelse(grepl('BW_maternal_effect_GA', infile), 'BW_maternal_GA', 'BW_fetal_GA') var= ifelse(grepl('BW_maternal_effect_GA', infile), 'Maternal Only', 'Fetal Only') temp_z= z[z$origin== var, ] d= filter(d, SNP %in% temp_z$SNP) return(d) } df_list= lapply(snakemake@input[grepl('BW', snakemake@input)], funk) d= do.call('rbind', df_list) d$beta_dif= with(d, (bC - b) / b) mor= filter(d, GWAS== 'BW_maternal_GA') %>% pull(beta_dif) barn= filter(d, GWAS== 'BW_fetal_GA') %>% pull(beta_dif) p1= ggplot() + geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) + annotate('text', x= 0.1, y= 3, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x= 0.1, y= -15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) + theme_cowplot(font_size = 8) + scale_x_continuous(expand= c(0, 0)) + xlab("Relative difference in effect size on \nbirth weight after conditioning") + ylab('Density') + geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) p1= ggplot(d, aes(beta_dif, group= GWAS, fill= GWAS)) + geom_hline(yintercept= 0, colour= 'black') + geom_density(color= NA) + annotate('text', x=-0.55, y= 1, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x=0.1, y= 10, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + theme_cowplot(font_size= 8) + #scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_x_continuous(expand= c(0, 0)) + scale_y_continuous(expand=c(0, 0.5)) + xlab("Relative difference in effect size on \nbirth weight after conditioning") + ylab('Density') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/BW_conditioning_top.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d= filter(d, grepl('GAraw', p1), grepl('BW', p2)) d$p1= 'Gestational duration (maternal)' x= fread(snakemake@input[[2]]) x= filter(x, grepl('GA_fetal', p1), grepl('BW', p2)) x$p1= 'Gestational duration (fetal)' d= rbind(d, x) d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) d$trait= d$p2 d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal \nonly', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal \nonly', ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))) p1= ggplot(d, aes(trait, rg, colour= p1)) + geom_pointrange(aes(ymin= rg - se * 1.96, ymax= rg + se * 1.96), position = position_dodge(0.3), width = 1/10, size= 0.4, fatten= 0.6) + scale_colour_manual(values= colorBlindBlack8[c(8,3)], guide= FALSE) + theme_cowplot(font_size= 8) + scale_y_continuous(limits= c(-0.2, 0.8), breaks= seq(-0.2, 0.8, 0.2)) + ylab('Genetic correlation') + xlab('Effect on birth weight') + geom_hline(yintercept= 0, size= 0.3) + geom_hline(yintercept= c(-0.2, seq(0.2, 0.8, 0.2)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/BW_genetic_correlations.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | library(data.table) library(dplyr) library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) x= fread(snakemake@input[[2]]) d= inner_join(d, x, by= 'Name') d= d[sample(nrow(d)),] d= d[order(d$Category, decreasing= F), ] d$Name= factor(d$Name, levels= unique(d$Name)) d$Name2= gsub('_', ' ', gsub("^.*\\.","", d$Name)) d$Name2= factor(d$Name2, levels= unique(d$Name2)) p1= ggplot(d, aes(Name2, -log10(Coefficient_P_value), colour= Category, fill= Category)) + geom_point(size= 2, shape= 21, stroke= 0.1) + xlab('Tissues') + ylab('-log10(Enrichment)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= -log10(0.05), colour= '#d9d9d9') + theme(axis.text.x = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05), panel.grid.major.x= element_blank(), legend.position="none") + geom_text_repel(data= filter(d, Coefficient_P_value< 0.05), aes(Name2, -log10(Coefficient_P_value), colour= Category, label= Name2, show_guide = FALSE)) ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300) p2= ggplot(d, aes(Name2, -log10(Coefficient_P_value), colour= Category, fill= Category)) + geom_point(size= 2, shape= 21, stroke= 0.1) + xlab('Tissues') + ylab('-log10(Enrichment)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= -log10(0.05), colour= '#d9d9d9') + theme(axis.text.x = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05), panel.grid.major.x= element_blank()) + geom_text_repel(data= filter(d, Coefficient_P_value< 0.05), aes(Name2, -log10(Coefficient_P_value), colour= Category, label= Name2), show_guide = FALSE) ggsave(snakemake@output[[2]], plot= p2, width= 120, height= 90, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(ggtern) options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) shbg= fread(snakemake@input[[1]]) testo= fread(snakemake@input[[2]]) #shbg$locus= gsub("^.*\\_","", shbg$locus) #testo$locus= gsub("^.*\\_","", testo$locus) colT= colorBlindBlack8[4] colR= colorBlindBlack8[1] colL= colorBlindBlack8[2] shbg$One_or_Other= shbg$PP.H0.abf + shbg$PP.H1.abf + shbg$PP.H2.abf shbg$coloc= shbg$PP.H4.abf shbg$shared_locus= shbg$PP.H3.abf p1= ggtern(shbg, aes(One_or_Other, coloc, shared_locus)) + geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) + scale_alpha_continuous(range= c(0.6, 1), guide= F) + scale_size_continuous(range= c(.001, 10), guide= F) + theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") + Tarrowlab("Probability of shared causal variant") + Larrowlab("Probability of locus not shared") + Rarrowlab("Probability of shared locus (distinct causal variant)") + theme_showarrows() + theme_notitles() + theme(text=element_text(family="arial", size= 10), tern.axis.arrow.T = element_blank(), tern.axis.arrow.L = element_blank(), tern.axis.arrow.R = element_blank(), tern.axis.text.T = element_text(color = colT), tern.axis.text.L = element_text(color = colL), tern.axis.text.R = element_text(color = colR), tern.axis.arrow.text.T = element_text(color = colT), plot.margin = margin(0, 0, 0, 0, "cm"), tern.axis.arrow.text.L = element_text(color = colL), tern.axis.arrow.text.R = element_text(color = colR), tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) testo$One_or_Other= testo$PP.H0.abf + testo$PP.H1.abf + testo$PP.H2.abf testo$coloc= testo$PP.H4.abf testo$shared_locus= testo$PP.H3.abf p2= ggtern(testo, aes(One_or_Other, coloc, shared_locus)) + geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) + scale_alpha_continuous(range= c(0.6, 1), guide= F) + scale_size_continuous(range= c(.001, 10), guide= F) + theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") + Tarrowlab("Probability of shared causal variant") + Larrowlab("Probability of locus not shared") + Rarrowlab("Probability of shared locus (distinct causal variant)") + theme_showarrows() + theme_notitles() + theme(text=element_text(family="arial", size= 10), tern.axis.arrow.T = element_blank(), tern.axis.arrow.L = element_blank(), tern.axis.arrow.R = element_blank(), tern.axis.text.T = element_text(color = colT), tern.axis.text.L = element_text(color = colL), tern.axis.text.R = element_text(color = colR), tern.axis.arrow.text.T = element_text(color = colT), plot.margin = margin(0, 0, 0, 0, "cm"), tern.axis.arrow.text.L = element_text(color = colL), tern.axis.arrow.text.R = element_text(color = colR), tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300) ggsave(snakemake@output[[2]], plot= p2, width= 95, height= 95, units= 'mm', dpi= 300) |
R
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
ggtern
From
line
1
of
figures/coloc_sex_hormones.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(ggdendro) library(gridExtra) library(dendextend) library(plyr) library(ggtree) library(scales) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) x= fread(snakemake@input[[2]], select= c('nearestGene', 'RSID')) d= inner_join(d, x, by= c('rsid'= 'RSID')) d$GENE= d$nearestGene d$GENE= with(d, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE))))) d$nearestGene= d$GENE d$nearestGene= with(d, ifelse(rsid== 'rs3129768', 'HLA-DQA1', ifelse(rsid== 'rs5991030', 'AGTR2', ifelse(rsid== 'rs5930554', 'RAP2C', nearestGene)))) d$nearestGene= with(d, ifelse(rsid== 'rs6780427', 'KCNAB1', nearestGene)) d$nearestGene= with(d, ifelse(rsid== 'rs6879092', 'EBF1', nearestGene)) d$nearestGene= gsub(' ', '', d$nearestGene) d$nearestGene= paste0("(", d$nearestGene, ")") d$rsid_lab= with(d, paste(rsid, nearestGene)) d$beta_PT= with(d, ifelse(beta_MT<0, -1 * beta_PT, beta_PT)) d$beta_MNT= with(d, ifelse(beta_MT<0, -1 * beta_MNT, beta_MNT)) d$beta_MT= with(d, ifelse(beta_MT<0, -1 * beta_MT, beta_MT)) d= gather(d, haplotype, beta, c('beta_MT', 'beta_MNT', 'beta_PT')) max_beta= max(d$beta) min_beta= min(d$beta) d$haplotype= with(d, ifelse(haplotype== 'beta_MT', 'Maternal\ntransmitted', ifelse(haplotype== 'beta_MNT', 'Maternal\nnon-transmitted', 'Paternal\ntransmitted'))) d$rsid_lab= factor(d$rsid_lab, levels= unique(d$rsid_lab)) d$class_name= factor(d$class_name, levels= c("Maternal", "MF SD", "MF OD", "Fetal MatT", "Fetal")) d= d %>% arrange(class_name, desc(probability)) %>% ungroup() d$rsid_lab= factor(d$rsid_lab, levels= unique(d$rsid_lab)) labs <- sapply( strsplit(levels(d$rsid_lab), " "), function(x) parse(text = paste0(x[1], "~italic('", x[2], "')")) ) p1= ggplot(d, aes(rsid_lab, haplotype, fill= beta)) + theme_cowplot(8) + geom_tile() + #scale_fill_gradient2(low= colorBlindBlack8[4], high= colorBlindBlack8[2], mid= 'white', limits= c(min_beta, max_beta), guide= 'none', midpoint= 0) + scale_fill_gradientn(colours=c(colorBlindBlack8[4], 'white', colorBlindBlack8[2]), values= rescale(c(min_beta, 0, max_beta)), limits= c(min_beta, max_beta), guide= 'none') + coord_equal() + scale_x_discrete(labels= labs) + theme(axis.title= element_blank(), axis.ticks= element_blank(), plot.margin = margin(0, 0, 0, 0, "mm"), text= element_text(size= 9/ .pt), axis.text.y= element_text(hjust= 0.5), axis.text.x= element_text(angle= 45, hjust= 1), axis.line = element_line(colour = 'black', size = 0.2)) + geom_text_repel(data= filter(d, haplotype== 'Paternal\ntransmitted'), aes(x= rsid_lab, y= 4, label= round(probability, 2)), direction= 'y', size= 8/ .pt, box.padding = 0.01) ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 60, units= 'mm', dpi= 300) p1= ggplot(d, aes(rsid_lab, haplotype, fill= beta)) + theme_cowplot(8) + geom_tile() + scale_fill_gradientn(colours=c(colorBlindBlack8[4], 'white', colorBlindBlack8[2]), values= rescale(c(min_beta, 0, max_beta)), limits= c(min_beta, max_beta), name= 'Effect size') + coord_equal() + scale_x_discrete(labels= labs) + theme(axis.title= element_blank(), axis.ticks= element_blank(), plot.margin = margin(0, 9, 0,0, "mm"), text= element_text(size= 9/ .pt), axis.text.y= element_text(hjust= 0.5), axis.line = element_line(colour = 'black', size = 0.2), legend.position= 'bottom') + geom_text_repel(data= filter(d, haplotype== 'Paternal\ntransmitted'), aes(x= rsid_lab, y= -0.05, label= round(probability, 2)), direction= "y" , size= 6.5/ .pt) ggsave(snakemake@output[[2]], plot= p1, width= 180, height= 100, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[3]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(ggtern) options(warn=-1) x= fread(snakemake@input[[1]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") x$rsid= with(x, ifelse(rsid== 'chrX:116013571', 'rs5991030', ifelse(rsid== 'chrX:132178061', 'rs5930554', rsid))) font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[2]], header= T, select= c('RSID', 'ID', 'nearestGene')) x= inner_join(x, d, by= c('rsid'= 'RSID')) x$nearestGene= with(x, ifelse(rsid== 'rs3129768', 'HLA-DQA1', ifelse(rsid== 'rs5991030', 'AGTR2', ifelse(rsid== 'rs5930554', 'RAP2C', nearestGene)))) x$nearestGene= with(x, ifelse(rsid== 'rs6780427', 'KCNAB1', nearestGene)) x$nearestGene= with(x, ifelse(rsid== 'rs6879092', 'EBF1', nearestGene)) #d= gather(x, haplotype, beta, c('Paternal', 'MaternalT', 'MaternalNT')) #d$rsid_label= with(d, paste0(RSID, ' (', nearestGene, ')')) #max_beta= max(abs(d$beta)) #d$class= factor(d$class, levels= c("MF SD", "MF OD", "Maternal", "Fetal MatT", "Fetal")) #d= arrange(d, class, desc(max_prob)) #d$rsid_label= factor(d$rsid_label, levels= unique(d$rsid_label)) colT= colorBlindBlack8[4] colR= colorBlindBlack8[1] colL= colorBlindBlack8[2] x$MF= x$MF_OD + x$MF_SD x$Fet= x$Fetal_MatT + x$Fetal p1= ggtern(x, aes(Maternal, Fet, MF)) + geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) + scale_alpha_continuous(range= c(0.6, 1), guide= F) + scale_size_continuous(range= c(.001, 10), guide= F) + theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") + Tarrowlab("Fetal only effect") + Larrowlab("Maternal only effect") + Rarrowlab("Maternal and fetal effect") + theme_showarrows() + theme_notitles() + theme(text=element_text(family="arial", size= 10), tern.axis.arrow.T = element_blank(), tern.axis.arrow.L = element_blank(), tern.axis.arrow.R = element_blank(), tern.axis.text.T = element_text(color = colT), tern.axis.text.L = element_text(color = colL), tern.axis.text.R = element_text(color = colR), tern.axis.arrow.text.T = element_text(color = colT), plot.margin = margin(0, 0, 0, 0, "cm"), tern.axis.arrow.text.L = element_text(color = colL), tern.axis.arrow.text.R = element_text(color = colR), tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) print(' ggtern(data=d, aes(-log10(pvalue_h1),-log10(pvalue_h2),-log10(pvalue_h3), label= nearestGene, size= abs(BETA), alpha= -log10(pvalue))) + geom_point(colour= "black", fill= colorBlindBlack8[8], shape= 21) + scale_alpha_continuous(range= c(0.6, 1), guide= F) + scale_size_continuous(range= c(.001, 10), guide= F) + theme_custom(tern.plot.background = NULL, tern.panel.background = "white", col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") + Tarrowlab("Maternal non-transmitted allele") + Larrowlab("Maternal transmitted allele") + Rarrowlab("Paternal transmitted allele") + theme_showarrows() + theme_notitles() + theme(text=element_text(family="arial", size= 10), tern.axis.arrow.T = element_blank(), tern.axis.arrow.L = element_blank(), tern.axis.arrow.R = element_blank(), tern.axis.text.T = element_text(color = colT), tern.axis.text.L = element_text(color = colL), tern.axis.text.R = element_text(color = colR), tern.axis.arrow.text.T = element_text(color = colT), plot.margin = margin(0, 0, 0, 0, "cm"), tern.axis.arrow.text.L = element_text(color = colL), tern.axis.arrow.text.R = element_text(color = colR), tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) + geom_text(data= filter(d, nearestGene== "HAND2"), position= position_nudge_tern(y=0.05,x=-0.05/2,z=-0.05/2), aes(label=nearestGene), fontface= "bold", check_overlap=T, size= 8/ .pt, colour= "#525252", hjust= 1, vjust= 0.5)') ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300) d= select(x, rsid, ID, MF, Maternal, Fetal) fwrite(x, snakemake@output[[2]], sep= '\t') |
R
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
ggtern
From
line
1
of
figures/effect_origin_ternary.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | library(data.table) library(dplyr) library("knitr") library("tidyr") library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d$lead_snp= with(d, ifelse(lead_snp== '1:50958027', '1:50959262', ifelse(lead_snp== '9:116929327', '9:116935764', ifelse(lead_snp== '5:157896786', '5:157895049', ifelse(lead_snp== '1:22511594', '1:22462111', lead_snp))))) x= fread(snakemake@input[[2]]) x$lead_snp= paste(x$CHR, x$POS, sep= ':') d= inner_join(d,x, by= 'lead_snp') d$z_score= ifelse(d$z_score> 3.5, 3.5, d$z_score) d$nearestGene= with(d, ifelse(nearestGene== 'CDC42', 'CDC42/ WNT4', ifelse(nearestGene== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(nearestGene== 'TET3', 'TET3/ DGUOK-AS1', ifelse(nearestGene== 'TCEA2', 'TCEA2/ OPRL1', nearestGene))))) d= filter(d, !(annotation %in% c('B2', 'geva_allele_age'))) d$annotation= with(d, ifelse(annotation== 'argweave', 'ARGWEAVE', ifelse(annotation== 'betascore', 'Beta score', ifelse(annotation== 'B2', '', ifelse(annotation== 'fst_eas_afr', 'Fst AFR-EAS', ifelse(annotation== 'fst_eur_afr', 'Fst AFR-EUR', ifelse(annotation== 'fst_eur_eas', 'Fst EAS-EUR', ifelse(annotation== 'gerp', 'GERP', ifelse(annotation== 'geva_allele_age', 'Alelle age', ifelse(annotation== 'iES_Sabeti', 'iES', ifelse(annotation== 'linsigh', 'LINSIGHT', ifelse(annotation== 'phastCon100', 'phastCONS100', ifelse(annotation== 'phyloP100', 'PhyloP', ifelse(annotation== 'xpehh_afr2_eas', 'XPEHH AFR-EAS', ifelse(annotation== 'xpehh_afr2_eur', 'XPEHH AFR-EUR', 'XPEHH EAS-EUR'))))))))))))))) p1= ggplot(d, aes(annotation, nearestGene, fill= z_score)) + geom_tile(colour = "white", size= 1) + theme_cowplot(font_size= 9) + scale_fill_gradient2(low= colorBlindBlack8[2], high= colorBlindBlack8[4], mid= 'white', limits= c(-2, 4)) + theme(axis.text.x = element_text(angle = 45, hjust = 0), axis.title.x = element_blank(), axis.title.y = element_blank()) + scale_x_discrete(position = "top") + geom_text(data= filter(d, pvalue.x< 0.05), aes(annotation, nearestGene, label= '*'), size= 8/ .pt) + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.ticks= element_blank(), panel.border = element_rect(colour= 'black', fill= NA, size=1), plot.margin = unit(c(0, 1, 0, 0), "cm"), axis.line= element_blank(), axis.text.y = element_text(face = "italic")) + coord_equal() ggsave(snakemake@output[[1]], plot= p1, width= 140, height= 120, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") d= fread(snakemake@input[[1]]) z= fread(snakemake@input[[3]]) df= fread(snakemake@input[[2]], select= (c('MarkerName', 'Effect', 'StdErr', 'HetISq', 'HetPVal', 'TOTALSAMPLESIZE', 'P-value', 'Allele1', 'Allele2'))) names(df)= c('SNP', 'BETA', 'SE', 'HetISq', 'HetPval', 'N', 'pvalue', 'A1', 'A2') df= filter(df, SNP %in% d$SNP) df= separate(df, SNP, into= c('CHR', 'POS', 'Ax1', 'Ax2', 'ID'), sep= ':', remove= F) df$BETA= ifelse(df$A2 > df$A1, -1 * df$BETA, df$BETA) df$CHR= ifelse(df$CHR== 'X','23', df$CHR) df$CHR= as.integer(df$CHR) df$POS= as.integer(df$POS) df= select(df, -c(A1, A2, ID, Ax1, Ax2)) df$cohort= 'Meta-analysis' d= bind_rows(d, df) z$CHR= ifelse(z$CHR== 'X','23', z$CHR) z$CHR= as.integer(z$CHR) d= inner_join(d, z, by= 'CHR') %>% filter(POS> pos1, POS< pos2) d$locus= paste0('Chr ', d$CHR,': ', d$nearestGene) d= filter(d, !(cohort %in% c('PGPII', 'PGPIII', 'BIB', 'DNBCPTD', 'STORK', 'STORKGROR'))) d$cohort= paste0(d$cohort, ' (n= ', d$N, ')') temp_df= d[d$nearestGene== snakemake@wildcards[['prev_locus']], ] temp_df= temp_df[order(temp_df$N, decreasing= T), ] rsid= ifelse(snakemake@wildcards[['prev_locus']]== 'EEFSEC', 'rs2659685', ifelse(snakemake@wildcards[['prev_locus']]== 'WNT4', 'rs12037376', ifelse(snakemake@wildcards[['prev_locus']]== 'EBF1', 'rs2963463', ifelse(snakemake@wildcards[['prev_locus']]== 'AGTR2', 'rs5991030', 'rs28654158')))) gene= unique(temp_df$nearestGene) my_title = expression(paste0(italic(gene), " (,", rsid, ")")) p1= ggplot(temp_df, aes(x=factor(cohort, level = factor(cohort)), y=BETA, ymin= BETA - 1.96 * SE, ymax= BETA + 1.96 * SE, colour= !is.na(HetISq), shape= !is.na(HetISq)), alpha= 0.5) + geom_pointrange(size= 0.4) + scale_shape_manual(values= c(15, 18), guide= F) + geom_hline(yintercept = 0, linetype=2) + scale_y_continuous(sec.axis = dup_axis()) + ggtitle(parse(text = paste0(rsid, ' - ', "~italic('", unique(temp_df$nearestGene), "')"))) + coord_flip() + scale_colour_manual(values= c(colorBlindBlack8[3], colorBlindBlack8[4]), guide= F) + theme_cowplot(8) + xlab('') + ylab('Beta [95% CI]') + geom_vline(xintercept= 0, linetype= "dotted", colour= 'grey') ggsave(snakemake@output[[1]], plot= p1, width= 140, height= 30.5 + 50/13 * nrow(temp_df), units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/forest_plot_EEFSEC.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d$term= with(d, ifelse(term== 'fetal_effect_PGS', 'Fetal', 'Maternal')) d$outcome= gsub(' PGS', '', d$outcome) p1= ggplot(d, aes(term, estimate, colour= term)) + geom_pointrange(aes(ymin= lo95, ymax= up95)) + facet_wrap(vars(outcome)) + scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(2, 4)]) + theme_cowplot(10) + geom_hline(yintercept= 0, colour= 'grey', size= 0.5, linetype= 'dashed') + theme(strip.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA)) + ylab('Effect on gestational duration \ngenetic score (95% CI), days') + xlab('Birth weight genetic score') ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 100, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/GA_BW_PGS_correlations.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | library(MendelianRandomization) library(data.table) library(dplyr) library("knitr") library("tidyr") library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) top_ga= fread(snakemake@input[[1]]) top_ga= c(pull(top_ga, ID), '5:158058432:G:T', '3:156697097:A:G') top_ptd= fread(snakemake@input[[2]]) top_ptd= pull(top_ptd, ID) top= c(top_ga, top_ptd) top= unique(top) ga= fread(snakemake@input[[3]], select= c('ID', 'BETA', 'SE')) ga= filter(ga, ID %in% top) ptd= fread(snakemake@input[[4]], select= c('ID', 'BETA', 'SE')) ptd= filter(ptd, ID %in% top) %>% select(ID, BETA, SE) names(ptd)= c('ID', 'BETA_ptd', 'SE_ptd') d= inner_join(ga, ptd, by= 'ID') d$GWAS= with(d, ifelse(ID== '5:157895049:C:T', 'Both phenotypes', ifelse(ID %in% top_ptd, 'Preterm delivery', 'Gestational duration'))) p1= ggplot(d, aes(BETA, BETA_ptd, colour= GWAS, fill= GWAS)) + geom_errorbarh(aes(xmin= BETA - SE, xmax= BETA + SE, colour= GWAS, fill= GWAS), size= 0.1, alpha= 0.7) + geom_errorbar(aes(ymin= BETA_ptd - SE_ptd, ymax= BETA_ptd + SE_ptd, colour= GWAS, fill= GWAS),size= 0.1, alpha= 0.7) + geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) + scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') + scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') + xlab('Maternal effect on gestational duration, days') + ylab('Maternal effect on preterm delivery, log(OR)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) p1= ggplot(d, aes(BETA, BETA_ptd, colour= GWAS, fill= GWAS)) + geom_errorbarh(aes(xmin= BETA - SE, xmax= BETA + SE, colour= GWAS, fill= GWAS), size= 0.1, alpha= 0.7) + geom_errorbar(aes(ymin= BETA_ptd - SE_ptd, ymax= BETA_ptd + SE_ptd, colour= GWAS, fill= GWAS),size= 0.1, alpha= 0.7) + geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) + scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') + scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)]) + xlab('Maternal effect on gestational duration, days') + ylab('Maternal effect on preterm delivery, log(OR)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) ggsave(snakemake@output[[2]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/GAraw_vs_allPTD.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) pph= fread(snakemake@input[[1]]) supp_table= pph geneb= fread(snakemake@input[[2]]) gene_dict= fread(snakemake@input[[3]]) names(gene_dict)= c('CHR', 'POS1', 'POS2', 'Gene', 'EnsembleID') gene_dict$EID= with(gene_dict, unlist(lapply(strsplit(as.character(EnsembleID), ".", fixed= T), '[[', 1))) d= inner_join(pph, gene_dict, by= c('protein'= 'EID')) %>% inner_join(., geneb, by= 'Gene') #supp_table= full_join(pph, gene_dict, by= c('protein'= 'EID')) %>% full_join(., geneb, by= 'Gene') %>% filter(Pvalue< 0.05/ nrow(geneb) | PP.H4.abf>= 0.9) z= fread(snakemake@input[[5]], select= c('z.df1', 'z.df2', 'SNP.PP.H4', 'protein', 'snp')) z= arrange(z, desc(SNP.PP.H4)) z= group_by(z, protein) %>% filter(row_number()==1) d= left_join(d, z, by= c('protein')) d= separate(d, snp, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':', remove= FALSE) #aa= fread(snakemake@input[[6]]) #names(aa)= c('CHR', 'POS', 'REF', 'ALT', 'AA') #aa= filter(aa, AA!= '.') #aa= filter(aa, POS %in% d$POS) #aa$ID= with(aa, ifelse(REF> ALT, paste(CHR, POS, ALT, REF, sep= ':'), paste(CHR, POS, REF, ALT, sep= ':'))) #d= left_join(d,aa[, c('ID', 'AA')], by= c('snp'= 'ID')) #d$z.df1= with(d, ifelse(d$AA== d$EFF, -1 * d$z.df1, d$z.df1)) #d$z.df2= with(d, ifelse(d$AA== d$EFF, -1 * d$z.df2, d$z.df2)) #d$direction= with(d, ifelse(z.df1>0 & z.df2 > 0, 'Positive', ifelse(z.df1<0 & z.df2< 0, 'Negative', 'Opposite'))) #d$direction= with(d, ifelse(is.na(d$AA), 'Missing', d$direction)) d$direction= with(d, ifelse((z.df1 * z.df2)>0, 'Same direction', 'Opposite')) d$gene_group= with(d, ifelse(PP.H4.abf> 0.9 & Pvalue< 0.05 / nrow(geneb), 'Colocalize and gene-based significant', ifelse(Pvalue< 0.05 / nrow(geneb) & PP.H4.abf<= 0.9, 'Gene based significant', ifelse(PP.H4.abf> 0.9 & Pvalue> 0.05 / nrow(geneb), 'Colocalize', 'No colocalize and not significant')))) ga= fread(snakemake@input[[4]], select= c('ID', 'BETA')) d= inner_join(d, ga, by= c('snp'= 'ID')) p1= ggplot(d, aes(-log10(Pvalue), PP.H4.abf, fill= direction, alpha= (1 + PP.H4.abf) * -log10(Pvalue))) + geom_point(shape=21, colour= 'black', size= 4) + theme_cowplot(font_size= 10) + scale_alpha_continuous(guide= F) + scale_size_continuous(range = c(.001, 10), guide= F) + scale_fill_manual(values= c(colorBlindBlack8[c(2, 4)]), guide= F) + geom_text_repel(data= filter(d, PP.H4.abf> 0.9 | Pvalue< 0.05 / nrow(geneb)), aes(label= Gene), max.overlaps= 20, colour= 'black', size= 6/ .pt, max.time= 10, alpha= 1) + geom_hline(yintercept= 0.9, colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_vline(xintercept= -log10(0.05/nrow(geneb)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + scale_y_continuous(breaks= c(seq(0, 1, 0.25), 0.9), limits= c(0, 1), expand= expansion(mult= c(0.05,0))) + ylab('Posterior probability of colocalization') + xlab('-log10(Gene based p-value)') ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300) d= select(d, Gene, BETA, direction, Pvalue, PP.H4.abf, Pvalue, z.df1, z.df2) fwrite(d, snakemake@output[[2]], sep= '\t') p1= ggplot(d, aes(-log10(Pvalue), PP.H4.abf, fill= direction, alpha= (1 + PP.H4.abf) * -log10(Pvalue))) + geom_point(shape=21, colour= 'black', size= 4) + theme_cowplot(font_size= 10) + scale_alpha_continuous('Legend') + scale_size_continuous('Legend', range = c(.001, 10)) + scale_fill_manual('Legend', values= c(colorBlindBlack8[c(2, 4)])) + geom_text_repel(data= filter(d, PP.H4.abf> 0.9 | Pvalue< 0.05 / nrow(geneb)), aes(label= Gene), max.overlaps= 20, colour= 'black', size= 6/ .pt, max.time= 10, alpha= 1) + geom_hline(yintercept= 0.9, colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_vline(xintercept= -log10(0.05/nrow(geneb)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + scale_y_continuous(breaks= c(seq(0, 1, 0.25), 0.9), limits= c(0, 1), expand= expansion(mult= c(0.05,0))) + ylab('Posterior probability of colocalization') + xlab('-log10(Gene based p-value)') ggsave(snakemake@output[[3]], plot= p1, width= 90, height= 90, units= 'mm', dpi= 300) fwrite(supp_table, snakemake@output[[4]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/gene_based_vs_coloc_iPSC.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | library(scales) library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(tidyverse) library(fmsb) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") d= fread(snakemake@input[[1]]) d$p1= 'Gestational\nduration' d$p2= with(d, ifelse(grepl('postTerm', p2), 'Post-term\ndelivery', ifelse(grepl('allPTD', p2), 'Preterm\ndelivery', 'GAnrm'))) d= filter(d, p2!= 'GAnrm') p1= ggplot(d, aes(p2, rg, colour= p2)) + geom_point() + geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) + theme_cowplot(font_size= 9) + scale_fill_manual(values= colorBlindBlack8[c(8,3,2)], guide= 'none') + scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2)]) + xlab('Phenotype') + ylab('Genetic correlation [95% CI]') + theme(legend.position= 'none') + ylim(pmin(-1, min(d$rg - 1.96*d$se)), pmax(1, max(d$rg + 1.96 * d$se))) + geom_hline(yintercept= 0, linetype= 'dashed', colour= 'grey', size= 0.5) ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 80, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d= filter(d, pheno!= 'GAnrm') d$pheno= with(d, ifelse(pheno== 'GAraw', 'Gestational\nduration', ifelse(pheno== 'allPTD', 'Preterm\ndelivery', 'Post-term\ndelivery'))) p1= ggplot(d, aes(pheno, h2, colour= pheno)) + geom_point() + geom_errorbar(aes(ymin= I(h2 - 1.96*se) , ymax= (h2 + 1.96 * se)), width=.2, position=position_dodge(.9)) + theme_cowplot(font_size= 9) + scale_fill_manual(values= colorBlindBlack8[c(8,3,2)], guide= 'none') + scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2)]) + xlab('Phenotype') + ylab('Common SNP heritability [95% CI]') + theme(legend.position= 'none', axis.text.x= element_text(angle= 45, hjust= 1)) ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 80, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/h2_allphenos.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T) x= fread(snakemake@input[[2]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d$trait= 'Gestational\nduration' x$trait= 'Preterm delivery' d= rbind(d, x) p1= ggplot(d, aes(cohort, h2, colour= cohort)) + geom_point() + geom_errorbar(aes(ymin= I(h2 - 1.96*se) , ymax= (h2 + 1.96 * se)), width=.2, position=position_dodge(.9)) + theme_cowplot(font_size= 9) + facet_wrap(vars(trait), ncol= 1) + scale_fill_manual(values= colorBlindBlack8[c(8,3,2,6,7, 4, 1)], guide= 'none') + scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2,6,7, 4, 1)]) + xlab('Cohort') + ylab('Common SNP heritability [95% CI]') + theme(legend.position= 'none', strip.background = element_blank(), axis.text.x= element_text(angle= 45, hjust= 1)) ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 120, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | library(dplyr) library(data.table) library(ggplot2) library(cowplot) library(ggrepel) library(tidyr) library(showtext) colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) names(d)[8]= 'phenocode' mani= fread(snakemake@input[[2]]) trait_list= c('biomarkers', 'continuous', 'icd10') mani= mani[mani$trait_type %in% trait_list, ] mani= filter(mani, saige_heritability_EUR> 0.01) mani= mani[order(mani$saige_heritability_EUR, decreasing= TRUE), ] mani= mani[!duplicated(mani$phenocode), ] mani$phenocode= paste(mani$trait_type, mani$phenocode, sep= '_') mani= mani[, c('phenocode', 'description')] mani= mani[!duplicated(mani$description), ] d= inner_join(d, mani[, c('description', 'phenocode')], by= 'phenocode') d$cohort= 'UKBB' x= fread(snakemake@input[[3]]) names(x)[8]= 'phenocode' mani= fread(snakemake@input[[4]]) mani= mani[, c('phenocode', 'name')] names(mani)= c('phenocode', 'description') mani= mani[!duplicated(mani$description), ] x= inner_join(x, mani, by= 'phenocode') x$cohort= 'FINNGEN' d= rbind(d, x) d= d[order(d$PP.H4.abf, decreasing= F), ] d= filter(d, PP.H4.abf> 0.01, PP.H4.abf + PP.H3.abf> 0.75) d$preg_trait= factor(d$preg_trait) empty_bar <- 6 to_add <- data.frame( matrix(NA, empty_bar*nlevels(d$preg_trait), ncol(d)) ) colnames(to_add) <- colnames(d) to_add$preg_trait <- rep(levels(d$preg_trait), each=empty_bar) d <- rbind(d, to_add) d <- d %>% arrange(preg_trait) d$id= seq(1, nrow(d)) label_data= d number_of_bar <- nrow(label_data) angle <- 90 - 360 * (label_data$id-0.5) /number_of_bar label_data$hjust<-ifelse( angle < -90, 1, 0) label_data$angle<-ifelse(angle < -90, angle+180, angle) #d$id= factor(d$id, levels= d$id[order(d$PP.H4.abf)]) base_data= d %>% group_by(preg_trait) %>% filter(is.na(PP.H4.abf)) %>% summarize(start=min(id), end=max(id) ) %>% rowwise() %>% mutate(title=mean(c(start, end))) arc100= rep(1, 2) arc75= rep(0.75, 2) arc50= rep(0.50, 2) arc25= rep(0.25, 2) p1= ggplot(d, aes(as.factor(id), PP.H4.abf, fill= preg_trait, alpha= PP.H4.abf)) + geom_bar(stat="identity", colour= NA) + scale_alpha_continuous(range= c(0.4, 1), guide= F) + geom_segment(data=base_data, aes(x = end, y = arc100, xend = start, yend = arc100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + geom_segment(data=base_data, aes(x = end, y = arc75, xend = start, yend = arc75), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + geom_segment(data=base_data, aes(x = end, y = arc50, xend = start, yend = arc50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + geom_segment(data=base_data, aes(x = end, y = arc25, xend = start, yend = arc25), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) + annotate("text", x = ((base_data$end[1] + base_data$start[1]) / 2), y = c((0.25 + 0.05) , (0.50 + 0.05), (0.75 + 0.05) , (1 + 0.05)), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=3 , angle=0, fontface="bold", hjust= 0.5) + annotate("text", x = ((base_data$end[2] + base_data$start[2]) / 2), y = c((0.25 + 0.05) , (0.50 + 0.05), (0.75 + 0.05) , (1 + 0.05) ), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=3, angle=15, fontface="bold", hjust=0.5) + ylim(-0.2, 2) + # Limits of the plot = very important. The negative value controls the size of the inner circle, the positive one is useful to add size over each bar theme_cowplot() + scale_fill_manual(values=colorBlindBlack8[c(2,4)], guide= F) + scale_colour_manual(values=colorBlindBlack8[c(2,4)], guide= F) + theme( axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank(), plot.margin = unit(rep(-2,4), "cm") ) + coord_polar(start = 0) + geom_text(data=filter(label_data, PP.H4.abf> 0.75), aes(x= factor(id), y=PP.H4.abf + 0.01, label=description, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= filter(label_data, PP.H4.abf> 0.750)$angle, inherit.aes = FALSE) + theme(panel.grid = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) p1= save_plot(snakemake@output[[1]], p1, base_width= 8, base_height= 8) fwrite(d, snakemake@output[[2]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | library(data.table) library(dplyr) library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d$Category= factor(d$Category, levels= unique(d$Category)) p1= ggplot(d, aes(Enrichment, -log10(Enrichment_p))) + geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) + xlab('Heritability enrichment') + ylab('-log10(P-value)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(panel.grid.major= element_line(colour= 'grey', size= 0.05), legend.position = "none") + geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(Enrichment, -log10(Enrichment_p), label= Category), size= 8/.pt) ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300) p2= ggplot(d, aes(n_genes, -log10(Enrichment_p))) + geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) + xlab('Size of gene set') + ylab('-log10(Enrichment)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(panel.grid.major= element_line(colour= 'grey', size= 0.05), legend.position = "none") + geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(n_genes, -log10(Enrichment_p), label= Category), size= 8/.pt) ggsave(snakemake@output[[2]], plot= p2, width= 90, height= 90, units= 'mm', dpi= 300) p3= ggplot(d, aes(n_genes, -log10(Enrichment_p))) + geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) + xlab('Size of gene set') + ylab('-log10(Enrichment)') + theme_cowplot(font_size= 8) + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(panel.grid.major= element_line(colour= 'grey', size= 0.05)) + geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(n_genes, -log10(Enrichment_p), label= Category), size= 8/.pt) ggsave(snakemake@output[[3]], plot= p3, width= 90, height= 90, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) x= fread(snakemake@input[[2]], select= c('RSID', 'BETA')) d= inner_join(d, x, by= c('rsid' = 'RSID')) d$beta_MNT= with(d, ifelse(BETA< 0, -1 * beta_MNT, beta_MNT)) d$beta_PT= with(d, ifelse(BETA< 0, -1 * beta_PT, beta_PT)) d$beta_MT= with(d, ifelse(BETA< 0, -1 * beta_MT, beta_MT)) d$BETA= with(d, ifelse(BETA<0, -1 * BETA, BETA)) d$lo95_MT= d$beta_MT - 1.96 * d$se_MT d$up95_MT= d$beta_MT + 1.96 * d$se_MT d$lo95_MNT= d$beta_MNT - 1.96 * d$se_MNT d$up95_MNT= d$beta_MNT + 1.96 * d$se_MNT d$lo95_PT= d$beta_PT - 1.96 * d$se_PT d$up95_PT= d$beta_PT + 1.96 * d$se_PT d$class_name= with(d, ifelse(class_name== 'MF SD', 'Maternal and fetal (same direction)', ifelse(class_name== 'Fetal MatT', 'Fetal effect, maternal transmitted only', ifelse(class_name== 'Maternal', 'Maternal', ifelse(class_name== 'Fetal', 'Fetal', ifelse(class_name== 'MF OD', 'Maternal and fetal (opposite direction)', '')))))) p1= ggplot(d, aes(beta_MNT, BETA, colour= class_name)) + geom_point(size= 0.5) + #geom_errorbarh(data= filter(d, (lo95_h2 >0 & up95_h2>0) | (lo95_h2<0 & up95_h2 <0)), aes(xmax = lo95_h2, xmin = up95_h2), size= 0.05) + theme_cowplot(font_size= 8) + scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) + geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + xlab('Effect size maternal \nnon-transmitted alleles, days') + ylab('Effect size maternal genome, days') #theme(legend.direction = "horizontal", legend.position = "bottom") #scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) + # scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300) print('plot1') p1= ggplot(d, aes(beta_PT, BETA, colour= class_name)) + geom_point(size= 0.5) + #geom_errorbarh(data= filter(d, (lo95_h3 >0 & up95_h3>0) | (lo95_h3<0 & up95_h3 <0)), aes(xmax = lo95_h3, xmin = up95_h3), size= 0.05) + theme_cowplot(font_size= 8) + scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) + geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + xlab('Effect size paternal \ntransmitted alleles, days') + ylab('Effect size maternal genome, days') #scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) + # scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) ggsave(snakemake@output[[2]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300) print('plot2') p1= ggplot(d, aes(beta_MT, BETA, colour= class_name)) + geom_point(size= 0.5) + #geom_errorbarh(data= filter(d, (lo95_h3 >0 & up95_h3>0) | (lo95_h3<0 & up95_h3 <0)), aes(xmax = lo95_h3, xmin = up95_h3), size= 0.05) + theme_cowplot(font_size= 8) + scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)]), guide= F) + geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + xlab('Effect size maternal \ntransmitted alleles, days') + ylab('Effect size maternal genome, days') #scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) + # scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) ggsave(snakemake@output[[3]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300) p1= ggplot(d, aes(beta_MNT, BETA, colour= class_name)) + geom_point(size= 0.5) + #geom_errorbarh(data= filter(d, (lo95_h2 >0 & up95_h2>0) | (lo95_h2<0 & up95_h2 <0)), aes(xmax = lo95_h2, xmin = up95_h2), size= 0.05) + theme_cowplot(font_size= 8) + scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) + geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) + xlab('Effect size maternal \nnon-transmitted alleles, days') + ylab('Effect size maternal genome, days') theme(legend.direction = "horizontal", legend.position = "bottom") #scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) + # scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) ggsave(snakemake@output[[4]], plot= p1, width= 120, height= 60, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[5]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/lm_effect_origin.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) x= fread(snakemake@input[[2]]) d= rbind(d, x) names(d)= c('Name', 'no_no', 'no_yes', 'yes_no', 'yes_yes', 'candidate_gene', 'rest_genes', 'OR', 'pvalue') d$enrichment= d$candidate_gene / d$rest_genes d= arrange(d, desc(pvalue)) d$description= with(d, ifelse(Name== 'pli', 'Loss-of-function intolerant', ifelse(Name== 'dominant', 'Dominant', 'Recessive'))) d$description= factor(d$description, levels= unique(d$description)) p1= ggplot(data=d, aes(x= description, y= -log10(pvalue))) + geom_col(fill=colorBlindBlack8[2], alpha= 0.6) + theme_cowplot(font_size= 10) + ylab('Enrichment -log10(pvalue)') + theme(axis.title.y=element_blank()) + geom_hline(yintercept= -log10(0.05/nrow(d)), linetype= 'dashed', colour= 'grey') + coord_flip() ggsave(snakemake@output[[1]], plot= p1, height= 35, width= 90, dpi= 300, units= 'mm') fwrite(d, snakemake@output[[2]], sep='\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/MacArthurlab_enrichment.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ge= data.frame(CHR= c(5, 3, 1, 23, 3, 23), pos_ge= c(157895049, 127881613, 22470407, 115164770, 123068359, 131300571)) df= arrange(d, pvalue) dg= fread(snakemake@input[[2]]) dg$GENE= dg$nearestGene don <- df %>% group_by(CHR) %>% summarise(chr_len= max(POS)) %>% mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome select(-chr_len) %>% left_join(df, ., by= 'CHR') %>% arrange(CHR, POS) %>% # Add a cumulative position of each SNP mutate(BPcum=POS+tot) %>% ungroup() axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 ) names(axisdf)= c('CHR', 'center') HC= -log10(5*10**-8) dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, BETA) don$disc= ifelse(don$pvalue> 5*10**-8, 0, 2) don= left_join(don, select(dg, ID, GENE), by= 'ID') names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA') lims= 250000 don= data.frame(don) dg= data.frame(dg) for (i in rownames(dg)) { don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) } for (i in rownames(ge)) { don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc)) } don= don[order(don$disc, decreasing= F, na.last= T), ] don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery')) cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[3], 'New discovery'= colorBlindBlack8[8]) don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE) font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) don$GENE= with(don, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE))))) don$logpval= -log10(don$pvalue) p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= disc)) + geom_point(size= 0.07) + # Show all points theme_cowplot(font_size= 9) + scale_colour_manual(values= cols, guide= F) + scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= c(0.03, 0.03)) + # label = ifelse(axisdf$CHR== 23, 'X', axisdf$CHR) scale_y_continuous(expand= c(0, 0), limits= c(min(don$logpval) - 2, max(don$logpval) + 2), breaks= seq(0, 10, 5), labels= c(abs(seq(0, 10, 5)))) + # , sec.axis = sec_axis(~ ., name = derive())) + ylab('-log10(pvalue)') + xlab('Chromosome') + geom_hline(yintercept= 0,, size= 0.25, colour= 'black') + geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') + coord_cartesian(clip = "off") + geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE), size= 6/ .pt, force_pull= 0, # do not pull toward data points force= 0.1, nudge_y = ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1), #43 - ((-log10(filter(don, GENE!= '')$pvalue))), direction = "both", hjust = 0, vjust= 0.5, box.padding= 0.1, angle= 0, segment.size = 0.1, segment.square= TRUE, segment.inflect= FALSE, segment.colour= colorBlindBlack8[8], colour= ifelse(filter(don, GENE!= '') %>% pull(disc)== 'New discovery', colorBlindBlack8[8], colorBlindBlack8[3]), segment.linetype = 4, ylim = c(-Inf, 50), xlim = c(-Inf, Inf)) + theme(legend.position= 'none', plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'), text= element_text(family="arial", size= 9), axis.line= element_line(size= 0.1)) save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 185, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/manhattan_plot_postTerm.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T, select= c('ID', 'CHR', 'POS', 'pvalue', 'nearestGene')) d$pheno= 'GAraw' x= fread(snakemake@input[[3]], h= T, select= c('ID', 'CHR', 'POS', 'pvalue', 'nearestGene')) x$pheno= 'allPTD' d= rbind(d, x) rm(x) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ge= data.frame(CHR= c(5, 3, 1, 23, 3, 23), pos_ge= c(157895049, 127881613, 22470407, 115164770, 123068359, 131300571)) df= arrange(d, pvalue) dg= fread(snakemake@input[[2]]) dg$GENE= dg$nearestGene ptd= fread(snakemake@input[[4]]) ptd$GENE= ptd$nearestGene don <- df %>% group_by(CHR) %>% summarise(chr_len= max(POS)) %>% mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome select(-chr_len) %>% left_join(df, ., by= 'CHR') %>% arrange(CHR, POS) %>% # Add a cumulative position of each SNP mutate(BPcum=POS+tot) %>% ungroup() axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 ) names(axisdf)= c('CHR', 'center') HC= -log10(5*10**-8) dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, BETA) ptd= ptd %>% ungroup %>% select(ID, GENE, CHR, POS, BETA) don$disc= ifelse(don$pvalue> 5*10**-8, 0, 2) don1= filter(don, pheno== 'GAraw') %>% left_join(., select(dg, ID, GENE), by= 'ID') don2= filter(don, pheno!= 'GAraw') %>% left_join(., select(ptd, ID, GENE), by= 'ID') names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA') names(ptd)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA') lims= 250000 don= data.frame(don) dg= data.frame(dg) ptd= data.frame(ptd) for (i in rownames(dg)) { don1= mutate(don1, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) } for (i in rownames(ptd)) { don2= mutate(don2, disc= ifelse(CHR== as.integer(ptd[i, 'CHR']) & POS>= as.integer(ptd[i, 'POS_new']) - lims & POS<= as.integer(ptd[i, 'POS_new']) + lims, 2, disc)) } don= rbind(don1, don2) rm(don1) ; rm(don2) for (i in rownames(ge)) { don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc)) } don= don[order(don$disc, decreasing= F, na.last= T), ] don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery')) cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[3], 'New discovery'= colorBlindBlack8[8]) don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE) font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) don$GENE= with(don, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE))))) don$logpval= with(don, ifelse(pheno== 'allPTD', log10(pvalue), -log10(pvalue))) p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= disc)) + geom_point(size= 0.07) + # Show all points theme_cowplot(font_size= 9) + scale_colour_manual(values= cols, guide= F) + scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= c(0.03, 0.03)) + # label = ifelse(axisdf$CHR== 23, 'X', axisdf$CHR) scale_y_continuous(expand= c(0, 0), limits= c(min(don$logpval) - 2, max(don$logpval) + 2), breaks= seq(-30, 45, 10), labels= c(abs(seq(-30, 45, 10)))) + # , sec.axis = sec_axis(~ ., name = derive())) + ylab('-log10(pvalue)') + xlab('Chromosome') + geom_hline(yintercept= 0,, size= 0.25, colour= 'black') + geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') + coord_cartesian(clip = "off") + geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE), size= 6/ .pt, force_pull= 0, # do not pull toward data points force= 0.1, nudge_y = ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1), #43 - ((-log10(filter(don, GENE!= '')$pvalue))), direction = "both", hjust = 0, vjust= 0.5, box.padding= 0.1, angle= 0, segment.size = 0.1, segment.square= TRUE, segment.inflect= FALSE, segment.colour= colorBlindBlack8[8], colour= ifelse(filter(don, GENE!= '') %>% pull(disc)== 'New discovery', colorBlindBlack8[8], colorBlindBlack8[3]), segment.linetype = 4, ylim = c(-Inf, 50), xlim = c(-Inf, Inf)) + theme(legend.position= 'none', plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'), text= element_text(family="arial", size= 9), axis.line= element_line(size= 0.1)) save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 180, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/manhattan_plot.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) as= 8 as1= 8 d= fread(snakemake@input[[1]]) d$effect= 'fetal_effect' x= fread(snakemake@input[[2]]) x$effect= 'maternal_effect' d= rbind(d, x) d= filter(d, !(rsID %in% c('rs7819593', 'rs41311445'))) d$Beta2= ifelse(d$Beta1< 0, -1 * d$Beta2, d$Beta2) d$Beta1= ifelse(d$Beta1< 0, -1 * d$Beta1, d$Beta1) d$beta_dif= with(d, (Beta2 - Beta1) / Beta1) mor= filter(d, effect == 'maternal_effect') %>% pull(beta_dif) barn= filter(d, effect == 'fetal_effect') %>% pull(beta_dif) p1= ggplot() + geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) + annotate('text', x= 0.35, y= 0.6, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x= 0.35, y= -1, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) + theme_cowplot(font_size = 8) + scale_x_continuous(expand= c(0, 0)) + xlab("Relative difference in effect size on \nbirth weight with or without adjusting for gestational duration") + ylab('Density') + geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) p1= ggplot(d, aes(beta_dif, group= effect, fill= effect)) + geom_hline(yintercept= 0, colour= 'black') + geom_density(color= NA) + annotate('text', x=-1.5, y= 0.8, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x=1, y= 0.8, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + theme_cowplot(font_size= 8) + #scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_x_continuous(expand= c(0, 0)) + scale_y_continuous(expand=c(0, 0.05)) + xlab("Relative difference in effect size on birth weight\nwith or without adjusting for gestational duration") + ylab('Density') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) + geom_vline(xintercept= 0, linetpye= 'dashed', colour= 'grey') ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) as= 8 as1= 8 d= fread(snakemake@input[[1]]) d$beta_h2_GA= ifelse(d$beta_h2< 0, -1 * d$beta_h2_GA, d$beta_h2_GA) d$beta_h2= ifelse(d$beta_h2< 0, -1 * d$beta_h2, d$beta_h2) d$beta_h3_GA= ifelse(d$beta_h3 < 0, -1 * d$beta_h3_GA, d$beta_h3_GA) d$beta_h3= ifelse(d$beta_h3 < 0, -1 * d$beta_h3, d$beta_h3) d$beta_dif_h2= with(d, (beta_h2_GA - beta_h2) / beta_h2) d$beta_dif_h3= with(d, (beta_h3_GA - beta_h3) / beta_h3) mor= filter(d, effect == 'maternal_effect') %>% pull(beta_dif_h2) barn= filter(d, effect == 'fetal_effect') %>% pull(beta_dif_h3) p1= ggplot() + geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) + annotate('text', x= 0.35, y= 0.6, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x= 0.35, y= -1, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) + theme_cowplot(font_size = 8) + scale_x_continuous(expand= c(0, 0)) + xlab("Relative difference in effect size on \nbirth weight with or without adjusting for gestational duration") + ylab('Density') + geom_hline(yintercept= 0, colour= 'grey') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) moms= filter(d, effect== 'maternal_effect') %>% gather(key, beta_dif, beta_dif_h2) %>% select(beta_dif, effect) fets= filter(d, effect== 'fetal_effect') %>% gather(key, beta_dif, beta_dif_h3) %>% select(beta_dif, effect) d= rbind(moms, fets) p1= ggplot(d, aes(beta_dif, group= effect, fill= effect)) + geom_hline(yintercept= 0, colour= 'black') + geom_density(color= NA) + annotate('text', x=-2, y= 0.4, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') + annotate('text', x=1, y= 0.8, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') + theme_cowplot(font_size= 8) + #scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') + scale_x_continuous(expand= c(0, 0)) + scale_y_continuous(expand=c(0, 0.05)) + xlab("Relative difference in effect size on birth weight\nwith or without adjusting for gestational duration") + ylab('Density') + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3)) + geom_vline(xintercept= 0, linetpye= 'dashed', colour= 'grey') ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | library(MendelianRandomization) library(data.table) library(dplyr) library("knitr") library("tidyr") library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d$rsid= with(d, ifelse(rsid== 'chrX:116013571', 'rs5991030', ifelse(rsid== 'chrX:132178061', 'rs5930554', rsid))) d$effect_origin= with(d, ifelse(class_name== 'MF OD' | class_name== 'MF SD', 'Maternal and fetal', ifelse(class_name== 'Fetal MatT' | class_name== 'Fetal', 'Fetal', 'Maternal'))) #d= filter(d, MarkerName!= '6:32595083:G:T') #top= fread(snakemake@input[[2]]) #ids= pull(top, ID) #ids= c('3:156697097:A:G', '5:158058432:G:T', ids) x= fread(snakemake@input[[2]], select= c('ID', 'RSID')) #x= filter(x, ID %in% ids) d= inner_join(d, x, by= c('rsid'= 'RSID')) d= separate(d, ID, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':') d$beta_MT= with(d, ifelse(REF > EFF, -1 * beta_MT, beta_MT)) d$beta_MNT= with(d, ifelse(REF > EFF, -1 * beta_MNT, beta_MNT)) d$beta_PT= with(d, ifelse(REF > EFF, -1 * beta_PT, beta_PT)) d$ID= with(d, ifelse(REF> EFF, paste(CHR, POS, EFF, REF, sep= ':'), paste(CHR, POS, REF, EFF, sep= ':'))) outcome= ifelse(grepl('fetal', snakemake@input[[3]]), 'Fetal', 'Maternal') x= fread(snakemake@input[[3]], select= c('ID', 'BETA', 'SE', 'pvalue')) d= inner_join(d, x, by= 'ID') df_MT= select(d, beta_MT, se_MT, BETA, SE, effect_origin) df_MT$BETA= with(df_MT, ifelse(beta_MT<0, BETA * -1, BETA)) df_MT$beta_MT= with(df_MT, ifelse(beta_MT<0, beta_MT * -1, beta_MT)) inputMR_m= mr_input(bx= df_MT$beta_MT, bxse= df_MT$se_MT, by= df_MT$BETA, byse= df_MT$SE) MT= mr_allmethods(inputMR_m)$Values names(MT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue') df_MNT= select(d, beta_MNT, se_MNT, BETA, SE, effect_origin) df_MNT$BETA= with(df_MNT, ifelse(beta_MNT<0, BETA * -1, BETA)) df_MNT$beta_MNT= with(df_MNT, ifelse(beta_MNT<0, beta_MNT * -1, beta_MNT)) inputMR_m= mr_input(bx= df_MNT$beta_MNT, bxse= df_MNT$se_MNT, by= df_MNT$BETA, byse= df_MNT$SE) MNT= mr_allmethods(inputMR_m)$Values names(MNT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue') df_PT= select(d, beta_PT, se_PT, BETA, SE, effect_origin) print(nrow(df_PT)) df_PT$BETA= with(df_PT, ifelse(beta_PT<0, BETA * -1, BETA)) df_PT$beta_PT= with(df_PT, ifelse(beta_PT<0, beta_PT * -1, beta_PT)) inputMR_m= mr_input(bx= df_PT$beta_PT, bxse= df_PT$se_PT, by= df_PT$BETA, byse= df_PT$SE) PT= mr_allmethods(inputMR_m)$Values names(PT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue') p1= ggplot(df_MT, aes(beta_MT, BETA, colour= effect_origin, fill= effect_origin)) + geom_errorbarh(aes(xmin= beta_MT - se_MT, xmax= beta_MT + se_MT, colour= effect_origin, fill= effect_origin), size= 0.1, alpha= 0.7) + geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE, colour= effect_origin, fill= effect_origin),size= 0.1, alpha= 0.7) + geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) + scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') + scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') + xlab('Effect of maternal transmitted\nalleles on gestational duration, days') + ylab(paste(outcome, 'only effect\non birth weight, z-score')) + theme_cowplot(font_size= 8) + geom_abline(intercept= 0, slope= filter(MT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') + geom_abline(intercept= (filter(MT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(MT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) p2= ggplot(df_MNT, aes(beta_MNT, BETA, colour= effect_origin, fill= effect_origin)) + geom_errorbarh(aes(xmin= beta_MNT - se_MNT, xmax= beta_MNT + se_MNT,colour= effect_origin, fill= effect_origin), size= 0.1) + geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE,colour= effect_origin, fill= effect_origin),size= 0.1) + geom_point(size= 2, shape= 21, stroke= 0.1) + scale_colour_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') + scale_fill_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') + xlab('Effect of maternal non-transmitted alleles\non gestational duration, days') + ylab(paste(outcome, 'only effect\non birth weight, z-score')) + theme_cowplot(font_size= 8) + geom_abline(intercept= 0, slope= filter(MNT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') + geom_abline(intercept= (filter(MNT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(MNT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) p3= ggplot(df_PT, aes(beta_PT, BETA, colour= effect_origin, fill= effect_origin)) + geom_errorbarh(aes(xmin= beta_PT - se_PT, xmax= beta_PT + se_PT, colour= effect_origin, fill= effect_origin), size= 0.1) + geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE, colour= effect_origin, fill= effect_origin), alpha= 0.5, size= 0.1) + geom_point(size= 2, shape= 21, stroke = 0.1) + scale_colour_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') + scale_fill_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') + xlab('Effect of paternal transmitted alleles\non gestational duration, days') + ylab(paste(outcome, 'only effect\non birth weight, z-score')) + theme_cowplot(font_size= 8) + geom_abline(intercept= 0, slope= filter(PT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') + geom_abline(intercept= (filter(PT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(PT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) ggsave(snakemake@output[[2]], plot= p2, width= 70, height= 70, units= 'mm', dpi= 300) ggsave(snakemake@output[[3]], plot= p3, width= 70, height= 70, units= 'mm', dpi= 300) MT$haplotype= 'MT' MNT$haplotype= 'MNT' PT$haplotype= 'PT' df= bind_rows(MT, MNT, PT) fwrite(d, snakemake@output[[4]], sep= '\t') fwrite(df, snakemake@output[[5]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/MR_GA_BW_haplotype.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | library(MendelianRandomization) library(data.table) library(dplyr) library("knitr") library("tidyr") library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) x= fread(snakemake@input[[2]], select= c('ID', 'BETA', 'SE')) mr= fread(snakemake@input[[3]]) d= inner_join(d,x, by= 'ID') d= filter(d, !duplicated(ID)) d$BETA= with(d, ifelse(beta< 0, -1 * BETA, BETA)) d$beta= with(d, ifelse(beta< 0, -1 * beta, beta)) shbg= filter(d, trait== 'SHBG_fem_cluster') testo= filter(d, trait== 'Testosterone_fem_cluster') p1= ggplot(shbg, aes(beta, BETA), color= colorBlindBlack8[2]) + geom_errorbarh(aes(xmin= beta - se, xmax= beta + se), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) + geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) + geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7, fill= colorBlindBlack8[2]) + xlab('Effect on SHBG (women), nmol/L') + ylab('Effect on gestational duration, days') + theme_cowplot(font_size= 8) + geom_abline(intercept= 0, slope= filter(mr, method== 'IVW', trait== 'SHBG_fem_cluster') %>% pull(estimate), colour= '#d9d9d9') + geom_abline(intercept= (filter(mr, method== '(intercept)', trait== 'SHBG_fem_cluster') %>% pull(estimate))[1], slope= filter(mr, method== 'MR-Egger', trait== 'SHBG_fem_cluster') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) p2= ggplot(testo, aes(beta, BETA), color= colorBlindBlack8[2]) + geom_errorbarh(aes(xmin= beta - se, xmax= beta + se), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) + geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) + geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7, fill= colorBlindBlack8[2]) + xlab('Effect on testosterone (women), nmol/L') + ylab('Effect on gestational duration, days') + theme_cowplot(font_size= 8) + geom_abline(intercept= 0, slope= filter(mr, method== 'IVW', trait== 'Testosterone_fem_cluster') %>% pull(estimate), colour= '#d9d9d9') + geom_abline(intercept= (filter(mr, method== '(intercept)', trait== 'Testosterone_fem_cluster') %>% pull(estimate))[1], slope= filter(mr, method== 'MR-Egger', trait== 'Testosterone_fem_cluster') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') + geom_hline(yintercept= 0, size= 0.1) + geom_vline(xintercept= 0, size= 0.1) + theme(axis.line.x = element_blank(), axis.line.y = element_blank(), axis.ticks= element_blank(), panel.grid.major= element_line(colour= 'grey', size= 0.05)) ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300) ggsave(snakemake@output[[2]], plot= p2, width= 70, height= 70, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/MR_sex_hormones_GA.R
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) nr= nrow(d) d= filter(d, Enrichment_p< 0.05 / (nrow(d)- 1)) d$description= with(d, ifelse(Category== 'H3K27ac_HniszL2_0', 'H3K27ac', ifelse(Category== 'SuperEnhancer_HniszL2_0', 'SuperEnhancer', ifelse(Category== 'Backgrd_Selection_StatL2_0', 'Background selection', ifelse(Category== 'CpG_Content_50kbL2_0', 'CpG content', ifelse(Category== 'BLUEPRINT_DNA_methylation_MaxCPPL2_0', 'DNA Methylation', NA)))))) d= arrange(d, desc(Enrichment_p)) d$description= factor(d$description, levels= unique(d$description)) p1= ggplot(data=d, aes(x= description, y= -log10(Enrichment_p))) + geom_col(fill=colorBlindBlack8[2], alpha= 0.6) + theme_cowplot(font_size= 10) + ylab('Enrichment -log10(pvalue)') + theme(axis.title.y=element_blank()) + geom_hline(yintercept= -log10(0.05/ (nr -1)), linetype= 'dashed', colour= 'grey') + coord_flip() p2= ggplot(data=d, aes(x= description, y= Enrichment)) + geom_col(fill=colorBlindBlack8[4], alpha= 0.6) + theme_cowplot(font_size= 10) + ylab('Enrichment (h2 / proportion of SNPs)') + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + geom_hline(yintercept= 1, linetype= 'dashed', colour= 'grey') + coord_flip() x= plot_grid(p1, p2) ggsave(snakemake@output[[1]], plot= x, height= 50, width= 140, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
2
of
figures/partitioned_h2.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]], h= T, select= c('ID', 'pvalue', 'EAF')) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) d= arrange(d, pvalue) d= d[!duplicated(d$ID), ] d= mutate(d, maf_tertiles = ntile(MAF, 3)) #m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3) #m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3) #d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2))) df= arrange(d, pvalue) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue))) p1= ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue))) + geom_point(size= 0.4, color= colorBlindBlack8[2]) + #scale_color_manual(values= colorBlindBlack8[c(2,4,8)])+ geom_abline(intercept = 0, slope = 1, alpha = .5) + labs(colour="") + theme_cowplot(font_size= 12) + xlab('Expected (-log10(p-value))') + ylab('Observed (-log10(p-value))') + theme(legend.position= 'bottom') #guides(colour = guide_legend(override.aes = list(size=3))) ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 120, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | library(data.table) library(dplyr) library("knitr") library("tidyr") library(cowplot) library(ggrepel) library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) d= fread(snakemake@input[[1]]) d= filter(d, !grepl('BW', trait), !grepl('GA_fetal', trait), !grepl('male', trait)) d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal only', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal only', ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))) pregnancy= c('Miscarriage', 'Pre-eclampsia') uterus= c('Leiomyoma uterus', 'Pelvic Organ Prolapse', 'Endometriosis', 'Polycystic ovary syndrome') fitness= c('Age at first birth', 'Number of live births') hormonal= c('Age at menarche', 'Age at menopause', 'Testosterone (women)', 'SHBG (women)', 'CBAT (women)', 'Oestradiol (women)') d$cluster= with(d, ifelse(trait %in% pregnancy, 'Pregnancy', ifelse(trait %in% uterus, 'Reproductive organs', ifelse(trait %in% fitness, 'Fitness', 'Sex-hormone related')))) d$colour= with(d, ifelse(cluster== 'Pregnancy', colorBlindBlack8[3], ifelse(cluster== 'Reproductive organs', colorBlindBlack8[5], ifelse(cluster== 'Fitness', colorBlindBlack8[7], colorBlindBlack8[8])))) d$GENE= apply(d[, 'locus'], 1, function(x) unlist(strsplit(x, '_'))[2]) d$GENE= with(d, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE))))) d$sig= ifelse(d$PP.H4.abf>0.5, '*', '') d= arrange(d, cluster) d$trait= factor(d$trait, levels= unique(d$trait)) traits= unique(d$trait) colors <- filter(d, !duplicated(trait)) %>% arrange(trait) %>% pull(colour) d$PP= ifelse(d$PP.H4.abf> d$PP.H3.abf, d$PP.H4.abf, -d$PP.H3.abf - d$PP.H4.abf) d$PP2= ifelse(d$PP.H4.abf> d$PP.H3.abf, d$PP.H4.abf, d$PP.H3.abf) p1= ggplot(d, aes(trait, GENE, value= PP, fill= PP, colour= PP, size= PP2, stroke= 1- PP)) + theme_cowplot(font_size= 9) + geom_point(shape= 15) + scale_fill_gradient2(low= colorBlindBlack8[4], mid= 'white', high= colorBlindBlack8[2], guide= F) + scale_colour_gradient2(low= colorBlindBlack8[4], mid= 'white', high= colorBlindBlack8[2], guide= F) + scale_size_continuous(range= c(1, 2.5), guide= F) + scale_x_discrete(position= 'top') + theme(axis.ticks= element_blank(), axis.title= element_blank(), axis.text.x= element_blank()) + geom_vline(xintercept= 1:(length(unique(d$trait))-1) + 0.5, size= 0.4, colour= 'grey') + geom_hline(yintercept= 1:(length(unique(d$GENE))-1) + 0.5, size= 0.4, colour= 'grey') + geom_vline(xintercept= cumsum(c(length(fitness) , length(pregnancy) , length(uterus) )) +0.5, size= 0.8) + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour= 'black', fill= NA, size=1), plot.margin = unit(c(0, 0.1, 0.1, 0), "cm"), axis.line= element_blank()) t_count_locus= group_by(d, trait) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)), PP_locus= sum(as.numeric(PP.H4.abf + PP.H3.abf>0.8))) t_count_locus$PP= t_count_locus$PP_locus - t_count_locus$PP t_count_locus$supp= 'Locus-level' t_count= group_by(d, trait) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8))) t_count$supp= 'Coloc' t_count= bind_rows(t_count, t_count_locus) t_count$trait= factor(t_count$trait, levels= unique(d$trait)) t_count$supp= factor(t_count$supp, levels= c('Locus-level','Coloc')) p2= ggplot(t_count, aes(trait, -PP, fill= supp)) + theme_cowplot(font_size= 8) + geom_col(alpha= 0.7) + geom_hline(yintercept= 0) + scale_fill_manual(values= c(colorBlindBlack8[4], colorBlindBlack8[2]), guide= F) + theme( axis.line= element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour= 'black', fill= NA, size=1), axis.text.x= element_blank(), axis.ticks.x= element_blank(), axis.title= element_blank(), plot.margin = unit(c(0, 0, 0, 0.1), "cm")) + scale_y_continuous(limits= c(-10, 0), expand= c(0,0), labels= seq(0, 10, 2), breaks= seq(0, -10, -2)) + geom_vline(xintercept= cumsum(c(length(fitness) , length(pregnancy) , length(uterus) )) +0.5, size= 0.8) + geom_hline(yintercept= c(-4, -8), size= 0.3, linetype= 'dashed', colour= 'grey') l_count_locus= group_by(d, GENE) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)), PP_locus= sum(as.numeric(PP.H4.abf + PP.H3.abf>0.8))) l_count_locus$PP= l_count_locus$PP_locus - l_count_locus$PP l_count_locus$supp= 'Locus-level' l_count= group_by(d, GENE) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8))) l_count$supp= 'Coloc' l_count= bind_rows(l_count, l_count_locus) l_count$trait= factor(l_count$GENE, levels= unique(d$GENE)) l_count$supp= factor(l_count$supp, levels= c('Locus-level','Coloc')) print('done') p3= ggplot(l_count, aes(PP, GENE, fill= supp)) + theme_cowplot(font_size= 8) + geom_col(alpha= 0.7) + geom_hline(yintercept= 0) + scale_fill_manual(values= c(colorBlindBlack8[4], colorBlindBlack8[2]), guide= F) + theme( axis.line= element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour= 'black', fill= NA, size=1), axis.text.y= element_blank(), axis.ticks.y= element_blank(), axis.title= element_blank(), plot.margin = unit(c(0, 0.1, 0, 0), "cm")) + scale_x_continuous(limits= c(0, 10), expand= c(0,0), labels= seq(0,10, 2), breaks= seq(0, 10, 2)) x1= plot_grid(p1, p3, nrow= 1, align= 'h', rel_widths= c(2, 0.5)) x2= plot_grid(p1, p2, nrow= 2, align= 'v', rel_heights= c(2, 0.3)) ggsave(snakemake@output[[1]], plot= x1, width= 127 - 1, height= 127 - 25 - 1, units= 'mm', dpi= 300) ggsave(snakemake@output[[2]], plot= x2, width= 103 - 1, height= 127 - 25 - 1, units= 'mm', dpi= 300) ################## Genetic correlations d= fread(snakemake@input[[2]]) d= filter(d, grepl('GAraw', p1), !grepl('BW', p2), !grepl('male', p2)) #d$p1= 'Gestational duration (maternal)' d$p1= 'Maternal' x= fread(snakemake@input[[2]]) x= filter(x, grepl('GA_fetal', p1), !grepl('BW', p2), !grepl('male', p2)) #x$p1= 'Gestational duration (fetal)' x$p1= 'Fetal' d= rbind(d, x) d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) d$trait= d$p2 d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal only', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal only', ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))) d= filter(d, trait!= 'GA fetal effect') d$cluster= with(d, ifelse(trait %in% pregnancy, 'Pregnancy', ifelse(trait %in% uterus, 'Reproductive organs', ifelse(trait %in% fitness, 'Fitness', 'Sex-hormone related')))) d$colour= with(d, ifelse(cluster== 'Pregnancy', colorBlindBlack8[3], ifelse(cluster== 'Reproductive organs', colorBlindBlack8[1], ifelse(cluster== 'Fitness', colorBlindBlack8[7], colorBlindBlack8[8])))) d= arrange(d, cluster) d$trait= factor(d$trait, levels= traits) colors <- filter(d, !duplicated(trait)) %>% arrange(trait) %>% pull(colour) d$sig= ifelse(d$p< 0.05/ (nrow(d)/2), '**', ifelse(d$p< 0.05, '*', '')) d= filter(d, p1== 'Maternal') d$p1= 'Gestational duration' rg_plot= ggplot(d, aes(trait, p1, fill= rg)) + geom_tile(colour = "white", size= 1) + theme_cowplot(font_size= 9) + scale_fill_gradient2(low= colorBlindBlack8[2], high= colorBlindBlack8[4], mid= 'white', guide= F) + theme(axis.text.x = element_text(angle = 45, hjust = 0), axis.title.x = element_blank(), axis.title.y = element_blank()) + scale_x_discrete(position = "top") + geom_text(data= d, aes(trait, p1, label= sig), size= 6/ .pt) + theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.ticks= element_blank(), panel.border = element_rect(colour= 'black', fill= NA, size=1), plot.margin = unit(c(0, 1, 0, 0), "cm"), axis.line= element_blank(), axis.text.x= element_text(angle= 45, hjust=0, colour= colors)) x2= plot_grid(rg_plot,p1, nrow= 2, align= 'v', rel_heights= c(0.85, 2)) ggsave(snakemake@output[[3]], plot= x2, width= 113 - 2.5, height= 127 - 25 - 1 , units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/repr_pheno_coloc.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(tidyverse) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) x= fread(snakemake@input[[1]]) x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) x1= fread(snakemake@input[[2]]) x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) x1$rg= -1 * x1$rg d= rbind(x, x1) #traits= filter(d, p< 0.05/ 14, !grepl('BW', p2), !grepl('GA', p2)) %>% pull(p2) d$trait= d$p2 d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration', ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal BW', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Estradiol (women)', ifelse(trait== 'POP', 'Pelvic organ prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal only', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal only', ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))) d= filter(d, !grepl('BW', p2), !grepl('GA', p2), !grepl('_male', p2)) traits= unique(arrange(d, p) %>% pull(trait)) d$trait= factor(d$trait, levels= rev(traits)) p1= ggplot(d, aes(rg, trait, colour= p1)) + geom_pointrange(aes(xmax= rg + 1.96 * se, xmin= rg - 1.96 * se), position = position_dodge(width = 0.3), fatten= 1) + scale_colour_manual(values= colorBlindBlack8[c(8,3)], guide= FALSE) + theme_cowplot(font_size= 9) + scale_x_continuous(limits= c(-1, 1), breaks= seq(-1, 1, 0.5)) + xlab('Genetic correlation') + geom_vline(xintercept= 0, size= 0.3) + geom_vline(xintercept= c(seq(-1, 1, 0.25)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3), axis.title.y= element_blank()) ggsave(snakemake@output[[1]], plot= p1, width= 88, height= 120, units= 'mm', dpi= 300) fwrite(d, snakemake@output[[2]], sep= '\t') p1= ggplot(d, aes(rg, trait, colour= p1)) + geom_pointrange(aes(xmax= rg + 1.96 * se, xmin= rg - 1.96 * se), position = position_dodge(width = 0.3), fatten= 1) + scale_colour_manual(values= colorBlindBlack8[c(8,3)], name= 'Trait') + theme_cowplot(font_size= 9) + scale_x_continuous(limits= c(-1, 1), breaks= seq(-1, 1, 0.5)) + xlab('Genetic correlation') + geom_vline(xintercept= 0, size= 0.3) + geom_vline(xintercept= c(seq(-1, 1, 0.25)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) + theme(axis.line.x = element_line(size = 0.3), axis.line.y = element_line(size = 0.3), axis.ticks= element_line(size= 0.3), axis.title.y= element_blank()) ggsave(snakemake@output[[3]], plot= p1, width= 88, height= 120, units= 'mm', dpi= 300) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | library(scales) library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') library(tidyverse) library(fmsb) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) x= fread(snakemake@input[[1]]) x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) x1= fread(snakemake@input[[2]]) x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) x= rbind(x, x1) traits= unique(filter(x, p< 0.05/ 14, !grepl('BW', p2), !grepl('GA', p2)) %>% pull(p2)) d= fread(snakemake@input[[3]]) table_supp= d table_supp$pheno= 'Gestational duration' d$gcp.pm= ifelse(d$pval.gcpzero.2tailed< 0.05/length(traits), d$gcp.pm, 0) d= filter(d, repr_pheno %in% traits) d= arrange(d, desc(gcp.pm)) df= fread(snakemake@input[[4]]) table_supp2= df table_supp2$pheno= 'Preterm delivery' table_supp= rbind(table_supp, table_supp2) df$gcp.pm= ifelse(df$pval.gcpzero.2tailed< 0.05/length(traits), df$gcp.pm, 0) df= filter(df, repr_pheno %in% traits) d= inner_join(d, df, by= 'repr_pheno') d$trait= d$repr_pheno d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration', ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal BW', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal only', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal only', ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))) d$repr_pheno= d$trait x= as.data.frame(matrix(d$gcp.pm.x, ncol= nrow(d))) x=rbind(x, as.data.frame(matrix(d$gcp.pm.y, ncol= nrow(d)))) names(x)= d$repr_pheno rownames(x)= c('Preterm delivery', 'Gestational duration ') x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x) inches= 25.4 pdf(snakemake@output[[1]], width= 88 / inches, height= 88 / inches) par(mar=c(0,0,0,0)) radarchart(abs(x), axistype= 0, #custom polygon pcol= c(colorBlindBlack8[3], colorBlindBlack8[8]) , pfcol= c(alpha(colorBlindBlack8[3], 0.4), alpha(colorBlindBlack8[8], 0.4)) , plwd=1, pty= 16, plty= 1, vlcex= 0.8, vlabels= c('Testosterone\n(women)', 'Age at\nfirst birth', 'Age at\nmenopause', 'Number of\nlive births', 'SHBG\n(women)', 'CBAT\n(women)'), #custom the grid cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), cglwd=0.8, calcex= 0.4 #custom labels ) dev.off() table_supp$trait= table_supp$repr_pheno table_supp$trait= with(table_supp, ifelse(trait== 'GAraw', 'Maternal gestational duration', ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal BW', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal only', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal only', ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))) fwrite(table_supp, snakemake@output[[2]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') options(warn=-1) d= fread(snakemake@input[[1]], h= T) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") font_add("arial", "arial.ttf", bold= 'arial_bold.ttf') showtext_opts(dpi = 300) showtext_auto(enable = TRUE) female_repr= c('breast', "cervix, uterine", 'endometrium', 'ovary', 'placenta', 'vagina', 'fallopian tube') male_repr= c('ductus deferens', 'testis', 'seminal vesicle', 'prostate', 'epididymis') muscle= c('smooth muscle', 'heart muscle', 'skeletal muscle') d$organ= with(d, ifelse(tissue %in% female_repr, 'Female reproductive', ifelse(tissue %in% male_repr, 'Male reproductive', ifelse(tissue %in% muscle, 'Muscle', 'Others')))) p1= ggplot(d, aes(-log10(MannW_pvalue), I(i_listmedian/ base_list_median), colour= organ)) + geom_point(size= 1.5) + theme_cowplot(font_size= 8) + scale_colour_manual('Legend', values= c(colorBlindBlack8[c(3, 2, 8)], 'grey'), guide= 'none') + geom_text_repel(data= filter(d, MannW_pvalue< 0.05), aes(label= tissue), fontface = 'bold') + geom_vline(xintercept= -log10(0.05), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_vline(xintercept= -log10(0.05/nrow(d)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + ylab('Enrichment') + xlab('-log10(pvalue)') ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300) p1= ggplot(d, aes(-log10(MannW_pvalue), I(i_listmedian/ base_list_median), colour= organ)) + geom_point(size= 1.5) + theme_cowplot(font_size= 10) + scale_colour_manual('Legend', values= c(colorBlindBlack8[c(3, 2, 8)], 'grey')) + geom_text_repel(data= filter(d, MannW_pvalue< 0.05), aes(label= tissue), fontface = 'bold', show_guide = FALSE) + geom_vline(xintercept= -log10(0.05), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + geom_vline(xintercept= -log10(0.05/nrow(d)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) + ylab('Enrichment') + xlab('-log10(pvalue)') ggsave(snakemake@output[[2]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300) |
R
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
figures/RNA_enrichment.R
11 12 | script: 'manhattan_plot.R' |
25 26 | script: 'lm_effect_origin.R' |
37 38 | script: 'effect_origin_dendrogram.R' |
48 49 | script: 'effect_origin_ternary.R' |
64 65 | script: 'gene_based_vs_coloc_iPSC.R' |
74 75 | script: 'BW_coloc_spider.R' |
87 88 | script: 'KCNAB1_pheWAS.R' |
101 102 | script: 'ADCY5_pheWAS.R' |
118 119 | script: 'ADCY5_FST_AFR_EUR.R' |
129 130 | script: 'BW_genetic_correlations.R' |
143 144 | script: 'repr_pheno_correlations.R' |
153 154 | script: 'partitioned_h2.R' |
164 165 | script: 'MacArthurlab_enrichment.R' |
180 181 | script: 'ADCY5_effect_direction.R' |
193 194 | script: 'BW_conditioning.R' |
207 208 | script: 'BW_conditioning_top.R' |
218 219 | script: 'mediation_BW_GA_individual_level_data.R' |
230 231 | script: 'mediation_BW_GA_individual_level_data_decode.R' |
245 246 | script: 'MR_GA_BW_haplotype.R' |
257 258 | script: 'repr_pheno_coloc.R' |
270 271 | script: 'repr_pheno_LCV.R' |
282 283 | script: 'repr_pheno_correlations.R' |
292 293 | script: 'RNA_enrichment.R' |
305 306 | script: 'QQ_plot.R' |
316 317 | script: 'h2_allphenos.R' |
326 327 | script: 'h2_cohorts.R' |
335 336 | script: 'genet_correlations_meta.R' |
346 347 | script: 'manhattan_plot_postTerm.R' |
357 358 | script: 'manhattan_plot_postTerm.R' |
368 369 | script: 'forest_plot_EEFSEC.R' |
380 381 | script: 'MR_sex_hormones_GA.R' |
391 392 | script: 'cell_type_enrichment.R' |
402 403 | script: 'labor_deg.R' |
413 414 | script: 'coloc_sex_hormones.R' |
423 424 | script: 'evo.R' |
432 433 | script: 'GA_BW_PGS_correlations.R' |
445 446 | script: 'GAraw_vs_allPTD.R' |
11 12 13 14 15 | run: d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'geneSymbol', 'Ensembl_gene']) d= d.loc[~d.geneSymbol.str.contains(' '), :] d= d[['CHR', 'start', 'end', 'geneSymbol']] d.to_csv(output[0], sep= '\t', header= True, index= False) |
24 25 26 27 28 29 | run: d= pd.read_csv(input[0], sep='\t', header= 0) pop= ['CEU', 'TSI', 'FIN', 'GBR', 'IBS'] d= d.loc[d.Population.isin(pop)] d['IID']= d['Individual ID'] d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['IID']) |
37 38 39 | run: vcfs= [x for x in input if '1000g' in x] shell('/home/pol/software/bcftools-1.9/bcftools concat {input} -o {output[0]} -Oz') |
52 53 | shell: '/home/pol/software/plink2 --vcf {input[0]} --max-alleles 2 --keep {input[1]} --make-bed --out {params[0]}' |
65 66 67 68 69 70 71 72 73 74 75 | run: d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'RSID', 'cm', 'POS', 'A1', 'A2']) d['REF']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1) d['EFF']= np.where(d.A2.str.len() > d.A1.str.len(), 'I', d.A2) d['REF']= np.where(d.EFF== 'I', 'D', d.REF) d['EFF']= np.where(d.REF== 'I', 'D', d.EFF) d['RSID']= np.where(d.REF > d.EFF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF + ':' + d.REF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF) d= d[['CHR', 'RSID', 'cm', 'POS', 'A1', 'A2']] d.to_csv(output[0], sep= '\t', header= False, index= False) shell('mv {input[1]} {output[1]}') shell('mv {input[2]} {output[2]}') |
83 84 85 86 87 88 89 90 | run: d= pd.read_csv(input[0], header= 0, sep= '\t', compression= 'gzip', usecols= ['ID', 'pvalue']) d.dropna(subset= ['ID'], inplace= True) d= d.loc[d.ID != '-', :] d= d[['ID', 'pvalue']] d.columns= ['SNP', 'p'] d['SNP']= d.SNP.str.replace('^23:', 'X:') d.to_csv(output[0], sep= '\t', header= True, index= None, columns= ['SNP', 'p']) |
105 106 | shell: '/home/pol/software/gcta_1.93.2beta/gcta64 --bfile {params[0]} --maf 0.01 --fastBAT {input[1]} --fastBAT-gene-list {input[2]} --out {params[1]} --thread-num {threads}' |
12 13 14 15 | run: d=pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'RSID', 'pvalue'])[['RSID', 'CHR', 'POS', 'pvalue']] d.columns= ['SNP', 'CHR', 'POS', 'P'] d.to_csv(output[0], header= True, index= False, sep= '\t') |
23 24 25 26 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS']) d.sort_values(['CHR', 'POS'], inplace= True) d['pos2']= d.POS |
38 39 40 41 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d= d.loc[d.Relationship== 'unrel', :] pop= ['CEU', 'TSI', 'FIN', 'GBR', 'IBS'] |
54 55 56 | run: vcfs= [infile for infile in input if 'vcf' in infile] shell('/home/pol/software/bcftools-1.9/bcftools concat -a -O v -R {input[0]} {vcfs} -o {output[0]}') |
68 69 | shell: '/home/pol/software/plink --vcf {input[0]} --keep {input[1]} --make-bed -out {params[0]}' |
77 78 79 80 81 | run: d= pd.read_csv(input[0], sep= '\t', header= None, names= ['chr', 'snp', 'x1', 'pos', 'a1', 'a2']) d= d[d.duplicated(['snp'], keep=False)] d.drop_duplicates(subset= ['snp'], keep= 'first') d.to_csv(output[0], sep= '\t', header= False, index= False) |
94 95 | shell: '~/software/plink --bfile {params[0]} --clump {input[0]} --exclude {input[1]} --clump-r2 0.05 --clump-kb 1000 --clump-p1 5e-8 --clump-p2 1e-5 --out {params[1]}' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | library(data.table) library(dplyr) library(coloc) library(parallel) df= fread(snakemake@input[[1]], select= c('ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF')) df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF) z= fread(snakemake@input[[2]]) z$n= 716 df= inner_join(df, z, by= 'ID') rm(z) pph_outfile= snakemake@output[[1]] results_outfile= snakemake@output[[2]] if (!grepl('sQTL', snakemake@output[[1]])) { cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]]) } else { cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\tgene\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\tgene\n', file= snakemake@output[[2]]) } prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 df= data.frame(df) colocalization_eqtl= function(temp_df){ protein= unique(temp_df$gene) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { temp_df = filter(temp_df, SE>0, se> 0) if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID,s= 0.067) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID) } data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$n, type= 'quant', snp= temp_df$ID) myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, MAF=temp_df$MAF, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { PPH= data.frame(t(myres[[1]])) PPH$protein= protein fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= myres[[2]] res$protein= protein fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) } } } mclapply(split(df, df$gene), colocalization_eqtl, mc.cores= 3) |
15 16 | script: 'coloc_iPSC.R' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | library(dplyr) library(data.table) d= fread(snakemake@input[[1]]) d= filter(d, !is.na(Z)) x= fread(snakemake@input[[2]]) x= filter(x, !is.na(Z)) ld= fread(snakemake@input[[3]]) d= inner_join(d, x, by= 'SNP') d= inner_join(d, ld, by= 'SNP') source(snakemake@params[[1]]) setwd(snakemake@params[[2]]) LCV= RunLCV(d$L2, d$Z.y, d$Z.x, ldsc.intercept= 1, n.1= (d$N.y), n.2= (d$N.x)) cat('zscore\tpval.gcpzero.2tailed\tgcp.pm\tgcp.pse\trho.est\trho.err\tpval.fullycausal1\tpval.fullycausal2\th2.zscore1\th2.zscore2\tpheno\trepr_pheno\n', file = snakemake@output[[1]]) z= data.frame(zscore= LCV$zscore, pval.gcpzero.2tailed= LCV$pval.gcpzero.2tailed, gcp.pm= LCV$gcp.pm, gcp.pse= LCV$gcp.pse, rho.est= LCV$rho.est, rho.err= LCV$rho.err, pval.fullycausal1= LCV$pval.fullycausal[1],pval.fullycausal2= LCV$pval.fullycausal[2], h2.zscore1= LCV$h2.zscore[1], h2.zscore2= LCV$h2.zscore[2], pheno= snakemake@wildcards[['pheno']], repr_pheno= snakemake@wildcards[['repr_pheno']]) fwrite(z, snakemake@output[[1]], sep= '\t') |
13 14 | script: 'LCV.R' |
22 23 24 25 26 | shell: ''' head -1 {input[0]} > {output[0]} tail -n +2 -q {input} >> {output[0]} ''' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | import pandas as pd import numpy as np from scipy.special import chdtri import gzip import csv def not_number(s): if s != None: try: float(s) return False except ValueError: return True else: return True def select_format(repr_pheno, row): 'For each wildcard assign the correct formating function.' if repr_pheno== 'Preeclampsia': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= preeclampsia(row) if repr_pheno== 'POP': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= POP(row) if repr_pheno== 'miscarriage': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= miscarriage(row) if repr_pheno== 'GA_fetal': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= fet_GA(row) if repr_pheno== 'BW_maternal': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal(row) if repr_pheno== 'BW_fetal': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal(row) if repr_pheno== 'BW_maternal_effect': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal_adjusted_effect(row) if repr_pheno== 'BW_fetal_effect': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal_adjusted_effect(row) if repr_pheno== 'leiomyoma_uterus': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= leiomyoma_uterus(row) if repr_pheno== 'AMenopause': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= AMenopause(row) if repr_pheno in ['Oestradiol_fem', 'NLB', 'AFB', 'AMenarche', 'endometriosis']: rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= UKBB_traits(row) if repr_pheno in ['SHBG_fem', 'Testosterone_fem', 'Testosterone_male', 'SHBG_male', 'CBAT_fem', 'CBAT_male']: rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= pritchard(row) if repr_pheno == 'PCOS': rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= PCOS(row) if repr_pheno in ['Ruth_CBAT_female', 'Ruth_CBAT_male', 'Ruth_SHBG_female', 'Ruth_SHBG_male', 'Ruth_Testosterone_female', 'Ruth_Testosterone_male', 'Ruth_oestradiol']: rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= Ruth(row, repr_pheno) return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def AMenopause(row): 'REPROGEN Age at menopause.' EAF= float(row['EAF']) CHR= row['CHR'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['POS']) REF= row['Other_Allele'].upper() EFF= row['Effect_Allele'].upper() BETA= float(row['Effect']) pvalue= float(row['Pval']) SE= float(row['SE']) N= int(row['N']) rsid= '' return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def Ruth(row, repr_pheno): '' EAF= float(row['effect_allele_frequency']) CHR= row['chromosome'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['base_pair_location']) REF= row['other_allele'] EFF= row['effect_allele'] BETA= float(row['beta']) pvalue= float(row['p_value']) SE= float(row['standard_error']) N= np.where(repr_pheno== 'Ruth_SHBG_female', 189473, np.where(repr_pheno== 'Ruth_SHBG_make', 180726, np.where(repr_pheno== 'Ruth_Testosterone_female', 230454, np.where(repr_pheno== 'Ruth_SHBG_male',194453 , np.where(repr_pheno== 'Ruth_CBAT_female', 188507, np.where(repr_pheno== 'Ruth_SHBG_male', 178782, 206927)))))) rsid= row['variant_id'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def pritchard(row): '' EAF= float(row['A1_FREQ']) CHR= row['#CHROM'] if CHR== 'X': CHR= 23 if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0] POS= int(row['POS']) CHR= int(CHR) REF= row['REF'] EFF= row['ALT'] N= int(row['OBS_CT']) if not_number(row['BETA']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0] if not_number(row['SE']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0] if not_number(row['P']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0] BETA= float(row['BETA']) SE= float(row['SE']) pvalue= float(row['P']) rsid= row['ID'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def leiomyoma_uterus(row): '' EAF= float(row['EAF']) CHR= row['CHR'] if CHR== 'X': CHR= 23 if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0] POS= int(row['POS']) CHR= int(CHR) REF= row['REF'] EFF= row['EFF'] N= row['TOTALSAMPLESIZE'] BETA= float(row['beta']) SE= float(row['se']) pvalue= float(row['pvalue']) rsid= '' return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def preeclampsia(row): '' CHR= row['CHR'] if CHR== 'X': CHR= 23 if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0] POS= int(row['POS']) CHR= int(CHR) REF= row['REF'].upper() EFF= row['EFF'].upper() N= 4630 + 373345 rsid= row['rsid'] BETA= float(row['beta']) SE= float(row['se']) EAF= float(row['EAF']) pvalue= float(row['pvalue']) return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def BW_fetal_adjusted_effect(row): 'Define each header for Birth weight fetal effect.' EAF= float(row['eaf']) CHR= row['chr'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['pos']) REF= row['nea'].upper() if REF== 'R': REF= 'D' EFF= row['ea'].upper() if EFF== 'R': EFF= 'D' BETA= float(row['beta']) pvalue= float(row['p']) SE= float(row['se']) N= int(row['n_ownBW']) rsid= row['RSID'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def BW_maternal_adjusted_effect(row): 'Define each header for Birth weight fetal effect.' EAF= float(row['eaf']) CHR= row['chr'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['pos']) REF= row['nea'].upper() if REF== 'R': REF= 'D' EFF= row['ea'].upper() if EFF== 'R': EFF= 'D' BETA= float(row['beta']) pvalue= float(row['p']) SE= float(row['se']) N= int(row['n_offBW']) rsid= row['RSID'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def BW_maternal(row): 'Define each header for Birth weight maternal effect.' EAF= float(row['eaf']) CHR= row['chr'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['pos']) REF= row['nea'] EFF= row['ea'] if REF== 'R': REF= 'D' if EFF== 'R': EFF= 'D' BETA= float(row['beta']) pvalue= float(row['p']) SE= float(row['se']) N= int(row['n']) rsid= row['SNP'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def BW_fetal(row): 'Define each header for Birth weight maternal effect.' EAF= float(row['eaf']) CHR= row['chr'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['pos']) REF= row['nea'] EFF= row['ea'] if REF== 'R': REF= 'D' if EFF== 'R': EFF= 'D' BETA= float(row['beta']) pvalue= float(row['p']) SE= float(row['se']) N= int(row['n']) rsid= row['rsid'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def PCOS(row): 'Define each header for PCOS excluding 23andme.' EAF= float(row['EAF']) CHR= row['CHR'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['POS']) REF= row['REF'] EFF= row['EFF'] BETA= float(row['beta']) pvalue= float(row['pvalue']) SE= float(row['se']) N= int(round(float(row['TOTALSAMPLESIZE']))) rsid= '' return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def UKBB_traits(row): 'Define each header for UKBB traits (hormones).' if row['low_confidence_variant']== 'true': return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] CHR= row['variant'].split(':')[0] if CHR== 'X': CHR= 23 POS= row['variant'].split(':')[1] if any([not_number(t) for t in [row['minor_AF'], CHR, POS, row['beta'], row['pval'], row['se'], row['n_complete_samples']]]): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] CHR= int(CHR) POS= int(POS) REF= row['variant'].split(':')[2] EFF= row['variant'].split(':')[3] BETA= float(row['beta']) pvalue= float(row['pval']) SE= float(row['se']) N= int(row['n_complete_samples']) if row['minor_allele']== EFF: EAF= float(row['minor_AF']) else: EAF= 1- float(row['minor_AF']) rsid= '' return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def AP_repr(row): 'Define each header for BOLT-LMM sumstats.' EAF= float(row['EAF']) CHR= row['CHR'] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['POS']) REF= row['A2'] EFF= row['A1'] BETA= float(row['Beta']) pvalue= float(row['P']) SE= float(row['se']) N= row['N'] rsid= row['SNP'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def POP(row): 'Define each header for pelvic organ prolapse.' if not row['CHR'].isdigit(): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] EAF= float(row['EAF']) MAF= np.where(EAF> 0.5, 1 - EAF, EAF) if MAF < 0.005: return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] if row['CHR']== 'X': row['CHR']= 23 CHR= int(row['CHR']) POS= int(row['POS']) REF= row['REF'] EFF= row['EFF'] BETA= float(row['BETA']) pvalue= float(row['pvalue']) SE= float(row['SE']) N= float(row['N']) rsid= '' return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def fet_GA(row): 'Define each header for Fetal gestational duration.' EAF= '' if row['Chr']== 'X': row['Chr']= 23 CHR= int(row['Chr']) POS= int(row['Pos']) REF= row['Non_effect_allele'].upper() EFF= row['Effect_allele'].upper() BETA= float(row['Effect']) pvalue= float(row['P']) SE= float(row['StdErr']) N= int(row['N']) rsid= row['Rsid'] return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def miscarriage(row): 'Define each header for Miscarriage.' EAF= row['Freq1'] CHR= row['MarkerName'].split(':')[0] if CHR== 'X': CHR= 23 CHR= int(CHR) POS= int(row['MarkerName'].split(':')[1]) REF= row['Allele2'].upper() EFF= row['Allele1'].upper() BETA= float(row['Effect']) pvalue= float(row['P-value']) SE= float(row['StdErr']) N= 49996 + 174109 rsid= '' return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue] def format_list(input, output): with gzip.open(input, 'rt', newline='') as f: print(input) dialect = csv.Sniffer().sniff(f.readline(), delimiters= ' \t') f.seek(0) input_file= csv.DictReader(f, dialect= dialect) df_list= list() with open(output, 'w') as csvfile: writer = csv.writer(csvfile, delimiter= '\t') writer.writerow([g for g in ['ID', 'rsid', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']]) for row in input_file: rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= select_format(snakemake.wildcards.repr_pheno, row) if CHR== 0: continue if len(REF) >1: REF= 'I' if len(EFF) >1: EFF= 'I' if REF== 'I': EFF= 'D' if EFF== 'I': REF= 'D' if REF> EFF: ID= str(CHR) + ':' + str(POS) + ':' + EFF + ':' + REF BETA= -1 * float(BETA) ref= EFF eff= REF EAF= 1 - float(EAF) else: ID= str(CHR) + ':' + str(POS) + ':' + REF + ':' + EFF BETA= float(BETA) eff= EFF ref= REF df_list.append([ID, rsid, CHR, POS, EAF, N, ref, eff, BETA, SE, pvalue]) if len(df_list)== 1000: with open(output, 'a', newline= '') as file_handler: writer1= csv.writer(file_handler, delimiter= '\t') for item in df_list: writer1.writerow(item) df_list= list() with open(output, 'a', newline= '') as file_handler: writer1= csv.writer(file_handler, delimiter= '\t') for item in df_list: writer1.writerow(item) format_list(snakemake.input[0], snakemake.output[0]) |
8 9 | script: 'format_sumstats.py' |
21 22 23 24 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d.columns= ['ID', 'SNP', 'CHR', 'POS', 'EAF', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue'] d.dropna(subset= ['pvalue'], axis= 0, inplace= True) |
53 54 55 56 57 58 59 60 61 62 63 64 65 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --out {params[0]} \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | run: allfiles= [infile for infile in input if wildcards.pheno not in infile] allfiles= ','.join(allfiles) outfile= params[0] + wildcards.pheno + '_rg' infile= input[0] shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --rg {infile},{allfiles} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {outfile} conda deactivate set -eu """) |
101 102 103 104 105 106 | run: with open(input[0], 'r') as f: x= f.readlines() x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3] with open(output[0], 'w') as f: f.write(''.join(x)) |
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | run: allfiles= [infile for infile in input if wildcards.repr_pheno not in infile] allfiles= ','.join(allfiles) outfile= params[0] + wildcards.repr_pheno + '_rg' infile= input[0] shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --rg {infile},{allfiles} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {outfile} conda deactivate set -eu """) |
141 142 143 144 145 146 | run: with open(input[0], 'r') as f: x= f.readlines() x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3] with open(output[0], 'w') as f: f.write(''.join(x)) |
154 155 156 157 158 159 160 | run: df_list= list() for i in input: d= pd.read_csv(i, delim_whitespace= True, header= 0) df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
172 173 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) |
211 212 213 214 215 216 217 218 219 220 221 222 223 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --out {params[0]} \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | run: allfiles= [infile for infile in input if 'BW_maternal_effect' not in infile] allfiles= ','.join(allfiles) outfile= params[0] + 'BW_maternal_effect_rg' infile= input[0] shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --rg {infile},{allfiles} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {outfile} conda deactivate set -eu """) |
261 262 263 264 265 266 267 268 269 270 271 272 273 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """ |
281 282 283 284 285 286 | run: with open(input[0], 'r') as f: x= f.readlines() x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3] with open(output[0], 'w') as f: f.write(''.join(x)) |
297 298 299 300 301 302 303 304 305 306 307 308 309 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """ |
317 318 319 320 321 322 323 324 325 326 327 328 | run: df_list= list() for infile in input: with open(infile, 'r') as f: lines= [line.strip() for line in f if line.startswith('Total Observed')] h2= float(lines[0].split(' ')[4]) se= float(lines[0].split('(')[1].replace(')', '')) cohort= infile.split('/')[10].replace('_h2.log', '') d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0]) df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
17 18 19 20 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip', usecols= ['RSID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']) d.columns= ['CHR', 'POS', 'A1', 'A2', 'N', 'BETA', 'SE', 'pvalue', 'SNP'] d.dropna(axis= 0, inplace= True) |
40 41 42 43 44 45 46 47 48 49 50 51 52 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --out {params[0]} \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
63 64 65 | run: allfiles= [infile for infile in input if wildcards.pheno not in infile] allfiles= ','.join(allfiles) |
88 89 90 91 92 93 | run: with open(input[0], 'r') as f: x= f.readlines() x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3] with open(output[0], 'w') as f: f.write(''.join(x)) |
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | shell: ''' set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]}\ --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD. \ --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\ --overlap-annot\ --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC.\ --out {params[0]} conda deactivate set -eu ''' |
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | shell: ''' set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc cd /home/pol/software/ldsc/cts/ python2 /home/pol/software/ldsc/ldsc.py \ --h2-cts {input[0]}\ --ref-ld-chr-cts {params[1]} \ --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\ --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD.\ --out {params[0]} conda deactivate set -eu ''' |
156 157 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])[['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']] |
185 186 187 188 189 190 191 192 193 194 195 196 197 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --out {params[0]} \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
209 210 211 212 213 214 215 216 217 218 219 220 221 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """ |
229 230 231 232 233 234 235 236 237 238 239 240 | run: df_list= list() for infile in input: with open(infile, 'r') as f: lines= [line.strip() for line in f if line.startswith('Total Observed')] h2= float(lines[0].split(' ')[4]) se= float(lines[0].split('(')[1].replace(')', '')) cohort= infile.split('/')[9].replace('_h2.log', '') d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0]) df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
252 253 254 255 256 257 258 259 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])[['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']] d['SNP']= d.SNP.str.replace(':SNP', '') d['SNP']= d.SNP.str.replace(':INDEL', '') d['CHR']= d.CHR.apply(str) d.columns= ['ID', 'CHR', 'POS', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue'] d.dropna(axis= 0, inplace= True) d['CHR']= d.CHR.apply(str) |
281 282 283 284 285 286 287 288 289 290 291 292 293 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --out {params[0]} \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
304 305 306 307 308 309 310 311 312 313 314 315 316 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """ |
324 325 326 327 328 329 330 331 332 333 334 335 | run: df_list= list() for infile in input: with open(infile, 'r') as f: lines= [line.strip() for line in f if line.startswith('Total Observed')] h2= float(lines[0].split(' ')[4]) se= float(lines[0].split('(')[1].replace(')', '')) cohort= infile.split('/')[10].replace('_allPTD.log', '') d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0]) df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
345 346 347 348 349 350 351 352 353 354 355 356 357 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """ |
365 366 | run: df_list= list() |
387 388 389 390 391 | run: x= pd.read_csv(input[0], sep= '\t', header= 0) d= pd.read_csv(input[1], sep= '\t', header= 0) d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF', 'EFF', 'INDELS']]= d['MarkerName'].str.split(':', expand= True) |
416 417 418 419 420 421 422 423 424 425 426 427 428 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --out {params[0]} \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 | run: allfiles= [infile for infile in input if wildcards.PTD_metas not in infile] allfiles= ','.join(allfiles) outfile= params[0] + wildcards.PTD_metas + '_rg' infile= input[0] shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --rg {infile},{allfiles} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {outfile} conda deactivate set -eu """) |
464 465 466 467 468 469 470 471 472 473 474 | run: with open(input[0], 'r') as f: x= f.readlines() x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3] with open(output[0], 'w') as f: f.write(''.join(x)) with open(input[1], 'r') as f: x= f.readlines() x= x[x.index('Summary of Genetic Correlation Results\n')+2:-3] with open(output[0], 'a') as f: f.write(''.join(x)) |
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | run: allfiles= [infile for infile in input if 'individual_cohorts/' + wildcards.big5 not in infile] allfiles= ','.join(allfiles) print(allfiles) outfile= input[0].replace('.txt.sumstats.gz', '_rg') infile= input[0] shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --rg {infile},{allfiles} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """) |
513 514 515 516 517 518 519 520 521 522 523 524 | run: for i in range(len(input)): with open(input[i], 'r') as f: x= f.readlines() if i== 0: x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3] with open(output[0], 'w') as f: f.write(''.join(x)) else: x= x[x.index('Summary of Genetic Correlation Results\n')+2:-3] with open(output[0], 'a') as f: f.write(''.join(x)) |
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] rs= pd.read_csv(input[1], sep= '\t', header=0) rs.columns= ['ID', 'RSID'] |
564 565 566 567 568 569 570 571 572 573 574 575 576 | run: x= pd.read_csv(input[0], sep= '\t', header= 0) d= pd.read_csv(input[1], sep= '\t', header= 0) d['CHR']= np.where(d['CHR']== 'X', '23', d['CHR']) d['POS']= d['POS'].astype(str).astype(int) d['CHR']= d['CHR'].astype(str).astype(int) d.dropna(axis= 0, inplace= True) d= pd.merge(d, x[['CHR', 'SNP', 'BP']], left_on= ['CHR', 'POS'], right_on= ['CHR', 'BP']) d= d.loc[~((d.CHR==6) & (d.POS >28477797) & (d.POS< 33448354)), :] d= d[['CHR', 'POS', 'RSID', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']] d.columns= ['CHR', 'POS', 'SNP', 'A1', 'A2', 'N', 'EAF', 'BETA', 'SE', 'pvalue'] d.drop_duplicates(['CHR', 'POS', 'A1', 'A2'], keep= 'first', inplace= True) d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['SNP', 'CHR', 'POS', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue']) |
588 589 590 591 592 593 594 595 596 597 598 599 600 | shell: """ set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/munge_sumstats.py \ --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \ --out {params[0]} \ --sumstats {input[0]} \ --chunksize 500000 conda deactivate set -eu """ |
611 612 613 614 615 616 617 618 619 620 621 622 623 | run: shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --rg {input[0]},{input[1]} \ --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \ --out {params[0]} conda deactivate set -eu """) |
637 638 639 640 641 642 643 644 | run: d= pd.read_table(input[0], sep= '\t', header= 0) d['kbid']= d.kbid.str.split('.', expand= True)[0] d['Cell_type']= d.Cell_type.str.replace(' ', '-') for k, g in d[d['Cell_type'].isin(set(d.Cell_type))].groupby('Cell_type'): g.to_csv(params[0] + k + '.txt', header= False, sep= '\t', columns= ['kbid'], index= False) d.drop_duplicates('kbid', inplace= True, keep= 'first') d.to_csv(output[-1], sep= '\t', header= False, index= False, columns= ['kbid']) |
655 656 657 | run: shell(""" set +eu |
681 682 683 | run: shell(""" set +eu |
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 | shell: ''' set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2 {input[0]}\ --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD.,{params[1]}. \ --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\ --overlap-annot \ --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC. \ --out {params[0]} \ --thin-annot conda deactivate set -eu ''' |
736 737 738 739 740 741 742 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d= d.loc[d.Category== 'L2_1', :] x= pd.read_csv(input[1], sep= '\t', header= None, names= ['Gene']) d['n_genes']= x.shape[0] d['Category']= wildcards.cell_types d.to_csv(output[0], sep= '\t', header= True, index= False) |
750 751 752 753 754 | shell: ''' head -1 {input[0]} > {output[0]} tail -n +2 -q {input} >> {output[0]} ''' |
766 767 | run: shell(""" |
794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 | run: shell(""" set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --l2 \ --bfile {params[0]} \ --ld-wind-cm 1 \ --annot {input[0]} \ --out {params[1]} \ --print-snps {input[1]} \ --thin-annot """) |
823 824 825 826 827 828 | run: if wildcards.cell_types!= 'overall': d= pd.DataFrame({'V1': [wildcards.cell_types], 'V2': ','.join(params)}) d.to_csv(output[0], sep= '\t', header= False, index= False) else: open(output[0], 'a').close() |
836 837 | shell: 'cat {input} > {output[0]}' |
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 | shell: ''' set +eu source /home/pol/miniconda3/etc/profile.d/conda.sh conda activate ldsc python2 /home/pol/software/ldsc/ldsc.py \ --h2-cts {input[0]}\ --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD. \ --ref-ld-chr-cts {input[1]} \ --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\ --overlap-annot \ --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC. \ --out {params[0]} \ --thin-annot conda deactivate set -eu ''' |
10 11 | shell: 'grep -v {wildcards.allPTD_coh} {input[0]} | sed -e "s/to_replace/{params[0]}/g" > {output[0]}' |
38 39 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1', 'P-value']) |
59 60 | shell: 'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}' |
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | run: d= pd.read_csv(input[0], sep= '\t', header=0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue', 'HetISq', 'HetPVal'] d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] |
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | run: d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene', 'ID']) df= d.loc[d.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue x= pd.concat(df_list) x['pos1']= x.POS - 1.5*10**6 x['pos2']= x.POS + 1.5*10**6 x['CHR']= x.CHR.astype(str) x['CHR']= np.where(x.CHR== '23', 'X', x.CHR) x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene', 'ID', 'pvalue']) |
10 11 | shell: 'grep -v {wildcards.GAraw_coh} {input[0]} | sed -e "s/to_replace/{params[0]}/g" > {output[0]}' |
38 39 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1', 'P-value']) |
59 60 | shell: 'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}' |
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | run: d= pd.read_csv(input[0], sep= '\t', header=0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue', 'HetISq', 'HetPVal'] d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] |
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | run: d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene', 'ID']) df= d.loc[d.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue x= pd.concat(df_list) x['pos1']= x.POS - 1.5*10**6 x['pos2']= x.POS + 1.5*10**6 x['CHR']= x.CHR.astype(str) x['CHR']= np.where(x.CHR== '23', 'X', x.CHR) x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene', 'ID', 'pvalue']) |
11 12 13 14 15 16 17 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'], compression= 'gzip') d.sort_values(['pvalue'], ascending=True, inplace= True) d.drop_duplicates(['CHR', 'POS'], inplace= True, keep= 'first') d['ID']= 'chr' + d.CHR.apply(str) + ':' + d.POS.apply(str) d.columns= ['CHR', 'POS', 'P-value', 'nearestGene', 'MarkerName'] d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['MarkerName', 'P-value']) |
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | run: df= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'], compression= 'gzip') df= df.loc[df.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5 * 10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue df= pd.concat(df_list) df['CHR']= df.CHR.astype(str) |
57 58 59 60 61 62 63 64 65 66 67 | run: if not os.path.exists(params[1]): os.makedirs(params[1]) df= pd.read_csv(input[0], sep= '\t', header= 0) for index, row in df.iterrows(): snp= row['snp'] title= '"' + row['nearestGene'] + '"' shell('python2 /home/pol/software/locuszoom/bin/locuszoom --metal {input[1]} --refsnp {snp} --flank 250kb --plotonly --no-date --build hg19 --pop EUR --source 1000G_March2012 --prefix {params[0]} title={title} theme=publication') outfile= params[1] + 'chr' + str(row['chr']) + '_' + row['nearestGene'] + '.pdf' infile= params[0] + '_' + snp.replace(':', '_') + '.pdf' shell('qpdf --empty --pages {infile} 1 -- {outfile}; rm {infile}') |
80 81 82 | run: if len(input)== 1: shell('cp {input[0]} {output[0]}') |
96 97 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue'], compression= 'gzip') |
126 127 128 129 | run: if not os.path.exists(params[1]): os.makedirs(params[1]) shell('python2 /home/pol/software/locuszoom/bin/locuszoom --metal {input[0]} --refsnp rs9823520 --flank 250kb --plotonly --no-date --build hg19 --pop EUR --source 1000G_March2012 --prefix {params[0]} title="WNT4-GA" theme=publication') |
20 21 | shell: '/home/pol/software/generic-metal/metal {params[0]} >> {output[1]}' |
37 38 39 40 41 42 | shell: ''' /home/pol/software/generic-metal/metal {input[0]} >> {output[3]} /home/pol/software/generic-metal/metal {input[1]} >> {output[3]} /home/pol/software/generic-metal/metal {input[2]} >> {output[3]} ''' |
55 56 57 58 59 | shell: ''' /home/pol/software/generic-metal/metal {input[0]} >> {output[2]} /home/pol/software/generic-metal/metal {input[1]} >> {output[2]} ''' |
70 71 | shell: '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}' |
81 82 | shell: '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}' |
92 93 | shell: '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}' |
104 105 | shell: '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}' |
115 116 117 118 | shell: ''' /home/pol/software/generic-metal/metal {input[0]} >> {output[1]} ''' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | library(MendelianRandomization) library(data.table) library(dplyr) if (!grepl('cluster', snakemake@output[[1]])){ d= fread(snakemake@input[[1]]) names(d)= c('ID', 'beta', 'se', 'pvalue', 'trait') } else { d= fread(snakemake@input[[1]]) } x=fread(snakemake@input[[2]]) x= filter(x, !duplicated(ID)) d= inner_join(d, x, by= 'ID') funk= function(temp_df){ inputMR= mr_input(bx = temp_df$beta, bxse= temp_df$se,by = temp_df$BETA, byse = temp_df$SE) if (nrow(temp_df)>3) { z= mr_allmethods(inputMR)$Values names(z)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue') z$trait= unique(temp_df$trait) } else { z= mr_ivw(inputMR) z= data.frame(method= 'IVW', estimate= z$Estimate, se= z$StdError, lo95= z$CILower, up95= z$CIUpper, pvalue= z$Pvalue, trait= unique(temp_df$trait)) } return(z) } mr= lapply(split(d, d$trait), funk) mr= do.call('rbind', mr) fwrite(mr, snakemake@output[[1]], sep= '\t') |
16 17 18 19 20 21 22 23 24 25 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'ID']) x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['ID', 'EAF']) x= x.loc[((x.EAF>=0.01) & (x.EAF<= 0.99)), :] d= d.loc[d.pvalue< 5e-8, :] d= d.loc[d.ID.isin(x.ID.values), :] d.drop_duplicates('ID', inplace= True) if d.shape[0] == 0: open(output[0], 'a').close() else: |
40 41 42 43 44 45 46 47 48 49 50 | run: d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'SNP', 'x1', 'POS', 'A1', 'A2']) d['A1']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1) d['A2']= np.where(d.A1.str.len() < d.A2.str.len(), 'I', d.A2) d['A1']= np.where(d.A2== 'I', 'D', d.A1) d['A2']= np.where(d.A1== 'I', 'D', d.A2) d['SNP']= np.where(d.A1>d.A2, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A1 + ':' + d.A2) d.to_csv(output[0], sep= '\t', header= False, index= False) d= d[d.duplicated(['SNP'], keep= False)] d.drop_duplicates('SNP', inplace= True, keep= 'first') d.to_csv(output[1], sep='\t', columns= ['SNP'], index= False, header= False) |
64 65 | run: shell('~/software/plink --bim {input[2]} --bed {input[3]} --fam {input[4]} --clump {input[0]} --exclude {input[1]} --clump-r2 0.001 --clump-kb 1000 --clump-p1 5e-8 --clump-p2 1e-5 --out {params[1]} || true') |
75 76 77 78 79 80 81 82 83 84 85 | run: if os.stat(input[1]).st_size == 0: open(output[0], "w").close else: d= pd.read_csv(input[0], sep='\t', header= 0, usecols= ['ID', 'BETA', 'SE', 'pvalue']) x= pd.read_csv(input[1], delim_whitespace= True, header= 0) d= d.loc[d.ID.isin(x.SNP.values), :] d= d.groupby('ID').head(1) d= d[['ID', 'BETA', 'SE', 'pvalue']] d['trait']= wildcards.repr_pheno d.to_csv(output[0], sep= '\t', header= False, index= False) |
93 94 | shell: 'echo -e "ID\tbeta\tse\tpvalue_exp\ttrait" | cat {input} > {output[0]}' |
104 105 | script: 'MR_reproductive_traits.R' |
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'ID', 'REF', 'ALT']) x.columns= ['CHR', 'POS', 'ID', 'REF', 'ALT'] df= d.loc[(d.OKG_proxy != 'Signal in 1KG') & (d.OKG_proxy != 'No 1KG proxy'), :] okg= df.loc[(df.OKG_Other_allele.str.len() == 1) & (d.OKG_Trait_raising.str.len() == 1), :] hm= df.loc[(df.OKG_Other_allele.str.len() != 1) | (d.OKG_Trait_raising.str.len() != 1), :] hm['beta']= hm.HM_Weight hm['ref']= hm.HM_Other_allele hm['eff']= hm.HM_Trait_raising hm['RSID']= hm.HM_proxy hm['se']= hm.HM_SE_weight hm= hm[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']] okg['beta']= okg.OKG_Weight okg['ref']= okg.OKG_Other_allele okg['eff']= okg.OKG_Trait_raising okg['RSID']= okg.OKG_proxy okg['se']= okg.OKG_SE_weight okg= okg[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']] d= d.loc[(d.OKG_proxy == 'Signal in 1KG') | (d.OKG_proxy == 'No 1KG proxy'), :] d['beta']= d.Weight d['ref']= d.Other_allele d['eff']= d.Trait_raising d['RSID']= d.Signal d['se']= d.SE_weight d= d[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']] d= pd.concat([d, hm, okg]) d= pd.merge(d, x, left_on= ['RSID'], right_on= 'ID') d= d.loc[(d.ALT== d.ref) | (d.REF== d.ref), :] d= d.loc[(d.ALT== d.eff) | (d.REF== d.eff), :] d['beta']= np.where(d.ref > d.eff, -1 * d.beta, d.beta) d['ID']= np.where(d.ref > d.eff, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.eff + ':' + d.ref, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.ref + ':' + d.eff) d['trait']= np.where(d.Cluster== 'Female SHBG cluster', 'SHBG_fem_cluster', 'Testosterone_fem_cluster') d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'beta', 'se', 'trait']) |
158 159 | script: 'MR_reproductive_traits.R' |
168 169 170 171 172 173 174 | run: x= pd.read_csv(input[0], sep= '\t', header= None, names= ['ID', 'beta', 'se', 'pvalue', 'trait']) x= x.loc[((x.trait== 'SHBG_fem') | (x.trait== 'Testosterone_fem') | (x.trait== 'CBAT_fem')), :] x.drop_duplicates(subset= 'ID', inplace= True) x[['CHR', 'POS', 'REF', 'EFF']]= x.ID.str.split(':', expand= True) x['CHR']= np.where(x.CHR== 'X', '23', x.CHR) x.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['CHR', 'POS', 'POS', 'ID']) |
186 187 | shell: '~/software/plink2 --bfile {params[0]} --extract bed1 {input[0]} --memory 5000 --threads {threads} --make-bed --out {params[1]}' |
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | run: d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'SNP', 'x1', 'POS', 'A1', 'A2']) d['A1']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1) d['A2']= np.where(d.A1.str.len() < d.A2.str.len(), 'I', d.A2) d['A1']= np.where(d.A2== 'I', 'D', d.A1) d['A2']= np.where(d.A1== 'I', 'D', d.A2) d['CHR']= d.CHR.apply(str) d['CHR']= np.where(d.CHR== 'X', '23', d.CHR) d['SNP']= np.where(d.A1>d.A2, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A1 + ':' + d.A2) d.to_csv(output[0], sep= '\t', header= False, index= False) d= d[d.duplicated(['SNP'], keep= False)] d.drop_duplicates('SNP', inplace= True, keep= 'first') d.to_csv(output[3], sep='\t', columns= ['SNP']) shell('mv {input[1]} {output[1]}') shell('mv {input[2]} {output[2]}') |
225 226 | shell: '~/software/plink --bfile {params[0]} --r square --out {params[1]}' |
240 241 | script: 'MVMR.R' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | library(data.table) library(dplyr) hrc= fread(snakemake@input[[1]]) funk= function(infile){ d= fread(infile) print(paste('Filtering file: ', infile)) d= arrange(d, CHR, POS, EFF, REF) d= filter(d, pvalue< 1, pvalue>0) d$pval= pnorm(-abs(d$BETA / d$SE)) * 2 d= filter(d, (abs(-log10(pvalue) - -log10(pval)) / -log10(pval)) * 100 <= 10) d$ID= with(d, ifelse(REF> EFF, paste(CHR, POS, EFF, REF, sep= ':'), paste(CHR, POS, REF, EFF, sep= ':'))) d$SNP= with(d, ifelse(grepl('I', ID), paste(ID, 'INDEL', sep= ':'), paste(ID, 'SNP', sep= ':'))) print(str(d)) d= inner_join(d, hrc, by= 'ID') d$EAF= ifelse(is.na(d$EAF), d$eaf, d$EAF) d$BETA= ifelse(d$REF> d$EFF, -1 * d$BETA, d$BETA) d$EAF= ifelse(d$REF> d$EFF, 1 - d$EAF, d$EAF) d[d$REF>d$EFF, c("REF", "EFF")]= d[d$REF > d$EFF, c("EFF", "REF")] d$MAF= ifelse(d$EAF>0.5, 1- d$EAF, d$EAF) d= filter(d, MAF>= 0.005) d= filter(d, pvalue>0, pvalue<1, MAF>=0.005, SE>0) d= filter(d, (MAF * 2 * N) > 6) d$maf= ifelse(d$eaf> 0.5, 1 - d$eaf, d$eaf) d= filter(d, abs(maf - MAF) < 0.2) if (grepl('GAraw/Viva', infile)){ d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF)) d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA)) } if (grepl('GAnrm/Viva', infile)){ d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF)) d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA)) } if (grepl('postTerm/HUNT', infile)){ d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF)) d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA)) } d= arrange(d, pvalue) d= filter(d, !duplicated(ID)) d= select(d, -c(MAF, ID, eaf, pval)) x2= nrow(d) d$STRAND= '+' #outfile= paste0(snakemake@params[[1]], gsub('_temp.txt', '', unlist(strsplit(infile, '/'))[9]), '.txt') fwrite(d, snakemake@output[[1]], sep= '\t') } #input_files= snakemake@input[grepl('sumstats', snakemake@input)] lapply(snakemake@input[[2]], funk) |
96 97 | run: format_list(input[0], output[0]) |
107 108 109 110 | run: for infile in input: outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt' shell('mv {infile} {outfile}') |
118 119 | run: format_list(input[0], output[0]) |
129 130 131 132 | run: for infile in input: outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt' shell('mv {infile} {outfile}') |
140 141 | run: format_list(input[0], output[0]) |
151 152 153 154 | run: for infile in input: outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt' shell('mv {infile} {outfile}') |
163 164 | run: format_list(input[0], output[0]) |
174 175 176 177 | run: for infile in input: outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt' shell('mv {infile} {outfile}') |
186 187 188 189 190 191 192 193 194 195 196 197 198 199 | run: d= pd.read_csv(input[0], header= 0, sep= '\t', usecols= ['#CHROM', 'POS', 'REF', 'ALT', 'AF_EXCLUDING_1000G']) d.columns= ['CHR', 'POS', 'REF', 'ALT', 'eaf'] d['CHR']= np.where(d.CHR=='X', '23', d.CHR) KG= pd.read_csv(input[1], header= 0, sep='\t', compression= 'gzip', names= ['ID', 'ALT', 'REF', 'eaf']) KG['ID']= KG['ID'].str.replace(':ID', '') KG['ID']= KG['ID'].str.replace('X', '23') d['eaf']= np.where(d['REF']> d['ALT'], 1 - d.eaf, d.eaf) KG['eaf']= np.where(KG['REF']> KG['ALT'], 1- KG.eaf, KG.eaf) d['REF']= np.where(d.REF.str.len() > d.ALT.str.len(), 'I', d.REF) d['ALT']= np.where(d.REF.str.len()< d.ALT.str.len(), 'I', d.ALT) d['REF']= np.where(d.ALT== 'I', 'D', d.REF) d['ALT']= np.where(d.REF== 'I', 'D', d.ALT) KG['REF']= np.where(KG.REF.str.len() > KG.ALT.str.len(), 'I', KG.REF) |
229 230 | script: 'filter_SNPs.R' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | library(data.table) library(dplyr) d= fread(snakemake@input[[1]]) x1= nrow(d) d= arrange(d, CHR, POS, EFF, REF) hrc= fread(snakemake@input[[2]], header=T) d= inner_join(d, hrc, by= 'ID') rm(hrc) d$EAF= ifelse(is.na(d$EAF), d$eaf, d$EAF) d[d$REF>d$EFF, c("REF", "EFF")]= d[d$REF > d$EFF, c("EFF", "REF")] d$MAF= ifelse(d$EAF>0.5, 1- d$EAF, d$EAF) d= filter(d, MAF>0.005) d= filter(d, (MAF * 2 * N) > 6) d$maf= ifelse(d$eaf> 0.5, 1 - d$eaf, d$eaf) d$P= as.numeric(d$P) d= filter(d, P<1, P>0) d= filter(d, abs(MAF - maf) < 0.2) d= select(d, -c(maf, MAF, eaf)) x2= nrow(d) write.table(d, snakemake@output[[1]], col.names= T, row.names=F, sep= '\t', quote= F) cohort= unlist(strsplit(unlist(strsplit(snakemake@input[[1]], '/'))[[10]], '_'))[2] cat(c(cohort, '\t', x1, '\t', x2, '\n'), file= snakemake@output[[2]]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import pandas as pd import numpy as np import re #d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0) #d['Allele1']= d['Allele1'].str.upper() #d['Allele2']= d['Allele2'].str.upper() #d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] #d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) #d['CHR']= d['CHR'].astype(str).astype(int) #d['POS']= d['POS'].astype(str).astype(int) #d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] #d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] #d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) #d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) #d['CHR']= d['CHR'].astype(str).astype(int) #d['POS']= d['POS'].astype(str).astype(int) #d['pvalue']= d['pvalue'].astype(str).astype(float) #d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values #d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF #d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] col_list= ['IMPACT', 'DISTANCE', 'SYMBOL', 'SYMBOL_SOURCE', 'BIOTYPE'] df_list= list() for vep in pd.read_csv(snakemake.input[1], sep= '\t', header= None, names= ['Variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra'], comment= '#', chunksize= 100000): for i in col_list: vep[i]= vep['Extra'].apply(lambda y: dict([(x.split('=', 1)) for x in re.split(';(?=\w)', y) if x.find('=') > -1])[i] if i in y else '') vep= vep[['Variation', 'Location', 'Existing_variation', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']] vep.columns= ['ID', 'Location', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE'] vep['BIOTYPE1']= np.where(vep.BIOTYPE== 'protein_coding', 0, np.where(vep.BIOTYPE.str.contains('pseudo'), 2, 1)) vep['DISTANCE']= np.where(vep.DISTANCE== '', 0, vep.DISTANCE) vep[['chr', 'pos', 'All']]= vep.ID.str.split('_', expand= True) vep[['EFF', 'REF']]= vep.All.str.split('/', expand= True) vep.loc[vep.REF > vep.EFF, ['REF', 'EFF']] = vep.loc[vep.REF > vep.EFF, ['EFF', 'REF']].values vep[['CHR', 'POS']]= vep['Location'].str.split(':', expand= True) vep['CHR']= np.where(vep['CHR']== 'X', '23', vep['CHR']) vep['ID']= vep.CHR.astype(int).astype(str) + ':' + vep.POS.astype(int).astype(str) + ':' + vep.REF + ':' + vep.EFF vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE', 'BIOTYPE1']] vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True) vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True) df_list.append(vep) vep= pd.concat(df_list) vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True) vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True) vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE']] d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d= d.loc[d.TOTALSAMPLESIZE> 66106, :] d[['CHR', 'POS', 'REF','EFF']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'P-value']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'pvalue'] d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] d['MAF']= np.where(d.EAF>0.5, 1 - d.EAF, d.EAF) d= d.loc[d.MAF>= 0.1, :] d= pd.merge(d, vep, on= ['ID'], how= 'left') d.to_csv(snakemake.output[0], header=True, index= False, sep= '\t') |
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) dcols= d.columns.values[1:] d.drop('INFO', 1, inplace= True) d.columns= dcols d= d.loc[d.INFO>= 0.4, :] d['MAF']= np.where(d.EAF_CONTR> 0.5, 1- d.EAF_CONTR, d.EAF_CONTR) d= d.loc[d.MAF * 2 * d.N >6, :] d.drop('MAF', 1, inplace= True) d['REF_ALLELE']= np.where(d.REF_ALLELE.str.len()> d.EFF_ALLELE.str.len(), 'I', d.REF_ALLELE) d['EFF_ALLELE']= np.where(d.REF_ALLELE.str.len()< d.EFF_ALLELE.str.len(), 'I', d.EFF_ALLELE) d['REF_ALLELE']= np.where(d.EFF_ALLELE== 'I', 'D', d.REF_ALLELE) d['EFF_ALLELE']= np.where(d.REF_ALLELE== 'I', 'D', d.EFF_ALLELE) d['CHR']= d.CHR.apply(str) d['CHR']= np.where(d.CHR== 'X', '23', d.CHR) d['ID']= np.where(d.REF_ALLELE> d.EFF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF_ALLELE + ':' + d.REF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF_ALLELE + ':' + d.EFF_ALLELE) d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO']] df= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_REC', 'INFO']] d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM','INFO']] d= d.loc[(d.P_VAL_DOM!= '.' ), :] df= df.loc[(df.P_VAL_REC!= '.'), :] d[['BETA_ADD', 'P_VAL_DOM']]= d[['BETA_ADD', 'P_VAL_DOM']].apply(pd.to_numeric, errors= 'coerce') df[['BETA_ADD', 'P_VAL_REC']]= df[['BETA_ADD', 'P_VAL_REC']].apply(pd.to_numeric, errors= 'coerce') d.dropna(axis= 0, inplace= True) df.dropna(axis= 0, inplace= True) d.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO'] d.to_csv(output[0], sep= '\t', header= True, index= False) df.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO'] df.to_csv(output[1], sep= '\t', header= True, index= False) |
56 57 58 59 60 | run: for i in range(len(input)): d= pd.read_csv(input[i], sep= '\t', header= 0) d= d.loc[d.INFO>= 0.4, :] d['MAF']= np.where(d.EAF> 0.5, 1- d.EAF, d.EAF) |
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | run: for i in range(len(input)): print(input[i]) d= pd.read_csv(input[i], header= 0, delim_whitespace= True) d[['BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO', 'EAF_CONTR']]= d[['BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO', 'EAF_CONTR']].apply(pd.to_numeric, errors= 'coerce') d= d.loc[d.INFO>= 0.4, :] d['MAF']= np.where(d.EAF_CONTR> 0.5, 1- d.EAF_CONTR, d.EAF_CONTR) d= d.loc[d.MAF * 2 * d.N >6, :] d.drop('MAF', 1, inplace= True) d['REF_ALLELE']= np.where(len(d.REF_ALLELE)> len(d.EFF_ALLELE), 'I', d.REF_ALLELE) d['EFF_ALLELE']= np.where(len(d.REF_ALLELE)< len(d.EFF_ALLELE), 'I', d.EFF_ALLELE) d['REF_ALLELE']= np.where(d.EFF_ALLELE== 'I', 'D', d.REF_ALLELE) d['EFF_ALLELE']= np.where(d.REF_ALLELE== 'I', 'D', d.EFF_ALLELE) d['CHR']= d.CHR.apply(str) d['CHR']= np.where(d.CHR== '0X', 'X', d.CHR) d['CHR']= np.where(d.CHR== 'X', '23', d.CHR) d['ID']= np.where(d.REF_ALLELE> d.EFF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF_ALLELE + ':' + d.REF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF_ALLELE + ':' + d.EFF_ALLELE) d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO']] df= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_REC', 'INFO']] d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM','INFO']] d= d.loc[(d.P_VAL_DOM!= '.' ), :] df= df.loc[(df.P_VAL_REC!= '.'), :] d.dropna(axis= 0, inplace= True) df.dropna(axis= 0, inplace= True) d.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO'] d.to_csv(output[i], sep= '\t', header= True, index= False) df.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO'] out= output[i].replace('dom', 'rec') df.to_csv(out, sep= '\t', header= True, index= False) |
145 146 | script: 'filter_SNPs.R' |
154 155 | shell: 'cat {input} > {output[0]}' |
165 166 167 168 | shell: ''' /home/pol/software/generic-metal/metal {input[0]} >> {output[1]} ''' |
176 177 178 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d[['CHR', 'POS', 'REF', 'EFF']]= d['MarkerName'].str.split(':', expand= True) |
194 195 | shell: '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite' |
205 206 | script: 'format_VEP.py' |
214 215 216 217 218 219 220 221 222 223 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1']) d['CHR']= d.MarkerName.str.split(':').str[0] d['end']= d.MarkerName.str.split(':').str[1] d['CHR']= d.CHR.astype('str').astype('int') d['end']= d.end.astype('str').astype('int') d['start']= d.end - 1 d.sort_values(by= ['CHR', 'start'], inplace= True) d= d[['CHR', 'start', 'end', 'MarkerName']] d.to_csv(output[0], sep= '\t', header= False, index= False) |
232 233 | shell: 'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}' |
244 245 246 247 248 249 250 251 252 253 254 255 | run: d= pd.read_csv(input[0], sep= '\t', header=0) rs= pd.read_csv(input[1], sep= '\t', header=0) d= pd.merge(d, rs, on= 'ID', how= 'left') d['RSID']= np.where(pd.isnull(d.RSID), d.name, d.RSID) d['RSID']= np.where(d.RSID== '', d.name, d.RSID) d['RSID']= np.where(d.RSID== '-', d.name, d.RSID) d.drop('name', 1, inplace= True) ne= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene']) ne= ne[['ID', 'nearestGene']] d= pd.merge(d, ne, on= 'ID', how= 'left') d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip') |
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | run: for nfile in range(len(input)): d= pd.read_csv(input[nfile], sep= '\t', header= 0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] d.to_csv(output[nfile], header=True, index= False, sep= '\t') |
48 49 | run: for fnumber in range(len(input)): |
72 73 74 75 76 77 | run: meta_files= [x for x in input if 'other_meta' in x] for nfile in range(len(meta_files)): meta= meta_files[nfile] out= output[nfile] shell('bedtools closest -t all -a {meta} -b {input[0]} > {out}') |
96 97 | run: rs= pd.read_csv(input[0], sep= '\t', header=0) |
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | run: for nfile in range(len(input)): d= pd.read_csv(input[nfile], sep= '\t', header= 0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] d.to_csv(output[nfile], header=True, index= False, sep= '\t') |
144 145 146 147 148 149 150 | run: rs= pd.read_csv(input[0], sep= '\t', header=0) d= pd.read_csv(input[1], sep= '\t', header=0) d= pd.merge(d, rs, on= 'ID', how= 'left') d['RSID']= d.name d.drop('name', 1, inplace= True) d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | library(data.table) library(dplyr) library(coloc) library(parallel) df= fread(snakemake@input[[1]], select= c('CHR', 'POS', 'ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF')) df= filter(df, !duplicated(ID)) df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF) x= fread(snakemake@input[[2]], select= c('CHR', 'POS', 'nearestGene')) x= x[, c('CHR', 'POS', 'nearestGene')] names(x)= c('CHR', 'pos2', 'nearestGene') df= inner_join(df, x, by= 'CHR') df= filter(df, POS>= pos2 - 1.5*10**6, POS< pos2 + 1.5*10**6) z= fread(snakemake@input[[3]], select= c('chr', 'pos', 'Allele1', 'Allele2', 'Freq1', 'Effect', 'StdErr', 'TotalSampleSize')) z$Allele1= toupper(z$Allele1) z$Allele2= toupper(z$Allele2) z$ID= with(z, ifelse(Allele1 > Allele2, paste(chr, pos, Allele2, Allele1, sep= ':'), paste(chr, pos, Allele1, Allele2, sep= ':'))) z$maf= ifelse(z$Freq1> 0.5, 1 - z$Freq1, z$Freq1) z= select(z, ID, maf, Effect, StdErr, TotalSampleSize) df= inner_join(df, z, by= 'ID') rm(z) pph_outfile= snakemake@output[[1]] results_outfile= snakemake@output[[2]] cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]]) cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]]) prior1= 1 * 10**-4 prior2= 1 * 10**-4 prior12= 5 * 10**-6 df= data.frame(df) colocalization_eqtl= function(temp_df){ protein= unique(temp_df$nearestGene) if (nrow(temp_df)== 0) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { temp_df = filter(temp_df, SE>0, StdErr> 0) if (grepl('allPTD', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.067, MAF= temp_df$MAF) } else if (grepl('postTerm', snakemake@input[[1]])) { data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122, MAF= temp_df$MAF) } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) } data2= list(beta= temp_df$Effect, varbeta= temp_df$StdErr**2, N=temp_df$TotalSampleSize, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf) myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)} ) if (length(myres)==1 ) { PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein) fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein) fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) print('next') } else { PPH= data.frame(t(myres[[1]])) PPH$protein= protein fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) res= myres[[2]] res$protein= protein fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T) } } } mclapply(split(df, df$nearestGene), colocalization_eqtl, mc.cores= 3) |
12 13 | script: 'coloc_pQTL.R' |
18 19 20 21 22 23 24 | body .main-container { max-width: 1280px !important; width: 1280px !important; } body { max-width: 1280px !important; } |
28 | pheno= unlist(strsplit(snakemake@input[[2]], '/'))[8] |
40 41 42 43 44 45 46 47 48 49 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library("data.table") library(moments) options(warn=-1) opts_chunk$set(fig.width = 12) |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
knitr
kableExtra
From
line
40
of
reports/all_files_QC.Rmd
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | p1_list= list() p2_list= list() p3_list= list() df_list= list() df2_list= list() fl= length(snakemake@input) dec= fread(snakemake@input[[grep('DECODE', snakemake@input)]]) dec$MAF= ifelse(dec$EAF>0.5, 1 - dec$EAF, dec$EAF) dec= select(dec, c(SNP, BETA)) names(dec)= c('SNP', 'BETA_dec') for (i in 2:fl){ df= fread(snakemake@input[[i]]) df= select(df, -c(STRAND)) cohort= unlist(strsplit(snakemake@input[[i]], '/'))[9] df$cohort= cohort df= filter(df, !is.na(EAF)) p1= summarize(df, n_m= median(N, na.rm=T), se_m= mean(SE, na.rm=T)) p1$cohort= cohort p2= summarize(df, N_max= sqrt(max(N)), EAF_m= median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) / median(SE)) p2$cohort= cohort d= filter(df, pvalue> median(pvalue, na.rm=T)) p3= summarize(d,SK= skewness(BETA/SE), KU= kurtosis(BETA/SE)) p3$cohort= cohort p1_list[[cohort]]= p1 p2_list[[cohort]]= p2 p3_list[[cohort]]= p3 df_list[[cohort]]= group_by(df, CHR) %>% sample_n(5000, replace=T) df= inner_join(df, dec, by= 'SNP') df$beta_diff= df$BETA - df$BETA_dec df2_list[[cohort]]= group_by(df, CHR) %>% sample_n(5000, replace=T) } p1= do.call("rbind", p1_list) p2= do.call("rbind", p2_list) p3= do.call("rbind", p3_list) d= do.call("rbind", df_list) d2= do.call('rbind', df2_list) |
113 114 115 116 117 118 | ggplot(p1, aes(n_m, se_m)) + geom_point() + geom_text(aes(label=cohort), hjust=0, vjust=0) + theme_cowplot() + xlab('Median(N)') + ylab('Mean(SE)') |
127 128 129 130 131 132 | ggplot(p2, aes(N_max, EAF_m)) + geom_point() + geom_text(aes(label=cohort), hjust=0, vjust=0) + theme_cowplot() + xlab('SQRT(Max(N))') + ylab('median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) / median(SE)') |
145 146 147 148 149 150 | ggplot(p3, aes(SK, KU)) + geom_point() + geom_text(aes(label=cohort), hjust=0, vjust=0) + theme_cowplot() + xlab('Skewness (Z-score)') + ylab('Kurtosis (Z-score)') |
160 161 162 163 164 165 166 167 | d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) ggplot(d, aes(MAF, BETA)) + geom_point() + facet_wrap(vars(cohort), scales= 'free_y', ncol= 3) + theme_cowplot() + xlab('MAF') + ylab('BETA') |
177 178 179 180 181 182 183 184 | d2$MAF= ifelse(d2$EAF>0.5, 1 - d2$EAF, d2$EAF) ggplot(d2, aes(MAF, beta_diff)) + geom_point() + facet_wrap(vars(cohort), scales= 'free_y', ncol= 3) + theme_cowplot() + xlab('MAF') + ylab('BETA cohort - BETA DECODE') |
10 11 12 13 14 15 16 17 18 19 20 | library(tint) knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint')) options(htmltools.dir.version = FALSE) pheno= snakemake@wildcards[['pheno']] pdf.options(useDingbats = TRUE) #knitr::opts_chunk$set(dpi=300) knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600) pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration'))) |
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library(ggrepel) library("data.table") options(warn=-1) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") d= fread(snakemake@input[[1]]) d$p1= gsub('.txt.sumstats.gz', '', apply(d[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) d$trait= d$p2 d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Birth weight maternal effect', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Birth weight fetal effect', ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)', ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis')))))))))))))))))))))) |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
kableExtra
From
line
30
of
reports/coloc.Rmd
78 | cat(paste0('\n- ', factor(d$trait)), sep= "\n") |
82 | Testosterone in males was further included as a negative control, and only after a first round of genetic correlations. |
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | bw= filter(d, grepl('Birth weight ', d$trait)) df= filter(d, !grepl('Birth weight ', d$trait)) df$significant= ifelse(df$p< 0.05 / (nrow(df) - 7 ), '1', '0') bw$trait= with(bw, ifelse(trait== 'Birth weight fetal effect', 'Fetal effect', ifelse( trait== 'Birth weight fetal effect (adjusted MG)', 'Fetal effect\n (adjusted MG)', ifelse( trait=='Birth weight maternal effect (adjusted FG)', 'Maternal effect \n(adjusted FG)', ifelse( trait== 'Birth weight maternal effect', 'Maternal effect', ''))))) bw$trait= gsub('Birth weight', '', bw$trait) ggplot(bw, aes(trait, rg, colour= trait)) + geom_point() + geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.1, position=position_dodge(.9)) + theme_cowplot() + scale_colour_manual(guide= F, values= colorBlindBlack8[c(1,2,4,8)]) + xlab('Birth weight') + ylab('Genetic correlation [95% CI]') + geom_hline(yintercept= 0) + ylim(-1, 1) + scale_y_continuous(breaks= seq(-1, 1, 0.2)) + geom_hline(yintercept= seq(-1, 1, 0.2), colour= 'grey', size= 0.3, linetype= 'dashed') |
115 116 117 118 119 120 121 122 123 124 125 126 | ggplot(df, aes(trait, rg, colour= significant)) + geom_point() + geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) + theme_cowplot() + theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1)) + scale_colour_manual(guide= F, values= c('#737373', colorBlindBlack8[2])) + xlab('Reproductive traits') + ylab('Genetic correlation [95% CI]') + geom_hline(yintercept= 0) + ylim(-1, 1) + scale_y_continuous(breaks= seq(-1, 1, 0.2)) + geom_hline(yintercept= seq(-1, 1, 0.2), colour= 'grey', size= 0.3, linetype= 'dashed') |
132 | We used a Bonferroni corrected threshold for significance (0.05/ 13). We exclude testosterone in males, as this test was performed a posteriori as a negative control for testosterone in women. |
157 | While coloc naiveley assumes one causal variant, it does not require an LD matrix that represents the summary statistics used. This is almost impossible to obtain without an LD matrix from each of the studies used in the meta-analysis. |
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | inputs= snakemake@input[grep('pph', snakemake@input)] df_list= list() for (infile in inputs) { d_temp= fread(infile) #d_temp$trait= gsub('.txt', '', unlist(strsplit(infile, '_'))[2]) df_list[[infile]]= d_temp } d= do.call('rbind', df_list) d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Birth weight maternal effect', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Birth weight fetal effect', ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)', ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)', ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis')))))))))))))))))))))) d$locus= gsub('_', ' ', d$locus) d$locus= gsub('chr', 'Chr', d$locus) d$locus= gsub('23', 'X', d$locus) |
203 | We identify the different loci as the chromosome where the locus is located and the nearest protein coding gene to the top associated genetic variant (e.g., Chr5 EBF1). |
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | x1= select(d, locus, PP.H3.abf, trait) x1$PP= 'Shared locus' x2= select(d, locus, PP.H4.abf, trait) x2$PP= 'Shared genetic variant' names(x1)= c('locus', 'coloc', 'trait', 'PP') names(x2)= c('locus', 'coloc', 'trait', 'PP') x= bind_rows(x1, x2) d$coloc= with(d, ifelse(PP.H4.abf< 0.5, 'No evidence', ifelse(PP.H4.abf>= 0.50 & PP.H4.abf< 0.75, 'Suggestive evidence', 'Strong evidence'))) ggplot(d, aes(trait, locus, size= PP.H4.abf, fill= coloc, color= coloc, shape= direction, alpha= coloc)) + geom_point() + theme_cowplot() + theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1)) + scale_size_binned('Posterior probability of colocalization', guide= FALSE) + scale_alpha_manual('Colocalization', values= c(1,0.55, 0.55)) + scale_shape_manual('Effect direction', values=c(25, 21, 24)) + scale_fill_manual('Colocalization', values= c('#737373', colorBlindBlack8[2], colorBlindBlack8[4])) + scale_colour_manual('Colocalization', values= c('#737373', colorBlindBlack8[2], colorBlindBlack8[4])) + xlab('') + ylab('') |
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | x3= select(d, locus, PP.H3.abf, trait) x3$PP= 'H3' x4= select(d, locus, PP.H4.abf, trait) x4$PP= 'H4' x0= select(d, locus, PP.H0.abf, trait) x0$PP= 'H0' x1= select(d, locus, PP.H1.abf, trait) x1$PP= 'H1' x2= select(d, locus, PP.H2.abf, trait) x2$PP= 'H2' names(x0)= c('locus', 'coloc', 'trait', 'PP') names(x1)= c('locus', 'coloc', 'trait', 'PP') names(x2)= c('locus', 'coloc', 'trait', 'PP') names(x3)= c('locus', 'coloc', 'trait', 'PP') names(x4)= c('locus', 'coloc', 'trait', 'PP') x= bind_rows(x0, x1, x2, x3, x4) x= x[order(x$PP, decreasing= T),] x$evidence= ifelse(x$coloc>= 0.75, '1', '0') ggplot(filter(x, PP== 'H3' | PP== 'H4'), aes(fill= factor(PP), y=coloc, x= locus, alpha= evidence)) + geom_bar(position="stack", stat="identity") + scale_fill_manual('Posterior probability', values= c(colorBlindBlack8[2], colorBlindBlack8[4])) + scale_alpha_manual('Posterior probability', values= c(0.55, 0.8), guide=FALSE) + facet_wrap(vars(trait), ncol= 3) + theme_cowplot() + theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1), strip.background= element_blank(), legend.position= 'bottom') + xlab('') + ylab('') |
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | z= filter(d, PP.H4.abf >= 0.75) res_inputs= snakemake@input[grep('results_', snakemake@input)] df_list= list() for (infile in res_inputs) { x= fread(infile, select= c('snp', 'z.df1', 'z.df2', 'SNP.PP.H4', 'locus', 'trait')) x= gather(x, pheno, zscore, c(z.df1, z.df2)) x$pvalue=2 * pnorm(-abs(x$zscore)) x= separate(x, snp, into= c('CHR', 'POS', 'REF', 'EFF'), sep =':', remove= F) x$POS= as.numeric(x$POS) x$pheno= ifelse(x$pheno =='z.df1', pheno, x$trait) df_list[[infile]]= x } df= bind_rows(df_list) df$trait= with(df, ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Birth weight maternal effect', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'BW_fetal', 'Birth weight fetal effect', ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)', ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis')))))))))))))))))))))) df$locus= gsub('_', ' ', df$locus) df$locus= gsub('chr', 'Chr', df$locus) df$locus= gsub('23', 'X', df$locus) z$id= paste(z$trait, z$locus, sep= ':') df$id= paste(df$trait, df$locus, sep= ':') df= filter(df, id %in% z$id) df$pheno= ifelse(df$pheno== pheno, pheno, df$trait) for (i in unique(df$id)){ PP= filter(z, id== i)$PP.H4.abf temp_df= filter(df, id== i) temp_df$pheno= temp_df$pheno temp_df$POS= temp_df$POS / 10**6 high_df= filter(temp_df, id == i, SNP.PP.H4== max(SNP.PP.H4)) (ggplot() + geom_point(data= temp_df, aes(POS, -log10(pvalue), colour= pheno), size= 1, alpha = 0.5) + geom_point(data= high_df, aes(POS, -log10(pvalue)), colour= colorBlindBlack8[1], size= 2) + facet_wrap(vars(pheno), nrow= 2, scales = "free_y") + theme_cowplot(font_size=14) + theme(strip.background= element_blank()) + scale_colour_manual(guide=FALSE, values= colorBlindBlack8[c(4,2)]) + ylab('-log10(pvalue)') + xlab('Position (Mbp)') + geom_text_repel(data=high_df, aes(x= POS, y= -log10(pvalue), label=snp), hjust= 0.5, size=3, vjust= 1) + ggtitle(paste('Locus: ', unique(temp_df$locus), '. Posterior probability for shared causal variant: ', round(PP, 3)))) %>% print() cat(' \n') } |
18 19 | cohort= unlist(strsplit(snakemake@input[[1]], '/'))[9] pheno= unlist(strsplit(snakemake@input[[1]], '/'))[8] |
29 30 31 32 33 34 35 36 37 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library("data.table") options(warn=-1) #opts_chunk$set(dpi=300, out.width="300px") |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
knitr
kableExtra
From
line
29
of
reports/file_level_qc.Rmd
45 | d= fread(snakemake@input[[1]], h=T) |
53 | kable(summary(select(d, BETA, SE, pvalue, EAF, N))) |
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | #dec= fread(snakemake@input[[3]],h=T, select= c('CHR', 'POS', 'BETA', 'SE', 'EFF', 'REF', 'EAF', 'pvalue')) #names(dec)= c('CHR', 'POS', 'BETA_dec', 'SE_dec', 'A1_dec', 'A2_dec', 'EAF_dec', 'pvalue_dec') #dec$BETA_dec= ifelse(dec$A1_dec> dec$A2_dec, dec$BETA_dec, -1* dec$BETA_dec) #dec$EAF_dec= ifelse(dec$A1_dec> dec$A2_dec, dec$EAF_dec, 1- dec$EAF_dec) x= fread(snakemake@input[[2]], h=T) df= d df$CHR= as.numeric(df$CHR) x$CHR= as.numeric(x$CHR) df$EAF= ifelse(df$EFF> df$REF, df$EAF, 1 - df$EAF) df= inner_join(df, x, on= c('CHR', 'POS')) df= filter(df, (EFF== ea & REF== oa) | (REF== ea & EFF== oa)) if (nrow(df)>0){ ggplot(df, aes(eaf, EAF)) + geom_point(alpha= 1/10) + theme_cowplot(12) + xlab('EAF HRC') + ylab('EAF Sample') } else{ print('No match') } rm(df) rm(x) |
101 102 103 104 105 106 107 108 109 110 111 112 113 114 | d$BETA= ifelse(d$REF> d$EFF, d$BETA, -1* d$BETA) #dec= inner_join(d, dec, on= c('CHR', 'POS')) #dec= filter(dec, EFF== A1_dec, REF== A2_dec, pvalue_dec< 0.001) #dec= group_by(dec, CHR) %>% slice_sample(n= 1000, replace= T) #ggplot(dec, aes(BETA_dec/SE_dec, BETA/SE)) + #geom_point(alpha= 1/10) + #theme_cowplot(12) + #xlab('DECODE z-score') + #ylab('Observed z-score') #rm(dec) |
125 126 127 128 129 130 131 | d$exp_pvalue= pchisq((d$BETA/d$SE)^2, df=1, lower=F) ggplot(group_by(d, CHR) %>% sample_n(10000, replace=T ), aes(-log10(exp_pvalue), -log10(pvalue))) + geom_point(alpha= 1/10) + theme_cowplot() + xlab('Expected pvalue') + ylab('Observed pvalue') |
140 141 142 143 144 145 146 147 | df= arrange(d, pvalue) %>% mutate(exp1= -log10(1:length (pvalue)/length (pvalue))) ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue))) + geom_point(size= 0.4) + geom_abline(intercept = 0, slope = 1, alpha = .5) + theme_cowplot(12, font_size= 12) + xlab('Expected (-log10(p-value))') + ylab('Observed (-log10(p-value))') |
10 11 12 13 14 15 16 17 18 19 20 | library(tint) knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint')) options(htmltools.dir.version = FALSE) pheno= snakemake@wildcards[['pheno']] pdf.options(useDingbats = TRUE) #knitr::opts_chunk$set(dpi=300) knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600) pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration'))) |
29 30 31 32 33 34 35 36 37 38 39 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library(ggrepel) library("data.table") options(warn=-1) colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
kableExtra
From
line
29
of
reports/forest_plots.Rmd
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | d= fread(snakemake@input[[1]]) z= fread(snakemake@input[[3]]) df= fread(snakemake@input[[2]], select= (c('MarkerName', 'Effect', 'StdErr', 'HetISq', 'HetPVal', 'TOTALSAMPLESIZE', 'P-value', 'Allele1', 'Allele2'))) names(df)= c('SNP', 'BETA', 'SE', 'HetISq', 'HetPval', 'N', 'pvalue', 'A1', 'A2') df= filter(df, SNP %in% d$SNP) df= separate(df, SNP, into= c('CHR', 'POS', 'Ax1', 'Ax2', 'ID'), sep= ':', remove= F) df$BETA= ifelse(df$A2 > df$A1, -1 * df$BETA, df$BETA) df$CHR= ifelse(df$CHR== 'X','23', df$CHR) df$CHR= as.integer(df$CHR) df$POS= as.integer(df$POS) df= select(df, -c(A1, A2, ID, Ax1, Ax2)) df$cohort= 'Meta-analysis' d= bind_rows(d, df) z$CHR= ifelse(z$CHR== 'X','23', z$CHR) z$CHR= as.integer(z$CHR) d= inner_join(d, z, by= 'CHR') %>% filter(POS> pos1, POS< pos2) d$locus= paste0('Chr ', d$CHR,': ', d$nearestGene) d$cohort= paste0(d$cohort, ' (n= ', d$N, ')') |
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | for (i in unique(d$locus)) { temp_df= d[d$locus== i, ] cat('\n') cat("\n# Forest plot for locus ", i, "\n") cat("\n") cat('\n') cat('Lead variant: \n', temp_df[!is.na(temp_df$HetISq), ]$SNP) cat('\n') cat(paste0('\n Meta-analysis: Beta= ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA, 3), ' (95% CI= ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA - 1.96 * temp_df[!is.na(temp_df$HetISq), ]$SE, 3), ', ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA + 1.96 * temp_df[!is.na(temp_df$HetISq), ]$SE, 3), '); pvalue= ', temp_df[!is.na(temp_df$HetISq), ]$pvalue)) cat('\n') cat('\n') temp_df= temp_df[order(temp_df$N, decreasing= T), ] (ggplot(temp_df, aes(x=factor(cohort, level = factor(cohort)), y=BETA, ymin= BETA - 1.96 * SE, ymax= BETA + 1.96 * SE, colour= !is.na(HetISq), shape= !is.na(HetISq)), alpha= 0.5) + geom_pointrange(size= 1, alpha= 0.7) + scale_shape_manual(values= c(15, 18), guide= F) + geom_hline(yintercept = 0, linetype=2) + scale_y_continuous(sec.axis = dup_axis()) + coord_flip() + scale_colour_manual(values= c(colorBlindBlack8[3], colorBlindBlack8[4]), guide= F) + theme_cowplot() + xlab('') + ylab('Beta [95% CI]') + geom_vline(xintercept= 0, linetype= "dotted", colour= 'grey') ) %>% print() cat('\n') cat('Test for heterogeneity: I^2^= ', temp_df[!is.na(temp_df$HetISq), ]$HetISq, '%; Het pvalue= ', temp_df[!is.na(temp_df$HetISq), ]$HetPval, '\n') cat('\\pagebreak') } |
10 11 12 13 14 15 16 17 18 19 20 | library(tint) knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint')) options(htmltools.dir.version = FALSE) pheno= snakemake@wildcards[['pheno']] pdf.options(useDingbats = TRUE) #knitr::opts_chunk$set(dpi=300) knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600) pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration'))) |
32 33 34 35 36 37 38 39 40 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library(ggrepel) library("data.table") options(warn=-1) |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
kableExtra
From
line
32
of
reports/meta_qc.Rmd
45 46 47 48 49 50 51 52 53 54 55 | d= fread(snakemake@input[[1]], h= T) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ggplot(d, aes(MAF)) + geom_density(fill= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + xlab('MAF') #d= select(d, -MAF) |
61 62 63 64 | ggplot(d, aes(TOTALSAMPLESIZE)) + geom_density(fill= colorBlindBlack8[3]) + theme_cowplot(font_size= 12) + xlab('Sample size') |
72 73 74 75 | ggplot(d, aes(BETA)) + geom_density(fill= colorBlindBlack8[2]) + theme_cowplot(font_size= 12) + xlab('Beta') |
91 92 | **Effective sample size** for binary phenotypes was calculated as: $$\frac{2}{(\frac{1}{Ncases} + \frac{1}{Ncontrols})}$$ |
110 111 112 | ge= data.frame(CHR= c(5, 3, 1, 23, 1), pos_ge= c(157895049, 127881613, 22470407, 115164770, 22470407)) #kable(summary(select(d, BETA, SE, pvalue, EAF, TOTALSAMPLESIZE)), digits = c(3, 3, 5, 4, 0), col.names = c('Beta', 'Standard error', 'P-value', 'Effect allele frequency', 'Sample size'), caption= 'Summary statistics after QC.') |
124 | The same number of loci is obtained when using a larger radius (1.5Mb). |
128 | We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. We mapped top genetic variants to the body (TSS or TES) nearest protein coding gene. |
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | df= arrange(d, pvalue) #df= group_by(df, CHR, POS) %>% filter(row_number() == 1) %>% ungroup() df= df[!duplicated(df[, c('CHR', 'POS')]), ] dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>% mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>% group_by(CHR, clumpid) %>% filter(rank(pvalue, ties.method = "random")==1) %>% mutate(GENE= ifelse(SYMBOL=='', RSID, SYMBOL)) %>% ungroup() dg= filter(dg, pvalue< 5*10**-8) dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1) #dg$GENE= ifelse(grepl('rs|-', dg$GENE), dg$nearestGene, ifelse(dg$GENE=='', dg$nearestGene, dg$GENE)) dg$GENE= dg$nearestGene don <- df %>% group_by(CHR) %>% summarise(chr_len= max(POS)) %>% mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome select(-chr_len) %>% left_join(df, ., by= 'CHR') %>% arrange(CHR, POS) %>% # Add a cumulative position of each SNP mutate( BPcum=POS+tot) %>% ungroup() axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 ) names(axisdf)= c('CHR', 'center') HC= -log10(5*10**-8) dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF, BETA) don= left_join(don, select(dg, ID, GENE), by= 'ID') names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF', 'BETA') lims= 250000 don$disc= ifelse(don$pvalue> 5*10**-8, 0, NA) for (i in rownames(dg)) { don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) } for (i in rownames(ge)) { don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc)) } don= don[order(don$disc, decreasing= F, na.last= T), ] don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery')) cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2]) don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE) ggplot(don) + geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) + # Show all points theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + #scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) + scale_colour_manual(values= cols) + scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis scale_y_continuous(expand= c(0,0)) + xlab('Chromosome') + ylab('-log10(pvalue)') + labs(colour= '') + geom_hline(yintercept= 0, size= 0.5, colour= 'black') + geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') + geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') + theme(legend.position= 'bottom') + guides(colour = guide_legend(override.aes = list(size=3))) |
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 | indep= fread(snakemake@input[[6]]) indep$nd2P= sapply(strsplit(indep$SP2, ','), length) indep= inner_join(indep, dg, by= 'CHR') indep= filter(indep, BP>= POS_new - 1.5*10**6, BP<= POS_new + 1.5*10**6) indep_df= group_by(indep, GENE) %>% summarize(total= sum(TOTAL), nsig= sum(NSIG), GWS= n(), sug_ev= sum(nd2P), mP= min(P)) indep_df= indep_df[order(indep_df$mP, decreasing=T), ] indep_df$GENE= factor(indep_df$GENE, levels= indep_df$GENE) indep_df= filter(indep_df, !grepl('HLA', GENE)) p1= ggplot(data=indep_df, aes(x= GENE, y= GWS)) + geom_col(fill=colorBlindBlack8[2], alpha= 0.6) + theme_cowplot() + ylab('# Independent GW significant') + xlab('Locus') + coord_flip() p2= ggplot(data=indep_df, aes(x= GENE, y= total)) + geom_col(fill=colorBlindBlack8[4], alpha= 0.6) + theme_cowplot() + ylab('Total # of \n genetic variants in locus') + xlab('Locus') + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + coord_flip() p3= ggplot(data=indep_df, aes(x= GENE, y= sug_ev / total)) + geom_col(fill=colorBlindBlack8[8], alpha= 0.6) + theme_cowplot() + ylab('Proportion of variants with P<1e-5') + xlab('Locus') + ylim(0, 1) + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + coord_flip() plot_grid(p1, p2, p3, align = "h", nrow= 1) |
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | d= mutate(d, maf_tertiles = ntile(MAF, 3)) m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3) m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3) d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2))) df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue))) ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) + geom_point(size= 0.4) + scale_color_manual(values= colorBlindBlack8[2:4])+ geom_abline(intercept = 0, slope = 1, alpha = .5) + labs(colour="") + theme_cowplot(font_size= 12) + xlab('Expected (-log10(p-value))') + ylab('Observed (-log10(p-value))') + theme(legend.position= 'bottom') + guides(colour = guide_legend(override.aes = list(size=3))) |
292 293 294 295 296 297 298 | don= filter(don, pvalue< 1*10**-4) d1= filter(don, pvalue<5*10**-8) d1= d1[order(d1$pvalue, decreasing= F), ] d1$pvalue= format(d1$pvalue, digits= 3) don$pvalue= format(don$pvalue, digits= 3) kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value', 'Consequence'), digits= 3) #%>% kable_styling(latex_options = c("striped", "scale_down")) |
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | if (nrow(dg)>1){ (ggplot(dg, aes(MAF, abs(BETA), size= abs(BETA)), alpha= 0.7) + geom_point(colour= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + scale_size_continuous(name= 'Absolute Beta', guide= F) + geom_text_repel(data= dg, aes(label= GENE), guide= F) + xlab('Minor allele frequency') + ylab('Absolute effect size')) %>% print() } else{ print('Only one locus identified, check the table.') plot_comment='' } |
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | z= fread(snakemake@input[[5]], header= T, sep= '\t', select= c('MarkerName', 'Effect', 'P-value', 'HetPVal')) names(z)= c('ID', 'beta', 'pvalue', 'het_pvalue') z$ID= gsub(':SNP', '', z$ID) z$ID= gsub(':INDEL', '', z$ID) z= inner_join(z, dg, by= 'ID') if (nrow(z)>1) { plot_comment= 'No pattern between effect size and heterogeneity. Attention should be paid to the top hit.' z$Direction= ifelse(z$beta> 0, 'Positive', 'Negative') ggplot(z, aes(-log10(het_pvalue), -log10(pvalue), size= abs(beta)), alpha= 0.7) + geom_point(colour= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + scale_size_continuous(name= 'Absolute effect size') + geom_text_repel(data= z, aes(label= GENE), hjust =1, show.legend = FALSE) + xlab('-log10(Het pvalue)') + ylab('-log10(Association pvalue)') + theme(legend.position="bottom") } else{ print(paste('Pvalue for heterogeneity: ', z$het_pvalue)) #plot_comment= '' } |
376 377 378 379 | kable(filter(don, (IMPACT== 'HIGH') | (IMPACT== 'MODERATE')) %>% select(ID, RSID, SYMBOL, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value', 'Consequence'), digits= 3) x= readLines(snakemake@input[[2]]) x= x[match('Heritability of phenotype 1', x) + 2] |
393 | Ideally, calculate LDscores from our sample (MOBAGENETICS) or from a bigger cohort (UKBIOBANK). |
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 | d= fread(snakemake@input[[3]]) d$pheno1= gsub('.txt.sumstats.gz', '', sapply(strsplit(as.character(d$p1), '/'), tail, 1)) d$pheno2= gsub('.txt.sumstats.gz', '', sapply(strsplit(as.character(d$p2), '/'), tail, 1)) d$rg= ifelse(d$rg> 1, 1, ifelse(d$rg< ( -1), -1, d$rg)) maxy= with(d, ifelse(max(rg + 1.96*se)> 1, max(rg + 1.96*se), 1)) miny=with(d, ifelse(min(rg - 1.96*se)< -1, min(rg - 1.96*se), -1)) ggplot(d, aes(pheno2, rg, colour= pheno2)) + geom_point(alpha= 0.5) + geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) + theme_cowplot(font_size= 9) + scale_fill_manual(values= colorBlindBlack8[2:4]) + scale_colour_manual(guide= F, values= colorBlindBlack8[2:4]) + xlab('Phenotype') + ylab(paste0('R coefficient [95% CI] \n', pheno)) + ylim(c(miny, maxy)) + theme(legend.position= 'none') link= 'https://drive.google.com/drive/folders/101ErlqwE4_iFwZFCTM0QZUtUVwOoOE1L?usp=sharing' |
10 11 12 13 14 15 16 17 18 19 20 | library(tint) knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint')) options(htmltools.dir.version = FALSE) pdf.options(useDingbats = TRUE) #knitr::opts_chunk$set(dpi=300) knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600) model= ifelse(grepl('rec', snakemake@input[[1]]), 'recessive', 'dominant') |
31 32 33 34 35 36 37 38 39 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library(ggrepel) library("data.table") options(warn=-1) |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
kableExtra
From
line
31
of
reports/nonadditive_qc.Rmd
44 45 46 47 48 49 50 51 52 53 54 55 56 | d= fread(snakemake@input[[1]], h= T) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) d= filter(d, TOTALSAMPLESIZE> 66106) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ggplot(d, aes(MAF)) + geom_density(fill= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + xlab('MAF') #d= select(d, -MAF) |
62 63 64 65 | ggplot(d, aes(TOTALSAMPLESIZE)) + geom_density(fill= colorBlindBlack8[3]) + theme_cowplot(font_size= 12) + xlab('Sample size') |
95 96 97 98 | add_model= fread(snakemake@input[[2]]) add_model$CHR= ifelse(add_model$CHR== 'X', '23', add_model$CHR) add_model$CHR= as.numeric(add_model$CHR) add_model$pos= round((add_model$pos1 + add_model$pos2) / 2) |
110 | The same number of loci is obtained when using a larger radius (1.5Mb). |
114 | We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. We mapped top genetic variants to the body (TSS or TES) nearest protein coding gene. |
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | df= arrange(d, pvalue) #df= group_by(df, CHR, POS) %>% filter(row_number() == 1) %>% ungroup() df= df[!duplicated(df[, c('CHR', 'POS')]), ] dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>% mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>% group_by(CHR, clumpid) %>% filter(rank(pvalue, ties.method = "random")==1) %>% mutate(GENE= ifelse(SYMBOL=='', RSID, SYMBOL)) %>% ungroup() dg= filter(dg, pvalue< 5*10**-8) dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1) #dg$GENE= ifelse(grepl('rs|-', dg$GENE), dg$nearestGene, ifelse(dg$GENE=='', dg$nearestGene, dg$GENE)) dg$GENE= dg$nearestGene don <- df %>% group_by(CHR) %>% summarise(chr_len= max(POS)) %>% mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome select(-chr_len) %>% left_join(df, ., by= 'CHR') %>% arrange(CHR, POS) %>% # Add a cumulative position of each SNP mutate( BPcum=POS+tot) %>% ungroup() axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 ) names(axisdf)= c('CHR', 'center') HC= -log10(5*10**-8) dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF) don= left_join(don, select(dg, ID, GENE), by= 'ID') names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF') lims= 250000 don$disc= ifelse(don$pvalue> 5*10**-8, 0, NA) don= data.frame(don) dg= data.frame(dg) add_model= data.frame(add_model) for (i in rownames(dg)) { don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) } for (i in rownames(add_model)) { don= mutate(don, disc= ifelse(CHR== as.integer(add_model[i, 'CHR']) & POS>= as.integer(add_model[i, 'pos']) - lims & POS<= as.integer(add_model[i, 'pos']) + lims, 1, disc)) } don= don[order(don$disc, decreasing= F, na.last= T), ] don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Additive model discovery', 'New discovery')) cols <- c('Not significant'= 'grey', 'Additive model discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2]) don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE) ggplot(don) + geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) + # Show all points theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + #scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) + scale_colour_manual(values= cols) + scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis scale_y_continuous(expand= c(0,0)) + xlab('Chromosome') + ylab('-log10(pvalue)') + labs(colour= '') + geom_hline(yintercept= 0, size= 0.5, colour= 'black') + geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') + geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') + theme(legend.position= 'bottom') + guides(colour = guide_legend(override.aes = list(size=3))) |
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | d= mutate(d, maf_tertiles = ntile(MAF, 3)) m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3) m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3) d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2))) df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue))) ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) + geom_point(size= 0.4) + scale_color_manual(values= colorBlindBlack8[2:4])+ geom_abline(intercept = 0, slope = 1, alpha = .5) + labs(colour="") + theme_cowplot(font_size= 12) + xlab('Expected (-log10(p-value))') + ylab('Observed (-log10(p-value))') + theme(legend.position= 'bottom') + guides(colour = guide_legend(override.aes = list(size=3))) |
230 231 232 233 234 235 236 | don= filter(don, pvalue< 1*10**-4) d1= filter(don, pvalue<5*10**-8) d1= d1[order(d1$pvalue, decreasing= F), ] d1$pvalue= format(d1$pvalue, digits= 3) don$pvalue= format(don$pvalue, digits= 3) kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'P-value', 'Consequence'), digits= 3) #%>% kable_styling(latex_options = c("striped", "scale_down")) |
249 | kable(filter(don, (IMPACT== 'HIGH') | (IMPACT== 'MODERATE')) %>% select(ID, RSID, SYMBOL, TOTALSAMPLESIZE, EAF, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'P-value', 'Consequence'), digits= 3) |
10 11 12 13 14 15 16 17 18 | library(tint) knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint')) options(htmltools.dir.version = FALSE) pdf.options(useDingbats = TRUE) #knitr::opts_chunk$set(dpi=300) knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600) cohort= ifelse(grepl('MOBA', snakemake@input[[1]]), 'MoBa', '23andMe') pheno= ifelse(grepl('GAraw', snakemake@input[[1]]), 'GA days', 'GA normalized') |
30 31 32 33 34 35 36 37 38 | library("ggplot2") library("dplyr") library("knitr") library("tidyr") library(cowplot) library("kableExtra") library(ggrepel) library("data.table") options(warn=-1) |
R Markdown
ggplot2
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
kableExtra
From
line
30
of
reports/other_meta.Rmd
45 46 47 48 49 50 51 52 53 54 55 | d= fread(snakemake@input[[1]], h= T) d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF) colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ggplot(d, aes(MAF)) + geom_density(fill= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + xlab('MAF') #d= select(d, -MAF) |
61 62 63 64 | ggplot(d, aes(TOTALSAMPLESIZE)) + geom_density(fill= colorBlindBlack8[3]) + theme_cowplot(font_size= 12) + xlab('Sample size') |
72 73 74 75 | ggplot(d, aes(BETA)) + geom_density(fill= colorBlindBlack8[2]) + theme_cowplot(font_size= 12) + xlab('Beta') |
89 90 | **Effective sample size** for binary phenotypes was calculated as: $$\frac{2}{(\frac{1}{Ncases} + \frac{1}{Ncontrols})}$$ |
105 106 107 108 109 110 111 112 113 114 115 116 | ge= data.frame(CHR= c(5, 3, 1, 23, 1), pos_ge= c(157895049, 127881613, 22470407, 115164770, 22470407)) if (grepl('GAraw', snakemake@input[[1]])){ topids= c('1:22462111:A:G', '3:128038373:A:C', '5:157896786:C:T', '23:115184372:A:C', '1:228216997:A:C', '3:123112292:C:T', '3:141147414:C:T', '3:155859113:A:G', '23:131268226:C:T', '2:74207357:A:G', '4:174734471:A:G', '6:32589937:A:G', '6:49559793:G:T', '9:16408826:A:G', '20:62692060:A:C') } else { topids= c('1:22414785:G:T', '5:157895049:C:T', '23:115129904:C:T', '1:41955090:A:G', '1:50959262:A:C', '3:14293832:A:G', '3:139004333:A:G', '3:141147414:C:T', '3:155862524:A:G', '3:156697097:A:G', '2:74253326:A:G', '4:55895282:C:T', '4:174739258:A:G', '6:32604898:A:G', '8:75315146:C:G', '9:116935764:C:G') } fullmeta= fread(snakemake@input[[3]]) fullmeta= filter(fullmeta, ID %in% topids) %>% select(ID, POS, CHR, BETA, SE, pvalue) names(fullmeta)= c('ID_f', 'POS_f', 'CHR_f', 'BETA_f', 'SE_f', 'pvalue_f') |
128 | The same number of loci is obtained when using a larger radius (1.5Mb). |
132 | We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. |
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | df= arrange(d, pvalue) df= df[!duplicated(df[, c('CHR', 'POS')]), ] dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>% mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>% group_by(CHR, clumpid) %>% filter(rank(pvalue, ties.method = "random")==1) %>% mutate(GENE= nearestGene) %>% ungroup() dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1) don <- df %>% group_by(CHR) %>% summarise(chr_len= max(POS)) %>% mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome select(-chr_len) %>% left_join(df, ., by= 'CHR') %>% arrange(CHR, POS) %>% # Add a cumulative position of each SNP mutate( BPcum=POS+tot) %>% ungroup() axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 ) names(axisdf)= c('CHR', 'center') HC= -log10(5*10**-8) dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF, BETA) don= left_join(don, select(dg, ID, GENE), by= 'ID') names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF', 'BETA') don= left_join(don, fullmeta, by= c('CHR'= 'CHR_f')) lims= 250000 don$disc= ifelse((don$POS> don$POS_f - lims) & (don$POS < don$POS_f + lims), 2, 0) don= don[order(don$disc, decreasing= T, na.last= T), ] don= group_by(don, ID) %>% filter(row_number() == 1) don$disc= ifelse(is.na(don$disc), 0, don$disc) don= left_join(don, select(dg, CHR, POS_new), by= 'CHR') don$disc= ifelse(don$disc== 2, 2, ifelse((don$POS> (don$POS_new - lims)) & (don$POS < (don$POS_new + lims)), 1, 0)) don$disc= ifelse(is.na(don$disc), 0, don$disc) don= don[order(don$disc, decreasing= T, na.last= T), ] don= group_by(don, ID) %>% filter(row_number() == 1) don$disc= ifelse(is.na(don$disc), 0, ifelse(don$disc== 1, 2, ifelse(don$disc== 2, 1, 0))) don= don[order(don$disc, decreasing= F, na.last= T), ] don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Full meta discovery', 'New discovery')) cols <- c('Not significant'= 'grey', 'Full meta discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2]) ggplot(don) + geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) + # Show all points theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + #scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) + scale_colour_manual(values= cols) + scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis scale_y_continuous(expand= c(0,0)) + xlab('Chromosome') + ylab('-log10(pvalue)') + labs(colour= '') + geom_hline(yintercept= 0, size= 0.5, colour= 'black') + geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') + geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') + theme(legend.position= 'bottom') + guides(colour = guide_legend(override.aes = list(size=3))) |
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | d= mutate(d, maf_tertiles = ntile(MAF, 3)) m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3) m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3) d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2))) df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue))) ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) + geom_point(size= 0.4) + scale_color_manual(values= colorBlindBlack8[2:4])+ geom_abline(intercept = 0, slope = 1, alpha = .5) + labs(colour="") + theme_cowplot(font_size= 12) + xlab('Expected (-log10(p-value))') + ylab('Observed (-log10(p-value))') + theme(legend.position= 'bottom') + guides(colour = guide_legend(override.aes = list(size=3))) |
243 244 245 246 247 248 | don= filter(don, pvalue< 1*10**-4) d1= filter(don, pvalue<5*10**-8) d1$pvalue= format(d1$pvalue, digits= 3) don$pvalue= format(don$pvalue, digits= 3) kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value'), digits= 3) caption= 'As expected, beta increases with decreasing minor allele frequency.' |
258 259 260 261 262 263 264 265 266 267 268 | x= inner_join(fullmeta, d, by= c('ID_f'= 'ID')) (ggplot(x, aes(BETA_f, BETA, size= abs(BETA)), alpha= 0.7) + geom_point(colour= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + scale_size_continuous(name= 'Absolute Beta', guide= F) + geom_text_repel(data= x, aes(label= RSID), guide= F) + xlab('Effect size full meta-analysis') + ylab(paste('Effect size without', cohort))+ geom_abline(intercept= 0, slope= 1, linetype= 'dashed', colour= 'grey')) %>% print() |
279 280 281 282 283 284 285 | (ggplot(x, aes(-log10(pvalue_f), -log10(pvalue)), alpha= 0.7) + geom_point(colour= colorBlindBlack8[4]) + theme_cowplot(font_size= 12) + geom_text_repel(data= x, aes(label= RSID), guide= F) + xlab('-log10(pvalue) full meta-analysis') + ylab(paste('-log10(pvalue) without', cohort))+ geom_abline(intercept= 0, slope= 1, linetype= 'dashed', colour= 'grey')) %>% print() |
11 12 13 14 15 16 17 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'REF', 'ALT', 'AF_EXCLUDING_1000G']) d.columns= ['CHR', 'POS', 'oa', 'ea', 'eaf'] d= d.loc[((d.eaf> 0.05) & (d.eaf<0.95)), :] d['eaf']= np.where(d.oa> d.ea, 1 - d.eaf, d.eaf) d= d.sample(n= 1000000) d.to_csv(output[0], sep= '\t', header= True, index= False) |
27 28 | script: 'file_level_qc.Rmd' |
38 39 | script: 'file_level_qc.Rmd' |
49 50 | script: 'file_level_qc.Rmd' |
60 61 | script: 'file_level_qc.Rmd' |
70 71 | script: 'all_files_QC.Rmd' |
80 81 | script: 'all_files_QC.Rmd' |
91 92 | script: 'all_files_QC.Rmd' |
102 103 | script: 'all_files_QC.Rmd' |
117 118 | script: 'meta_qc.Rmd' |
129 130 131 132 133 134 135 136 137 138 139 140 141 | run: df= pd.read_csv(input[0], sep= '\t', header= 0) df.sort_values('SNP.PP.H4', ascending= False, inplace= True) d= df.groupby('locus').head(1).reset_index() df['trait']= input[1].split('pph_')[1].replace('.txt', '') d['direction']= np.where((d['z.df1'] > 0) & (d['z.df2'] > 0), 'Positive', np.where((d['z.df1'] < 0) & (d['z.df2'] < 0), 'Negative', 'Opposite')) x= pd.read_csv(input[1], sep= '\t', header= 0) x['trait']= input[1].split('pph_')[1].replace('.txt', '') x= pd.merge(x, d[['snp', 'locus', 'SNP.PP.H4', 'direction']]) x= x.loc[(x['PP.H0.abf'] != 0) & (x['PP.H1.abf'] != 0) & (x['PP.H2.abf'] != 0) & (x['PP.H0.abf'] != 0) & (x['PP.H4.abf'] != 0), :] x.dropna(axis= 0, inplace= True) x.to_csv(output[0], sep= '\t', header= True, index= False) df.to_csv(output[1], sep= '\t', header= True, index= False) |
151 152 | script: 'coloc.Rmd' |
161 162 163 164 165 166 167 168 169 170 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip') df= pd.read_csv(input[1], sep= '\t', header= 0) df['CHR']= np.where(df.CHR== 'X', '23', df.CHR) df['CHR']= df.CHR.astype(str).astype(int) d= pd.merge(d, df, on= 'CHR') d= d.loc[((d.POS> d.pos1) & (d.POS < d.pos2)), :] d.sort_values('pvalue', ascending= True, inplace= True) d= d.groupby('nearestGene_y').first() d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['ID']) |
179 180 181 182 183 | shell: ''' grep -f {input[0]} {input[1]} > {output[0]} || true touch {output[0]} ''' |
192 193 194 195 196 | shell: ''' grep -f {input[0]} {input[1]} > {output[0]} || true touch {output[0]} ''' |
205 206 207 208 209 | shell: ''' grep -f {input[0]} {input[1]} > {output[0]} || true touch {output[0]} ''' |
217 218 219 220 221 | shell: ''' grep -f {input[0]} {input[1]} > {output[0]} || true touch {output[0]} ''' |
230 231 232 233 234 235 236 237 | run: df_list= list() for infile in input: d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf']) d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '') df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
245 246 247 248 249 250 251 252 | run: df_list= list() for infile in input: d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf']) d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '') df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
260 261 262 263 264 265 266 267 | run: df_list= list() for infile in input: d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf']) d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '') df_list.append(d) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
275 276 277 278 279 | run: df_list= list() for infile in input: d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf']) d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '') |
292 293 | script: 'forest_plots.Rmd' |
303 304 | script: 'other_meta.Rmd' |
314 315 | script: 'other_meta.Rmd' |
326 327 | script: 'other_meta.Rmd' |
337 338 | script: 'nonadditive_qc.Rmd' |
348 349 | script: 'file_level_qc.Rmd' |
359 360 | script: 'file_level_qc.Rmd' |
370 371 | script: 'file_level_qc.Rmd' |
381 382 | script: 'file_level_qc.Rmd' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | library(data.table) library(dplyr) library(metafor) funk= function(pheno) { d_temp= d[d$outcome== pheno, ] df_list= lapply(c('MT', 'MNT', 'PT'), function(i){ df_temp= d_temp[d_temp$haplotype== i, ] print(nrow(d_temp)) res.FE= rma(yi= beta, sei= se, data= df_temp, method= "FE") df= data.frame(beta= res.FE$beta, se= res.FE$se, pvalue= res.FE$pval, lo95= res.FE$ci.lb, up95= res.FE$ci.ub, het_pvalue= res.FE$QEp, outcome= pheno, haplotype= i) print(df) return(df) }) df= do.call('rbind', df_list) return(df) } moba= fread(snakemake@input[[1]]) decode= fread(snakemake@input[[2]]) hunt= fread(snakemake@input[[3]]) d= rbind(moba, decode) d= rbind(d, hunt) df_list= lapply(unique(d$outcome), funk) x= do.call('rbind', df_list) df= group_by(d, haplotype, outcome) %>% summarize(n= sum(n)) x= inner_join(x, df, by= c('haplotype', 'outcome')) fwrite(x, snakemake@output[[1]], sep= '\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | library(data.table) library(dplyr) library(metafor) funk= function(pheno) { d_temp= d[d$exposure== pheno, ] df_list= lapply(c('MT', 'MNT', 'PT'), function(i){ df_temp= d_temp[d_temp$haplotype== i, ] print(nrow(d_temp)) res.FE= rma(yi= beta, sei= se, data= df_temp, method= "FE") df= data.frame(beta= res.FE$beta, se= res.FE$se, pvalue= res.FE$pval, lo95= res.FE$ci.lb, up95= res.FE$ci.ub, het_pvalue= res.FE$QEp, exposure= pheno, haplotype= i) print(df) return(df) }) df= do.call('rbind', df_list) return(df) } moba= fread(snakemake@input[[1]]) decode= fread(snakemake@input[[2]]) hunt= fread(snakemake@input[[3]]) d= rbind(moba, decode) d= rbind(d, hunt) df_list= lapply(unique(d$exposure), funk) x= do.call('rbind', df_list) df= group_by(d, haplotype, exposure) %>% summarize(n= sum(n)) x= inner_join(x, df, by= c('haplotype', 'exposure')) fwrite(x, snakemake@output[[1]], sep= '\t') |
13 14 | script: 'PGS_repr_pheno_meta.R' |
26 27 | script: 'PGS_fetal_growth_meta.R' |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | library(data.table) library(dplyr) library(DESeq2) library(tidyverse) df_list= list() flist= list.files(snakemake@params[[1]], 'CL', full.names=T) for (i in 1:length(flist)){ d= fread(flist[i]) cname= unlist(strsplit(flist[i], '/'))[10] d= select(d, Name, NumReads) names(d)= c('Name', cname) df_list[[i]]= d } x= df_list %>% reduce(left_join, by = "Name") cols= data.frame(row.names= colnames(x)[2:7], condition= colnames(x)[2:7], subject= colnames(x)[2:7]) cols$condition= gsub('.txt', '', sapply(strsplit(cols$condition, '-'), tail, 1)) cols$subject= sapply(strsplit(cols$subject, '-'), head, 1) cts= as.matrix(x[, 2:7]) row.names(cts)= x$Name dds <- DESeqDataSetFromMatrix(countData = round(cts), colData = cols, design= ~ subject + condition) dds= DESeq(dds) res= results(dds, name="condition_unt_vs_dec") res= data.frame(res) res$geneid= row.names(res) fwrite(res, snakemake@output[[1]], sep= '\t') |
16 17 | script: 'rna_seq_dif.R' |
28 29 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID']) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | library("dplyr") library("knitr") library("tidyr") library(cowplot) library(ggrepel) library("data.table") library('showtext') colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") x= fread(snakemake@input[[1]]) x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) x1= fread(snakemake@input[[2]]) x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2])) x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2])) d= rbind(x, x1) d$trait= d$p2 d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration', ifelse(trait== 'miscarriage', 'Miscarriage', ifelse(trait== 'GA_fetal', 'GA fetal effect', ifelse(trait== 'BW_maternal', 'Maternal BW', ifelse(trait== 'AFB', 'Age at first birth', ifelse(trait== 'AMenarche', 'Age at menarche', ifelse(trait== 'AMenopause', 'Age at menopause', ifelse(trait== 'NLB', 'Number of live births', ifelse(trait== 'Testosterone_fem', 'Testosterone (women)', ifelse(trait== 'SHBG_fem', 'SHBG (women)', ifelse(trait== 'SHBG_male', 'SHBG (men)', ifelse(trait== 'CBAT_fem', 'CBAT (women)', ifelse(trait== 'CBAT_male', 'CBAT (men)', ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)', ifelse(trait== 'POP', 'Pelvic Organ Prolapse', ifelse(trait== 'Testosterone_male', 'Testosterone (men)', ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus', ifelse(trait== 'BW_fetal', 'Fetal', ifelse(trait== 'BW_fetal_effect', 'Fetal only', ifelse(trait== 'Preeclampsia', 'Pre-eclampsia', ifelse(trait== 'BW_maternal_effect', 'Maternal only', ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))) d= filter(d, grepl('men', trait), !grepl('women', trait)) fwrite(d, snakemake@output[[1]], sep= '\t') |
R
dplyr
data.table
tidyr
cowplot
ggrepel
knitr
showtext
From
line
1
of
tables/genetic_correlations_males.R
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | run: d= pd.read_csv(input[0], sep='\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0) x['pheno']= 'Gestational duration' ptd= pd.read_csv(input[2], sep= '\t', header= 0) ptd['pheno']= 'Preterm delivery' postterm= pd.read_csv(input[3], sep= '\t', header= 0) postterm['pheno']= 'Post term delivery' gID= ['3:156697097:A:G', '5:158058432:G:T'] d= d.loc[d.ID.isin(gID), :] d= pd.concat([x, d]) d= pd.concat([d, ptd]) d= pd.concat([d, postterm]) d.sort_values('ID', inplace= True) d.to_csv(output[0], header= True, index= False, sep= '\t') |
34 35 36 37 38 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0) d= pd.merge(d[['ID', 'pheno']], x, on= 'ID', how= 'inner') d.to_csv(output[0], sep= '\t', header= True, index= False) |
49 50 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) |
67 68 69 70 | run: d= pd.read_csv(input[1], sep= '\t', header= 0) top= pd.read_csv(input[2], sep= '\t', header= 0, usecols= ['ID', 'nearestGene', 'RSID']) d= pd.merge(d, top, left_on= 'rsid', right_on= 'RSID') |
82 83 84 85 | shell: ''' cp {input[0]} {output[0]} ''' |
95 96 97 98 99 100 101 102 103 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0) x['eqtl_data']= 'iPSC' d= pd.concat([d, x,]) df= pd.read_csv(input[2], sep= '\t', header= None, names= ['chr', 'pos1', 'pos2', 'Gene_symbol', 'EID'], usecols= ['Gene_symbol', 'EID']) df['EID']= df['EID'].str.split('.').str[0] d= pd.merge(d, df, left_on= 'gene', right_on= 'EID') d.to_csv(output[0], sep= '\t', header= True, index= False) |
113 114 | script: 'genetic_correlations_males.R' |
122 123 124 125 126 127 128 129 130 | run: df_list= list() for i in input: d= pd.read_csv(i, sep= '\t', header= 0, usecols= ['CHR', 'N']) coh= i.split('filtered/')[1].replace('.txt', '') df_dict= pd.DataFrame({'cohort': coh, 'N': d.N.max()}, index= [0]) df_list.append(df_dict) d= pd.concat(df_list) d.to_csv(output[0], sep= '\t', header= True, index= False) |
138 139 140 141 142 | run: d= pd.read_csv(input[0], sep= '\t', header= None, names= ['ID', 'beta', 'se', 'pvalue', 'trait']) d[['CHR', 'POS', 'REF', 'EFF']]= d.ID.str.split(':', expand= True) d['CHR']= np.where(d.CHR== '23', 'X', d.CHR) d.to_csv(output[0], sep= '\t', header= True, index= False) |
151 152 153 154 155 156 157 158 159 160 161 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0) d= pd.concat([d, x]) horm= ['CBAT_fem', 'SHBG_fem', 'Testosterone_fem', 'SHBG_fem_cluster', 'Testosterone_fem_cluster'] df= d.loc[d.trait.isin(horm), :] ivw= df.loc[df.method== 'IVW', :] egger= df.loc[df.method== 'MR-Egger', :] egger_int= df.loc[np.array(df.index[df.method== 'MR-Egger' ] + 1), :] d= pd.concat([ivw, egger, egger_int]) d.to_csv(output[0], sep= '\t', header= True, index= False) |
170 171 172 173 174 175 176 177 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'HetISq', 'HetPVal']) d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) x= pd.read_csv(input[1], sep= '\t', header= 0) d['REF'], d['EFF']= np.where(d.REF> d.EFF, [d.EFF, d.REF], [d.REF, d.EFF]) d['ID']= np.where(d.REF> d.EFF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF + ':' + d.REF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF) d= pd.merge(d, x, on= 'ID') d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'HetISq', 'HetPVal']) |
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] df= d.loc[d.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue x= pd.concat(df_list) x['pos1']= x.POS - 1.5*10**6 x['pos2']= x.POS + 1.5*10**6 x['CHR']= x.CHR.astype(str) x['CHR']= np.where(x.CHR== '23', 'X', x.CHR) x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2']) |
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | run: d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene']) df= d.loc[d.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue x= pd.concat(df_list) x['pos1']= x.POS - 1.5*10**6 x['pos2']= x.POS + 1.5*10**6 x['CHR']= x.CHR.astype(str) x['CHR']= np.where(x.CHR== '23', 'X', x.CHR) x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene']) |
72 73 74 75 76 77 78 79 80 81 82 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'ID', 'BETA', 'SE', 'pvalue']) x= pd.read_csv(input[1], sep= '\t', header= 0) x['CHR']= np.where(x.CHR== 'X', '23', x.CHR) x['CHR']= x.CHR.apply(int) d= pd.merge(d, x, on= 'CHR') d= d.loc[((d.POS>= d.pos1) & (d.POS <= d.pos2)), ] d.sort_values('pvalue', ascending= True, inplace= True) d= d.groupby('nearestGene').head(1) d= d[['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'nearestGene', 'ID', 'BETA', 'SE', 'pvalue']] d.to_csv(output[0], sep= '\t', header= True, index= False) |
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | run: for i in range(2): d= pd.read_csv(input[i], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene']) df= d.loc[d.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue x= pd.concat(df_list) x['pos1']= x.POS - 1.5*10**6 x['pos2']= x.POS + 1.5*10**6 x['CHR']= x.CHR.astype(str) x['CHR']= np.where(x.CHR== '23', 'X', x.CHR) x.to_csv(output[i], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene']) |
126 127 128 129 130 131 132 133 134 135 136 137 | run: for i in range(2): d= pd.read_csv(input[i], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'ID', 'pvalue']) x= pd.read_csv(input[i+2], sep= '\t', header= 0) x['CHR']= np.where(x.CHR== 'X', '23', x.CHR) x['CHR']= x.CHR.apply(int) d= pd.merge(d, x, on= 'CHR') d= d.loc[((d.POS>= d.pos1) & (d.POS <= d.pos2)), ] d.sort_values('pvalue', ascending= True, inplace= True) d= d.groupby('nearestGene').head(1) d= d[['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'nearestGene', 'ID', 'pvalue']] d.to_csv(output[i], sep= '\t', header= True, index= False) |
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | run: d= pd.read_csv(input[0], sep= '\t',usecols= ['CHR', 'POS', 'pvalue']) df= d.loc[d.pvalue< 5*10**-8, :] df.sort_values(by= 'pvalue', ascending= True, inplace= True) df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True) df_list= list() for chrom in set(df.CHR): d_temp= df.loc[df.CHR== chrom, :] positions= d_temp.POS.values for pos in positions: if pos in d_temp.POS.values: df_list.append(d_temp.loc[d_temp.POS== pos, :]) d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :] else: continue x= pd.concat(df_list) x['pos1']= x.POS - 1.5*10**6 x['pos2']= x.POS + 1.5*10**6 x['CHR']= x.CHR.astype(str) x['CHR']= np.where(x.CHR== '23', 'X', x.CHR) x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'POS', 'pos1', 'pos2']) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | import pandas as pd import numpy as np d= pd.read_csv(snakemake.input[0], sep= '\t', header=0, compression= 'gzip') d= d.loc[~d['#chrom'].str.contains('_'), :] d['a1']= d.alts.str.split(',').str[0] d['a2']= d.alts.str.split(',').str[1] d['#chrom']= d['#chrom'].str.replace('chr', '') d['POS']= np.where(d.ref.str.len() < d.alts.str.len(), d.chromStart, d.chromEnd) d['ref']= np.where(d.ref.str.len()< d.alts.str.len(), 'I', d.ref) d['ref']= np.where(d.ref.str.len() > d.alts.str.len(), 'D', d.ref) d['a1']= np.where(d.ref== 'I', 'D', d.a1) d['a1']= np.where(d.ref== 'D', 'I', d.a1) df= d.copy() df= df.loc[df.a2!= '', :] d.loc[d.ref > d.a1, ['ref', 'a1']] = d.loc[d.ref > d.a1, ['a1', 'ref']].values d['ID']= d['#chrom'] + ':' + d['POS'].astype(int).astype(str) + ':' + d.ref + ':' + d.a1 df.loc[df.ref > df.a2, ['ref', 'a2']] = df.loc[df.ref > df.a2, ['a2', 'ref']].values df['ID']= df['#chrom'] + ':' + df['POS'].astype(int).astype(str) + ':' + df.ref + ':' + df.a2 df= df[['ID', 'name']] d= d[['ID', 'name']] d= pd.concat([d, df]) # Read RSIDs from HRC x= pd.read_csv(snakemake.input[1], sep= '\t', header=0, usecols= ['#CHROM', 'POS', 'ID', 'REF', 'ALT']) x.columns= ['CHROM', 'POS', 'name', 'REF', 'ALT'] x= x.loc[x.name!= '.', :] x['CHROM']= np.where(x.CHROM== 'X', '23', x.CHROM) x['CHROM']= x.CHROM.apply(str) x.loc[x.REF > x.ALT, ['REF', 'ALT']] = x.loc[x.REF > x.ALT, ['ALT', 'REF']].values x['ID']= x['CHROM'] + ':' + x['POS'].astype(int).astype(str) + ':' + x.REF + ':' + x.ALT x= x[['ID', 'name']] x= x.loc[~x.ID.isin(d.ID), :] d= pd.concat([d, x]) d.to_csv(snakemake.output[0], sep= '\t', header= True, index= False) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import pandas as pd import numpy as np import re #d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0) #d['Allele1']= d['Allele1'].str.upper() #d['Allele2']= d['Allele2'].str.upper() #d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] #d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) #d['CHR']= d['CHR'].astype(str).astype(int) #d['POS']= d['POS'].astype(str).astype(int) #d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] #d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] #d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) #d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) #d['CHR']= d['CHR'].astype(str).astype(int) #d['POS']= d['POS'].astype(str).astype(int) #d['pvalue']= d['pvalue'].astype(str).astype(float) #d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values #d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF #d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] col_list= ['IMPACT', 'DISTANCE', 'SYMBOL', 'SYMBOL_SOURCE', 'BIOTYPE'] df_list= list() for vep in pd.read_csv(snakemake.input[1], sep= '\t', header= None, names= ['Variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra'], comment= '#', chunksize= 100000): for i in col_list: vep[i]= vep['Extra'].apply(lambda y: dict([(x.split('=', 1)) for x in re.split(';(?=\w)', y) if x.find('=') > -1])[i] if i in y else '') vep= vep[['Variation', 'Location', 'Existing_variation', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']] vep.columns= ['ID', 'Location', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE'] vep['BIOTYPE1']= np.where(vep.BIOTYPE== 'protein_coding', 0, np.where(vep.BIOTYPE.str.contains('pseudo'), 2, 1)) vep['DISTANCE']= np.where(vep.DISTANCE== '', 0, vep.DISTANCE) vep[['chr', 'pos', 'All']]= vep.ID.str.split('_', expand= True) vep[['EFF', 'REF']]= vep.All.str.split('/', expand= True) vep.loc[vep.REF > vep.EFF, ['REF', 'EFF']] = vep.loc[vep.REF > vep.EFF, ['EFF', 'REF']].values vep[['CHR', 'POS']]= vep['Location'].str.split(':', expand= True) vep['CHR']= np.where(vep['CHR']== 'X', '23', vep['CHR']) vep['ID']= vep.CHR.astype(int).astype(str) + ':' + vep.POS.astype(int).astype(str) + ':' + vep.REF + ':' + vep.EFF vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE', 'BIOTYPE1']] vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True) vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True) df_list.append(vep) vep= pd.concat(df_list) vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True) vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True) vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE']] d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0) d['Allele1']= d['Allele1'].str.upper() d['Allele2']= d['Allele2'].str.upper() d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :] d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']] d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue'] d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA) d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF) d['CHR']= d['CHR'].astype(str).astype(int) d['POS']= d['POS'].astype(str).astype(int) d['pvalue']= d['pvalue'].astype(str).astype(float) d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :] d= pd.merge(d, vep, on= ['ID'], how= 'left') d.to_csv(snakemake.output[0], header=True, index= False, sep= '\t') |
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0) x['CHR']= np.where(x['CHR']== '23', 'X', x['CHR']) d[['CHR', 'POS', 'REF', 'EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True) d= d.loc[d.SNP== 'SNP', :] d['POS2']= d['POS'] d['CHR']= np.where(d['CHR']== '23', 'X', d['CHR']) d['POS']= d['POS'].astype(str).astype(int) df_list= list() for index, row in x.iterrows(): temp_df= d.loc[d.CHR== row['CHR'], :] temp_df= temp_df.loc[((temp_df.POS >= int(row['pos1'])) & (temp_df.POS <= int(row['pos2']))), :] df_list.append(temp_df) d= pd.concat(df_list) d['Allele']= d['Allele1'].str.upper() + '/' + d['Allele2'].str.upper() d['STRAND']= '+' d.sort_values(by= ['CHR', 'POS'], inplace= True) d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['CHR', 'POS', 'POS2', 'Allele', 'STRAND']) |
37 38 | shell: '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite' |
48 49 | script: 'format_VEP.py' |
58 59 60 61 | run: d= pd.read_csv(input[0], sep= '\t', header= 0) x= pd.read_csv(input[1], sep= '\t', header= 0) d= d.loc[~d.geneSymbol.isin(x.name2), :] |
88 89 90 91 92 93 94 95 96 97 98 99 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1']) d['CHR']= d.MarkerName.str.split(':').str[0] d['end']= d.MarkerName.str.split(':').str[1] d['CHR']= d.CHR.astype('str').astype('int') d['end']= d.end.astype('str').astype('int') d['start']= d.end - 1 d['MarkerName']= d.MarkerName.str.replace(':SNP', '') d['MarkerName']= d.MarkerName.str.replace(':INDEL', '') d.sort_values(by= ['CHR', 'start'], inplace= True) d= d[['CHR', 'start', 'end', 'MarkerName']] d.to_csv(output[0], sep= '\t', header= False, index= False) |
108 109 | shell: 'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}' |
118 119 | script: 'format_dbSNP.py' |
129 130 131 132 133 134 135 136 137 138 139 140 | run: d= pd.read_csv(input[0], sep= '\t', header=0) rs= pd.read_csv(input[1], sep= '\t', header=0) d= pd.merge(d, rs, on= 'ID', how= 'left') d['RSID']= np.where(pd.isnull(d.RSID), d.name, d.RSID) d['RSID']= np.where(d.RSID== '', d.name, d.RSID) d['RSID']= np.where(d.RSID== '-', d.name, d.RSID) d.drop('name', 1, inplace= True) ne= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene']) ne= ne[['ID', 'nearestGene']] d= pd.merge(d, ne, on= 'ID', how= 'left') d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip') |
148 149 150 151 152 153 154 | run: d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['ID', 'CHR', 'POS']) d['end']= d.POS d['start']= d.end - 1 d.sort_values(by= ['CHR', 'start'], inplace= True) d= d[['CHR', 'start', 'end', 'ID']] d.to_csv(output[0], sep= '\t', header= False, index= False) |
163 164 | shell: 'bedtools closest -t all -k 2 -a {input[0]} -b {input[1]} > {output[0]}' |
173 174 175 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['ID', 'nearestGene']) ne= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene2', 'Ensembl_gene']) |
188 189 190 | run: d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'P-value']) d= d.loc[d['P-value']< 5e-5, :] |
207 208 | shell: '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite' |
217 218 | script: 'format_VEP.py' |
Support
Do you know this workflow well? If so, you can
request seller status , and start supporting this workflow.
Created: 1yr ago
Updated: 1yr ago
Maitainers:
public
URL:
https://github.com/PerinatalLab/metaGWAS
Name:
metagwas
Version:
v1.0.0
Downloaded:
0
Copyright:
Public Domain
License:
MIT License
Keywords:
Sample annotation
Analysis
Gene report
plink2
BCFtools
BEDTools
ggtree
GWAS
Pandas
pLink
Quant
Snakemake
Variant Effect Predictor (VEP)
coloc
cowplot
data.table
dendextend
dplyr
ggplot2
ggrepel
ggtern
gridExtra
kableExtra
knitr
metafor
plyr
scales
showtext
tidyr
tidyverse
numpy
scipy
Genetic variation
preeclampsia
- Future updates
Related Workflows

ENCODE pipeline for histone marks developed for the psychENCODE project
psychip pipeline is an improved version of the ENCODE pipeline for histone marks developed for the psychENCODE project.
The o...

Near-real time tracking of SARS-CoV-2 in Connecticut
Repository containing scripts to perform near-real time tracking of SARS-CoV-2 in Connecticut using genomic data. This pipeli...

snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...

ATLAS - Three commands to start analyzing your metagenome data
Metagenome-atlas is a easy-to-use metagenomic pipeline based on snakemake. It handles all steps from QC, Assembly, Binning, t...
raw sequence reads
Genome assembly
Annotation track
checkm2
gunc
prodigal
snakemake-wrapper-utils
MEGAHIT
Atlas
BBMap
Biopython
BioRuby
Bwa-mem2
cd-hit
CheckM
DAS
Diamond
eggNOG-mapper v2
MetaBAT 2
Minimap2
MMseqs
MultiQC
Pandas
Picard
pyfastx
SAMtools
SemiBin
Snakemake
SPAdes
SqueezeMeta
TADpole
VAMB
CONCOCT
ete3
gtdbtk
h5py
networkx
numpy
plotly
psutil
utils
metagenomics

RNA-seq workflow using STAR and DESeq2
This workflow performs a differential gene expression analysis with STAR and Deseq2. The usage of this workflow is described ...

This Snakemake pipeline implements the GATK best-practices workflow
This Snakemake pipeline implements the GATK best-practices workflow for calling small germline variants. The usage of thi...