Genome-to-genome analysis reveals associations between human and mycobacterial genetic variation in tuberculosis patients from Tanzania
Loading...
Preprint: https://doi.org/10.1101/2023.05.11.23289848
Summary Statistics
Full summary statistics: https://doi.org/10.5281/zenodo.6373034
Reported associations: summary_stats
Code
Snak
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | library(ape) library(parallel) GetGeneCoverage <- function(VCF_Files,ref_annot,n_cores){ pbmclapply(1:length(VCF_Files),function(i){ cur_vcf <- VCF_Files[i] cur_out <- gsub(x=cur_vcf,pattern = '.vcf.gz','.coverage') system(glue::glue("bcftools query -f '%POS [ %SDP]\n' {cur_vcf} > {cur_out}")) },mc.cores = n_cores) gff_file <- read.gff(ref_annot) gff_file_genes <- gff_file[gff_file$type=='gene',] gene_df <- data.frame(Gene = sapply(gff_file_genes$attributes,function(x) strsplit(x=strsplit(x=x,split = ';')[[1]][3],split = '=')[[1]][2]),start = gff_file_genes$start, end = gff_file_genes$end,length = gff_file_genes$end - gff_file_genes$start + 1 ,stringsAsFactors = F) cov_matrix <- matrix(NA,nrow = length(VCF_Files),ncol = nrow(gene_df)) sample_names <- sapply(VCF_Files,function(x) strsplit(x=strsplit(x=x,split = '/')[[1]][length(strsplit(x=x,split = '/')[[1]])],split = '\\.')[[1]][1]) rownames(cov_matrix) <- sample_names colnames(cov_matrix) <- gene_df$Gene for(i in 1:length(VCF_Files)){ print(i) cur_vcf <- VCF_Files[i] cur_out <- gsub(x=cur_vcf,pattern = '.vcf.gz','.coverage') cur_coverage <- data.table::fread(cur_out) gene_cov <- unlist(pbmclapply(1:nrow(gene_df),function(k) { start <- gene_df$start[k] end = gene_df$end[k] return(mean(cur_coverage$V2[cur_coverage$V1 >= start & cur_coverage$V1 <= end])) },mc.cores = n_cores)) cov_matrix[i,] <- gene_cov system(glue::glue("rm {cur_out}")) } return(list(cov_matrix = cov_matrix,gene_df=gene_df)) } #Get positions for each samples which contain missing call GetMissingMatrix <- function(missing_files){ #Get data frame for each sample, containing missing positions missing_tbl <- lapply(missing_files,function(x) data.table::fread(x)) names(missing_tbl) <- sapply(missing_files,function(x) { split_path <- strsplit(x=x,split = '/')[[1]] file_name <- split_path[length(split_path)] ID <- strsplit(file_name,split = '\\.')[[1]][1] return(ID) }) uniq_pos <- sort(unique(do.call(rbind, missing_tbl)$V1)) missing_mat <- matrix(F,nrow = length(uniq_pos),ncol = length(missing_tbl)) rownames(missing_mat) <- as.character(uniq_pos) colnames(missing_mat) <- names(missing_tbl) for(i in 1:ncol(missing_mat)){ missing_mat[as.character(missing_tbl[[i]]$V1),i] <- T } return(missing_mat) } args <- commandArgs(trailingOnly = TRUE) vcf_files <- args[grepl(args,pattern = '.vcf.gz')] missing_files <- args[grepl(args,pattern = '.missing')] params <- args[!grepl(args,pattern = '.vcf.gz') & !grepl(args,pattern = '.missing')] ref_annot <- params[[1]] out_dir <- params[[2]] n_cores <- as.numeric(params[[3]]) system(glue::glue("mkdir -p {out_dir}")) #Get average coverage for each gene # Gene_Result <- GetGeneCoverage(vcf_files,ref_annot,n_cores) # Gene_Cov_Matrix <- Gene_Resul$cov_matrix # saveRDS(Gene_Cov_Matrix,glue::glue("{out_dir}/Mtb_gene_coverage.rds")) #Get missing positions for each sample missing_mat <- GetMissingMatrix(missing_files) saveRDS(missing_mat,glue::glue("{out_dir}missing_mat.rds")) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | library(glue) library(dplyr) #Convert VCF file to AA list for each sample with non-synonymous variants WriteAATable <- function(cur_file,locus_to_excl,snpEff,cur_id,out_path){ DATA_DIR <- paste0(strsplit(cur_file,split = '/')[[1]][1:(length(strsplit(cur_file,split = '/')[[1]]) - 1)],collapse = '/') cur_prefix <- strsplit(cur_file,split = '/')[[1]][length(strsplit(cur_file,split = '/')[[1]])] cur_file_unzip <- gsub(pattern = '.gz',replacement = '',cur_prefix) system(glue::glue("mkdir -p {DATA_DIR}/rep_filt/")) out_dir <- gsub(x=out_path,pattern = paste0(cur_id,'.txt'),replacement = '') system(glue::glue("mkdir -p {out_dir}")) #Loop through all possible reading frames for each nucleotide variant cur_annot_table <- list() loop_tbl = T cnt <- 0 #Loop through all frames while(loop_tbl){ #Remove SNPs in repetitive regions system(glue::glue("bedtools subtract -header -A -a {cur_file} -b {locus_to_excl} > {DATA_DIR}/rep_filt/{cur_file_unzip}")) #Get SNP consequences cur_table <- system(glue::glue("java -jar {snpEff} extractFields {DATA_DIR}/rep_filt/{cur_file_unzip} POS REF ALT ANN[{cnt}].EFFECT ANN[{cnt}].GENE EFF[{cnt}].AA GEN[0].GT GEN[0].FREQ> {DATA_DIR}{cur_file_unzip}.tbl"),intern = T) cur_table_parsed <- data.table::fread(glue::glue("{DATA_DIR}{cur_file_unzip}.tbl"),sep = '\t',fill = F) colnames(cur_table_parsed) <- c('POS','REF','ALT','EFFECT','GENE','AA_Change','GENOTYPE','FREQ') cur_table_parsed$FREQ <- sapply(cur_table_parsed$FREQ,function(x) as.numeric(gsub(x=x,pattern = '%',replacement = ''))) #End loop if there are no longer have any frames to search on. if(!all(is.na(cur_table_parsed$EFFECT))){ cnt <- cnt + 1 cur_annot_table <- c(cur_annot_table,list(cur_table_parsed)) }else{ loop_tbl <- F } } #Cleanup system(glue::glue("rm {DATA_DIR}{cur_file_unzip}.tbl")) if(length(cur_annot_table) == 0){ system(glue::glue("touch {out_path}")) return(NA) } cur_annot_table_full <- do.call(rbind,cur_annot_table) %>% dplyr::filter(EFFECT != 'synonymous_variant' & AA_Change != '') data.table::fwrite(cur_annot_table_full,file = out_path) } #Convert VCF file to Nucleotide list for each sample with synonymous variants WriteNucTableSyn <- function(cur_file,locus_to_excl,snpEff,cur_id,out_path){ DATA_DIR <- paste0(strsplit(cur_file,split = '/')[[1]][1:(length(strsplit(cur_file,split = '/')[[1]]) - 1)],collapse = '/') cur_prefix <- strsplit(cur_file,split = '/')[[1]][length(strsplit(cur_file,split = '/')[[1]])] cur_file_unzip <- gsub(pattern = '.gz',replacement = '',cur_prefix) system(glue::glue("mkdir -p {DATA_DIR}/rep_filt/")) out_dir <- gsub(x=out_path,pattern = paste0(cur_id,'.txt'),replacement = '') system(glue::glue("mkdir -p {out_dir}")) #Loop through all possible reading frames for each nucleotide variant cur_annot_table <- list() loop_tbl = T cnt <- 0 #Loop through all frames while(loop_tbl){ #Remove SNPs in repetitive regions system(glue::glue("bedtools subtract -header -A -a {cur_file} -b {locus_to_excl} > {DATA_DIR}/rep_filt/{cur_file_unzip}")) #Get SNP consequences cur_table <- system(glue::glue("java -jar {snpEff} extractFields {DATA_DIR}/rep_filt/{cur_file_unzip} POS REF ALT ANN[{cnt}].EFFECT ANN[{cnt}].GENE GEN[0].GT GEN[0].FREQ > {DATA_DIR}{cur_file_unzip}.tbl"),intern = T) cur_table_parsed <- data.table::fread(glue::glue("{DATA_DIR}{cur_file_unzip}.tbl"),sep = '\t',fill = F) colnames(cur_table_parsed) <- c('POS','REF','ALT','EFFECT','GENE','GENOTYPE','FREQ') cur_table_parsed$FREQ <- sapply(cur_table_parsed$FREQ,function(x) as.numeric(gsub(x=x,pattern = '%',replacement = ''))) #End loop if there are no longer have any frames to search on. if(!all(is.na(cur_table_parsed$EFFECT))){ cnt <- cnt + 1 cur_annot_table <- c(cur_annot_table,list(cur_table_parsed)) }else{ loop_tbl <- F } } #Cleanup system(glue::glue("rm {DATA_DIR}{cur_file_unzip}.tbl")) if(length(cur_annot_table) == 0){ system(glue::glue("touch {out_path}")) return(NA) } cur_annot_table_full <- do.call(rbind,cur_annot_table) syn_variants <- dplyr::filter(cur_annot_table_full,EFFECT == 'synonymous_variant') data.table::fwrite(syn_variants,file = out_path) } args <- commandArgs(trailingOnly = TRUE) vcf_file <- args[[1]] locus_to_excl <- args[[2]] snpEff <- args[[3]] cur_id <- args[[4]] out_path <- args[[5]] out_path_syn <- args[[6]] #Get VCF Files and write into AA tables WriteAATable(vcf_file,locus_to_excl,snpEff,cur_id,out_path) WriteNucTableSyn(vcf_file,locus_to_excl,snpEff,cur_id,out_path_syn) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | library(glue) library(stringr) library(ape) library(parallel) library(pbmcapply) library(filematrix) library(data.table) #Filter variants which only appear in one lineage, and show no variability within the lineage. RemoveStratVariants <- function(AA_Matrix,Lineage_Df){ samples_to_keep <- intersect(Lineage_Df$G_NUMBER,rownames(AA_Matrix)) Lineage_Df <- Lineage_Df[match(samples_to_keep,Lineage_Df$G_NUMBER),] #Keep samples where Host data exists AA_Matrix_filt <- AA_Matrix[samples_to_keep,] #Remove Missing Variants/Genes AA_Matrix_filt <- AA_Matrix_filt[,!apply(AA_Matrix_filt,2,function(x) all(is.na(x)))] #Assign variants to lineages strat_table <- lapply(1:ncol(AA_Matrix_filt),function(i) table(ifelse(AA_Matrix_filt[,i]==0,0,1),Lineage_Df$Lineage)) strat_assng <- vector(mode = 'logical',length = length(strat_table)) for(i in 1:length(strat_table)){ cur_strat <- strat_table[[i]] MAC_by_lineage <- apply(cur_strat,2,function(x) min(x,na.rm = T)) #If a variant is perfectly stratified by lineage and there are no variability within a lineage, exclude from analysis if(sum(MAC_by_lineage > 0) == 0){ strat_assng[i] <- F }else strat_assng[i] <- T } return(AA_Matrix_filt[,strat_assng]) } #Merge AA table of each Mtb sequence into a matrix AATblToMatrix <- function(AA_Tbl_Files,phylo_snps = NULL,sift_table = NULL,sift_thresh = 0.05,het_thresh = het_thresh,n_cores = 10,missing_matrix){ if(!is.null(sift_table)){ #Include SNPs with SIFT score < threshold (Deleterious) sift_incl <- dplyr::filter(sift_table,SIFT_score > sift_thresh) %>% dplyr::select(POS = `#Position`,REF=Ref_allele,ALT=New_allele) %>% dplyr::filter(REF != ALT) } #Parse all sample IDs all_sample_ids <- sapply(AA_Tbl_Files,function(x) strsplit(x=x,split = '/')[[1]][length(strsplit(x=x,split = '/')[[1]])]) all_sample_ids <- sapply(all_sample_ids,function(x) gsub(pattern = '.txt',replacement = '',x=x)) #Order missing matrix missing_matrix <- missing_matrix[,all_sample_ids] #Get all non-syn variants for each Mtb sample all_variants <- pbmclapply(AA_Tbl_Files,function(x){ tbl <- data.table::fread(x) if(nrow(tbl) == 0){ return(data.frame(ID = character(),Genotype = numeric())) } if(!is.null(phylo_snps)){ tbl <- dplyr::anti_join(tbl,phylo_snps,by=c('POS'='POS','REF'='REF','ALT'='ALT')) } if(!is.null(sift_table)){ tbl <- dplyr::inner_join(tbl,sift_incl,by=c('POS'='POS','REF'='REF','ALT'='ALT')) } return(data.frame(ID = paste0(tbl$GENE,':',tbl$POS,':',tbl$AA_Change),Genotype = ifelse(tbl$GENOTYPE=='1/1',2,ifelse(tbl$GENOTYPE!='1/1' & tbl$FREQ > het_thresh,1,0)))) #Homozygotes as 2, heterzygotes (mixed calls) as 1 },mc.cores = n_cores) names(all_variants) <- all_sample_ids #Construct AA dosage matrix (0,1,or s) uniq_variant_ids <- unique(unlist(lapply(all_variants,function(x) x$ID))) uniq_variant_genes <- sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][1]) uniq_variant_nuc_pos <- as.numeric(sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][2])) uniq_variant_pos <- sapply(uniq_variant_ids,function(x) str_extract(strsplit(x=x,split = 'p\\.')[[1]][2],"[[:digit:]]+")) uniq_variant_df <- data.frame(ID = uniq_variant_ids,Gene = uniq_variant_genes, Pos = as.numeric(uniq_variant_pos),Nuc_pos = uniq_variant_nuc_pos) uniq_variant_df_sorted <- dplyr::group_by(uniq_variant_df,Gene) %>% dplyr::arrange(Pos,.by_group = T) sorted_variant_ids <- uniq_variant_df_sorted$ID #Initialize AA Matrix AA_Matrix <- matrix(0,nrow = length(all_variants),ncol = length(sorted_variant_ids)) rownames(AA_Matrix) <- names(all_variants) colnames(AA_Matrix) <- sorted_variant_ids #Fill in AA Matrix for(i in 1:nrow(AA_Matrix)){ AA_Matrix[i,all_variants[[i]]$ID] <- all_variants[[i]]$Genotype #Set missing positions missing_pos <- rownames(missing_matrix)[missing_matrix[,i]==T] missing_variants <- dplyr::filter(uniq_variant_df_sorted,Nuc_pos %in% missing_pos) AA_Matrix[i,missing_variants$ID] <- NA } return(AA_Matrix) } #Merge Synonymous Nuc table of each Mtb sequence into a matrix NucSynTblToMatrix <- function(AA_Tbl_Files,phylo_snps = NULL,missing_matrix,het_thresh=het_thresh,n_cores = n_cores){ #Parse all sample IDs all_sample_ids <- sapply(AA_Tbl_Files,function(x) strsplit(x=x,split = '/')[[1]][length(strsplit(x=x,split = '/')[[1]])]) all_sample_ids <- sapply(all_sample_ids,function(x) gsub(pattern = '.txt',replacement = '',x=x)) #Get all non-syn variants for each Mtb sample all_variants <- pbmclapply(AA_Tbl_Files,function(x){ tbl <- data.table::fread(x) if(nrow(tbl) == 0){ return(data.frame(ID = character(),Genotype = numeric())) } if(!is.null(phylo_snps)){ tbl <- dplyr::anti_join(tbl,phylo_snps,by=c('POS'='POS','REF'='REF','ALT'='ALT')) } return(data.frame(ID = paste0(tbl$GENE,':',tbl$POS,':',tbl$REF,'>',tbl$ALT),Genotype = ifelse(tbl$GENOTYPE=='1/1',2,ifelse(tbl$GENOTYPE!='1/1' & tbl$FREQ > het_thresh,1,0)))) },mc.cores = n_cores) names(all_variants) <- all_sample_ids #Construct AA dosage matrix (0s or 1s) uniq_variant_ids <- unique(unlist(lapply(all_variants,function(x) x$ID))) uniq_variant_genes <- sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][1]) uniq_variant_pos <- sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][2]) uniq_variant_df <- data.frame(ID = uniq_variant_ids,Gene = uniq_variant_genes, Pos = as.numeric(uniq_variant_pos)) uniq_variant_df_sorted <- dplyr::group_by(uniq_variant_df,Gene) %>% dplyr::arrange(Pos,.by_group = T) sorted_variant_ids <- uniq_variant_df_sorted$ID #Initialize AA Matrix AA_Matrix <- matrix(0,nrow = length(all_variants),ncol = length(sorted_variant_ids)) rownames(AA_Matrix) <- names(all_variants) colnames(AA_Matrix) <- sorted_variant_ids #Fill in AA Matrix for(i in 1:nrow(AA_Matrix)){ AA_Matrix[i,all_variants[[i]]$ID] <- all_variants[[i]]$Genotype #Set missing positions missing_pos <- rownames(missing_matrix)[missing_matrix[,i]==T] missing_variants <- dplyr::filter(uniq_variant_df_sorted,Pos %in% missing_pos) AA_Matrix[i,missing_variants$ID] <- NA } return(AA_Matrix) } GetGeneBurden <- function(AA_Matrix,AA_Matrix_Syn,metadata){ #Remove SNPs that are lineage markers AA_Matrix <- RemoveStratVariants(AA_Matrix,metadata) #Get Variant-Gene mapping for all non-syn variants non_syn_variants <- colnames(AA_Matrix) non_syn_genes <- sapply(non_syn_variants,function(x) strsplit(x=x,split = ':')[[1]][1]) non_syn_variant_df <- data.frame(SNP = non_syn_variants,Gene = non_syn_genes,stringsAsFactors = F) #Get non-syn burden per Gene non_syn_genes_uniq <- sort(unique(non_syn_variant_df$Gene)) non_syn_burden <- matrix(data = NA,nrow = nrow(AA_Matrix),ncol = length(non_syn_genes_uniq)) rownames(non_syn_burden) <- rownames(AA_Matrix) colnames(non_syn_burden) <- non_syn_genes_uniq non_syn_burden_homo <- non_syn_burden for(i in 1:length(non_syn_genes_uniq)){ cur_gene <- non_syn_genes_uniq[i] cur_burden <- apply(AA_Matrix[,dplyr::filter(non_syn_variant_df,Gene == cur_gene)$SNP,drop = F],1,function(x) sum(x != 0,na.rm = T)) non_syn_burden[,i] <- cur_burden } #Get Variant-Gene mapping for all syn variants, where gene contains syn variants syn_variants <- colnames(AA_Matrix_Syn) syn_genes <- sapply(syn_variants,function(x) strsplit(x=x,split = ':')[[1]][1]) syn_variant_df <- data.frame(SNP = syn_variants,Gene = syn_genes) %>% dplyr::filter(Gene %in% non_syn_genes_uniq) #Get syn burden per Gene syn_genes_uniq <- sort(unique(syn_variant_df$Gene)) syn_burden <- matrix(data = NA,nrow = nrow(AA_Matrix_Syn),ncol = length(syn_genes_uniq)) rownames(syn_burden) <- rownames(AA_Matrix_Syn) colnames(syn_burden) <- syn_genes_uniq syn_burden_homo <- syn_burden for(i in 1:length(syn_genes_uniq)){ cur_gene <- syn_genes_uniq[i] cur_burden <- apply(AA_Matrix_Syn[,dplyr::filter(syn_variant_df,Gene == cur_gene)$SNP,drop = F],1,function(x) sum(x!=0,na.rm = T)) syn_burden[,i] <- cur_burden } return(list(non_syn_burden=non_syn_burden,syn_burden=syn_burden,non_syn_variant_df=non_syn_variant_df,syn_variant_df=syn_variant_df)) } args <- commandArgs(trailingOnly = TRUE) AA_variant_files <- args[grepl(args,pattern = 'AA_')] Syn_variant_files <- args[grepl(args,pattern = 'Syn_')] params <- args[!grepl(args,pattern = 'AA_') & !grepl(args,pattern = 'Syn_')] missing_mat <- readRDS(params[[1]]) phylo_snps <- params[[2]] del_tbl <- data.table::fread(params[[3]]) sift_path <- params[[4]] IS_SIFT <- as.logical(params[[5]]) IS_Burden <- as.logical(params[[6]]) IS_Deletion <- as.logical(params[[7]]) out_path <- params[[8]] metadata <- data.table::fread(params[[9]]) het_thresh = as.numeric(params[[10]]) n_cores <- as.numeric(params[[11]]) # missing_mat <- readRDS('../scratch/Mtb_Coverage/HomoOnly_True/missing_mat.rds') # phylo_snps <- '../data/Mtb/PositionsPhylogeneticSNPs_20171004.txt' # del_tbl <- data.table::fread('../data/Mtb/binary_table_genes_Sinergia_final_dataset_human_bac_genome_available.txt') # sift_path <- '../data/Mtb/SIFT/GCA_000195955.2.22/Chromosome.gz' # IS_SIFT <- T # IS_Burden <- T # IS_Deletion <- F # out_path <- '../scratch//Mtb_Var_Tbl.rds' # metadata <- data.table::fread('../data/pheno/metadata_Sinergia_final_dataset_human_bac_genome_available_QCed.txt') # n_cores <- 1 # het_thresh <- 10 # AA_variant_files <- paste0('../scratch/AA_HomoOnly_True/',metadata$G_NUMBER,'.txt') # Syn_variant_files <- paste0('../scratch/Syn_HomoOnly_True/',metadata$G_NUMBER,'.txt') if(is.na(IS_SIFT) | is.na(IS_Burden) | is.na(IS_Deletion)){ stop('Please specify options') } phylo_snps <- data.table::fread(phylo_snps) %>% dplyr::filter(PhylogeneticSNP %in% c('PhyloMarkerANCL1','PhyloMarkerANCL234','PhyloMarkerL1','PhyloMarkerL2','PhyloMarkerL3','PhyloMarkerL4')) %>% dplyr::select(POS = Position_ref,REF = anc, ALT = derived) if(!IS_Burden){ #Convert AA Tables into a Matrix AA_Matrix <- AATblToMatrix(AA_Tbl_Files = AA_variant_files,phylo_snps=phylo_snps,missing_matrix=missing_mat,het_thresh = het_thresh,n_cores = n_cores) #Add deleted genes as variants if(IS_Deletion){ del_matrix <- as.matrix(del_tbl[,-c('GNUMBER','LINEAGE','NUMBER_OF_DELETED_GENES')]) del_matrix[del_matrix==1] <- 2 #Set all deletions as Homo calls colnames(del_matrix) <- paste0(colnames(del_matrix),':NA:del') rownames(del_matrix) <- del_tbl$GNUMBER matrix_to_append <- matrix(NA,nrow = nrow(AA_Matrix),ncol = ncol(del_matrix)) rownames(matrix_to_append) <- rownames(AA_Matrix) colnames(matrix_to_append) <- colnames(del_matrix) matrix_to_append <- del_matrix[rownames(AA_Matrix),] AA_Matrix <- cbind(AA_Matrix,matrix_to_append) } }else if(IS_Burden){ if(IS_SIFT){ sift_table <- data.table::fread(cmd = glue::glue("zcat {sift_path}")) AA_Matrix_NonSyn <- AATblToMatrix(AA_Tbl_Files = AA_variant_files,phylo_snps=phylo_snps,sift_table = sift_table,missing_matrix=missing_mat,het_thresh = het_thresh,n_cores = n_cores) }else{ AA_Matrix_NonSyn <- AATblToMatrix(AA_Tbl_Files = AA_variant_files,phylo_snps=phylo_snps,missing_matrix=missing_mat,het_thresh = het_thresh,n_cores = n_cores) } AA_Matrix_Syn <- NucSynTblToMatrix(AA_Tbl_Files = Syn_variant_files,phylo_snps=phylo_snps,missing_matrix = missing_mat,het_thresh = het_thresh,n_cores = n_cores) if(IS_Deletion){ del_matrix <- as.matrix(del_tbl[,-c('GNUMBER','LINEAGE','NUMBER_OF_DELETED_GENES')]) del_matrix[del_matrix==1] <- 2 #Set all deletions as Homo calls colnames(del_matrix) <- paste0(colnames(del_matrix),':NA:del') rownames(del_matrix) <- del_tbl$GNUMBER matrix_to_append <- matrix(NA,nrow = nrow(AA_Matrix_NonSyn),ncol = ncol(del_matrix)) rownames(matrix_to_append) <- rownames(AA_Matrix_NonSyn) colnames(matrix_to_append) <- colnames(del_matrix) matrix_to_append <- del_matrix[rownames(AA_Matrix_NonSyn),] AA_Matrix_NonSyn <- cbind(AA_Matrix_NonSyn,matrix_to_append) } Gene_Burden <- GetGeneBurden(AA_Matrix = AA_Matrix_NonSyn,AA_Matrix_Syn = AA_Matrix_Syn,metadata=metadata) AA_Matrix <- Gene_Burden } saveRDS(AA_Matrix,file = out_path) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | library(pbmcapply) library(dplyr) library(data.table) library(glue) library(stringr) set.seed(12345) GetResults <- function(OUT_DIR,suffix = 'glm.logistic.hybrid',p_thresh=5e-8,n_cores=5,is_interaction = F,is_ordinal = F,tool = 'PLINK'){ all_files <- dir(OUT_DIR) result_files <- all_files[sapply(all_files,function(x) grepl(pattern = suffix,x=x))] if(!is_interaction){ if((tool == 'PLINK' | tool == 'PLINK-FIRTH') & !grepl(x=suffix,pattern = 'gz')){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ",OUT_DIR,x)),mc.cores = n_cores) }else if ((tool == 'PLINK' | tool == 'PLINK-FIRTH') & grepl(x=suffix,pattern = 'gz')){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x," | awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) }else if(tool == 'HLA-PLINK' ){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x)),mc.cores = n_cores) }else if(tool == 'HLA-PLINK-PERM' ){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x),select = c('ID','P')),mc.cores = n_cores) } }else{ if(is_ordinal){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | awk -F ',' '{ if ($6 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) }else{ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | grep 'ADDx' | awk '{ if ($12 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) } } names(results) <- result_files return(results) } RunG2G <- function(G2G_Obj,SOFTWARE_DIR,OUT_DIR,tool,n_PC = 5,n_pPC = 6,n_cores = 20,covars_to_incl = c(), lineage = c(),stratified = T,debug=F,chr = c(),test_snp = c(),model = NA,var_type = 'both'){ system(glue::glue("mkdir -p {OUT_DIR}")) OUT_DIR <- glue::glue("{OUT_DIR}/{tool}/") system(glue::glue("mkdir -p {OUT_DIR}")) OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}/") system(glue::glue("mkdir -p {OUT_DIR}")) # if(length(covars_to_incl) > 0){ # OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}_cov_{paste0(sapply(covars_to_incl,function(x) gsub(x=x,pattern='_',replacement='')),collapse = '-')}/") # }else{ # OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}/") # } OUT_DIR <- glue::glue("{OUT_DIR}/Stratified_{str_to_title(as.character(stratified))}/") system(glue::glue("mkdir -p {OUT_DIR}")) #Initialize results object saveRDS(list(),glue::glue("{OUT_DIR}/G2G_results.rds")) if (tool == 'PLINK' | tool == 'PLINK-FIRTH' | tool == 'HLA-PLINK' | tool == 'HLA-PLINK-PERM'){ if(length(lineage)==0 & stratified){ lineages_to_run <- unique(names(G2G_Obj$aa_matrix_filt)) }else if(length(lineage) > 0 & stratified){ lineages_to_run <- lineage }else if(!stratified){ lineages_to_run <- 'ALL' } for(i in 1:length(lineages_to_run)){ cur_lineage <- lineages_to_run[i] OUT_PATH_Lineage <- glue::glue("{OUT_DIR}/LINEAGE_{cur_lineage}/") system(glue::glue("mkdir -p {OUT_PATH_Lineage}")) system(glue::glue("mkdir -p {OUT_PATH_Lineage}/tmp")) #Get IDD and FID, ensure they're in correct order fam_file <- data.table::fread(glue::glue("{G2G_Obj$host_path[cur_lineage]}.fam")) colnames(fam_file)[1:2] <- c('FID','IID') if(!all(fam_file$IID == G2G_Obj$both_IDs_to_keep[[cur_lineage]]$FAM_ID)){ stop('Sample order incorrect') } #Extract AA matrix for current lineage if(stratified){ cur_aa_matrix_filt <- G2G_Obj$aa_matrix_filt[[cur_lineage]] }else{ cur_aa_matrix_filt <- G2G_Obj$aa_matrix_full } #Extract/convert dosage #If burden, > 1 (more than one variant in gene) also as present #Var_Type = Both, treat homo and hetero calls as present #Var_Type = Homo, treat homo calls as present, hetero calls as missing (encoded as NA) if(grepl(pattern = 'Burden_True',x=OUT_DIR)){ cur_aa_matrix_filt[cur_aa_matrix_filt > 1] <- 1 } else if(var_type == 'both' | var_type == 'homo'){ cur_aa_matrix_filt[cur_aa_matrix_filt==2] <- 1 }else{ stop('Invalid Var Type') } #Append IDs to matrix cur_aa_Matrix_mat <- cbind(fam_file[,c(1,2)],as.matrix(cur_aa_matrix_filt)) colnames(cur_aa_Matrix_mat) <- c('FID','IID',colnames(cur_aa_matrix_filt)) data.table::fwrite(cur_aa_Matrix_mat,col.names = T,row.names = F,sep = ' ',file = glue::glue("{OUT_PATH_Lineage}/tmp/AA_outcome.txt"),na = 'NA',quote = F) #Store PCs and covars host_PCs <- G2G_Obj$host_PCs[[cur_lineage]] host_PCs <- dplyr::select(host_PCs,'IID',paste0('PC',1:n_PC)) if(n_pPC != 0){ pPCs <- G2G_Obj$vir_pPCs[[cur_lineage]] %>% dplyr::inner_join(G2G_Obj$both_IDs_to_keep[[cur_lineage]]) %>% dplyr::rename(IID=FAM_ID) pPCs <-dplyr::select(pPCs,IID,paste0('PC',1:n_pPC)) colnames(pPCs) <- c('IID',paste0('pPC',1:n_pPC)) } cur_covars_to_incl <- covars_to_incl #Don't include lineage if only running in single lineage. if((nrow(str_locate_all(pattern = 'L',cur_lineage)[[1]]) == 1) & cur_lineage != 'ALL'){ cur_covars_to_incl <- setdiff(covars_to_incl,'LINEAGE') } if(length(cur_covars_to_incl) > 0){ covars_num_discrete <- G2G_Obj$covars[[cur_lineage]] #Add lineage as covariate if more than one lineage in current group if(stratified){ if('LINEAGE' %in% cur_covars_to_incl & (nrow(str_locate_all(pattern = 'L',cur_lineage)[[1]]) > 1 | cur_lineage == 'ALL')){ lineage_df <- G2G_Obj$both_IDs_to_keep[[cur_lineage]] %>% dplyr::select(IID=FAM_ID,LINEAGE) lineage_df$LINEAGE <- as.factor(lineage_df$LINEAGE) lineage_df <- one_hot(as.data.table(lineage_df),cols = 'LINEAGE') lineage_df <- lineage_df %>% dplyr::select(-colnames(lineage_df)[ncol(lineage_df)]) covars_num_discrete <- covars_num_discrete %>% dplyr::left_join(lineage_df) } } covars_num_discrete <- dplyr::select(covars_num_discrete,'IID',contains(cur_covars_to_incl)) covars_num_discrete[is.na(covars_num_discrete)] <- 'NONE' if(n_pPC != 0){ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::left_join(pPCs,by=c('IID'='IID')) %>% dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) }else{ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) } }else{ if(n_pPC != 0){ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::left_join(pPCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) }else{ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) } } covars[covars==''] <- NA data.table::fwrite(covars,col.names = F,row.names = F,sep = '\t',file = glue::glue("{OUT_PATH_Lineage}/tmp/plink-covars.txt"),na = 'NA',quote = F) #Run association study for each AA variant in the current lineage AA_Matrix_No_ID <- cur_aa_Matrix_mat[,-c(1,2)] if(debug){ end_index <- min(c(ncol(AA_Matrix_No_ID),5)) }else{ end_index <- ncol(AA_Matrix_No_ID) } #Generate Permutations if specified if(tool == 'HLA-PLINK-PERM'){ N_Perm = 100 Alleles_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_Alleles_G2G') AA_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_AA_G2G') system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_Allele/")) system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_AA/")) true_fam_file <- data.table::fread(glue::glue("{Alleles_Path}.fam")) perm_fam_file <- lapply(1:N_Perm,function(x) true_fam_file[sample(1:nrow(true_fam_file),size = nrow(true_fam_file),replace = F),]) lapply(1:N_Perm,function(x) data.table::fwrite(perm_fam_file[[x]],glue::glue("{OUT_PATH_Lineage}/perm_{x}.fam"),sep = '\t',col.names = F,row.names = F)) } pbmclapply(1:end_index,function(k){ cur_pathogen_variant <- colnames(AA_Matrix_No_ID)[k] if(tool == 'PLINK'){ if(is.na(model)){ tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {G2G_Obj$host_path[cur_lineage]} --glm cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}{cur_pathogen_variant}" ) )) }else{ tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {G2G_Obj$host_path[cur_lineage]} --glm {model} cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}{cur_pathogen_variant}.{model}" ) )) } system(glue::glue('pigz --fast {OUT_PATH_Lineage}{cur_pathogen_variant}*.hybrid')) }else if(tool == 'HLA-PLINK'){ Alleles_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_Alleles_G2G') AA_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_AA_G2G') system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_Allele/")) system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_AA/")) tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {Alleles_Path} --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_Allele/{cur_pathogen_variant}" ) )) tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {AA_Path} --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_AA/{cur_pathogen_variant}" ) )) system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_Allele/{cur_pathogen_variant}*.hybrid')) system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_AA/{cur_pathogen_variant}*.hybrid')) }else if(tool == 'HLA-PLINK-PERM'){ for (q in 1:N_Perm){ system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_Allele/Perm_{q}/")) system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_AA/Perm_{q}/")) tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {Alleles_Path} --fam {OUT_PATH_Lineage}/perm_{q}.fam --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_Allele/Perm_{q}/{cur_pathogen_variant}" ) )) tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {AA_Path} --fam {OUT_PATH_Lineage}/perm_{q}.fam --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_AA/Perm_{q}/{cur_pathogen_variant}" ) )) system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_Allele/Perm_{q}/{cur_pathogen_variant}*.hybrid')) system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_AA/Perm_{q}/{cur_pathogen_variant}*.hybrid')) } }else if(tool == 'PLINK-FIRTH'){ tryCatch(system( glue::glue( "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {G2G_Obj$host_path[cur_lineage]} --no-sex --glm firth hide-covar --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}{cur_pathogen_variant}" ) )) } },mc.cores = max(floor(n_cores / 22),1)) if(tool == 'PLINK'){ results <- list(GetResults(OUT_PATH_Lineage,suffix = 'glm.logistic.hybrid.gz',p_thresh=5e-8,n_cores=n_cores,is_interaction = F,is_ordinal = F,tool = tool)) names(results) <- cur_lineage all_res <- readRDS(glue::glue("{OUT_DIR}/G2G_results.rds")) all_res <- c(all_res,results) saveRDS(all_res,glue::glue("{OUT_DIR}/G2G_results.rds")) }else if(tool == 'HLA-PLINK'){ results_allele <- list(GetResults(paste0(OUT_PATH_Lineage,'HLA_Allele/'),suffix = 'glm.logistic.hybrid.gz',p_thresh=1,n_cores=n_cores,is_interaction = F,is_ordinal = F,tool = tool)) names(results_allele) <- cur_lineage results_AA <- list(GetResults(paste0(OUT_PATH_Lineage,'HLA_AA/'),suffix = 'glm.logistic.hybrid.gz',p_thresh=1,n_cores=n_cores,is_interaction = F,is_ordinal = F,tool = tool)) names(results_AA) <- cur_lineage all_res <- readRDS(glue::glue("{OUT_DIR}/G2G_results.rds")) all_res <- c(all_res,list(HLA_Allele = results_allele,HLA_AA = results_AA)) saveRDS(all_res,glue::glue("{OUT_DIR}/G2G_results.rds")) } } } } args <- commandArgs(trailingOnly = TRUE) G2G_Obj <- readRDS(args[[1]]) OUT_DIR <- args[[2]] tool <- args[[3]] n_PC <- as.numeric(args[[4]]) n_pPC <- as.numeric(args[[5]]) covars_to_incl <- args[[6]] if(covars_to_incl == 'NA'){ covars_to_incl <- c() }else{ covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]] } stratified <- as.logical(args[[7]]) homo_only <- as.logical(args[[8]]) n_cores <- as.numeric(args[[9]]) # G2G_Obj <- readRDS('../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True/G2G_Obj.rds') # OUT_DIR <- '../results/Burden_False_SIFT_False_Del_True_HomoOnly_True/' # tool <- 'PLINK' # n_PC <- 3 # n_pPC <- 0 # covars_to_incl <- c('patient_sex') # if(covars_to_incl == 'NA'){ # covars_to_incl <- c() # }else{ # covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]] # } # stratified <- F # homo_only <- T # n_cores <- 10 RunG2G(G2G_Obj,SOFTWARE_DIR,OUT_DIR,tool = tool,n_cores = n_cores,n_PC = n_PC,n_pPC = n_pPC,debug = F,covars_to_incl = covars_to_incl, model = NA,stratified = stratified,var_type = ifelse(homo_only,'homo','both')) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | library(pbmcapply) library(dplyr) library(data.table) library(glue) library(stringr) GetResults <- function(OUT_DIR,suffix = 'glm.logistic.hybrid',p_thresh=5e-8,n_cores=5,is_interaction = F,is_ordinal = F,tool = 'PLINK'){ all_files <- dir(OUT_DIR) result_files <- all_files[sapply(all_files,function(x) grepl(pattern = suffix,x=x))] if(!is_interaction){ if((tool == 'PLINK' | tool == 'PLINK-FIRTH') & !grepl(x=suffix,pattern = 'gz')){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ",OUT_DIR,x)),mc.cores = n_cores) }else if ((tool == 'PLINK' | tool == 'PLINK-FIRTH') & grepl(x=suffix,pattern = 'gz')){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x," | awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) }else if(tool == 'GMMAT-SCORE'){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x," | awk '{ if (NR == 1 || $11 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) } }else{ if(is_ordinal){ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | awk -F ',' '{ if ($6 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) }else{ results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | grep 'ADDx' | awk '{ if ($12 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores) } } names(results) <- result_files return(results) } RunInteraction <- function(G2G_Obj,OUT_DIR,tool = 'PLINK',n_PC = 3,n_pPC = 0,n_cores = 20,covars_to_incl = c(),lineage = c(),by_lineage = F,stratified = T,debug=F,model = NA,var_type = 'both',pheno = 'TB_score'){ OUT_DIR <- glue::glue("{OUT_DIR}/{tool}/") system(glue::glue("mkdir -p {OUT_DIR}")) OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}/") system(glue::glue("mkdir -p {OUT_DIR}")) OUT_DIR <- glue::glue("{OUT_DIR}/Stratified_{str_to_title(as.character(stratified))}/") system(glue::glue("mkdir -p {OUT_DIR}")) saveRDS(list(),glue::glue("{OUT_DIR}/{pheno}_int_results.rds")) if(length(lineage)==0 & stratified){ lineages_to_run <- unique(names(G2G_Obj$aa_matrix_filt)) }else if(length(lineage) > 0 & stratified){ lineages_to_run <- lineage }else if(!stratified){ lineages_to_run <- 'ALL' } for(i in 1:length(lineages_to_run)){ cur_lineage <- lineages_to_run[i] OUT_DIR_Lineage <- glue::glue("{OUT_DIR}/LINEAGE_{cur_lineage}/") system(glue::glue("mkdir -p {OUT_DIR_Lineage}")) system(glue::glue("mkdir -p {OUT_DIR_Lineage}/tmp")) #Get IDD and FID, ensure they're in correct order fam_file <- data.table::fread(glue::glue("{G2G_Obj$host_path[cur_lineage]}.fam")) colnames(fam_file)[1:2] <- c('FID','IID') if(!all(fam_file$IID == G2G_Obj$both_IDs_to_keep[[cur_lineage]]$FAM_ID)){ stop('Sample order incorrect') } #Store AA matrix if(!by_lineage & stratified){ cur_aa_matrix_filt <- G2G_Obj$aa_matrix_filt[[cur_lineage]] }else if(!by_lineage & !stratified){ cur_aa_matrix_filt <- G2G_Obj$aa_matrix_full } else if(by_lineage){ #SNP x LINEAGE interaction cur_aa_matrix_filt <- as.matrix(one_hot(as.data.table(data.frame(LINEAGE = G2G_Obj$vir_pPCs$ALL$LINEAGE)))) } #Extract/convert dosage #Var_Type = Both, treat homo and hetero calls as present #Var_Type = Homo, treat homo calls as present, hetero calls as absent if(var_type == 'both'){ cur_aa_matrix_filt[cur_aa_matrix_filt==2] <- 1 }else if (var_type == 'homo'){ cur_aa_matrix_filt[cur_aa_matrix_filt==1] <- 0 cur_aa_matrix_filt[cur_aa_matrix_filt==2] <- 1 }else{ stop('Invalid Var Type') } #Store PCs and covars host_PCs <- G2G_Obj$host_PCs[[cur_lineage]] host_PCs <- dplyr::select(host_PCs,'IID',paste0('PC',1:n_PC)) if(n_pPC != 0){ pPCs <- G2G_Obj$vir_pPCs[[cur_lineage]] pPCs <-dplyr::select(pPCs,IID,paste0('PC',1:n_pPC)) colnames(pPCs) <- c('IID',paste0('pPC',1:n_pPC)) } if(length(covars_to_incl) > 0){ covars_num_discrete <- G2G_Obj$covars[[cur_lineage]] covars_num_discrete <- dplyr::select(covars_num_discrete,'IID',covars_to_incl) if(n_pPC != 0){ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::left_join(pPCs,by=c('IID'='IID')) %>% dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) }else{ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) } }else{ if(n_pPC != 0){ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::left_join(pPCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) }else{ covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID) } } #Change categorial binary covariates to numeric (0 and 1) cat_covars <- which(sapply(3:ncol(covars),function(q) any(is.character(as.vector(t(covars[,..q])))))) if(length(cat_covars) > 0){ for(col in (cat_covars+2)){ covars[,col] <- as.integer(as.factor(t(covars[,..col]))) - 1 } } #Write out Phenotype if(pheno == 'TB_score'){ tb_score <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),G2G_Obj$tb_score[[cur_lineage]] %>% dplyr::select(IID,TB_score),by=c('IID'='IID')) %>% dplyr::relocate(FID,IID,TB_score) data.table::fwrite(tb_score,col.names = T,quote = F,row.names = F,sep = ' ',file = glue::glue("{OUT_DIR_Lineage}/tmp/TB_score.txt"),na = 'NA') }else if(pheno == 'Xray_score'){ xray_score <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),G2G_Obj$xray_score[[cur_lineage]] %>% dplyr::select(IID,Xray_score),by=c('IID'='IID')) %>% dplyr::relocate(FID,IID,Xray_score) data.table::fwrite(xray_score,col.names = T,quote = F,row.names = F,sep = ' ',file = glue::glue("{OUT_DIR_Lineage}/tmp/Xray_score.txt"),na = 'NA') }else{ stop('Invalid Pheno') } #Run GWAS with lineage as covariate if(by_lineage){ cur_covar <- cbind(covars[,1:2],data.frame(LINEAGE = G2G_Obj$vir_pPCs$ALL$LINEAGE),covars[,-c(1,2)]) data.table::fwrite(cur_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/lineage_covar.txt")) system( glue::glue( "plink2 --threads {n_cores} --bfile {G2G_Obj$host_path[cur_lineage]} --covar-variance-standardize --linear no-x-sex --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/lineage_covar.txt --out {OUT_DIR_Lineage}Lineage_GWAS" ) ) discrete_covar <- dplyr::select(cur_covar,c('FID','IID',intersect(colnames(cur_covar),c('Patient_Sex','HIV_Status','LINEAGE')))) num_covar <- dplyr::select(cur_covar,c('FID','IID',setdiff(colnames(cur_covar),colnames(discrete_covar)))) data.table::fwrite(discrete_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/lineage_discrete_covar.txt")) data.table::fwrite(num_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/lineage_num_covar.txt")) system( glue::glue( "gcta64 --bfile {G2G_Obj$host_path[cur_lineage]} --make-grm --out {G2G_Obj$host_path[cur_lineage]} --thread-num {n_cores}" ) ) system( glue::glue( "gcta64 --mlma --bfile {G2G_Obj$host_path[cur_lineage]} --grm {G2G_Obj$host_path[cur_lineage]} --autosome --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/lineage_discrete_covar.txt --qcovar {OUT_DIR_Lineage}/tmp/lineage_num_covar.txt --thread-num {n_cores} --out {OUT_DIR_Lineage}Lineage_GWAS" ) ) } if(debug){ end_index <- min(c(ncol(cur_aa_matrix_filt),2)) }else{ end_index <- ncol(cur_aa_matrix_filt) } for(k in 1:end_index){ tryCatch({ if(by_lineage){ cur_pathogen_variant <- colnames(cur_aa_matrix_filt)[k] cur_covar <- cbind(covars[,1:2],cur_aa_matrix_filt[,k,drop=FALSE],cur_aa_matrix_filt[,-k][,-1],covars[,-c(1,2)]) data.table::fwrite(cur_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt")) system( glue::glue( "plink2 --threads {n_cores} --bfile {G2G_Obj$host_path[cur_lineage]} --covar-variance-standardize --linear no-x-sex interaction --parameters 1-{ncol(cur_covar)} --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt --out {OUT_DIR_Lineage}{cur_pathogen_variant}" ) ) }else{ cur_pathogen_variant <- colnames(cur_aa_matrix_filt)[k] cur_covar <- cbind(covars[,1:2],cur_aa_matrix_filt[,k,drop=FALSE],covars[,-c(1,2)]) data.table::fwrite(cur_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt")) system( glue::glue( "plink2 --threads {n_cores} --bfile {G2G_Obj$host_path[cur_lineage]} --covar-variance-standardize --linear no-x-sex interaction --parameters 1-{ncol(cur_covar)} --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt --out {OUT_DIR_Lineage}{cur_pathogen_variant}" ) ) system(glue::glue("grep 'ADD' {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear > {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear.add")) system(glue::glue("pigz --fast {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear.add")) system(glue::glue("rm {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear")) } } ) } results <- list(GetResults(OUT_DIR_Lineage,suffix = glue::glue('{pheno}.glm.linear.add.gz'),p_thresh = 5e-8,n_cores = n_cores,is_interaction = T,is_ordinal = F,tool = tool)) names(results) <- cur_lineage all_res <- readRDS(glue::glue("{OUT_DIR}/{pheno}_int_results.rds")) all_res <- c(all_res,results) saveRDS(all_res,glue::glue("{OUT_DIR}/{pheno}_int_results.rds")) } } args <- commandArgs(trailingOnly = TRUE) G2G_Obj <- readRDS(args[[1]]) OUT_DIR <- args[[2]] tool <- args[[3]] n_PC <- as.numeric(args[[4]]) n_pPC <- as.numeric(args[[5]]) covars_to_incl <- args[[6]] if(covars_to_incl == 'NA'){ covars_to_incl <- c() }else{ covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]] } stratified <- as.logical(args[[7]]) homo_only <- as.logical(args[[8]]) pheno <- args[[9]] n_cores <- as.numeric(args[[10]]) # G2G_Obj <- readRDS('../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True/G2G_Obj.rds') # OUT_DIR <- '../results/Burden_False_SIFT_False_Del_False_HomoOnly_True/' # tool <- 'PLINK' # n_PC <- 3 # n_pPC <- 0 # covars_to_incl <- c('patient_sex') # if(covars_to_incl == 'NA'){ # covars_to_incl <- c() # }else{ # covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]] # } # stratified <- F # homo_only <- T # n_cores <- 10 # pheno <- 'TB_score' RunInteraction(G2G_Obj,OUT_DIR, tool = tool, n_cores = n_cores,n_PC = n_PC,n_pPC = n_pPC, covars_to_incl = covars_to_incl, debug = F,by_lineage = F,stratified=stratified,var_type = ifelse(homo_only,'homo','both'),pheno = pheno) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | library(dplyr) library(parallel) Map1KG <- function(bim_1KG,cur_lineage,scratch_dir){ cur_lineage <- paste0('LINEAGE_',cur_lineage) if(!file.exists(glue::glue("{scratch_dir}{cur_lineage}/mapped_ids.txt"))){ bim_1KG <- data.table::fread(bim_1KG,header = F) bim_1KG$V1 <- as.character(bim_1KG$V1) cur_host_bim <- data.table::fread(glue::glue("{scratch_dir}{cur_lineage}/TB_DAR_G2G.bim"),header = F) jned_file <- dplyr::left_join(cur_host_bim,bim_1KG,by=c('V1'='V1','V4'='V4','V5'='V5','V6'='V6')) data.table::fwrite(jned_file %>% dplyr::select(ID_Host=V2.x,ID_1KG=V2.y),sep = '\t', file = glue::glue("{scratch_dir}{cur_lineage}/mapped_ids.txt")) cur_mapped_ids <- data.table::fread(glue::glue("{scratch_dir}{cur_lineage}/mapped_ids.txt")) ID_1KG <- cur_mapped_ids$ID_1KG ID_1KG[ID_1KG == ""] <- NA write(ID_1KG,glue::glue("{scratch_dir}{cur_lineage}/mapped_1KG_ids.txt")) remove(cur_mapped_ids) gc() } } RunPASCAL <- function(GWAS_file,variant,scratch_dir,results_dir,PASCAL_path,filt = T,gene_scoring = 'max',cur_lineage='ALL'){ system(glue::glue("mkdir -p {results_dir}")) cur_lineage <- paste0('LINEAGE_',cur_lineage) #Write summary stats if(grepl(pattern = 'glm.logistic.hybrid.gz',x=GWAS_file)){ system(glue::glue("zcat {GWAS_file} | awk '{{if(NR > 1) print $13}}' > {results_dir}/{variant}.tmp")) }else{ system(glue::glue("zcat {GWAS_file} | grep 'ADDx' | awk '{{print $12}}' > {results_dir}/{variant}.tmp")) } system(glue::glue("paste {scratch_dir}{cur_lineage}/mapped_1KG_ids.txt {results_dir}/{variant}.tmp | grep -v 'NA' > {results_dir}/{variant}.pascal")) system(glue::glue("pigz --fast {results_dir}/{variant}.pascal")) system(glue::glue("rm {results_dir}/{variant}.tmp")) #Run PASCAL cur_wd = getwd() setwd(PASCAL_path) if(gene_scoring == 'max'){ system(glue::glue("./Pascal --pval {results_dir}{variant}.pascal.gz --outdir={results_dir} --customdir=./resources/1kg_AFR/ --custom=AFR --runpathway=on --genescoring=max > {results_dir}{variant}.log")) }else if(gene_scoring == 'sum'){ system(glue::glue("./Pascal --pval {results_dir}{variant}.pascal.gz --outdir={results_dir} --customdir=./resources/1kg_AFR/ --custom=AFR --runpathway=on --genescoring=sum > {results_dir}{variant}.log")) }else{ stop('gene_scoring misspecified') } setwd(cur_wd) } args <- commandArgs(trailingOnly = T) gwas_file <- args[[1]] variant <- args[[2]] bim_1KG <- args[[3]] scratch_dir <- args[[4]] Map1KG(bim_1KG,'ALL',scratch_dir) results_dir <- args[[5]] pascal_path <- args[[6]] RunPASCAL(gwas_file,variant,scratch_dir,results_dir,pascal_path,filt = T) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | library(dplyr) library(ape) library(phylobase) library(adephylo) #Minor allele count of a TB variant GetMAC <- function(AA_Matrix_filt){ MAC <- apply(AA_Matrix_filt,2,function(x) { no_na <- x[!is.na(x)] if(length(no_na) == 0){ return(0) } no_na_binary <- ifelse(no_na == 0,0,1) #Set to binary matrix (0 for absent, 1 more present - either homo or hetero call or burden) return(ifelse(length(unique(no_na_binary)) != 1,min(table(no_na_binary)),length(no_na_binary) - min(table(no_na_binary)))) }) return(MAC) } #Phylogenetic PC ConstructpPCA <- function(AA_Table,Phylo_Tree_Path,ID_Mapping_df,pPCA = T,mac_thresh = 5){ tree <- ape::read.tree(Phylo_Tree_Path) ID_Mapping_df_filt <- dplyr::filter(ID_Mapping_df,G_NUMBER %in% tree$tip.label & !is.na(LINEAGE)) #Separate based on LINEAGES unique_lineages <- sort(unique(ID_Mapping_df_filt$LINEAGE)) unique_lineages <- c(unique_lineages,unlist(sapply(2:(length(unique_lineages)-1),function(x) apply(combn(unique_lineages,x),2,function(y) paste0(sort(y),collapse = '_'))))) lineage_samples <- lapply(unique_lineages,function(x) dplyr::filter(ID_Mapping_df_filt,LINEAGE %in% strsplit(x,split = '_')[[1]])$G_NUMBER) #Add in index which cover all LINEAGES lineage_samples <- c(list(ID_Mapping_df_filt$G_NUMBER),lineage_samples) names(lineage_samples) <- c('ALL',unique_lineages) vir_pca <- vector(mode = 'list',length = length(lineage_samples)) names(vir_pca) <- c('ALL',unique_lineages) for(i in 1:length(lineage_samples)){ tree_subset <-ape::keep.tip(tree, lineage_samples[[i]]) AA_Table_filt <- AA_Table[tree_subset$tip.label,] MAC <- GetMAC(AA_Table_filt) if(pPCA){ vir_4d <- phylobase::phylo4d(tree_subset, AA_Table_filt[,MAC > mac_thresh]) vir_pca[[i]] <- adephylo::ppca( vir_4d, scale = FALSE, center = T, scannf = F, nfposi = 10, method = "Abouheif" ) }else{ vir_pca[[i]] <- prcomp(AA_Table_filt[,MAC > mac_thresh],center = T,scale = F) } } return(vir_pca) } #Filter AA variants according to MAC and Group variants according to lineage #If a variant appears in multiple lineages, do unstratified analysis. #If a variant appears in only 1 lineage, do stratified analysis. FilterAAMatrix <- function(AA_Matrix,Lineage_Df,MAC_Thresh){ #Keep samples where Host data exists AA_Matrix_filt <- AA_Matrix[Lineage_Df$ALL$G_NUMBER,,drop=FALSE] #Keep variants which pass unstratified MAC threshold MAC_AA_Matrix_filt <- GetMAC(AA_Matrix_filt) AA_Matrix_filt <- AA_Matrix_filt[,MAC_AA_Matrix_filt > MAC_Thresh,drop=FALSE] #Remove Missing Variants/Genes AA_Matrix_filt <- AA_Matrix_filt[,!apply(AA_Matrix_filt,2,function(x) all(is.na(x))),drop=FALSE] #Assign variants to lineages strat_table <- lapply(1:ncol(AA_Matrix_filt),function(i) table(ifelse(AA_Matrix_filt[,i]==0,0,1),Lineage_Df$ALL$LINEAGE)) strat_assng <- vector(mode = 'character',length = length(strat_table)) for(i in 1:length(strat_table)){ cur_strat <- strat_table[[i]] MAC_by_lineage <- apply(cur_strat,2,function(x) min(x,na.rm = T)) #If a variant is perfectly stratified by lineage and there are no variability within a lineage, exclude from analysis if(sum(MAC_by_lineage > MAC_Thresh) == 0){ strat_assng[i] <- NA #If a variant only show variability in a single lineage, assign it to that lineage }else if (sum(MAC_by_lineage > MAC_Thresh) == 1){ strat_assng[i] <- names(MAC_by_lineage)[which(MAC_by_lineage > MAC_Thresh)] }else{ strat_assng[i] <- paste0(sort(names(MAC_by_lineage)[which(MAC_by_lineage > MAC_Thresh)]),collapse = '_') if(strat_assng[i] == 'L1_L2_L3_L4'){ strat_assng[i] <- 'ALL' } } } #Construct AA Matrix for each lineage uniq_assgn <- sort(unique(strat_assng)) uniq_assgn <- uniq_assgn[!is.na(uniq_assgn)] #Exclude ALL lineage (Will be run seperately anyways) #uniq_assgn <- uniq_assgn[uniq_assgn != paste0(sort(unique(Lineage_Df$ALL$LINEAGE)),collapse = '_')] #For each gene, assign to lineage groups AA_Matrix_filt_by_lineage <- lapply(uniq_assgn,function(x) AA_Matrix_filt[Lineage_Df[[x]]$G_NUMBER,which(strat_assng == x),drop=FALSE]) names(AA_Matrix_filt_by_lineage) <- uniq_assgn return(AA_Matrix_filt_by_lineage) } SetUpG2G <- function(OUT_DIR,Metadata_Path,Host_PC_Path,Var_Tbl_Path,Phylo_Tree_Path,Mtb_Nuc_Out,Host_MAF,Pathogen_MAC_pPCA,Pathogen_MAC_AA_Lineage,Pathogen_MAC_AA,Pathogen_Missing,BFILE_Path,Host_Files,n_cores){ #Get metadata file parsed_mapping <- data.table::fread(Metadata_Path) %>% dplyr::select(PATIENT_ID,G_NUMBER,LINEAGE=Lineage) #Get nucleotide variant matrix nuc_matrix <- data.table::fread(Mtb_Nuc_Out) colnames(nuc_matrix) <- sapply(colnames(nuc_matrix),function(x) strsplit(x=x,split = ']')[[1]][2]) nuc_matrix_trans <- nuc_matrix[,-c('CHROM','POS','REF','ALT')] nuc_matrix_trans <- t(as.matrix(nuc_matrix_trans)) nuc_matrix_trans[nuc_matrix_trans==2] <- 1 rownames(nuc_matrix_trans) <- colnames(nuc_matrix[,-c('CHROM','POS','REF','ALT')]) colnames(nuc_matrix_trans) <- paste0(nuc_matrix$POS,'_',nuc_matrix$REF,'_',nuc_matrix$ALT) #Calculate Mtb pPCs vir_pPCA <- ConstructpPCA(nuc_matrix_trans,Phylo_Tree_Path,parsed_mapping,Pathogen_MAC_pPCA) vir_pPCs <- lapply(vir_pPCA,function(x) cbind(G_NUMBER=rownames(x$li),as.data.frame(x$li)) %>% dplyr::left_join(parsed_mapping)) #Get consensus samples host_IDs <- data.table::fread(glue::glue("{BFILE_Path}.fam"))$V2 host_IDs_simple <- sapply(host_IDs,function(x) gsub(x=x,pattern = 'WGS_Fellay.',replacement = '')) host_IDs_simple <- sapply(host_IDs_simple,function(x) gsub(x=x,pattern = 'Batch1_',replacement = '')) host_IDs_simple <- sapply(host_IDs_simple,function(x) gsub(x=x,pattern = 'Batch2_',replacement = '')) #Initialize variables to store both_IDs_to_keep <- vector(mode = 'list',length = length(vir_pPCs)); names(both_IDs_to_keep) <- names(vir_pPCs) host_PCs <- vector(mode = 'list',length = length(vir_pPCs)); names(host_PCs) <- names(vir_pPCs) covars <- vector(mode = 'list',length = length(vir_pPCs)); names(covars) <- names(vir_pPCs) #Get Host PCs host_PCs_raw <- data.table::fread(Host_PC_Path) #Get Covars covars_discrete <- data.table::fread(glue::glue("{Host_Files}/covars_discrete"),sep = '\t',na.strings = c("",'unknown')) covars_numeric <- data.table::fread(glue::glue("{Host_Files}/covars_numeric"),sep = '\t',na.strings = c("",'unknown')) covars_raw <- dplyr::inner_join(covars_discrete,covars_numeric) #Loop through lineages, store PLINK files, covars, and host PCs for(i in 1:length(vir_pPCs)){ system(glue::glue('mkdir -p {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/')) pathogen_IDs_to_keep <- dplyr::filter(parsed_mapping,G_NUMBER %in% vir_pPCs[[i]]$G_NUMBER) both_IDs_to_keep[[i]] <- dplyr::filter(pathogen_IDs_to_keep,PATIENT_ID %in% host_IDs_simple) both_IDs_to_keep[[i]]$FAM_ID <- host_IDs[match(both_IDs_to_keep[[i]]$PATIENT_ID,host_IDs_simple)] data.table::fwrite(data.frame(FID=both_IDs_to_keep[[i]]$FAM_ID,IID=both_IDs_to_keep[[i]]$FAM_ID), file = glue::glue("{OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt"),sep = '\t',quote = F) #Write out PLINK files with samples to keep system( glue::glue( "~/Software/plink2 --bfile {BFILE_Path} --keep {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G_Raw" ) ) #Apply MAF Filter system( glue::glue( "~/Software/plink2 --bfile {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G_Raw --maf {Host_MAF} --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G" ) ) #Write out HLA PLINK files with samples to keep system( glue::glue( "~/Software/plink2 --bfile {Host_Files}/TB_DAR_HLA_Alleles --keep {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_HLA_Alleles_G2G" ) ) system( glue::glue( "~/Software/plink2 --bfile {Host_Files}/TB_DAR_HLA_AA --keep {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_HLA_AA_G2G" ) ) system(glue::glue("rm {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G_Raw*")) #Order and filter host PCs host_PCs[[i]] <- dplyr::left_join(data.frame('IID' = both_IDs_to_keep[[i]]$FAM_ID),host_PCs_raw[,-1],by=c('IID'='V2')) colnames(host_PCs[[i]]) <- c('IID',paste0('PC',seq(1,ncol(host_PCs[[i]])-1,1))) #Order and filter pPCs vir_pPCs[[i]] <- dplyr::left_join(data.frame(G_NUMBER=both_IDs_to_keep[[i]]$G_NUMBER),vir_pPCs[[i]]) #Order and filter covars covars[[i]] <- dplyr::left_join(data.frame(IID=both_IDs_to_keep[[i]]$FAM_ID),covars_raw) %>% dplyr::select(-contains("PC")) } #Unstratified AA matrix aa_matrix <- readRDS(Var_Tbl_Path) if(grepl(pattern = 'Burden_True',x=OUT_DIR)){ aa_matrix <- aa_matrix$non_syn_burden } aa_matrix_full <- aa_matrix[both_IDs_to_keep[['ALL']]$G_NUMBER,,drop=FALSE] aa_matrix_full<- aa_matrix_full[,GetMAC(aa_matrix_full) > Pathogen_MAC_AA,drop=FALSE] #Filter by missingness aa_missingness <- apply(aa_matrix_full,2,function(x) sum(is.na(x)) / nrow(aa_matrix_full)) aa_matrix_full<- aa_matrix_full[,aa_missingness < Pathogen_Missing,drop=FALSE] #Filter AA Matrix, decide for each variant whether to do stratified or stratified analysis aa_matrix_filt <- FilterAAMatrix(aa_matrix_full,both_IDs_to_keep,MAC_Thresh = Pathogen_MAC_AA_Lineage) #Filter AA Matrix based on MAC per lineage # aa_matrix_mac_by_lineage <- sapply(1:ncol(aa_matrix_full),function(x) max(apply(table(aa_matrix_full[,x],both_IDs_to_keep[['ALL']]$LINEAGE),2,min))) aa_matrix_mac_by_lineage <- sapply(1:ncol(aa_matrix_full),function(x) max(sapply(unique(both_IDs_to_keep[['ALL']]$LINEAGE), function(y) GetMAC(aa_matrix_full[both_IDs_to_keep[['ALL']]$LINEAGE == y,x,drop=F])))) aa_matrix_full <- aa_matrix_full[,aa_matrix_mac_by_lineage > Pathogen_MAC_AA] #Path to VCF file (for each lineage) host_path <- glue::glue("{OUT_DIR}/LINEAGE_{c(names(vir_pPCs),'ALL')}/TB_DAR_G2G") names(host_path) <- c(names(vir_pPCs),'ALL') #Load TB Score tb_score_df <- data.table::fread(glue::glue("{Host_Files}/TB_score")) tb_score_by_lineage <- lapply(both_IDs_to_keep,function(x) dplyr::left_join(x %>% dplyr::select(IID=FAM_ID),tb_score_df,c('IID'='IID')) %>% dplyr::relocate(FID,IID,TB_score)) #Load X-Ray Score xray_score_df <- data.table::fread(glue::glue("{Host_Files}/Xray_score")) xray_score_by_lineage <- lapply(both_IDs_to_keep,function(x) dplyr::left_join(x %>% dplyr::select(IID=FAM_ID),xray_score_df,c('IID'='IID')) %>% dplyr::relocate(FID,IID,Xray_score)) return(list(host_PCs=host_PCs, vir_pPCs=vir_pPCs, covars = covars, tb_score=tb_score_by_lineage, xray_score = xray_score_by_lineage, aa_matrix_filt=aa_matrix_filt, aa_matrix_full=aa_matrix_full, nuc_matrix = nuc_matrix_trans, both_IDs_to_keep=both_IDs_to_keep, host_path = host_path, raw_pPCA = vir_pPCA)) } args <- commandArgs(trailingOnly = TRUE) OUT_DIR <- args[[1]] Metadata_Path <- args[[2]] Host_PC_Path <- args[[3]] Var_Tbl_Path <- args[[4]] Phylo_Tree_Path <- args[[5]] Mtb_Nuc_Out <- args[[6]] Host_MAF <- as.numeric(args[[7]]) Pathogen_MAC_pPCA <- as.numeric(args[[8]]) Pathogen_MAC_AA_Lineage <- as.numeric(args[[9]]) Pathogen_MAC_AA <- as.numeric(args[[10]]) Pathogen_Missing <- as.numeric(args[[11]]) BFILE_Path <- gsub(args[[12]],pattern = '.bed',replacement = '') Host_Files <- args[[13]] n_cores <- as.numeric(args[[14]]) # OUT_DIR <- '../../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True_HetThresh_10/' # Metadata_Path <- '../../data/pheno/metadata_Sinergia_final_dataset_human_bac_genome_available_QCed.txt' # Host_PC_Path <- '../../scratch/Host/TB_DAR_GWAS_PCA.eigenvec' # Var_Tbl_Path <- '../../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True_HetThresh_10/Mtb_Var_Tbl.rds' # Phylo_Tree_Path <- '../../data/Mtb/RAxML_bestTree.Sinergia_final_dataset_human_bac_genome_available_rerooted.nwk' # Mtb_Nuc_Out <- '../../data/Mtb/merged/merged.var.homo.SNPs.vcf.dosage' # Host_MAF <- 0.05 # Pathogen_MAC_pPCA <- 15 # Pathogen_MAC_AA_Lineage <- 15 # Pathogen_MAC_AA <- 15 # Pathogen_Missing <- 0.05 # BFILE_Path <- '../../data/Genotyping_WGS/TBDAR.WGS.Imputed.GWASReady' # Host_Files <- '../../scratch/Host/' # n_cores <- 1 G2G_Obj <- SetUpG2G(OUT_DIR=OUT_DIR,Metadata_Path=Metadata_Path,Host_PC_Path=Host_PC_Path,Var_Tbl_Path=Var_Tbl_Path,Phylo_Tree_Path=Phylo_Tree_Path,Mtb_Nuc_Out=Mtb_Nuc_Out,Host_MAF=Host_MAF,Pathogen_MAC_pPCA=Pathogen_MAC_pPCA,Pathogen_MAC_AA_Lineage=Pathogen_MAC_AA_Lineage,Pathogen_MAC_AA=Pathogen_MAC_AA,Pathogen_Missing=Pathogen_Missing,BFILE_Path=BFILE_Path,Host_Files=Host_Files,n_cores=n_cores) saveRDS(G2G_Obj,glue::glue("{OUT_DIR}/G2G_Obj.rds")) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | library(dplyr) library(tidyr) library(dummies) library(data.table) SetUpHost <- function(Metadata_Path,BFILE_Path,Out_Path,excl_regions,covars_discrete_to_incl = c('patient_sex','HIV_status'),covars_numeric_to_incl = c('age','BMI'),pheno_name = c('TB_score','Xray_score'),n_PC = 3,not_chr_6 = T,n_cores){ system(glue::glue("mkdir -p {Out_Path}")) system(glue::glue("mkdir -p {Out_Path}/Host/")) #File for PCA (Prune and remove long-range LD) system( glue::glue( "plink2 --bfile {BFILE_Path} --threads {n_cores} --keep-allele-order --new-id-max-allele-len 50 --set-all-var-ids @:#[b37]\\$r,\\$a --rm-dup force-first --make-bed --out {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp" ) ) system( glue::glue( "plink2 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp --threads {n_cores} --keep-allele-order --indep-pairwise 200 100 0.2 --out {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp" ) ) if(not_chr_6){ system( glue::glue( "plink2 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp --threads {n_cores} --exclude range {excl_regions} --not-chr 6 --keep-allele-order --extract {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp.prune.in --make-bed --out {Out_Path}/Host/TB_DAR_GWAS_PCA" ) ) }else{ system( glue::glue( "plink2 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp --threads {n_cores} --exclude range {excl_regions} --keep-allele-order --extract {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp.prune.in --make-bed --out {Out_Path}/Host/TB_DAR_GWAS_PCA" ) ) } ### Calculate PCs ### system( glue::glue( "gcta64 --bfile {BFILE_Path} --make-grm --out {Out_Path}/Host/TB_DAR_GWAS --thread-num {n_cores}" ) ) system( glue::glue( "gcta64 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA --make-grm --out {Out_Path}/Host/TB_DAR_GWAS_PCA --thread-num {n_cores}" ) ) system( glue::glue( "gcta64 --grm {Out_Path}/Host/TB_DAR_GWAS_PCA --pca 20 --out {Out_Path}/Host/TB_DAR_GWAS_PCA --thread-num {n_cores}" ) ) ### Set-up Phenotype ### #Load metadata file metadata <- data.table::fread(Metadata_Path,na.strings = c("",'unknown','NA'),stringsAsFactors = T) %>% dplyr::select(any_of(c('PATIENT_ID',covars_discrete_to_incl,covars_numeric_to_incl))) %>% dplyr::mutate(PATIENT_ID = as.character(PATIENT_ID)) #Parse sample ID samples_for_GWAS <- data.table::fread(glue::glue("{BFILE_Path}.fam")) %>% dplyr::select(IID=V2) samples_for_GWAS_simple <- sapply(samples_for_GWAS$IID,function(x) gsub(x=x,pattern = 'WGS_Fellay.',replacement = '')) samples_for_GWAS_simple <- sapply(samples_for_GWAS_simple,function(x) gsub(x=x,pattern = 'Batch1_',replacement = '')) samples_for_GWAS_simple <- sapply(samples_for_GWAS_simple,function(x) gsub(x=x,pattern = 'Batch2_',replacement = '')) if(is.null(covars_discrete_to_incl)){ covars_discrete <- NULL }else{ covars_discrete <- dplyr::left_join(data.frame(ID = samples_for_GWAS_simple,stringsAsFactors = F),metadata %>% dplyr::select(any_of(c('PATIENT_ID',covars_discrete_to_incl))),by=c('ID'='PATIENT_ID')) %>% dplyr::select(-ID) } if(is.null(covars_numeric_to_incl)){ covars_numeric <- NULL }else{ covars_numeric <- dplyr::left_join(data.frame(ID = samples_for_GWAS_simple,stringsAsFactors = F),metadata %>% dplyr::select(any_of(c('PATIENT_ID',covars_numeric_to_incl))),by=c('ID'='PATIENT_ID')) %>% dplyr::select(-ID) } PCs <- data.table::fread(glue::glue("{Out_Path}/Host/TB_DAR_GWAS_PCA.eigenvec")) colnames(PCs) <- c('FID','IID',paste0('PC',seq(1,ncol(PCs) - 2))) #Top N PCs and numeric covars data.table::fwrite(cbind(PCs[,1:(2+n_PC)],covars_numeric),sep = '\t',col.names = T,na = 'NA',file = glue::glue("{Out_Path}/Host/covars_numeric")) #discrete covars data.table::fwrite(cbind(PCs[,1:2],covars_discrete),sep = '\t',col.names = T,na = 'NA',file = glue::glue("{Out_Path}/Host/covars_discrete")) for(i in 1:length(pheno_name)){ metadata_pheno <- data.table::fread(Metadata_Path,na.strings = c("",'unknown'),stringsAsFactors = T) %>% dplyr::select(any_of(c('PATIENT_ID',pheno_name[i]))) %>% dplyr::mutate(PATIENT_ID = as.character(PATIENT_ID)) %>% tidyr::drop_na() #Join with pheno table, extract relevant covars pheno_vect <- dplyr::left_join(data.frame(ID = samples_for_GWAS_simple,stringsAsFactors = F),metadata_pheno %>% dplyr::select(PATIENT_ID,any_of(pheno_name[i])),by=c('ID'='PATIENT_ID'))[,pheno_name[i],drop=T] #pheno if(pheno_name[i] == 'Lineage'){ pheno_df <- cbind(PCs[,1:2],data.frame(Lineage = pheno_vect)) %>% tidyr::drop_na() pheno_df <- cbind(pheno_df[,1:2],dummy.data.frame(pheno_df[,-c(1,2)])) }else if(pheno_name[i] == 'Sublineage'){ pheno_df <- cbind(PCs[,1:2],data.frame(Lineage = pheno_vect)) %>% tidyr::drop_na() dummy_data <- dummy.data.frame(pheno_df[,-c(1,2)]) #Filter Sublineage with less than 30 counts cnts <- apply(dummy_data,2,function(x) min(c(sum(x==1),sum(x==0)))) dummy_data <- dummy_data[,cnts > 30] pheno_df <- cbind(pheno_df[,1:2],dummy_data) } else{ pheno_df <- cbind(PCs[,1:2],data.frame(pheno=pheno_vect)) colnames(pheno_df)[3] <- pheno_name[i] } data.table::fwrite(pheno_df,sep = ' ',col.names = T,na = 'NA',file = glue::glue("{Out_Path}/Host/{pheno_name[i]}"),quote = F) } } SetUpHLA <- function(VCF_File,Out_Path,MAF_Thresh = 0.01,Info_Thresh = 0.8){ info_file <- data.table::fread(cmd = glue::glue("zcat {gsub(VCF_File,pattern = '.dose.vcf.gz',replacement = '.info.gz')}")) hla_alleles <- dplyr::filter(info_file,Rsq > Info_Thresh & grepl('HLA',SNP)) %>% dplyr::select(SNP) hla_AA <- dplyr::filter(info_file,Rsq > Info_Thresh & grepl('AA',SNP)) %>% dplyr::select(SNP) write(hla_AA$SNP,file = glue::glue("{Out_Path}Host/AA_to_keep.txt")) write(hla_alleles$SNP,file = glue::glue("{Out_Path}Host/Alleles_to_keep.txt")) system(glue::glue("plink2 --vcf {VCF_File} --extract {Out_Path}Host/AA_to_keep.txt --maf {MAF_Thresh} --make-bed --double-id --out {Out_Path}Host/TB_DAR_HLA_AA")) system(glue::glue("plink2 --vcf {VCF_File} --extract {Out_Path}Host/Alleles_to_keep.txt --maf {MAF_Thresh} --make-bed --double-id --out {Out_Path}Host/TB_DAR_HLA_Alleles")) } # Metadata_Path <- '../../data/pheno/metadata_Sinergia_final_dataset_human_bac_genome_available_QCed.txt' # BFILE_Path <- '../../data/Genotyping_WGS/TBDAR.WGS.Imputed.GWASReady' # HLA_Path <-'../../data/HLA/FourDigits/chr6.dose.vcf.gz' # OUT_dir <- '../../scratch/' # excl_regions <- '../../data/exclusion_regions_hg19.txt' # n_cores <- 1 args <- commandArgs(trailingOnly = TRUE) Metadata_Path <- args[[1]] BFILE_Path <- gsub(args[[2]],pattern = '.bed',replacement = '') HLA_Path <- args[[3]] OUT_dir <- args[[4]] excl_regions <- args[[5]] n_cores <- as.numeric(args[[6]]) SetUpHost(Metadata_Path,BFILE_Path,OUT_dir,excl_regions=excl_regions,covars_discrete_to_incl = c('patient_sex','HIV_status','TB_RF_smoking'),covars_numeric_to_incl = c('age','BMI'),n_cores=n_cores) SetUpHLA(VCF_File = HLA_Path,Out_Path = OUT_dir) |
32 33 34 35 36 37 38 39 40 41 42 43 44 | shell: ''' bcftools filter -i 'GT=="1/1"' -O z -o {output.homo_snps} {input.VCF_file} #Keep both homo calls only. bcftools index -t {output.homo_snps} bcftools filter -i 'GT=="1/1" || GT=="0/1" || GT=="1/0"' -O z -o {output.homo_het_snps} {input.VCF_file} #Keep both homo and mixed calls. bcftools index -t {output.homo_het_snps} bcftools filter -i 'GT=="./." || GT=="0/1" || GT=="1/0"' {input.VCF_file} | bcftools query -f '%POS %REF %ALT\n' > {output.missing_snps_homo} bcftools filter -i 'GT=="./."' {input.VCF_file} | bcftools query -f '%POS %REF %ALT\n' > {output.missing_snps_homo_het} bcftools query -f '%POS %REF %ALT [%AD] [%RD] [%SDP] [%DP] [%RBQ] [%ABQ] [%RDF] [%RDR] [%ADF] [%ADR] [%FREQ] [%GT] [%GQ]\n' {input.VCF_file} | pigz --fast > {output.reads} ''' |
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | shell: ''' bcftools merge --threads {threads} -0 -m none {input.homo_snps} -O v -o {output.homo_snps}.tmp bedtools subtract -header -A -a {output.homo_snps}.tmp -b {input.locus_to_excl} > {output.homo_snps} bgzip -c {output.homo_snps} > {output.homo_snps}.gz bcftools index -t {output.homo_snps}.gz rm {output.homo_snps}.tmp bcftools +dosage {output.homo_snps}.gz > {output.homo_snps}.dosage bcftools merge --threads {threads} -0 -m none {input.homo_het_snps} -O v -o {output.homo_het_snps}.tmp bedtools subtract -header -A -a {output.homo_het_snps}.tmp -b {input.locus_to_excl} > {output.homo_het_snps} bgzip -c {output.homo_het_snps} > {output.homo_het_snps}.gz bcftools index -t {output.homo_het_snps}.gz rm {output.homo_het_snps}.tmp bcftools +dosage {output.homo_het_snps}.gz > {output.homo_het_snps}.dosage ''' |
99 100 | shell: "Rscript ./scripts/GetCoverage.R {input.VCF_Files} {input.missing_files} {input.gff} {params.out_dir} {threads}" |
122 123 124 125 | shell: ''' Rscript ./scripts/MakeAATable.R {input.snps} {input.locus_to_excl} {params.snpEff} {wildcards.id} {output.AA_variant} {output.Syn_variant} ''' |
147 148 | shell: "Rscript ./scripts/MakeVariantMatrix.R {input.AA_variant} {input.Syn_variant} {input.missing_mat} {input.phylo_snps} {input.del_tbl} {input.sift_path} {params.IS_SIFT} {params.IS_Burden} {params.IS_Deletion} {output.Mtb_Var_Tbl} {params.metadata} {params.Het_Thresh} {threads}" |
172 173 174 175 | shell: ''' Rscript ./scripts/SetUpHost.R {params.metadata} {input.host_bfile} {input.host_hla_vcf} {params.Out_DIR} {params.excl_region} {threads} ''' |
199 200 | shell: "Rscript ./scripts/SetUpG2G.R {params.Out_DIR} {params.metadata} {input.host_pc} {input.Mtb_Var_Tbl} {params.Phylo_Tree_Path} {input.merged_dosage} {params.Host_MAF} {params.Pathogen_MAC_pPCA} {params.Pathogen_MAC_AA_Lineage} {params.Pathogen_MAC_AA} {params.Pathogen_Missing} {input.host_bfile} {params.Host_Files} {threads}" |
220 221 | shell: " Rscript ./scripts/RunG2G.R {input.g2g_obj} {params.out_dir} {params.tool} {params.N_PC} {params.N_pPC} {params.covars_to_incl} {params.stratified} {params.homo_only} {threads}" |
242 243 | shell: "Rscript ./scripts/RunInteraction.R {input.g2g_obj} {params.out_dir} {params.tool} {params.N_PC} {params.N_pPC} {params.covars_to_incl} {params.stratified} {params.homo_only} {params.pheno} {threads}" |
258 259 | shell: "Rscript ./scripts/RunPASCAL.R {input.res_obj} {input.bim_1KG} {params.scratch_dir} {params.output}" |
Support
Do you know this workflow well? If so, you can
request seller status , and start supporting this workflow.
Created: 1yr ago
Updated: 1yr ago
Maitainers:
public
URL:
https://github.com/zmx21/G2G-TB
Name:
g2g-tb
Version:
1
Accessed: 1
Downloaded:
0
Copyright:
Public Domain
License:
None
Keywords:
- Future updates
Related Workflows

ENCODE pipeline for histone marks developed for the psychENCODE project
psychip pipeline is an improved version of the ENCODE pipeline for histone marks developed for the psychENCODE project.
The o...

Near-real time tracking of SARS-CoV-2 in Connecticut
Repository containing scripts to perform near-real time tracking of SARS-CoV-2 in Connecticut using genomic data. This pipeli...

snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...

ATLAS - Three commands to start analyzing your metagenome data
Metagenome-atlas is a easy-to-use metagenomic pipeline based on snakemake. It handles all steps from QC, Assembly, Binning, t...
raw sequence reads
Genome assembly
Annotation track
checkm2
gunc
prodigal
snakemake-wrapper-utils
MEGAHIT
Atlas
BBMap
Biopython
BioRuby
Bwa-mem2
cd-hit
CheckM
DAS
Diamond
eggNOG-mapper v2
MetaBAT 2
Minimap2
MMseqs
MultiQC
Pandas
Picard
pyfastx
SAMtools
SemiBin
Snakemake
SPAdes
SqueezeMeta
TADpole
VAMB
CONCOCT
ete3
gtdbtk
h5py
networkx
numpy
plotly
psutil
utils
metagenomics

RNA-seq workflow using STAR and DESeq2
This workflow performs a differential gene expression analysis with STAR and Deseq2. The usage of this workflow is described ...

This Snakemake pipeline implements the GATK best-practices workflow
This Snakemake pipeline implements the GATK best-practices workflow for calling small germline variants. The usage of thi...