Repo to analyze population genetic data with many different methods

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

Goal

This pipeline was built for the Peter et al 2019 manuscript on applying EEMS to a number of human populations and compares the results to PCA on the same datasets. The pipeline share here includes a workflow that comparisonn between several additional methods (listed below).

Reproducing results from Peter et al. 2019

As some of the data used requires permission, we are not free to redistribute it. To re-generate all figures from the paper, it will be necessary to

acquire access to all data and create the master data set as described in the merge-pipeline
change paths in config/config.json to reflect your working environment
run snakemake all

Implementation details

Genotypic data is stored in plink format. Metadata/location data is stored using the PopGenStructures data format, with some minor (recommended) changes. The pipeline is implemented using Snakemake , using python for most data wrangling and R for most plotting

Implemented methods

EEMS
flashpca
admixture
pong visualization of admixture
TESS3
treemix
Spacemix
conStruct
FST (using plink)

Code Snippets

library(dplyr)

old <- function(){
C <- snakemake@config$paper
panels <- names(C)

outfile <- snakemake@output$excluded
print(outfile)

excluded_table <- data.frame(panel=c(),
			     popId=c(),
			     abbrev=c(),
			     N=c())

pd <- read.csv(snakemake@input$pop_display)

for(panel in panels){
    print(panel)
#panel <- 'Southern Africa'
    main = C[[panel]][['main']]
    full = C[[panel]][['full']]
    if(is.null(full)){print("X!"); next};
    if(full == F){print("X2"); next};

    pg_m <- read.csv(sprintf("subset/%s.pop_geo", main))
    pg_f <- read.csv(sprintf("subset/%s.pop_geo", full))
    im_f <- read.csv(sprintf("subset/%s.indiv_meta", full))
    excluded_ids <- setdiff(pg_f %>% select(popId), pg_m %>% select(popId))

    n_excluded <-  excluded_ids %>% left_join(im_f) %>% 
	    group_by(popId) %>% summarize(N=n())
    tbl <- n_excluded %>% left_join(select(pd, popId, abbrev))
    tbl$panel <- panel
    tbl$full <- full
    tbl$main <- main
    excluded_table <- bind_rows(excluded_table, tbl)

}

excluded_table$panel <- factor(excluded_table$panel, levels=panels)
excluded_table <- excluded_table %>% arrange(panel, abbrev, N)

write.csv(excluded_table, file=outfile, row.names=F)


save.image(".excluded.rdebug")
}

R dplyr From line 1 of scripts/get_excluded.R

suppressPackageStartupMessages({
source("scripts/load_pop_meta.R") #load raw
source("scripts/ggeems/error.R")
source("scripts/config.R")
})
#save.image(".rdebug")

CC <- get_config(snakemake, plotname='error')

dist <- read.csv(snakemake@input$dist)
inddist <- read.csv(snakemake@input$inddist)
grid <- read.csv(snakemake@input$grid)
pd <- read.csv(snakemake@input$pop_display)
pg <- read.csv(snakemake@input$popgrid)
im <- read.csv(snakemake@input$ind_meta) %>%
    select(sampleId, popId)
nmax <- CC$nmax


pd <- annotate(pd)

dist_err <- get_marginal(dist, pd) 
P_dist_err <- plot_error(dist_err, CC$label,
                         CC$nmax)

grid_err <- get_marginal_grid(grid, pg, pd)
P_grid_err <- plot_error(grid_err, 'labels', CC$nmax)

worst_errors <- get_worst_errors(dist, pd)
P_worst_err <- plot_error(worst_errors, 'label', CC$nmax)

ind_err <- get_marginal_ind(inddist, im, pd)
P_ind_err <- plot_error(ind_err, "sampleId", CC$nmax)
#P_ind_err <- plot_error(ind_err, "pop", CC$nmax)
worst_ind_errors <- get_worst_errors_ind(inddist, im, pd)
P_worst_ind_err <- plot_error(worst_ind_errors, 'label', CC$nmax)

ggsave(snakemake@output$err_pop, P_dist_err, width=CC$width, height=CC$height)
ggsave(snakemake@output$err_grid, P_grid_err, width=CC$width, height=CC$height)
ggsave(snakemake@output$err_worst, P_worst_err, width=CC$width, height=CC$height)
ggsave(snakemake@output$err_ind, P_ind_err, width=CC$width, height=CC$height)
ggsave(snakemake@output$err_worst_ind, P_worst_ind_err, width=CC$width, height=CC$height)
saveRDS(P_dist_err,snakemake@output$err_pop_rds)
saveRDS(P_grid_err,snakemake@output$err_grid_rds)
saveRDS(P_worst_err,snakemake@output$err_worst_rds)

R From line 1 of ggeems/run_error.R

suppressPackageStartupMessages({
	library(ggplot2)
	library(dplyr)
	source("scripts/config.R")
	source("scripts/ggpca2d.R")
        source("scripts/themes.R")
})
C <- get_config(snakemake, 'pve')

pve <- read.table(snakemake@input$pve_file)[1:C$nmax,1]
df <- data.frame(PC=1:length(pve), pve=pve) 
G <- ggplot(df, aes(y=pve, x=as.factor(PC))) + geom_bar(stat="identity")  +
        pve_theme(base_size=C$theme_size)+ xlab("PC")
    ggsave(snakemake@output$png, G, width=C$width, height=C$height)
    saveRDS(G, snakemake@output$rds)

R ggplot2 dplyr From line 1 of pca/pve.R

library(SpaceMix)
library(dplyr)

make.spacemix.map <- function (spacemix.map.list, text = FALSE, ellipses = TRUE, source.option = TRUE, 
    xlim = NULL, ylim = NULL, ...) {
    with(spacemix.map.list, {
        plot(MAPP.geogen.coords, type = "n", xlim = xlim, ylim = ylim, 
            xlab = "", ylab = "", ...)
        if (ellipses) {
            lapply(1:k, FUN = function(i) {
                plot.credible.ellipse(pp.geogen.ellipses[[i]], 
                  color.vector[i])
            })
        }
        if (text) {
            text(MAPP.geogen.coords, col = color.vector, font = 2, 
                labels = name.vector, cex = 0.7)
        }
        if (source.option) {
            if (ellipses) {
                lapply(1:k, FUN = function(i) {
                  plot.credible.ellipse(pp.admix.source.ellipses[[i]], 
                    admix.source.color.vector[i], fading = 1, 
                    lty = 2)
                })
            }
            text(MAPP.admix.source.coords, col = admix.source.color.vector, 
                font = 3, labels = name.vector, cex = 0.7)
            plot.admix.arrows(MAPP.admix.source.coords, MAPP.geogen.coords, 
                admix.proportions = MCMC.output$admix.proportions[, 
                  best.iter], colors = admix.source.color.vector, 
                length = 0.1)
        }
        box(lwd = 2)
    })
    return(invisible("spacemix map!"))
}


plot_object <- function(opt, pop_meta, ...){
    make.spacemix.map.list(
        #MCMC.output.file=sprintf("%s/__LongRun/__space_MCMC_output1.Robj", opt),
        MCMC.output.file=opt,
        geographic.locations = as.matrix(pop_meta[,c('longitude', 'latitude')]),
        name.vector = pop_meta$name,
        color.vector = pop_meta$color,
        quantile = 0.95,
        burnin = 0)
}

    args <- commandArgs(T)
    if(exists('snakemake')){
        spm_out = snakemake@input$spacemix_output
        pop_geo = snakemake@input$pop_geo
        pop_display = snakemake@input$pop_display
        opt = snakemake@output[[1]]
    } else if(length(args) >=4){
        spm_out = args[1]
        pop_geo = args[2]
        pop_display = args[3]
        opt = args[4]
    }



        pop_g <- read.csv(pop_geo)
        pop_d <- read.csv(pop_display, strings=F)
        pop_meta <- pop_g %>% left_join(pop_d) %>% arrange(popId)
        #save.image('qqqtmpx')

        q <- SpaceMix::load_MCMC_output(spm_out)
        #saveRDS(q, 'temp_q.rds')
        pobj <- plot_object(spm_out, pop_meta)
        #saveRDS(pobj, 'temp.rds')
        png(opt, width=1600/2, height=600)
        make.spacemix.map(pobj, text=T, source.option=T,
            xlim=range(pop_g$longitude),
            ylim=range(pop_g$latitude))
        require(maps)
        points(pop_g$longitude, pop_g$latitude, pch=16, col='red')


        map(add=T)
        dev.off()

R Snakemake dplyr maps From line 1 of scripts/plot_spacemix.R

do_plot <- function(name){
library(maps)
p <- read.table(sprintf("subset/%s.polygon", name))
a <- read.csv(sprintf("subset/%s.pop_geo", name))                     


#pdf(sprintf("subset/%s_sample_map.pdf", name), width=8)
png(sprintf("subset/%s_sample_map.png", name), height=5*300, width=5*300)
plot(p, type='l', col='red', lty=2, lwd=2, asp=1); map(add=T)                           
text(a$longitude, a$latitude, a$popId, col='black', pch=16, cex=2) 
dev.off()
}


do_plot(snakemake@wildcards$name)

R maps From line 2 of scripts/sample_plot.R

script: 
    "../" + "scripts/construct/make_mat.R"

SnakeMake From line 9 of sfiles/construct.snake

script: "../" + "scripts/construct/make_coords.R"

SnakeMake From line 20 of sfiles/construct.snake

script: "../" + "scripts/construct/run.R"

SnakeMake From line 30 of sfiles/construct.snake

    script: "../" + gidscript
rule eems_ind_dist:
    input:
        geodist="dists/{name}.eemsdist0",
        indiv_meta="subset/{name}.indiv_meta",
        script=gidscript,
    output:
        "dists/{name}.eemsinddist"

SnakeMake From line 13 of sfiles/distances.snake

    script: "../" + gidscript

pcindscript="scripts/pcinddist.R"

SnakeMake From line 21 of sfiles/distances.snake

    script:  "../" + pcindscript

pcpopscript="scripts/pcdist.R"

SnakeMake From line 33 of sfiles/distances.snake

    script:  "../" + pcpopscript

pcgridscript="scripts/pcdistgrid.R"

SnakeMake From line 42 of sfiles/distances.snake

    script:  "../" + pcgridscript

geopopscript="scripts/geodist.R"

SnakeMake From line 52 of sfiles/distances.snake

    script:  "../" + geopopscript

gendistscript = "scripts/gendist.R"

SnakeMake From line 62 of sfiles/distances.snake

    script: "../" + gendistscript

eemsdistscript = "scripts/eemsdist.R"

SnakeMake From line 74 of sfiles/distances.snake

    script: "../" + eemsdistscript

rule eems0_pop_dist:
    input:
        Dhat=expand("eemsout0/{i}/{name}/rdistJtDhatJ.txt",
            i = range(N_EEMS_RUNS), 
            name = ['{name}']),
        ipmap='eemsout0/0/{name}/ipmap.txt',
        order="eems/{name}.order",
        indiv_meta="subset/{name}.indiv_meta",
        script=eemsdistscript
    params:
        statname='eems0dist'
    output:
        "dists/{name}.eems0dist",
        "dists/{name}.popgrid0",
    script: "../" + eemsdistscript

eemsgridscript="scripts/eems_grid_dist.R"

SnakeMake From line 92 of sfiles/distances.snake

    script: "../" + eemsgridscript

rule gen_grid_dist:
    input:
        mat=expand("eemsout/{i}/{name}/rdistJtDobsJ.txt",
            i = range(N_EEMS_RUNS), 
            name = ['{name}']),
        script=eemsgridscript
    params:
        statname='gendist'
    output:
        "dists/{name}.gengriddist"

SnakeMake From line 121 of sfiles/distances.snake

    script: "../" + eemsgridscript

geogriddistscript="scripts/geodistgrid.R"

SnakeMake From line 133 of sfiles/distances.snake

    script: "../" + geogriddistscript


rule dist_grid_all:
    input:
        "dists/{name}.gengriddist",
        "dists/{name}.eemsgriddist",
        "dists/{name}.geogriddist",
        "dists/{name}_dim2.pcgriddist",
        "dists/{name}_dim10.pcgriddist",
    output:
        "dists/{name}.grid"

SnakeMake From line 143 of sfiles/distances.snake

script: "../" + "scripts/merge_dists.R"

SnakeMake From line 155 of sfiles/distances.snake

script: "../" + "scripts/merge_dists.R"

SnakeMake From line 167 of sfiles/distances.snake

script: "../" + "scripts/merge_dists.R"

SnakeMake From line 179 of sfiles/distances.snake

script:
    "../" + "scripts/dist_rsq.R"

SnakeMake From line 188 of sfiles/distances.snake

script:
    "../" + "scripts/dist_decile_rsq.R"

SnakeMake From line 198 of sfiles/distances.snake

script:
    "../" + "scripts/inddists.R"

SnakeMake From line 208 of sfiles/distances.snake

script:
    "../" + "scripts/dists.R"

SnakeMake From line 219 of sfiles/distances.snake

run:
    wc = wildcards.name, wildcards.i
    new_diff = 'eems/%s-run%s.diffs' % wc
    new_outer = 'eems/%s-run%s.outer' % wc
    s = 'ln -f %s %s ' % (input.outer, new_outer)
    shell(s)
    s = 'ln -f %s %s ' % (input.diffs, new_diff)
    shell(s)
    s = config['EXE']['eems0'] + " --params " + input.inifile 
    shell(s + " 2> {log}")

SnakeMake From line 41 of sfiles/eems0.snake

run:
    wc = wildcards.name, wildcards.i
    new_diff = 'eems/%s-run%s.diffs' % wc
    new_outer = 'eems/%s-run%s.outer' % wc
    s = 'ln -f %s %s ' % (input.outer, new_outer)
    shell(s)
    s = 'ln -f %s %s ' % (input.diffs, new_diff)
    shell(s)
    s = config['EXE']['eems0'] + " --params " + input.inifile 
    shell(s + " 2> {log}")

SnakeMake From line 100 of sfiles/eems0.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    from subsetter.load import load_pop_geo, load_indiv_meta
    from subsetter.intersect import intersect
    import pandas as pd
    import numpy as np
    cfg = config['eems']['__default__'].copy()
    if "pilot" in config['eems']['__default__']:
        print("updating with default pilot")
        cfg.update(config['eems']['__default__']['pilot'])

    if wildcards.name in config['eems']:
        print("updating with name")
        cfg.update(config['eems'][wildcards.name])

    if "pilot" in config['eems'][wildcards.name]:
        print("updating with name")
        cfg.update(config['eems'][wildcards.name]['pilot'])

    n_demes = cfg.pop('nDemes')
    n_sites = np.loadtxt(input.bim, str).shape[0]
    cfg['nSites'] = n_sites
    mcmcpath='eemspilot0/' + wildcards.i + "/" + wildcards.name + '/'
    datapath = base(input.coord)
    ini = base(output.inifile)
    meta_data= pd.read_table(input.coord, header=None)

    if "gridsrc" in cfg and cfg['gridsrc'] == 'auto':
        pass
    else:
        cfg['gridpath'] = datapath

    #adapt ini
    cfg['numBurnIter'] = int(cfg['numBurnIter'] /EEMSO_FACTOR)
    cfg['numMCMCIter'] = int(cfg['numMCMCIter'] /EEMSO_FACTOR)
    cfg['numThinIter'] = int(cfg['numThinIter'] /EEMSO_FACTOR)

    create_ini_file(ini, mcmcpath, datapath,
        meta_data=meta_data,
        n_demes=n_demes, **cfg)

SnakeMake Pandas numpy From line 123 of sfiles/eems0.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    from subsetter.load import load_pop_geo, load_indiv_meta
    from subsetter.intersect import intersect
    import pandas as pd
    import numpy as np

    # first, get best pilot
    best_posterior = -np.inf
    best_pilot = ''
    for infile in input.prevs:
        pilogl = np.loadtxt(infile)
        print(pilogl)
        posterior = pilogl[1] - pilogl[0]
        if posterior > best_posterior:
            best_posterior = posterior
            best_pilot = os.path.dirname(infile)

    best_run_id = best_pilot.split("/")[1]

    # then, do same stuff as for regular ini
    cfg = config['eems']['__default__'].copy()
    if wildcards.name in config['eems']:
        cfg.update(config['eems'][wildcards.name])

    n_demes = cfg.pop('nDemes')
    n_sites = np.loadtxt(input.bim, str).shape[0]
    cfg['nSites'] = n_sites
    mcmcpath='eemsout0/' + wildcards.i + "/" + wildcards.name + '/'

    if 'continue' in config['eems'][wildcards.name] and ALLOW_CONTINUE:
        datapath = 'eems/%s-run%s' % (wildcards.name, wildcards.i)
    else:
        datapath = 'eems/%s-run%s' % (wildcards.name, best_run_id)
    coordfile = '%s.coord' % datapath
    ini = base(output.inifile)
    meta_data= pd.read_table(coordfile, header=None)

    cfg['gridpath'] = datapath
    cfg['prevpath'] = best_pilot

    #adapt ini
    cfg['numBurnIter'] = int(cfg['numBurnIter'] /EEMSO_FACTOR)
    cfg['numMCMCIter'] = int(cfg['numMCMCIter'] /EEMSO_FACTOR)
    cfg['numThinIter'] = int(cfg['numThinIter'] /EEMSO_FACTOR)

    create_ini_file(ini, mcmcpath, datapath,
        meta_data=meta_data,
        n_demes=n_demes, **cfg)

SnakeMake Pandas numpy From line 203 of sfiles/eems0.snake

shell: 'touch {output}'

SnakeMake From line 266 of sfiles/eems0.snake

run:
    grid = config['eems'][wildcards.name]['grid'] 
    s =  "%s scripts/eems_plot/make_plots.r" % config['EXE']['R']
    s +=   " {wildcards.nruns} {wildcards.name} {grid}"
    s += " {input.pop_display} {input.pop_geo} {input.indiv_label} 0"
    shell(s)

SnakeMake From line 287 of sfiles/eems0.snake

    script: "../" + "scripts/bf.R"


"""
rule ggplot_eems0:
    input:
        eems0in,
        pop_display=_POP_DISPLAY_,
        pop_geo='subset/{name}.pop_geo',
        indiv_label='subset/{name}.indiv_meta'
    params:
        RES=200,
        ZOOM=4,
        fancy=0
    output:
        mplot="eemsout_gg/{name}_nruns{nruns}-mrates01.png",
        m2plot="eemsout_gg/{name}_nruns{nruns}-mrates02.png"
    run:
        s =  "%s scripts/ggeems/run.R" % config['EXE']['R']
        s +=   " {wildcards.nruns} {wildcards.name} "
        s += " {input.pop_display} {input.pop_geo} {input.indiv_label}"
        s += " {params.RES} {params.ZOOM} {params.fancy}"
        shell(s)

rule ggeems_scatter:
    input:
        eemsin,
        pop_display=_POP_DISPLAY_,
        pop_geo='subset/{name}.pop_geo',
        indiv_label='subset/{name}.indiv_meta',
        diffs='eems/{name}.diffs',
        order='eems/{name}.order',
	ggpcvsgrid='figures/pcvsgrid/{name}_pc1-2.rds',
	ggrsq='figures/rsq/{name}_pc1-10.rds',
        script='scripts/ggeems/scatter.R'
    output:
        p1="eemsout_gg/{name}_nruns{nruns, \d+}-scatter01.png",
        p2="eemsout_gg/{name}_nruns{nruns, \d+}-scatter02.png",
        p3="eemsout_gg/{name}_nruns{nruns, \d+}-scatter03.png",
        p4="eemsout_gg/{name}_nruns{nruns, \d+}-scatter04.png",
        p5="eemsout_gg/{name}_nruns{nruns, \d+}-scatter05.png",
        p6="eemsout_gg/{name}_nruns{nruns, \d+}-scatter06.png",
        p7="eemsout_gg/{name}_nruns{nruns, \d+}-scatter07.png",
        paperfig="figures/paper/scatter_{name}_nruns{nruns, \d+}.png",
    run:
        s =  "%s scripts/ggeems/run_scatter.R" % config['EXE']['R']
        s +=   " {wildcards.nruns} {wildcards.name} "
        s += " {input.pop_display} {input.pop_geo} {input.indiv_label} "
        s += " {input.diffs} {input.order} "
        shell(s)

rule ggeems_scatter_hlex:
    input:
        eemsin,
        pop_display=_POP_DISPLAY_,
        pop_geo='subset/{name}.pop_geo',
        indiv_label='subset/{name}.indiv_meta',
        exfam='subset/{exname}.fam',
        diffs='eems/{name}.diffs',
        order='eems/{name}.order',
    output:
        p1="eemsout_gg/{name}_nruns{nruns}_ex:{exname}-scatter01.png",
        p2="eemsout_gg/{name}_nruns{nruns}_ex:{exname}-scatter02.png",
        p3="eemsout_gg/{name}_nruns{nruns}_ex:{exname}-scatter03.png",
        p4="eemsout_gg/{name}_nruns{nruns}_ex:{exname}-scatter04.png",
    run:
        s =  "%s scripts/ggeems/run_scatter.R" % config['EXE']['R']
        s +=   " {wildcards.nruns} {wildcards.name} "
        s += " {input.pop_display} {input.pop_geo} {input.indiv_label} "
        s += " {input.diffs} {input.order} "
        s += " {input.exfam} {wildcards.exname}"

        shell(s)

rule all_figs:
    input:
        "eemsout_gg/{name}_nruns4-mrates01.png",
        "eemsout/{name}_nruns4-mrates01.png",
        "eemsout_gg/{name}_nruns4-scatter01.png",
        "figures/pcvsgrid/{name}_pc1-10.png",
        "figures/pcvsgrid/{name}_pc1-2.png",
        "figures/pca/pc1d_{name}_pc1.png",
	"figures/pca/loadings_{name}_pc20.png"
    output:
        "{name}.figs"
    shell:
        "touch {output}"
"""

SnakeMake From line 306 of sfiles/eems0.snake

run:
    s =  "%s scripts/ggeems/run.R" % config['EXE']['R']
    s +=   " {wildcards.nruns} {wildcards.name} "
    s += " {input.pop_display} {input.pop_geo} {input.indiv_label}"
    s += " {params.RES} {params.ZOOM} {params.fancy}"
    shell(s)

SnakeMake From line 323 of sfiles/eems0.snake

run:
    s =  "%s scripts/ggeems/run_scatter.R" % config['EXE']['R']
    s +=   " {wildcards.nruns} {wildcards.name} "
    s += " {input.pop_display} {input.pop_geo} {input.indiv_label} "
    s += " {input.diffs} {input.order} "
    shell(s)

SnakeMake From line 350 of sfiles/eems0.snake

run:
    s =  "%s scripts/ggeems/run_scatter.R" % config['EXE']['R']
    s +=   " {wildcards.nruns} {wildcards.name} "
    s += " {input.pop_display} {input.pop_geo} {input.indiv_label} "
    s += " {input.diffs} {input.order} "
    s += " {input.exfam} {wildcards.exname}"

    shell(s)

SnakeMake From line 371 of sfiles/eems0.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    infile=base(input.bed)
    outfile=base(output.diffs)
    tmpfile=base(output.tmpbim)

    print(config['eems']['__default__'])
    try:
        bed2diffs = config['eems'][wildcards.name]['bed2diffs']
    except KeyError:
        bed2diffs = config['eems']['__default__']['bed2diffs']
    create_diffs_file(bedfile=infile,
                      bed2diffs=config['EXE'][bed2diffs],
                      outname=outfile, tmpbim=tmpfile)

SnakeMake From line 21 of sfiles/eems.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    from subsetter.load import load_pop_geo, load_indiv_meta
    from subsetter.intersect import intersect
    import pandas as pd
    import numpy as np
    location_data = load_pop_geo(input.pop_geo)
    sample_data = load_indiv_meta(input.indiv_meta)
    order = pd.read_table(input.order, header=None, sep=" ")
    meta_data = sample_data.merge(location_data)

    seed = int(wildcards.i) + sum(ord(s) for s in wildcards.name)
    np.random.seed(seed)
    sd = meta_data['accuracy'] * config['sdfactor'] + EPS
    long_jitter = np.random.normal(meta_data['longitude'], sd)
    lat_jitter = np.random.normal(meta_data['latitude'], sd)
    long_jitter = ["%2.2f" % i for i in long_jitter]
    lat_jitter = ["%2.2f" % i for i in lat_jitter]
    temp_data = pd.DataFrame({'longitude':long_jitter,
                             'latitude': lat_jitter})

    temp_data.to_csv(output.coord, sep=" ", header=False, index=False,
                     columns=('longitude', 'latitude'))

SnakeMake Pandas numpy From line 44 of sfiles/eems.snake

shell:
    'cp {input.outer} {output.outer}'

SnakeMake From line 74 of sfiles/eems.snake

run:
    from subsetter.intersect import intersect
    name = wildcards.name
    if 'grid' in config['eems'][name]:
        grid2 = GRID_PATH % config['eems'][name]['grid']
    else:
        grid2 = GRID_PATH % config['eems']['__default__']['grid']

    out_path = base(output.edges)
    intersect(grid2, input.outer, input.coord, out_path)

SnakeMake From line 86 of sfiles/eems.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    from subsetter.load import load_pop_geo, load_indiv_meta
    from subsetter.intersect import intersect
    import pandas as pd
    import numpy as np
    cfg = config['eems']['__default__'].copy()
    if wildcards.name in config['eems']:
        cfg.update(config['eems'][wildcards.name])

    n_demes = cfg.pop('nDemes')
    n_sites = np.loadtxt(input.bim, str).shape[0]
    cfg['nSites'] = n_sites
    mcmcpath='eemsout/' + wildcards.i + "/" + wildcards.name + '/'
    datapath = base(input.coord)
    ini = base(output.inifile)
    meta_data= pd.read_table(input.coord, header=None)

    cfg['gridpath'] = datapath
    create_ini_file(ini, mcmcpath, datapath,
        meta_data=meta_data,
        n_demes=n_demes, **cfg)

SnakeMake Pandas numpy From line 111 of sfiles/eems.snake

run:
    wc = wildcards.name, wildcards.i
    new_diff = 'eems/%s-run%s.diffs' % wc
    new_outer = 'eems/%s-run%s.outer' % wc
    s = 'ln -f %s %s ' % (input.outer, new_outer)
    shell(s)
    s = 'ln -f %s %s ' % (input.diffs, new_diff)
    shell(s)
    s = config['EXE']['eems'] + " --params " + input.inifile 
    shell(s + " 2> {log}")

SnakeMake From line 166 of sfiles/eems.snake

script: "../" + "scripts/get_induced_fst.R"

SnakeMake From line 183 of sfiles/eems.snake

run:
    wc = wildcards.name, wildcards.i
    new_diff = 'eems/%s-run%s.diffs' % wc
    new_outer = 'eems/%s-run%s.outer' % wc
    s = 'ln -f %s %s ' % (input.outer, new_outer)
    shell(s)
    s = 'ln -f %s %s ' % (input.diffs, new_diff)
    shell(s)
    s = config['EXE']['eems'] + " --params " + input.inifile 
    shell(s + " 2> {log}")

SnakeMake From line 239 of sfiles/eems.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    from subsetter.load import load_pop_geo, load_indiv_meta
    from subsetter.intersect import intersect
    import pandas as pd
    import numpy as np
    cfg = config['eems']['__default__'].copy()
    if "pilot" in config['eems']['__default__']:
        print("updating with default pilot")
        cfg.update(config['eems']['__default__']['pilot'])

    if wildcards.name in config['eems']:
        print("updating with name")
        cfg.update(config['eems'][wildcards.name])

    if "pilot" in config['eems'][wildcards.name]:
        print("updating with name")
        cfg.update(config['eems'][wildcards.name]['pilot'])

    n_demes = cfg.pop('nDemes')
    n_sites = np.loadtxt(input.bim, str).shape[0]
    cfg['nSites'] = n_sites
    mcmcpath='eemspilot/' + wildcards.i + "/" + wildcards.name + '/'
    datapath = base(input.coord)
    ini = base(output.inifile)
    meta_data= pd.read_table(input.coord, header=None)

    if "gridsrc" in cfg and cfg['gridsrc'] == 'auto':
        pass
    else:
        cfg['gridpath'] = datapath
    create_ini_file(ini, mcmcpath, datapath,
        meta_data=meta_data,
        n_demes=n_demes, **cfg)

SnakeMake Pandas numpy From line 262 of sfiles/eems.snake

run:
    from subsetter.eems import create_diffs_file, create_ini_file
    from subsetter.load import load_pop_geo, load_indiv_meta
    from subsetter.intersect import intersect
    import pandas as pd
    import numpy as np

    # first, get best pilot
    best_posterior = -np.inf
    best_pilot = ''
    for infile in input.prevs:
        pilogl = np.loadtxt(infile)
        print(pilogl)
        posterior = pilogl[1] - pilogl[0]
        if posterior > best_posterior:
            best_posterior = posterior
            best_pilot = os.path.dirname(infile)

    best_run_id = best_pilot.split("/")[1]

    # then, do same stuff as for regular ini
    cfg = config['eems']['__default__'].copy()
    if wildcards.name in config['eems']:
        cfg.update(config['eems'][wildcards.name])

    n_demes = cfg.pop('nDemes')
    n_sites = np.loadtxt(input.bim, str).shape[0]
    cfg['nSites'] = n_sites
    mcmcpath='eemsout/' + wildcards.i + "/" + wildcards.name + '/'

    if 'continue' in config['eems'][wildcards.name]:
        datapath = 'eems/%s-run%s' % (wildcards.name, wildcards.i)
    else:
        datapath = 'eems/%s-run%s' % (wildcards.name, best_run_id)
    coordfile = '%s.coord' % datapath
    ini = base(output.inifile)
    meta_data= pd.read_table(coordfile, header=None)

    cfg['gridpath'] = datapath
    cfg['prevpath'] = best_pilot
    create_ini_file(ini, mcmcpath, datapath,
        meta_data=meta_data,
        n_demes=n_demes, **cfg)

SnakeMake Pandas numpy From line 336 of sfiles/eems.snake

shell: 'touch {output}'

SnakeMake From line 393 of sfiles/eems.snake

run:
    grid = config['eems'][wildcards.name]['grid'] 
    s =  "%s scripts/eems_plot/make_plots.r" % "Rscript" #config['EXE']['R']
    s +=   " {wildcards.nruns} {wildcards.name} {grid}"
    s += " {input.pop_display} {input.pop_geo} {input.indiv_label}"
    shell(s)

SnakeMake From line 430 of sfiles/eems.snake

script: "../" + "scripts/ggeems/run_var.R"

SnakeMake From line 449 of sfiles/eems.snake

script: "../" + "scripts/ggeems/run2.R"

SnakeMake From line 463 of sfiles/eems.snake

script: "../" + "scripts/ggeems/run.R"

SnakeMake From line 477 of sfiles/eems.snake

script: "../scripts/ggeems/run_error.R"

SnakeMake From line 498 of sfiles/eems.snake

run:
    s =  "%s scripts/ggeems/run_scatter.R" % config['EXE']['R']
    s +=   " {wildcards.nruns} {wildcards.name} "
    s += " {input.pop_display} {input.pop_geo} {input.indiv_label} "
    s += " {input.diffs} {input.order} "
    s += " {input.exfam} {wildcards.exname}"

    shell(s)

SnakeMake From line 520 of sfiles/eems.snake

script:
    "../" + "scripts/ggeems/run_just_map.R"

SnakeMake From line 555 of sfiles/eems.snake

run:
    import pandas as pd
    indiv_meta = pd.read_csv(input.indiv_meta)
    within = indiv_meta[['sampleId', 'sampleId', 'popId']]
    within.to_csv(output.within, sep=" ", header=False, index=False)

SnakeMake Pandas From line 6 of sfiles/fst.snake

run:
    name = wildcards.name
    s = [PLINK_EXE, '--bfile',  name, '--fst --out', name,  
        '--within', input.within]
    sgrep = " |grep Mean | cut -f4 -d' ' > {output.fstall}"
    shell(" ".join(s) + sgrep)

SnakeMake From line 21 of sfiles/fst.snake

run:
    name = wildcards.name
    s = [PLINK_EXE, '--bfile',  name, '--freq gz --out', name,  
        '--within', input.within]
    shell(" ".join(s))

SnakeMake From line 36 of sfiles/fst.snake

    script: '../scripts/get_pi_mat.py'
"""

SnakeMake From line 48 of sfiles/fst.snake

script: "../scripts/plot_fst_mat.R"

SnakeMake From line 58 of sfiles/fst.snake

run:                                                                         
    outname = base(base(output.chunkcounts))                                 
    s ="%s -read {input} -paint %s 100" % (config['EXE']['pbwt'], outname)                    
    shell(s)

SnakeMake From line 9 of sfiles/paintings.snake

script:
    "../scripts/get_excluded.R"

SnakeMake From line 7 of sfiles/paper_figures.snake

shell: "cp {input} {output}"

SnakeMake From line 15 of sfiles/paper_figures.snake

    script: "../" + __script__1

__script__2="scripts/table_panels.R"

SnakeMake From line 28 of sfiles/paper_figures.snake

    script: "../" + __script__2

rule remove_underscore:
    input : "rawtables/{name}.csv"

SnakeMake From line 43 of sfiles/paper_figures.snake

shell : "sed -e 's/_/ /g; ' {input} >{output}"

SnakeMake From line 48 of sfiles/paper_figures.snake

    script: "../" + __script__3

__script__4="scripts/table_loc.R"

SnakeMake From line 58 of sfiles/paper_figures.snake

    script: "../" + __script__4

rule all_tables:
    input:
        "paper/polygon_plot.pdf",
        'paper/table_sources.csv',
        "paper/table_panel.csv",
        'paper/table_loc.csv',

rule tex_blurb:
    output: "blurbs/{name}.tex",
    shell: "touch {output}"

SnakeMake From line 77 of sfiles/paper_figures.snake

run:
    rs = reportshell % figuretex
    rs= rs.format(name=wildcards.name)
    with open(output.tex, 'w') as f:
        f.write(rs)
    shell("pdflatex -aux-directory=reports -output-directory=reports  {output.tex}")

SnakeMake From line 174 of sfiles/paper_figures.snake

    script: "../" + __script__5

rule list_exclusion_rules:
    input:
        pop_display=_POP_DISPLAY_,
    output: "excl/{name}.excl"

SnakeMake From line 193 of sfiles/paper_figures.snake

run:
    import pandas as pd
    cfg = load_subset_config(config['subset'], wildcards.name)
    print(cfg['exclude_pop'])
    x = pd.read_csv(input.pop_display)
    x = x[x.popId.isin(cfg['exclude_pop'])]
    x['run'] = wildcards.name
    x.to_csv(output[0], index=False)

SnakeMake Pandas From line 199 of sfiles/paper_figures.snake

script:
    "../" + "scripts/composite_fig.R"

SnakeMake From line 229 of sfiles/paper_figures.snake

shell:
    "tar czvf {output} {input}"

SnakeMake From line 238 of sfiles/paper_figures.snake

run: 
    run_pca(input, output, params, wildcards, config)

SnakeMake From line 23 of sfiles/pca.snake

run: 
    run_pca(input, output, params, wildcards, config)

SnakeMake From line 36 of sfiles/pca.snake

shell:
    "Rscript {input.__script__} {input.loadings} {input.bimfile} "
    "{params.region_bp} {params.abs_cutoff} {output.outliers} "

SnakeMake From line 49 of sfiles/pca.snake

script: "../" +__script__6
#shell: config['EXE']['R']  + " {input.__script__} {input.pc} {input.fam}"

SnakeMake From line 73 of sfiles/pca.snake

    script: "../" +__script__6

rule make_2d_pc_plots:
    input:
        pc='pca/flash_{name}_dim' + N_PC + '.pc',
        median='pca/median_{name}_dim' + N_PC + '.pc',
        fam='subset/{name}.fam',
        indiv_meta='subset/{name}.indiv_meta',
        pop_display=_POP_DISPLAY_,
        pop_order="subset/{name}.pop_order",
        pve="pca/flash_{name}_dim" + N_PC + '.pve',
        pop_geo=_POP_GEO_,
        __script__='scripts/pca/run_2d.R',
        _libscript='scripts/ggpca2d.R',
    params:
        wdf=False
    output:
        pc2=expand('figures/pca/2d/{name}_pc{PC}.png', 
            PC=range(1,5,2), name=['{name}']),
        pc2rds=expand('figures/pca/2d/{name}_pc{PC}.rds', 
            PC=range(1,5,2), name=['{name}']),
        out_map_rds="figures/paper/map_{name}.rds",
        out_map_png="figures/paper/map_{name}.png",
    script: "../" +"scripts/pca/run_2d.R"

SnakeMake From line 99 of sfiles/pca.snake

script: "../" +"scripts/pca/run_2d.R"

SnakeMake From line 165 of sfiles/pca.snake

script: "../scripts/pca/pve.R"

SnakeMake From line 173 of sfiles/pca.snake

    script: "../" + script_median

rule make_pc_plots_highlight_excluded:
    input:
        pc='pca/flash_{name}_dim{NPC, \d+}.pc',
        fam='subset/{name}.fam',
        exfam='subset/{exname}.fam',
        indiv_meta='subset/{name}.indiv_meta',
        pop_display=_POP_DISPLAY_,
        pop_geo=_POP_GEO_,
        pop_order="subset/{name}.pop_order",
        __script__='scripts/ggpca.R'
    params:
        wdf=False
    output:
        pc1=expand('figures/pcaex/pc1d_{name}_ex:{exname}_pc{PC}.png', 
            PC=range(1,21), name=['{name}'], exname=['{exname}']),
        pc2=expand('figures/pcaex/pc2d_{name}_ex:{exname}_pc{PC}.png', 
            PC=range(1,21,2), name=['{name}'], exname=['{exname}']),
    script: "../" +__script__6

__script__7='scripts/pcaloadings.R'
rule make_loadings_plots:
    input:
        load='pca/flash_{name}_dim' + N_PC + '.load',
        bim='subset/{name}.bim'  ,
        __script__='scripts/pcaloadings.R'
    output:
        fig=expand('figures/pca/loadings_{name}_pc{PC}.png', 
            PC=range(1,11), name=['{name}']),
    script: '../' + __script__7
    #shell:
    #    config['EXE']['R'] + " {input.__script__} {input.load} "

SnakeMake From line 186 of sfiles/pca.snake

    script:
        "../" + __script__8


__script__9='scripts/pca_vs_geo.R'
rule pca_vs_gen:
    input:
        pc='pca/flash_{name}_dim{NPC, \d+}.pc',
        ipmap='eemsout/0/{name}/ipmap.txt',
        fam='subset/{name}.fam',
        indiv_meta='subset/{name}.indiv_meta',
        pop_display=_POP_DISPLAY_,
        pop_order="subset/{name}.pop_order",
        pop_geo='subset/{name}.pop_geo',
        diffs='eems/{name}.diffs',
        order='eems/{name}.order',
        __script__='scripts/pca_vs_geo.R',
    output:
        pcvsdist='figures/pcvsdist/{name}_pc1-{npcs}.png',
        pcvsgrid='figures/pcvsgrid/{name}_pc1-{npcs}.png',
        rsq='figures/rsq/{name}_pc1-{npcs}.png',
        ggpcvsdist='figures/pcvsdist/{name}_pc1-{npcs}.rds',
        ggpcvsgrid='figures/pcvsgrid/{name}_pc1-{npcs}.rds',
        ggrsq='figures/rsq/{name}_pc1-{npcs}.rds',
    script: "../" +__script__9


rule synthmap:
    input:
        pc='pca/flash_{name}_dim{NPC, \d+}.pc',
        fam='subset/{name}.fam',
        indiv_meta='subset/{name}.indiv_meta',
        pop_display=_POP_DISPLAY_,
        pop_order="subset/{name}.pop_order",
        polygon="subset/{name}.polygon",
        pop_geo=_POP_GEO_,
        __script__='scripts/run_synthmap.R',
        _libscript='scripts/synthmap.R',
    output:
        plot0="figures/pca/synthmap/{name}_PC1.png",
    script: "../" + "scripts/run_synthmap.R"

SnakeMake From line 231 of sfiles/pca.snake

run:
    name, i, k = wildcards.name, wildcards.i, wildcards.k
    seed = int(i) * 23 + int(k) * 1541
    s = 'cd admixture/{name}/{i};'
    s += 'ln -sfr ../../../{input.bed} {name}.bed &&'
    s += 'ln -sfr ../../../{input.fam} {name}.fam &&'
    s += "awk '{{print 1,$2,$3,$4,$5,$6}}' ../../../{input.bim} > {name}.bim && "
    s += '%s {name}.bed {k}'
    s += ' --seed={seed} '
    s += ' > ../../../{log}; '
    s += ' cd - ; grep ^Logl {log} > {output.LL}'

SnakeMake From line 48 of sfiles/pong.snake

run:
    with open(output.filemap, 'w') as fm:
        for q_row in input:
            q = q_row.split("/")
            run_number = q[2]
            file_name = q[len(q) - 1]
            fns = file_name.split(".")
            k = fns[len(fns) - 2] #second last is K

            run_id = "%s_%s" % (file_name, run_number)
            run_id = run_id.replace(".", "_")
            s = "%s\t%s\t../%s\n" % (run_id, k, q_row)
            fm.write(s )

SnakeMake From line 67 of sfiles/pong.snake

run:
    import pandas as pd
    pop_display = pd.read_csv(input.pop_display)
    pop_geo = pd.read_csv(input.pop_geo)
    indiv_meta = pd.read_csv(input.indiv_meta)    
    pop_display = pop_display.drop('order', 1)
    pop_order = pd.read_csv(input.pop_order)    
    pop_display = pd.merge(pop_display, pop_order, how='left')
    indiv = pd.merge(indiv_meta, pop_display, how='left')        
    indiv = pd.merge(indiv, pop_geo, how='left')        
    assert all(indiv.sampleId == indiv_meta.sampleId)
    indiv.to_csv(output.ind2pop, columns=['popId'], index=None,
        header=False)
    indiv0 =indiv[['popId', 'name', 'latitude', 'longitude', 'order']]
    indiv0.drop_duplicates(inplace=True)
    indiv0.sort_index(by=['order'],
        ascending=[True], inplace=True)
    indiv0.to_csv(output.pop_names, sep="\t",
        columns=['popId', 'name'], 
        index=None, header=False)

SnakeMake Pandas From line 90 of sfiles/pong.snake

run:
    args= [EXE_PONG, '-fgv -c 0', 
        '--filemap', input.filemap[0],
        '--ind2pop', input.ind2pop,
        '--output_dir', 'pong/' + wildcards.name,

SnakeMake From line 123 of sfiles/pong.snake

run:
    s = """library(dplyr); library(tidyr);
        x <- read.table('{input}', header=T)  %>% 
            select(CLST, MAC, SNP) %>% 
            spread(key=CLST, value=MAC) %>%
            select(-SNP) %>% write.csv('{output}', row.names=F)
    """
    #R(s)
    s = s.replace("\n", " ")
    shell("""R -e "%s" """ % s)

SnakeMake tidyr From line 9 of sfiles/spacemix.snake

run:
    s = """library(dplyr); library(tidyr);
        x <- read.table('{input}', header=T)  %>% 
            select(CLST, NCHROBS, SNP) %>% 
            spread(key=CLST, value=NCHROBS) %>%
            select(-SNP) %>% write.csv('{output}', row.names=F)
    """
    #R(s)
    s = s.replace("\n", " ")
    shell("""R -e "%s" """ % s)

SnakeMake tidyr From line 26 of sfiles/spacemix.snake

script : "../scripts/run_spacemix.R"

SnakeMake From line 47 of sfiles/spacemix.snake

shell: 'touch {output}'

SnakeMake From line 61 of sfiles/spacemix.snake

script:
    "../scripts/plot_spacemix.R"

SnakeMake From line 72 of sfiles/spacemix.snake

run:
    inname = base(input.bed)
    outname = base(output.traw)
    s = [config['EXE']['plink'], '--bfile', inname,
        '--allow-extra-chr', '--recode A-transpose --out', outname]
    shell(" ".join(s))
    shell("cut -f7- {output.traw} | tail -n+2 | " +
        "sed 's/NA/9/g; s/\t//g'  > {output.tess}")

SnakeMake pLink From line 11 of sfiles/tess.snake

script : "../scripts/make_tess_input.R"

SnakeMake From line 28 of sfiles/tess.snake

run:
    seed = int(wildcards.K) * 1241 + int(wildcards.RUN) * 31
    s = [config['EXE']['tess'], '-K', wildcards.K,
        '-x', input.geno, '-r', input.coords,
        '-q', output.Q, '-g', output.G, '-f', output.FST,
        '-s', str(seed), 
        #'-y', output.summary
        ]
    shell(" ".join(s))

SnakeMake From line 47 of sfiles/tess.snake

shell: 'touch {output}'

SnakeMake From line 70 of sfiles/tess.snake

script: "../" + "scripts/plot_tess2.R"

SnakeMake From line 84 of sfiles/tess.snake

run:
    plink2treemix(input.frq_strat, output.treemix_in)

SnakeMake From line 45 of sfiles/treemix.snake

run:
    outname = base(base(output.cov))
    seed = params.seed_base * 23 + int(wildcards.m) * 19 + int(wildcards.run)
    s = config['EXE']['treemix'] + ' -i {input} '
    s += '-m {wildcards.m} '
    s += '-o {outname} '
    s += '-k {params.blocksize} '
    s += '-seed {seed} '
    s += '2> {log} > /dev/null'
    shell(s)

SnakeMake From line 64 of sfiles/treemix.snake

shell: 'touch {output}'

SnakeMake From line 85 of sfiles/treemix.snake

run:
    p1, p2 = output.p1, output.p2,
    import numpy as np
    __script__='scripts/plot_treemix_lib.R',
    pop_display=_POP_DISPLAY_,
    #__script__ = input.lib                 
    #pop_display = input.pop_display        
    print(input)
    infiles = input

    bases = [base(base(s)) for s in infiles]
    max_llik = 'NONE', -np.inf
    for b in bases:
        with open("%s.llik" %b) as f:
            x = f.read().split()
            print(x, len(x))
            ll = float(x[len(x)-1])
            if ll > max_llik[1]:
                max_llik = b, ll

    s = """
        source("{__script__}")
        png(file="{output.treeplot}", width=1600, height=1200)
        plot_tree("{max_llik[0]}")
        dev.off()
        """
    shell("echo '%s' > {p1}" %s)
    shell("Rscript {p1}")
    #shell("Rscript tmp.R")

    s = """
        source("{__script__}")
        x = read.table(gzfile("{max_llik[0]}.cov.gz"), check.names=F)
        n <- data.frame(popId=gsub("_", " ", names(x)))
        pop_display <- read.csv("{pop_display}")       
        m <- merge(n, pop_display, all.x=T)
        m <- m[order(m$order),"popId"]
        write.table(gsub(" ", "_", m), "{output.tmp}", row.names=F, quote=F, col.names=F)

        png(file="{output.residplot}", width=1600, height=1200)
        plot_resid("{max_llik[0]}", "{output.tmp}")
        dev.off()
    """
    shell("echo '%s' > {p2}" %s)
    shell("Rscript {p2}")
    #shell("Rscript tmp.R")

SnakeMake numpy From line 98 of sfiles/treemix.snake

run:
    import pandas as pd
    im = pd.read_csv(input.indiv_meta)
    im = im[['sampleId', 'sampleId', 'popId']]
    try:
        im.popId = im.popId.str.replace(' ', '_')
    except AttributeError:
        pass
    im.to_csv(output.pops, sep=" ", index=False)
    shell('cp {output.pops} tmpf')

    inname = base(input.bed)
    outname = base(base(output.frq))
    s = [PLINK_EXE, '--bfile', inname, '--freq',
        '--within', output.pops, '--out', outname, 
        '--allow-extra-chr']
    shell(" ".join(s))

SnakeMake Pandas From line 12 of sfiles/utils.snake

run:
    n = wildcards.name
    s = 'plink --bfile {n} --recode vcf-iid bgz --out {n}'
    shell(s)

SnakeMake pLink From line 38 of sfiles/utils.snake

run:
    l = [config['EXE']['pbwt'], '-readVcfGT', '{input.vcf}',
        '-write', output.pbwt]
    shell(" ".join(l))

SnakeMake From line 49 of sfiles/utils.snake

run:
    import pandas as pd
    prov = pd.read_csv(input.indiv_prov)
    label = pd.read_csv(input.indiv_label)
    data = pd.merge(prov, label)
    data.to_csv(output.indiv_meta, index=False)

SnakeMake Pandas From line 60 of sfiles/utils.snake

script: "../scripts/sample_plot.R"

SnakeMake From line 73 of sfiles/utils.snake

script : "../scripts/hwe.R"

SnakeMake From line 80 of sfiles/utils.snake

run:
    inname = base(input.bed)
    outname = base(output.hwe)
    s = [PLINK_EXE, '--bfile', inname, '--hardy', '--out', outname]
    shell(" ".join(s))

SnakeMake From line 88 of sfiles/utils.snake

run:
    s = """a <- read.table("{input.hwe}", as.is=T, header=T)

        cat(min(a[a[,7] > a[,8],9], na.rm=T), file="{output.hwe}")"""
    R(s)

SnakeMake From line 100 of sfiles/utils.snake

run:
    R("""require(tidyverse);
        read.csv("{input.indiv_meta}") %>% 
            left_join(read.csv("{input.pop_geo}")) %>%
            left_join(read.csv("{input.pop_display}")) %>%
        write.csv("{output.indiv_full}", row.names=F) """)

SnakeMake From line 114 of sfiles/utils.snake

run:
    snakemake_subsetter(input, output, wildcards.name)

SnakeMake From line 242 of master/Snakefile

run:
    s = '{PLINK_EXE} --allow-extra-chr --bfile subset_nopca/{wildcards.name} '
    s += ' --out subset/{wildcards.name} --make-bed'
    if 'no_pca' in config['subset'][wildcards.name]:
        if config['subset'][wildcards.name]['no_pca']:
            s += ' --exclude {input.outliers} '
    shell(s)