Admixtools Multi-qpAdm automation script

Jovialis · Sep 8, 2024

I used ChatGPT to help craft a script that streamlines testing multiple samples to see how well they fit within an existing population model. This approach makes it easier to organize and assess whether a particular sample or group of samples works in combination with another set you're interested in comparing.

For instance, in the example below, I'm testing Steppe_EMBA and Italy_Mesolithic.SG against a range of Neolithic to Early Bronze Age Aegean samples:

Code:

# Sarno et al. 2021 R-script CHG & Iran_N combined Outgroup Version

# Define paths for dataset
prefix = "D:\\Bioinformatics\\01_Admixtools_Dataset\\v54.1.p1_HO_Jovialis_Plink\\v54.1.p1_HO_Jovialis"
my_f2_dir = "D:\\Bioinformatics\\my_f2_dir_Jovialis"

# Load necessary libraries
library(admixtools)
library(tidyverse)

# Define target population
target = c('Jovialis')  # Update this if your sample has a different name in the merged dataset

# Define right populations
right = c('Mota', 'Ust_Ishim', 'Kostenki14', 'MA1_HG', 'Goyet', 'ElMiron',
          'Vestonice16', 'Villabruna', 'EHG', 'Levant_N', 'Anatolia_N',
          'WHG', 'CHG-Iran_N', 'Natufian')

# Generate f2 stats
mypops = c(right, target)
extract_f2(prefix, my_f2_dir, pops = mypops, overwrite = TRUE, maxmiss = 1)
f2_blocks = f2_from_precomp(my_f2_dir, pops = mypops, afprod = TRUE)

# Define the list of optional populations for each set
optional_pops = c(
    'C_Italian_N', 'Italy_Sicily_EN', 'Italy_Sicily_N_Stentinello', 'Italy_Sicily_MN', 'Greece_NeaNikomedeia_EN.SG', 'Greece_N', 'Greece_Peloponnese_N', 'Greece_LN_2.SG', 'Greece_Minoan_Lassithi', 'Greece_Minoan_Odigitria', 'Greece_Minoan_Kephala_Petras.SG', 'Greece_Koufonisi_Cycladic_EBA.SG', 'I2683-Anatolia_EBA_Isparta'
)

# Define the sets
sets = list(
    list(must_use = c('Steppe_EMBA', 'Italy_Mesolithic.SG'))
)

# Initialize list to store all results
all_results_list = list()

# Function to create combinations and run qpadm
run_qpadm_for_set <- function(set_must_use, optional_pops, prefix, right, target) {
    results_list <- list()  # Initialize an empty list to store results within the function
   
    for (pop in optional_pops) {
        left_set = c(set_must_use, pop)
       
        # Run qpadm model
        results = qpadm(prefix, left_set, right, target, allsnps = TRUE)
       
        # Store results in the list
        results_list[[length(results_list) + 1]] <- list(left = left_set, weights = results$weights, popdrop = results$popdrop)
       
        # Print summary of results
        cat("Testing Left Set:", paste(left_set, collapse = ", "), "\n")
        cat("Weights:\n")
        print(results$weights)
        cat("Popdrop Analysis:\n")
        print(results$popdrop)
        cat("----------------------------------------------------\n")
    }
   
    return(results_list)
}

# Iterate over each set and run qpadm for all combinations
for (i in 1:length(sets)) {
    cat("Running Set", i, "\n")
    set_results <- run_qpadm_for_set(sets[[i]]$must_use, optional_pops, prefix, right, target)
   
    # Append the results of the current set to the overall results list
    all_results_list[[i]] <- set_results
}

# all_results_list now contains results from all sets and combinations

 [code/]

Jovialis · Sep 23, 2024

Here is an optimized version that eliminates redunant processing time:

Description-
This script runs a parallelized ancestry analysis using qpAdm, comparing a target population with predefined reference populations and multiple optional populations, leveraging precomputed f2 statistics from a large genomic dataset (Plink format AADR 62.0).

Code:

# ---- Skourtanioti et al. 2023 qpAdm Reference model (Plink format AADR 62.0) ----

# Set the correct dataset path
prefix <- "D:\\Bioinformatics\\01_Admixtools_Dataset\\V62.0_HO_Plink_Merged_Jovialis\\v62.0_Jovialis_Merged"
my_f2_dir <- "D:\\Bioinformatics\\01_Admixtools_Dataset\\V62.0_HO_Plink_Merged_Jovialis\\my_f2_dir_v62.0_Jovialis_Merged"

# Load necessary libraries
library(admixtools)
library(tidyverse)
library(parallel)

# Define target population
target = c('Jovialis')

# Define right populations
right <- c('Ethiopia_4500BP.AG', 'Russia_UstIshim_IUP.DG', 'Russia_Kostenki14_UP.SG',
           'Serbia_IronGates_Mesolithic.AG', 'WHG', 'CHG', 'EEHG',
           'W_Anatolia_N', 'Iran_GanjDareh_N.AG', 'Israel_Natufian.AG', 'ANE')

# Generate f2 stats only once to avoid redundant computations
mypops = c(right, target)
extract_f2(prefix, my_f2_dir, pops = mypops, overwrite = TRUE, maxmiss = 1)
f2_blocks = f2_from_precomp(my_f2_dir, pops = mypops, afprod = TRUE)

# Define the list of optional populations for each set
optional_pops = c('Italy_SanGiovanni_IA.SG', 'Italy_Salapia_Daunian.SG',
                  'Italy_Ordona_Daunian.SG', 'Italy_Sicily_EBA.AG',
                  'Italy_Sicily_IA.AG', 'Italy_Broion_EBA.SG', 'Italy_Broion_BA.SG',
                  'Italy_ReginaMargherita_BA.SG', 'Italy_PianSultano_BA.SG',
                  'Italy_IA_Proto-Villanovan.SG', 'Italy_IA_Republic.SG',
                  'Italy_Bivio_Roman.SG', 'Italy_TarquiniaMonterozzi_IA.SG')

# Define the sets
sets = list(
    list(must_use = c('Italy_Sicily_Himera_409BCE.AG'))
)

# Initialize list to store all results
all_results_list = list()

# Parallelization setup
num_cores <- detectCores() - 1  # Reserve 1 core for system tasks
cl <- makeCluster(num_cores)

# Export necessary objects to the cluster
clusterEvalQ(cl, library(admixtools))
clusterExport(cl, list("prefix", "right", "target", "qpadm", "optional_pops", "sets"))

# Function to create combinations and run qpadm in parallel
run_qpadm_for_set_parallel <- function(set_index, sets, optional_pops, prefix, right, target, cl) {
    set_must_use <- sets[[set_index]]$must_use
   
    results_list <- parLapply(cl, optional_pops, function(pop) {
        left_set = c(set_must_use, pop)
       
        # Run qpadm model and handle errors gracefully
        tryCatch({
            results = qpadm(prefix, left_set, right, target, allsnps = TRUE)
            return(list(left = left_set, weights = results$weights, popdrop = results$popdrop))
        }, error = function(e) {
            return(list(left = left_set, error = e$message))
        })
    })
    return(results_list)
}

# Iterate over each set and run qpadm for all combinations in parallel
for (i in 1:length(sets)) {
    cat("Running Set", i, "\n")
    set_results <- run_qpadm_for_set_parallel(i, sets, optional_pops, prefix, right, target, cl)
   
    # Append the results of the current set to the overall results list
    all_results_list[[i]] <- set_results
}

# Stop the parallel cluster after processing
stopCluster(cl)

# View results
print(all_results_list)

Jalisciense · Nov 5, 2024

What is the process? I mean where do I paste the code? I make a new file .txt in my Bin folder and paste there? Or how?

Ancestry · Nov 5, 2024

Could you make some sort of blog post on the averages for all the samples you've ran?

Admixtools Multi-qpAdm automation script

Jovialis

Advisor

Jovialis

Advisor

Jalisciense

Regular Member

Ancestry

Junior Member