• Don't want to see ads? Install an adblocker like uBlock Origin or use a Europe-based privacy-friendly browser like Vivaldi or Mullvad.

Admixtools Multi-qpAdm automation script

Jovialis

Advisor
Messages
9,888
Reaction score
6,794
Points
113
Ethnic group
Italian
Y-DNA haplogroup
R1b-PF7566>Y227216
mtDNA haplogroup
H6a1b7
I used ChatGPT to help craft a script that streamlines testing multiple samples to see how well they fit within an existing population model. This approach makes it easier to organize and assess whether a particular sample or group of samples works in combination with another set you're interested in comparing.

For instance, in the example below, I'm testing Steppe_EMBA and Italy_Mesolithic.SG against a range of Neolithic to Early Bronze Age Aegean samples:

Code:
# Sarno et al. 2021 R-script CHG & Iran_N combined Outgroup Version

# Define paths for dataset
prefix = "D:\\Bioinformatics\\01_Admixtools_Dataset\\v54.1.p1_HO_Jovialis_Plink\\v54.1.p1_HO_Jovialis"
my_f2_dir = "D:\\Bioinformatics\\my_f2_dir_Jovialis"

# Load necessary libraries
library(admixtools)
library(tidyverse)

# Define target population
target = c('Jovialis')  # Update this if your sample has a different name in the merged dataset

# Define right populations
right = c('Mota', 'Ust_Ishim', 'Kostenki14', 'MA1_HG', 'Goyet', 'ElMiron',
          'Vestonice16', 'Villabruna', 'EHG', 'Levant_N', 'Anatolia_N',
          'WHG', 'CHG-Iran_N', 'Natufian')

# Generate f2 stats
mypops = c(right, target)
extract_f2(prefix, my_f2_dir, pops = mypops, overwrite = TRUE, maxmiss = 1)
f2_blocks = f2_from_precomp(my_f2_dir, pops = mypops, afprod = TRUE)

# Define the list of optional populations for each set
optional_pops = c(
    'C_Italian_N', 'Italy_Sicily_EN', 'Italy_Sicily_N_Stentinello', 'Italy_Sicily_MN', 'Greece_NeaNikomedeia_EN.SG', 'Greece_N', 'Greece_Peloponnese_N', 'Greece_LN_2.SG', 'Greece_Minoan_Lassithi', 'Greece_Minoan_Odigitria', 'Greece_Minoan_Kephala_Petras.SG', 'Greece_Koufonisi_Cycladic_EBA.SG', 'I2683-Anatolia_EBA_Isparta'
)

# Define the sets
sets = list(
    list(must_use = c('Steppe_EMBA', 'Italy_Mesolithic.SG'))
)

# Initialize list to store all results
all_results_list = list()

# Function to create combinations and run qpadm
run_qpadm_for_set <- function(set_must_use, optional_pops, prefix, right, target) {
    results_list <- list()  # Initialize an empty list to store results within the function
   
    for (pop in optional_pops) {
        left_set = c(set_must_use, pop)
       
        # Run qpadm model
        results = qpadm(prefix, left_set, right, target, allsnps = TRUE)
       
        # Store results in the list
        results_list[[length(results_list) + 1]] <- list(left = left_set, weights = results$weights, popdrop = results$popdrop)
       
        # Print summary of results
        cat("Testing Left Set:", paste(left_set, collapse = ", "), "\n")
        cat("Weights:\n")
        print(results$weights)
        cat("Popdrop Analysis:\n")
        print(results$popdrop)
        cat("----------------------------------------------------\n")
    }
   
    return(results_list)
}

# Iterate over each set and run qpadm for all combinations
for (i in 1:length(sets)) {
    cat("Running Set", i, "\n")
    set_results <- run_qpadm_for_set(sets[[i]]$must_use, optional_pops, prefix, right, target)
   
    # Append the results of the current set to the overall results list
    all_results_list[[i]] <- set_results
}

# all_results_list now contains results from all sets and combinations

 [code/]
 
Here is an optimized version that eliminates redunant processing time:

Description-
This script runs a parallelized ancestry analysis using qpAdm, comparing a target population with predefined reference populations and multiple optional populations, leveraging precomputed f2 statistics from a large genomic dataset (Plink format AADR 62.0).

Code:
# ---- Skourtanioti et al. 2023 qpAdm Reference model (Plink format AADR 62.0) ----

# Set the correct dataset path
prefix <- "D:\\Bioinformatics\\01_Admixtools_Dataset\\V62.0_HO_Plink_Merged_Jovialis\\v62.0_Jovialis_Merged"
my_f2_dir <- "D:\\Bioinformatics\\01_Admixtools_Dataset\\V62.0_HO_Plink_Merged_Jovialis\\my_f2_dir_v62.0_Jovialis_Merged"

# Load necessary libraries
library(admixtools)
library(tidyverse)
library(parallel)

# Define target population
target = c('Jovialis')

# Define right populations
right <- c('Ethiopia_4500BP.AG', 'Russia_UstIshim_IUP.DG', 'Russia_Kostenki14_UP.SG',
           'Serbia_IronGates_Mesolithic.AG', 'WHG', 'CHG', 'EEHG',
           'W_Anatolia_N', 'Iran_GanjDareh_N.AG', 'Israel_Natufian.AG', 'ANE')

# Generate f2 stats only once to avoid redundant computations
mypops = c(right, target)
extract_f2(prefix, my_f2_dir, pops = mypops, overwrite = TRUE, maxmiss = 1)
f2_blocks = f2_from_precomp(my_f2_dir, pops = mypops, afprod = TRUE)

# Define the list of optional populations for each set
optional_pops = c('Italy_SanGiovanni_IA.SG', 'Italy_Salapia_Daunian.SG',
                  'Italy_Ordona_Daunian.SG', 'Italy_Sicily_EBA.AG',
                  'Italy_Sicily_IA.AG', 'Italy_Broion_EBA.SG', 'Italy_Broion_BA.SG',
                  'Italy_ReginaMargherita_BA.SG', 'Italy_PianSultano_BA.SG',
                  'Italy_IA_Proto-Villanovan.SG', 'Italy_IA_Republic.SG',
                  'Italy_Bivio_Roman.SG', 'Italy_TarquiniaMonterozzi_IA.SG')

# Define the sets
sets = list(
    list(must_use = c('Italy_Sicily_Himera_409BCE.AG'))
)

# Initialize list to store all results
all_results_list = list()

# Parallelization setup
num_cores <- detectCores() - 1  # Reserve 1 core for system tasks
cl <- makeCluster(num_cores)

# Export necessary objects to the cluster
clusterEvalQ(cl, library(admixtools))
clusterExport(cl, list("prefix", "right", "target", "qpadm", "optional_pops", "sets"))

# Function to create combinations and run qpadm in parallel
run_qpadm_for_set_parallel <- function(set_index, sets, optional_pops, prefix, right, target, cl) {
    set_must_use <- sets[[set_index]]$must_use
   
    results_list <- parLapply(cl, optional_pops, function(pop) {
        left_set = c(set_must_use, pop)
       
        # Run qpadm model and handle errors gracefully
        tryCatch({
            results = qpadm(prefix, left_set, right, target, allsnps = TRUE)
            return(list(left = left_set, weights = results$weights, popdrop = results$popdrop))
        }, error = function(e) {
            return(list(left = left_set, error = e$message))
        })
    })
    return(results_list)
}

# Iterate over each set and run qpadm for all combinations in parallel
for (i in 1:length(sets)) {
    cat("Running Set", i, "\n")
    set_results <- run_qpadm_for_set_parallel(i, sets, optional_pops, prefix, right, target, cl)
   
    # Append the results of the current set to the overall results list
    all_results_list[[i]] <- set_results
}

# Stop the parallel cluster after processing
stopCluster(cl)

# View results
print(all_results_list)
 
What is the process? I mean where do I paste the code? I make a new file .txt in my Bin folder and paste there? Or how?
 
Could you make some sort of blog post on the averages for all the samples you've ran?
 
Back
Top