###Population genetic analyses #2.4 Evaluating Large-scale Population Structure #2.4.3 Effective population size history ##easySFS #load module module load Tools/easySFS #main group 1a python $EASYSFS -i pruned_vcf_file.vcf -p adria.pops.txt --preview -a python $EASYSFS -i pruned_vcf_file.vcf \ -p adria.pops.txt --proj=10,16,44,30,30,30,36,30,36,30,14,36,50,20,24,36,10,66 -a -f #main group 1c python $EASYSFS -i pruned_vcf_file.vcf -p danube.pops.txt --preview -a python $EASYSFS -i pruned_vcf_file.vcf \ -p danube.pops.txt --proj=36,34,40,36,18,40,18,40,30 -a -f #main group 1b python $EASYSFS -i pruned_vcf_file.vcf -p kolpa.pops.txt --preview -a python $EASYSFS -i pruned_vcf_file.vcf \ -p kolpa.pops.txt --proj=46,38,44,42 -a -f #2.4.2 Principal component analysis (PCA) #Package SNPRelate in R #load libraries library(gdsfmt) library(SNPRelate) library(gridExtra) library(ggrepel) library(ggplot2) #set working directory setwd("/your/working/dir/PCA") # Input VCF file plumvcf <- ("/your/working/dir/phox_filtered_final1.recode.vcf") # Read tab-separated popmap ("sample\tgroup") matching names in VCF popfile <- read.csv("/your/working/dir/popmap.txt", header = FALSE, sep = "\t") # Add popmap header colnames(popfile) <- c("sample", "group") # Convert VCF to GDS snpgdsVCF2GDS(plumvcf,"plum.gds", method="biallelic.only") snpgdsSummary("plum.gds") # Read in genotypes genofile <- snpgdsOpen("plum.gds", allow.duplicate=TRUE) # LD pruning option (turned off by default) set.seed(1000) snpset <- snpgdsLDpruning(genofile,autosome.only=FALSE, slide.max.bp = 50) # PCA snpset.id <- unlist(snpset) pca <- snpgdsPCA(genofile,autosome.only=FALSE, snp.id=snpset.id,num.thread=2) pc.percent <- pca$varprop*100 round.pc.percent <- round(pc.percent, 2) # Get % explained and store in vectors below ev1pc <- round.pc.percent[1] ev2pc <- round.pc.percent[2] ev1pc <- paste("PC1 (", ev1pc, "%)", sep ="") ev2pc <- paste("PC2 (", ev2pc, "%)", sep ="") # Prepare PCA results tab <- data.frame(sample.id = pca$sample.id, EV1 = pca$eigenvect[,1], # the first eigenvector EV2 = pca$eigenvect[,2], # the second eigenvector stringsAsFactors = FALSE) tab$sample.id <- sub(".*__", "", tab$sample.id) tabpops <- merge(tab,popfile, by.x = "sample.id" , by.y = "sample" ) # Set plot theme theme <- theme(panel.background = element_blank(),panel.border=element_rect(fill=NA), panel.grid.major = element_blank(),panel.grid.minor = element_blank(), strip.background=element_blank(),axis.text.x=element_text(colour="black"), axis.text.y=element_text(colour="black"),axis.ticks=element_line(colour="black"), plot.margin=unit(c(1,1,1,1),"line")) #plot it p_nolab <- ggplot(tabpops, aes(x = EV1, y = EV2, fill = group)) + geom_point(size = 5, shape = 21, color = "black", stroke = 0.5) + theme + xlab(ev1pc) + ylab(ev2pc) )) # Save plots as svg svg("PCA",width = 10,height = 10) yy <- grid.arrange(p_nolab,nrow=1) op <- par(no.readonly=TRUE) par(op) dev.off() #2.5. Evaluating Fine-scale Population Structure #2.5.1 Evaluating genetic connectivity in karst and non-karst river systems #2.5.1.1 ADMIXTURE analysis ##PLINK2 (prepare input files for ADMIXTURE) # Load PLINK2 source /opt/anaconda3/etc/profile.d/conda.sh conda activate plink2 # Convert VCF to PLINK format plink2 --vcf input.vcf --double-id \ --set-missing-var-ids @:# --allow-extra-chr --make-pgen --sort-vars --out plum_pgen # Perform linkage pruning plink2 --pfile plum_pgen --double-id \ --set-missing-var-ids @:# --allow-extra-chr --indep-pairwise 100 10 0.7 --out plum_100_10_0.7 # Create BED file and extract unlinked SNPs plink2 --pfile plum_pgen --double-id \ --set-missing-var-ids @:# --allow-extra-chr --indep-pairwise 100 10 0.7 \ --extract plum_100_10_0.7.prune.in --remove exclude_ind.txt --make-bed --pca --out plum_unlinked # Adjust chromosome names for ADMIXTURE awk '{$1="0";print $0}' plum_unlinked.bim > plum_unlinked.bim.tmp mv plum_unlinked.bim.tmp plum_unlinked.bim ##ADMIXTURE #load ADMIXTURE module load Tools/admixture_linux-1.3.0 #run ADMIXTURE for k 1-25 for i in {1..25} do admixture --cv /admixture/plum_unlinked3.bed $i > plum_admix_log${i}.out done ##2.5.1.2 Contemporary Migration Rates #BA3-SNPs v3.0.4 #load the tool module load Tools/bayesass-3 module load Tools/BA3-SNPS-autotune #BASNPsautotune to assess parameters $BA3SNPSautotune -i BA3-adria.immanc -l 4047 -b 500000 -g 5000000 -r 50 $BA3SNPSautotune -i BA3-danube.immanc -l 4047 -b 500000 -g 5000000 -r 50 #run BA3-SNPs BA3-SNPS -F BA3-adria.immanc -l 4047 -m 0.1000 -a 0.5500 -f 0.1000 -b 5000000 -i 50000000 -t -u -out BA3_adria BA3-SNPS -F BA3-danube.immanc -l 4047 -m 0.1000 -a 0.3250 -f 0.0375 -b 5000000 -i 50000000 -out BA3_danube -t -u #plot the results in R ###migration plot adria # Load necessary libraries library(circlize) library(readr) # Load the migration matrix migration_matrix_adria <- read_delim( "migration_matrix_adria.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE ) # Convert to matrix and set row names migration_matrix <- as.matrix(migration_matrix_adria[, -1]) # Exclude the first column rownames(migration_matrix) <- migration_matrix_adria[[1]] # Set the first column as row names # Reorder the matrix to match desired order new_order <- c( "OSPO", "RIZA", "BADA", "SOCA", "IDRI", "NADI", "BELS", "BRAN", "VIPA", "MRZL", "PRED", "NANO", "RAKU", "LOGA", "HOTE", "MALI", "RAKO", "CERK" ) ordered_migration_matrix <- migration_matrix[new_order, new_order] # Filter: Exclude small migration rates (< 0.1) filtered_migration_matrix <- ordered_migration_matrix filtered_migration_matrix[filtered_migration_matrix <= 0.08] <- 0 # Define sector colors sector_colors <- c( "OSPO" = "azure3", "RIZA" = "azure3", "BADA" = "azure3", "SOCA" = "linen", "IDRI" = "linen", "NADI" = "linen", "BELS" = "linen", "BRAN" = "aquamarine2", "VIPA" = "aquamarine2", "MRZL" = "lightblue", "PRED" = "green3", "NANO" = "green3", "RAKU" = "green3", "LOGA" = "green3", "HOTE" = "green3", "MALI" = "lightgoldenrod2", "RAKO" = "lightgoldenrod2", "CERK" = "lightgoldenrod2" ) # Ensure filtered_migration_matrix is a matrix filtered_migration_matrix <- as.matrix(filtered_migration_matrix) # Reorder the matrix to match the new order filtered_migration_matrix <- filtered_migration_matrix[new_order, new_order] # Define the file path for the PDF output_file <- "adria_migration.pdf" # Open a PDF device pdf(output_file, width = 10, height = 10) # Adjust width and height as needed # Generate the chord diagram chordDiagram( filtered_migration_matrix, transparency = 0.3, # Adjust transparency for clarity directional = 1, # Add directionality direction.type = c("arrows", "diffHeight"), # Arrows and height link.arr.type = "big.arrow", # Big arrows for direction order = new_order, # Apply the new order grid.col = sector_colors, # Assign sector colors annotationTrack = c("name", "grid"), # Add outer grid and population labels annotationTrackHeight = c(0.05, 0.1),# Adjust the height for labels self.link = 1, # Exclude self-migration rates link.largest = FALSE # Keep links proportional ) dev.off() # Clear after the plot circos.clear() ###BLACK SEA library(circlize) library(readr) # Load the migration matrix migration_matrix_bs <- read_delim( "migration_matrix.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE ) migration_matrix_B <- as.matrix(migration_matrix_bs[, -1]) # Exclude the first column rownames(migration_matrix_B) <- migration_matrix_bs[[1]] # Set the first column as row names # Define the new order for Black Sea groups black_sea_order <- c( "MALI", "RAKO", "CERK", # Group 1 "BLOS", "RASC", "CERJ", # Group 2 "TOJN", "IZIC", # Group 3 "SORA", "KOKR", "BOHI", # Group 4 "KRKA", "CRMO", # Group 5 "RADU", # Group 6 "MOKR", "KOLP", "KUPI" # Group 7 ) # Define the colors for these groups black_sea_colors <- c( "MALI" = "lightgoldenrod2", "RAKO" = "lightgoldenrod2", "CERK" = "lightgoldenrod2", "BLOS" = "magenta4", "RASC" = "magenta4", "CERJ" = "magenta4", "TOJN" = "firebrick4", "IZIC" = "firebrick4", "SORA" = "gray45", "KOKR" = "gray45", "BOHI" = "gray45", "KRKA" = "lightpink2", "CRMO" = "lightpink2", "RADU" = "peachpuff1", "MOKR" = "chocolate2", "KOLP" = "chocolate2", "KUPI" = "chocolate2" ) # Filter the migration matrix: Set values <= 0.08 to 0 filtered_black_sea_matrix <- migration_matrix_B filtered_black_sea_matrix[filtered_black_sea_matrix <= 0.08] <- 0 # Reorder the migration matrix based on the Black Sea groups ordered_black_sea_matrix <- filtered_black_sea_matrix[black_sea_order, black_sea_order] # Save the output as a PDF pdf("black_sea_chord_diagram.pdf", width = 10, height = 10) # Generate the chord diagram chordDiagram( ordered_black_sea_matrix, transparency = 0.3, # Adjust transparency for clarity directional = 1, # Add directionality direction.type = c("arrows", "diffHeight"), # Arrows and height link.arr.type = "big.arrow", # Big arrows for direction order = black_sea_order, # Apply the Black Sea order grid.col = black_sea_colors, # Assign colors for sectors annotationTrack = c("name", "grid"), # Add outer grid and population labels annotationTrackHeight = c(0.05, 0.1),# Adjust label height self.link = 1, # Exclude self-migration rates link.largest = FALSE # Keep links proportional ) # Close the PDF device dev.off() ##2.5.1.3 Pairwise FST # Load the necessary libraries library(hierfstat) library(vcfR) library(adegenet) library(ComplexHeatmap) library(circlize) library(grid) library(ggplot2) library(ecodist) # Set working directory setwd("C:/R/revision") # Load the LD-pruned VCF file vcf_file <- read.vcfR("C:/R/revision/pruned_vcf_file.vcf") # Convert VCF data to a genind object genind_obj <- vcfR2genind(vcf_file) # Load the popmap file (individual ID, population ID) popmap <- read.csv("popmap_final_red.csv", header = FALSE, stringsAsFactors = FALSE, sep=",") # Check the structure of the popmap file head(popmap) # Manually remove the BOM from the first column of the first row (if present) popmap[1, 1] <- sub("", "", popmap[1, 1]) # Ensure it has two columns: first column is individual IDs, second column is population IDs colnames(popmap) <- c("Individual", "Population") # Convert filtered genind object to hierfstat format hf_data <- genind2hierfstat(genind_obj) # Calculate pairwise Fst between populations using Weir & Cockerham's method pairwise_fst <- pairwise.WCfst(hf_data) # Save the filtered pairwise Fst matrix as an rds file saveRDS(pairwise_fs, file = "pairwise_fst_matrix.rds") # Load the filtered pairwise Fst matrix from the rds file pairwise_fst <- readRDS("pairwise_fst_matrix.rds") #change order of populations new_order <- c( "OSPO", "RIZA", "BADA", "BELS", "SOCA", "IDRI", "NADI", "BRAN", "VIPA", "PRED", "NANO", "RAKU", "LOGA", "HOTE", "MRZL", "MALI", "RAKO", "CERK", "BLOS", "RASC", "CERJ", "TOJN", "IZIC", "SORA", "KOKR", "BOHI", "KRKA", "CRMO", "RADU", "MOKR", "KOLP", "KUPI" ) # Define karst populations karst_populations <- c( "BRAN", "VIPA", "PRED", "NANO", "RAKU", "LOGA", "HOTE", "MRZL", "MALI", "RAKO", "CERK", "BLOS", "RASC", "CERJ", "KRKA", "CRMO", "MOKR", "KOLP", "KUPI" ) # Define underground-only populations underground_populations <- c( "PRED", "NANO", "RAKU", "LOGA", "HOTE", "MRZL", "MALI", "RAKO", "CERK", "BLOS", "RASC", "CERJ" ) # Create a function to get colors for labels based on population type highlight_labels <- function(pop_name) { if (pop_name %in% underground_populations) { return("blue") # Blue for underground-only populations } else if (pop_name %in% karst_populations) { return("red") # Red for karst populations } else { return("black") # Default black for non-karst, non-underground populations } } # Apply the highlight_labels function to get the colors for row and column labels row_label_colors <- sapply(rownames(pairwise_fst_filtered), highlight_labels) column_label_colors <- sapply(colnames(pairwise_fst_filtered), highlight_labels) # Generate heatmap pdf("Fst_heatmap_filtered.pdf", width = 4, height = 4) # Create the heatmap with custom label styles Heatmap(pairwise_fst_filtered, name = "Fst", col = colorRamp2(c(0, 0.2, 0.5, 0.7), c("blue", "green", "yellow", "red")), heatmap_legend_param = list(title = "Fst"), show_row_names = TRUE, show_column_names = TRUE, column_title = "Pairwise Fst Between Populations", row_title = "Populations", # Apply custom ordering for rows and columns row_order = new_order, column_order = new_order, # Disable clustering cluster_rows = FALSE, cluster_columns = FALSE, # Highlight row and column labels using the colors derived from highlight_labels row_names_gp = gpar(fontsize = 8, col = row_label_colors), column_names_gp = gpar(fontsize = 8,col = column_label_colors) ) dev.off() #2.5.2 Assessing the impact of historical hydrological connections on population structure ##2.5.2.1 Co-ancestry Analysis with fineRADstructure # Load fineRADstructure source /opt/anaconda3/etc/profile.d/conda.sh conda activate fineradstructure # Generate input file RADpainter hapsFromVCF input.vcf > radpaint.tsv RADpainter paint radpaint.tsv # Run fineRADstructure finestructure -x 100000 -y 100000 -z 1000 radpaint_chunks.out radpaint_chunks.mcmc.xml finestructure -m T -x 10000 radpaint_chunks.out radpaint_chunks.mcmc.xml radpaint_chunks.mcmcTree.xml #the following R script was used for plotting: https://github.com/millanek/fineRADstructure/blob/master/FinestructureLibrary.R #2.5.2.2 Historical Introgression with Dsuite #set working directory cd /your/working/dir/dsuite ##vcftools module load Tools/vcftools_0.1.13 vcftools --vcf /your/working/dir/dsuite/dsuite_final.recode.vcf \ --keep /your/working/dir/dsuite/keep_red \ --recode --recode-INFO-all --out dsuite.red #load the module module load Tools/Dsuite # then start the run: Dsuite Dtrios -c -o dsuite -t treemix.final_constree.newick dsuite.red.recode.vcf pops.txt Dsuite Fbranch treemix.final_constree.newick dsuite_trio_tree.txt > dsuite_Fbranch.txt python3 dtools.py dsuite_Fbranch.txt treemix.final_constree.newick #2.5.2.3 Historical relationships using TreeMix ###Bootstrap Analysis cd /your/working/dir/treemix/ mkdir ./log mkdir ./out module load PopGen/treemix-1.13 for r in {1..1000} do for m in {1..6} do # Generate random seed s=$RANDOM echo "Random seed = ${s}" treemix \ -i /your/working/dir/treemix/phox.treemix.gz \ -o ./out/treemix.${r}.${m} \ -global \ -m ${m} \ -k 300 \ -bootstrap \ -seed ${s} > log/log_treemix.${r}.${m}.txt done done ### Combine Bootstrapped Trees for r in {1..1000} do for m in {1..6} do bootf="treemix_rev.${r}.${m}.treeout.gz" gunzip -c $bootf | head -1 >> "/your/working/dir/treemix/treemix_boottree.tre" done done ### Run Phylip on the bootstrapped trees to obtain a consensus tree cd /your/working/dir/treemix # Create parameters file echo "treemix_boottree.tre" > "treemix_PhylipInputFile" echo "Y" >> "treemix_PhylipInputFile" # Run PHYLIP for Consensus Tree: /opt/bioinformatics/phylip-3.697/exe/consense < "treemix_PhylipInputFile" > screanout ### The output from Phylip will be modified #sed ':a;N;$!ba;s/\n//g' outtree > $outname"_outtree.newick" cat outtree | tr -d "\n" > "treemix_outtree.newick" echo >> "treemix_outtree.newick" ###inspect the output with OptM in R: library(OptM) #set working directory setwd("/your/working/dir/treemix") # Load a folder of simulated test data folder <- file.path("/your/working/dir/treemix/out") # To view the various linear modeling estimates: test.optM = optM(folder, method = "Evanno", tsv="OptM_all.txt") #plot plot_optM(test.optM, method = "Evanno", plot = T, pdf = "OptM.pdf") ##### best m=2 ### Final TreeMix Analysis: #load packages module load PopGen/treemix-1.13 cd /your/working/dir/treemix mkdir bootstrap/final/ mkdir bootstrap/final/log for r in {1..1000} do treemix -i Treemix.txt.gz -m 2 -k 300 \ -bootstrap -global -se \ -tf "treemix_outtree.newick" \ -o bootstrap/final/treemix.m2 > bootstrap/final/log/treemix_m2_${r}.log done ## Visualize Results: #Use the R package `BITE v2` to plot migration edges, residual heatmaps, and genetic drift. library(BITEV2) #tree with migration edges svg("bootstrap.m2.svg", 10, 7) treemix.bootstrap(in.file = "treemix.m2", out.file = "treemix.plot_m2", phylip.file = "treemix_outtree.newick", nboot = 1000, cex=0.5, xmin = -0.005, disp = 0.001, boot.legend.location = "top", xbar = 0.05) dev.off() #genetic drift svg("drift_m2.svg", 10, 7) treemix.drift(in.file = "treemix.m2", pop.order.color = "pop.color") dev.off() #residual heatmaps svg("residuals_m2.svg", 10, 7) plot_resid(stem="treemix.m2", pop_order="pop.color", min = -0.009, max = 0.009, cex = 1, usemax = T, wcols="rb") dev.off()