Add MCMCtree workflow and SNaQ analysis scripts
- Updated `pixi.toml` to include new R package dependencies for MCMCtree workflow. - Modified markdown file to remove unnecessary simulation data discussion. - Introduced `.lintr` configuration for linting R scripts. - Created `mcmctree.ctl` control file for MCMCtree analysis. - Developed `mcmctree_workflow.r` script to orchestrate MCMCtree analysis with model testing and ESS checks. - Added `06.snaq.jl` script for SNaQ analysis with plotting capabilities. - Implemented `07.mcmctree.sh` bash script to run MCMCtree on multiple alignments in parallel.
This commit is contained in:
parent
316017a575
commit
7f4e888e6a
1
.envrc
1
.envrc
|
|
@ -5,3 +5,4 @@ export PUEUE_CONFIG_PATH="$PROJECTHOME/.pueue.yml"
|
|||
watch_file pixi.lock
|
||||
eval "$(pixi shell-hook)"
|
||||
PATH_add $SCRIPTS/bucky/bin
|
||||
PATH_add $SCRIPTS/bpp/bin
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
98.results/*
|
||||
99.scripts/bucky/
|
||||
99.scripts/phyparts/
|
||||
99.scripts/bpp/
|
||||
run.status.json
|
||||
!*/_description.md
|
||||
tmp/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
linters: linters_with_defaults(
|
||||
line_length_linter(80),
|
||||
commented_code_linter = NULL,
|
||||
indentation_linter(4, hanging_indent_style = "never")
|
||||
)
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
seed = 12345
|
||||
seqfile = input.fa
|
||||
treefile = input.tre
|
||||
outfile = {{PREFIX_PLACEHOLDER}}.mcmctree.out
|
||||
|
||||
ndata = 1
|
||||
seqtype = 0 * 0: nucleotides; 1:codons; 2:AAs
|
||||
usedata = 1 * 0: no data; 1:seq like; 2:use in.BV; 3: out.BV
|
||||
clock = 3 * 1: global clock; 2: independent rates; 3: correlated rates
|
||||
RootAge = <100 * safe constraint on root age, used if no fossil for root.
|
||||
|
||||
model = {{MODEL_PLACEHOLDER}} * 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85
|
||||
alpha = {{GAMMA_PLACEHOLDER}} * alpha for gamma rates at sites
|
||||
ncatG = 4 * No. categories in discrete gamma
|
||||
|
||||
cleandata = 0 * remove sites with ambiguity data (1:yes, 0:no)?
|
||||
|
||||
BDparas = 1 1 0.1 C * birth, death, sampling
|
||||
kappa_gamma = 6 2 * gamma prior for kappa
|
||||
alpha_gamma = 1 1 * gamma prior for alpha
|
||||
|
||||
rgene_gamma = 2 2 * gamma prior for overall rates for genes
|
||||
sigma2_gamma = 1 10 * gamma prior for sigma^2 (for clock=2 or 3)
|
||||
|
||||
finetune = 1: 0.1 0.1 0.1 0.01 .5 * auto (0 or 1) : times, musigma2, rates, mixing, paras, FossilErr
|
||||
|
||||
print = 2
|
||||
burnin = 200000
|
||||
sampfreq = 200
|
||||
nsample = 5000
|
||||
|
||||
*** Note: Make your window wider (100 columns) before running the program.
|
||||
|
|
@ -0,0 +1,276 @@
|
|||
#! /usr/bin/env Rscript
|
||||
suppressMessages({
|
||||
library(ape)
|
||||
library(phytools)
|
||||
library(coda)
|
||||
library(argparser)
|
||||
library(processx)
|
||||
library(cli)
|
||||
})
|
||||
|
||||
# Function to check if the specified outgroup is at the root of the tree
|
||||
check_tree_outgroup <- function(tree, outgroup) {
|
||||
if (!is.rooted(tree)) {
|
||||
return(list(
|
||||
judge = FALSE,
|
||||
msg = "The tree is unrooted."
|
||||
))
|
||||
}
|
||||
if (!outgroup %in% tree$tip.label) {
|
||||
return(list(
|
||||
judge = FALSE,
|
||||
msg = "Outgroup not found in tree tip labels."
|
||||
))
|
||||
}
|
||||
root_node <- getMRCA(tree, tree$tip.label)
|
||||
outgroup_parent <- getParent(tree, which(tree$tip.label == outgroup))
|
||||
if (root_node != outgroup_parent) {
|
||||
list(
|
||||
judge = FALSE,
|
||||
msg = "The specified outgroup is not at the root of the tree."
|
||||
)
|
||||
} else {
|
||||
list(
|
||||
judge = TRUE,
|
||||
msg = "The outgroup is correctly placed at the root."
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
# Function to check multiple outgroups sequentially
|
||||
check_outgroup_list <- function(tree, outgroup_list) {
|
||||
tree_iter <- tree
|
||||
iter <- NULL
|
||||
for (outgroup in outgroup_list) {
|
||||
res <- check_tree_outgroup(tree_iter, outgroup)
|
||||
if (!res$judge) {
|
||||
cli_alert_danger(
|
||||
paste(
|
||||
"Outgroup check failed for", outgroup,
|
||||
":", res$msg
|
||||
)
|
||||
)
|
||||
break
|
||||
}
|
||||
iter <- c(iter, outgroup)
|
||||
tree_iter <- drop.tip(tree_iter, outgroup)
|
||||
}
|
||||
cli_alert_success("All outgroups are correctly placed at the root.")
|
||||
res$judge
|
||||
}
|
||||
|
||||
# Function to generate input tree for mcmctree with calibration points
|
||||
gen_input_tre <- function(tree) {
|
||||
tree_bak <- tree
|
||||
tree_bak$node.label <- rep("", tree$Nnode)
|
||||
tree_bak$edge.length <- NULL
|
||||
ntips <- length(tree$tip.label)
|
||||
ep_node <- getParent(tree, which(tree$tip.label == "EP"))
|
||||
tree_bak$node.label[ep_node - ntips] <- "'L(55.8,0.02,0.02)'"
|
||||
zju_node <- getParent(tree, which(tree$tip.label == "Zju"))
|
||||
tree_bak$node.label[zju_node - ntips] <- "'L(99.6,0.02,0.02)'"
|
||||
cli_alert_info("Generated input tree for mcmctree with calibration points:")
|
||||
cat(paste(ntips, "1\n"), file = "input.tre")
|
||||
raw_check_label <- function(x) {
|
||||
# this is a workaround to bypass the checkLabel function in ape
|
||||
x
|
||||
}
|
||||
assignInNamespace("checkLabel", raw_check_label, ns = "ape")
|
||||
write.tree(tree_bak, file = "input.tre", append = TRUE)
|
||||
cli_text(write.tree(tree_bak))
|
||||
}
|
||||
|
||||
# Function to run modeltest-ng on the alignment file
|
||||
run_modeltest <- function(alnfile, prefix) {
|
||||
file.copy(alnfile, "input.fa", overwrite = TRUE)
|
||||
cmd <- "modeltest-ng"
|
||||
args <- c(
|
||||
"-p", 1,
|
||||
"-d", "nt",
|
||||
"-r", 12345,
|
||||
"-m", "JC,K80,F81,HKY",
|
||||
"-h", "ug",
|
||||
"-t", "ml",
|
||||
"-i", "input.fa",
|
||||
"-o", paste0(prefix, ".modeltest")
|
||||
)
|
||||
result <- run(cmd, args = args, echo_cmd = TRUE, echo = TRUE)
|
||||
if (result$status != 0) {
|
||||
cli_alert_danger("modeltest-ng execution failed.")
|
||||
stop("modeltest-ng execution failed.")
|
||||
} else {
|
||||
cli_alert_success("modeltest-ng run completed.")
|
||||
}
|
||||
}
|
||||
|
||||
# Function to parse modeltest-ng output and extract the model & gamma parameter
|
||||
parse_modeltest_output <- function(modeltest_file) {
|
||||
model_map <- c(
|
||||
"JC" = "JC69",
|
||||
"K80" = "K80",
|
||||
"F81" = "F81",
|
||||
"HKY" = "HKY85"
|
||||
)
|
||||
lines <- readLines(modeltest_file)
|
||||
head_line <- grep("Best model according to AICc", lines, value = FALSE)
|
||||
if (length(head_line) == 0) {
|
||||
cli_alert_danger("No model found in modeltest-ng output.")
|
||||
stop("No model found in modeltest-ng output.")
|
||||
}
|
||||
model_line <- lines[head_line + 2]
|
||||
model_out <- strsplit(model_line, ":")[[1]][2]
|
||||
model_out <- unlist(strsplit(trimws(model_out), "+", fixed = TRUE))
|
||||
cli_alert_info(paste("Selected model:", paste(model_out, collapse = "+")))
|
||||
model <- model_map[model_out[1]]
|
||||
if (length(model_out) > 1 && model_out[2] == "G4") {
|
||||
gamma <- 0.5
|
||||
} else {
|
||||
gamma <- 0
|
||||
}
|
||||
list(
|
||||
model = model,
|
||||
gamma = gamma
|
||||
)
|
||||
}
|
||||
|
||||
# Function to render the mcmctree control file from the template
|
||||
render_mcmctree_ctl <- function(template, outputfile, model, gamma, prefix) {
|
||||
ctl_lines <- readLines(template)
|
||||
ctl_lines <- gsub("{{MODEL_PLACEHOLDER}}", model, ctl_lines, fixed = TRUE)
|
||||
ctl_lines <- gsub("{{GAMMA_PLACEHOLDER}}",
|
||||
as.character(gamma),
|
||||
ctl_lines,
|
||||
fixed = TRUE
|
||||
)
|
||||
ctl_lines <- gsub("{{PREFIX_PLACEHOLDER}}",
|
||||
prefix,
|
||||
ctl_lines,
|
||||
fixed = TRUE
|
||||
)
|
||||
writeLines(ctl_lines, con = outputfile)
|
||||
}
|
||||
|
||||
# Function to run mcmctree with the specified control file
|
||||
run_mcmctree <- function(ctlfile) {
|
||||
cmd <- "mcmctree"
|
||||
args <- ctlfile
|
||||
result <- run(cmd, args = args, echo_cmd = TRUE, echo = TRUE)
|
||||
if (result$status != 0) {
|
||||
cli_alert_danger("mcmctree execution failed.")
|
||||
stop("mcmctree execution failed.")
|
||||
} else {
|
||||
cli_alert_success("mcmctree run completed.")
|
||||
}
|
||||
}
|
||||
|
||||
# Function to check ESS values from mcmctree MCMC output
|
||||
check_ess <- function(mcmcfile, check_file, threshold = 200) {
|
||||
mcmc_samples <- read.table(mcmcfile, header = TRUE)[, -1]
|
||||
mcmc_results <- mcmc(mcmc_samples)
|
||||
ess_values <- effectiveSize(mcmc_results)
|
||||
ess_table <- data.frame(
|
||||
Parameter = names(ess_values),
|
||||
ESS = as.numeric(ess_values)
|
||||
)
|
||||
write.table(ess_table,
|
||||
file = paste0(check_file, ".ess.tsv"),
|
||||
sep = "\t",
|
||||
row.names = FALSE,
|
||||
quote = FALSE
|
||||
)
|
||||
if (all(ess_values >= threshold)) {
|
||||
cli_alert_success(paste("All parameters have ESS >=", threshold))
|
||||
file.create(paste0(check_file, ".ess_check.passed"))
|
||||
} else {
|
||||
failed_params <- names(ess_values[ess_values < threshold])
|
||||
names(failed_params) <- rep("x", length(failed_params))
|
||||
cli_alert_warning(paste("Parameters with ESS <", threshold))
|
||||
cli_bullets(failed_params)
|
||||
file.create(paste0(check_file, ".ess_check.failed"))
|
||||
}
|
||||
}
|
||||
|
||||
# Main function to orchestrate the workflow
|
||||
main <- function(treefile, alnfile, template, prefix) {
|
||||
tree <- read.tree(treefile)
|
||||
cli_h1("1. Checking Outgroups in the Tree")
|
||||
outgroup_list <- c("Zju", "EP")
|
||||
if (!check_outgroup_list(tree, outgroup_list)) {
|
||||
return(NULL)
|
||||
}
|
||||
cli_h1("2. Running ModelTest-NG")
|
||||
run_modeltest(alnfile, prefix)
|
||||
modeltest_file <- paste0(prefix, ".modeltest.out")
|
||||
model_info <- parse_modeltest_output(modeltest_file)
|
||||
cli_h1("3. Preparing and Running MCMCTree")
|
||||
gen_input_tre(tree)
|
||||
render_mcmctree_ctl(
|
||||
template,
|
||||
paste0(prefix, ".mcmctree.ctl"),
|
||||
model_info$model,
|
||||
model_info$gamma,
|
||||
prefix
|
||||
)
|
||||
cat("\n\n")
|
||||
run_mcmctree(paste0(prefix, ".mcmctree.ctl"))
|
||||
cli_h1("4. Checking ESS Values")
|
||||
check_ess("mcmc.txt", prefix)
|
||||
cli_h1("Workflow Completed Successfully")
|
||||
}
|
||||
|
||||
p <- arg_parser("MCMCTree Workflow Script", hide.opts = TRUE)
|
||||
p <- add_argument(p, "--treefile",
|
||||
short = "-t",
|
||||
help = "Input tree file in Newick format",
|
||||
)
|
||||
p <- add_argument(p, "--alnfile",
|
||||
short = "-a",
|
||||
help = "Input alignment file in FASTA format"
|
||||
)
|
||||
p <- add_argument(p, "--template",
|
||||
short = "-c",
|
||||
help = "Template control file for mcmctree"
|
||||
)
|
||||
p <- add_argument(p, "--prefix",
|
||||
short = "-p",
|
||||
help = "Prefix for output files"
|
||||
)
|
||||
|
||||
args <- parse_args(p)
|
||||
|
||||
check_param <- c(
|
||||
!is.na(args$treefile),
|
||||
!is.na(args$alnfile),
|
||||
!is.na(args$template),
|
||||
!is.na(args$prefix)
|
||||
)
|
||||
|
||||
if (all(check_param)) {
|
||||
cli_h1("Workflow Parameters:")
|
||||
cli_dl(args[c("treefile", "alnfile", "template", "prefix")])
|
||||
main(args$treefile, args$alnfile, args$template, args$prefix)
|
||||
} else {
|
||||
warnlist <- c(
|
||||
"--treefile, -t",
|
||||
"--alnfile, -a",
|
||||
"--template, -c",
|
||||
"--prefix, -p"
|
||||
)
|
||||
names(warnlist) <- ifelse(check_param, "v", "x")
|
||||
theme <- list(
|
||||
".bullets .bullet-x" = list(
|
||||
"text-exdent" = 4,
|
||||
before = function(x) paste0(" ", col_red(symbol$cross), " ")
|
||||
),
|
||||
".bullets .bullet-v" = list(
|
||||
"text-exdent" = 4,
|
||||
before = function(x) paste0(" ", col_green(symbol$tick), " ")
|
||||
)
|
||||
)
|
||||
cli_div(theme = theme)
|
||||
cli_h1("Missing required arguments:")
|
||||
cli_bullets(warnlist)
|
||||
cli_h1("")
|
||||
cli_end()
|
||||
print(p)
|
||||
}
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
#! /usr/bin/env bash
|
||||
|
||||
if [ "$#" -ne 3 ]; then
|
||||
echo "Usage: $0 <input_fasta_dir> <extension> <output_nexus_dir>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
input_dir=$1
|
||||
extension=$2
|
||||
output_dir=$3
|
||||
mkdir -p "${output_dir}"
|
||||
for f in "${input_dir}"/*."${extension}"; do
|
||||
filename=$(basename -- "${f}")
|
||||
filename_noext="${filename%.*}"
|
||||
output_file="${output_dir}/${filename_noext}.nex"
|
||||
echo "Converting ${f} to ${output_file}"
|
||||
seqmagick convert --output-format nexus --alphabet dna --input-format fasta "${f}" "${output_file}"
|
||||
done
|
||||
|
|
@ -104,10 +104,10 @@ R"dev.off"();
|
|||
|
||||
## expected vs. observed quartet concordance factors
|
||||
using CSV, DataFrames, Distributions, Random, RCall;
|
||||
inputCFfile = joinpath("bucky_1.CFs.csv");
|
||||
inputCFfile = joinpath("..","bucky","bucky_1","input.mb.CFs.csv");
|
||||
inputCF = readtableCF(inputCFfile);
|
||||
net3 = readsnaqnetwork("net3.out");
|
||||
topologymaxQpseudolik!(net3, inputCF);
|
||||
net4 = readsnaqnetwork("net4.out");
|
||||
topologymaxQpseudolik!(net4, inputCF);
|
||||
df_long = fittedquartetCF(inputCF, :long);
|
||||
|
||||
### Adding jitter to the points for better visualization
|
||||
|
|
@ -127,6 +127,6 @@ R"dev.off"();
|
|||
|
||||
# Goodness of fit of the SNaQ networks
|
||||
using QuartetNetworkGoodnessFit;
|
||||
res1 = quarnetGoFtest!(net3, inputCF, true; seed=123, nsim=1000);
|
||||
res1 = quarnetGoFtest!(net4, inputCF, true; seed=123, nsim=1000);
|
||||
res1[[1,2,3]] # p-value, uncorrected z, σ
|
||||
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#! /bin/bash
|
||||
set -e
|
||||
SCRIPTS=${SCRIPTS:-"$PROJECTHOME/99.scripts"}
|
||||
THREADS=${THREADS:-16}
|
||||
|
||||
if [ "$#" -ne 4 ]; then
|
||||
echo "Usage: $0 <aln_dir> <ml_dir> <out_dir> <alignment_ext>"
|
||||
echo "Run MCMCtree on orthogroup alignments with corresponding ML trees"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
aln_dir=$1
|
||||
ml_dir=$2
|
||||
out_dir=$3
|
||||
ext=$4
|
||||
mkdir -p "$out_dir"
|
||||
echo -n >mcmctree.cmds
|
||||
ctlfile=$(readlink -f "$SCRIPTS/miscs/mcmctree.ctl")
|
||||
for i in "$aln_dir"/*."$ext"; do
|
||||
j=$(basename "$i" ."$ext")
|
||||
aln=$(readlink -f "$i")
|
||||
ml_tree=$(readlink -f "$ml_dir/${j}/${j}.raxml.bestTree")
|
||||
mkdir -p "${out_dir}/${j}"
|
||||
cmd="cd ${out_dir}/${j} && Rscript $SCRIPTS/miscs/mcmctree_workflow.r"
|
||||
params="-a $aln -t $ml_tree -c ${ctlfile} -p ${j}"
|
||||
echo "$cmd $params > mcmctree.log 2>&1" >>mcmctree.cmds
|
||||
done
|
||||
xargs -t -P "$THREADS" -I cmd -a mcmctree.cmds bash -c "cmd"
|
||||
echo "MCMCtree all completed."
|
||||
|
|
@ -52,6 +52,10 @@ paml = ">=4.10.9,<5"
|
|||
beagle-lib = "==3.1.2"
|
||||
mrbayes = ">=3.2.7,<4"
|
||||
cairosvg = ">=2.8.2,<3"
|
||||
r-devtools = ">=2.4.6,<3"
|
||||
r-phytools = ">=2.5_2,<3"
|
||||
r-argparser = ">=0.7.2,<0.8"
|
||||
r-processx = ">=3.8.6,<4"
|
||||
|
||||
[feature.beast.dependencies]
|
||||
beagle-lib = ">=4.0.1,<5"
|
||||
|
|
|
|||
|
|
@ -59,7 +59,6 @@ title: 方法验证不足
|
|||
- 讨论方法的**适用范围与局限性**(如是否适用于大规模物种?能否处理多次introgression?)。
|
||||
- **解决方案:**
|
||||
- ==bpp模拟数据实验==
|
||||
- ==添加其他真实数据集模拟(蛇)==
|
||||
|
||||
---
|
||||
layout: top-title
|
||||
|
|
|
|||
Loading…
Reference in New Issue