biyelunwen/99.scripts/workflow/phylogeny_reconstruction/03.snaq.jl

133 lines
5.9 KiB
Julia
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#! /usr/bin/env julia
## Installing Dependencies
## using Pkg
## Pkg.add("Distributed")
## Pkg.add("DataFrames")
## Pkg.add("CSV")
## Pkg.add("SNaQ")
## Pkg.add("PhyloNetworks")
## Pkg.add("RCall")
## Pkg.add("PhyloPlots")
## Pkg.add("QuartetNetworkGoodnessFit")
# Running SNaQ Analysis
using PhyloNetworks, SNaQ;
using Distributed;
addprocs(5);
@everywhere using PhyloNetworks, SNaQ;
nruns = 100; # number of runs for each hmax
astralfile = joinpath("..", "..", "species_tree", "aster.out");
astraltree = readnewick(astralfile);
### Reading RAxML gene trees and ASTRAL species tree
### running in raxml_snaq/ folder
# raxmltrees = joinpath("..", "..", "species_tree", "all.trees");
# inputCF = readtrees2CF(raxmltrees);
# net0 = snaq!(astraltree, inputCF, hmax=0, filename="net0", seed=123, outgroup="Zju", runs=nruns);
# net1 = snaq!(net0, inputCF, hmax=1, filename="net1", seed=123, outgroup="Zju", runs=nruns);
# net2 = snaq!(net1, inputCF, hmax=2, filename="net2", seed=123, outgroup="Zju", runs=nruns);
# net3 = snaq!(net2, inputCF, hmax=3, filename="net3", seed=123, outgroup="Zju", runs=nruns);
# net4 = snaq!(net3, inputCF, hmax=4, filename="net4", seed=123, outgroup="Zju", runs=nruns);
### Alternatively, reading in the input files from Bucky
### running in input_snaq/ folder
inputCFfile = joinpath("bucky_1.CFs.csv");
inputCF = readtableCF(inputCFfile);
net0 = snaq!(astraltree, inputCF, hmax=0, filename="net0", seed=123, outgroup="Zju", runs=nruns);
net1 = snaq!(net0, inputCF, hmax=1, filename="net1", seed=123, outgroup="Zju", runs=nruns);
net2 = snaq!(net1, inputCF, hmax=2, filename="net2", seed=123, outgroup="Zju", runs=nruns);
net3 = snaq!(net2, inputCF, hmax=3, filename="net3", seed=123, outgroup="Zju", runs=nruns);
net4 = snaq!(net3, inputCF, hmax=4, filename="net4", seed=123, outgroup="Zju", runs=nruns);
# Plotting the SNaQ results
using PhyloPlots, RCall;
## Network scores vs. hmax
scores = [loglik(net0), loglik(net1), loglik(net2), loglik(net3), loglik(net4)];
hmax = collect(0:4);
R"pdf"("snaq_network_scores.pdf", width=12, height=8);
R"plot"(hmax, scores, type="b", ylab="network score", xlab="hmax", col="blue");
R"dev.off"();
## Rerooting and rotating the networks for better visualization
rootatnode!(net1, "Zju");
rootatnode!(net2, "Zju");
rootatnode!(net3, "Zju");
rootatnode!(net4, "Zju");
### rotate!(net1, -2);
### rotate!(net2, -2);
### rotate!(net3, -2);
### rotate!(net4, -2);
## Plotting the networks
R"pdf"("snaq_networks_fulltree.pdf", width=14, height=10);
R"layout(matrix(1:4, 2, 2, byrow=TRUE))"; # to get 4 plots into a single figure: 2 row, 2 columns
R"par"(mar=[0, 0, 1.5, 0]); # for smaller margins
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net1, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net1, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=1, loglik=-", round(loglik(net1), digits=2)), font=2);
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net2, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net2, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=2, loglik=-", round(loglik(net2), digits=2)), font=2);
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net3, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net3, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=3, loglik=-", round(loglik(net3), digits=2)), font=2);
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net4, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net4, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=4, loglik=-", round(loglik(net4), digits=2)), font=2);
R"dev.off"();
R"pdf"("snaq_networks_majortree.pdf", width=14, height=10);
R"layout(matrix(1:4, 2, 2, byrow=TRUE))"; # to get 4 plots into a single figure: 2 row, 2 columns
R"par"(mar=[0, 0, 1.5, 0]); # for smaller margins
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net1, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net1, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=1, loglik=-", round(loglik(net1), digits=2)), font=2);
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net2, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net2, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=2, loglik=-", round(loglik(net2), digits=2)), font=2);
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net3, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net3, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=3, loglik=-", round(loglik(net3), digits=2)), font=2);
xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net4, false, false)[13:14];
xmax += (xmax - xmin) * 0.3;
plot(net4, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]);
R"mtext"(string("hmax=4, loglik=-", round(loglik(net4), digits=2)), font=2);
R"dev.off"();
## expected vs. observed quartet concordance factors
using CSV, DataFrames, Distributions, Random, RCall;
inputCFfile = joinpath("bucky_1.CFs.csv");
inputCF = readtableCF(inputCFfile);
net3 = readsnaqnetwork("net3.out");
topologymaxQpseudolik!(net3, inputCF);
df_long = fittedquartetCF(inputCF, :long);
### Adding jitter to the points for better visualization
Random.seed!(123);
jitter = 0.005;
df_jittered = deepcopy(df_long);
df_jittered.obsCF .+= rand(Uniform(-jitter / 2, jitter / 2), nrow(df_long));
df_jittered.expCF .+= rand(Uniform(-jitter / 2, jitter / 2), nrow(df_long));
R"pdf"("expCFs_obsvsfitted.pdf", width=6, height=6);
R"plot"(df_jittered.obsCF, df_jittered.expCF, xlab="quartet CF observed in gene trees",
ylab="quartet CF expected from network", pch=16,
col="#008080",
cex=0.7);
R"abline"(0, 1, col="#008080", lwd=0.5);
R"dev.off"();
# Goodness of fit of the SNaQ networks
using QuartetNetworkGoodnessFit;
res1 = quarnetGoFtest!(net3, inputCF, true; seed=123, nsim=1000);
res1[[1,2,3]] # p-value, uncorrected z, σ