diff --git a/.gitignore b/.gitignore index 93d9614..a150ed7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ 99.scripts/phyparts/ .pueue.yml run.status +!*/_description.md \ No newline at end of file diff --git a/00.rawdata/_description.md b/00.rawdata/_description.md index 8329010..a346932 100644 --- a/00.rawdata/_description.md +++ b/00.rawdata/_description.md @@ -7,4 +7,4 @@ This directory contains raw data used for analysis. - `clean_fastq` This folder contains cleaned FASTQ files after quality control. - `raw_fastq` This folder contains the original raw FASTQ files. - `sra` This folder contains SRA files downloaded from the NCBI database. -- `sra.list` This file lists the SRA accession numbers for the datasets used in this project. +- `sra.list` This file lists the SRA downloading links for the datasets used in this project. diff --git a/99.scripts/workflow/phylogeny_reconstruction/03.snaq.jl b/99.scripts/workflow/phylogeny_reconstruction/03.snaq.jl index 9376967..72b1611 100644 --- a/99.scripts/workflow/phylogeny_reconstruction/03.snaq.jl +++ b/99.scripts/workflow/phylogeny_reconstruction/03.snaq.jl @@ -31,7 +31,7 @@ astraltree = readnewick(astralfile); ### Alternatively, reading in the input files from Bucky ### running in input_snaq/ folder -inputCFfile = joinpath("..", "..", "input", "input.CFs.csv"); +inputCFfile = joinpath("bucky_1.CFs.csv"); inputCF = readtableCF(inputCFfile); net0 = snaq!(astraltree, inputCF, hmax=0, filename="net0", seed=123, outgroup="Zju", runs=nruns); net1 = snaq!(net0, inputCF, hmax=1, filename="net1", seed=123, outgroup="Zju", runs=nruns); @@ -60,7 +60,7 @@ rootatnode!(net4, "Zju"); ### rotate!(net4, -2); ## Plotting the networks -R"pdf"("snaq_networks.pdf", width=14, height=10); +R"pdf"("snaq_networks_fulltree.pdf", width=14, height=10); R"layout(matrix(1:4, 2, 2, byrow=TRUE))"; # to get 4 plots into a single figure: 2 row, 2 columns R"par"(mar=[0, 0, 1.5, 0]); # for smaller margins xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net1, false, false)[13:14]; @@ -81,9 +81,52 @@ plot(net4, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]); R"mtext"(string("hmax=4, loglik=-", round(loglik(net4), digits=2)), font=2); R"dev.off"(); +R"pdf"("snaq_networks_majortree.pdf", width=14, height=10); +R"layout(matrix(1:4, 2, 2, byrow=TRUE))"; # to get 4 plots into a single figure: 2 row, 2 columns +R"par"(mar=[0, 0, 1.5, 0]); # for smaller margins +xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net1, false, false)[13:14]; +xmax += (xmax - xmin) * 0.3; +plot(net1, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]); +R"mtext"(string("hmax=1, loglik=-", round(loglik(net1), digits=2)), font=2); +xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net2, false, false)[13:14]; +xmax += (xmax - xmin) * 0.3; +plot(net2, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]); +R"mtext"(string("hmax=2, loglik=-", round(loglik(net2), digits=2)), font=2); +xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net3, false, false)[13:14]; +xmax += (xmax - xmin) * 0.3; +plot(net3, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]); +R"mtext"(string("hmax=3, loglik=-", round(loglik(net3), digits=2)), font=2); +xmin, xmax = PhyloPlots.PhyloPlots.edgenode_coordinates(net4, false, false)[13:14]; +xmax += (xmax - xmin) * 0.3; +plot(net4, style=:majortree, showgamma=true, tipoffset=0.1, xlim=[xmin, xmax]); +R"mtext"(string("hmax=4, loglik=-", round(loglik(net4), digits=2)), font=2); +R"dev.off"(); + ## expected vs. observed quartet concordance factors -using CSV, DataFrames; +using CSV, DataFrames, Distributions, Random, RCall; +inputCFfile = joinpath("bucky_1.CFs.csv"); +inputCF = readtableCF(inputCFfile); +net3 = readsnaqnetwork("net3.out"); +topologymaxQpseudolik!(net3, inputCF); +df_long = fittedquartetCF(inputCF, :long); + +### Adding jitter to the points for better visualization +Random.seed!(123); +jitter = 0.005; +df_jittered = deepcopy(df_long); +df_jittered.obsCF .+= rand(Uniform(-jitter / 2, jitter / 2), nrow(df_long)); +df_jittered.expCF .+= rand(Uniform(-jitter / 2, jitter / 2), nrow(df_long)); + +R"pdf"("expCFs_obsvsfitted.pdf", width=6, height=6); +R"plot"(df_jittered.obsCF, df_jittered.expCF, xlab="quartet CF observed in gene trees", + ylab="quartet CF expected from network", pch=16, + col="#008080", + cex=0.7); +R"abline"(0, 1, col="#008080", lwd=0.5); +R"dev.off"(); # Goodness of fit of the SNaQ networks using QuartetNetworkGoodnessFit; +res1 = quarnetGoFtest!(net3, inputCF, true; seed=123, nsim=1000); +res1[[1,2,3]] # p-value, uncorrected z, σ