biyelunwen/99.scripts/miscs/phypartspiecharts.py

226 lines
7.9 KiB
Python
Executable File

#!/usr/bin/env python
from xvfbwrapper import Xvfb
import argparse
import re
import json
from ete3 import Tree, TreeStyle, NodeStyle, faces
helptext= '''
Generate the "Pie Chart" representation of gene tree conflict from Smith et al. 2015 from
the output of phyparts, the bipartition summary software described in the same paper.
The input files include three files produced by PhyParts, and a file containing a species
tree in Newick format (likely, the tree used for PhyParts). The output is an SVG containing
the phylogeny along with pie charts at each node.
Requirements:
Python 3
ete3
'''
vdisplay = Xvfb()
vdisplay.start()
#Read in species tree and convert to ultrametric
#Match phyparts nodes to ete3 nodes
def get_phyparts_nodes(sptree_fn,phyparts_root):
sptree = Tree(sptree_fn)
sptree.convert_to_ultrametric()
phyparts_node_key = [line for line in open(phyparts_root+".node.key")]
subtrees_dict = {n.split()[0]:Tree(n.split()[1]+";") for n in phyparts_node_key}
subtrees_topids = {}
for x in subtrees_dict:
subtrees_topids[x] = subtrees_dict[x].get_topology_id()
#print(subtrees_topids['1'])
#print()
for node in sptree.traverse():
node_topid = node.get_topology_id()
if "Takakia_4343a" in node.get_leaf_names():
print(node_topid)
print(node)
for subtree in subtrees_dict:
if node_topid == subtrees_topids[subtree]:
node.name = subtree
return sptree,subtrees_dict,subtrees_topids
#Summarize concordance and conflict from Phyparts
def get_concord_and_conflict(phyparts_root,subtrees_dict,subtrees_topids):
with open(phyparts_root + ".concon.tre") as phyparts_trees:
concon_tree = Tree(phyparts_trees.readline())
conflict_tree = Tree(phyparts_trees.readline())
concord_dict = {}
conflict_dict = {}
for node in concon_tree.traverse():
node_topid = node.get_topology_id()
for subtree in subtrees_dict:
if node_topid == subtrees_topids[subtree]:
concord_dict[subtree] = node.support
for node in conflict_tree.traverse():
node_topid = node.get_topology_id()
for subtree in subtrees_dict:
if node_topid == subtrees_topids[subtree]:
conflict_dict[subtree] = node.support
return concord_dict, conflict_dict
#Generate Pie Chart data
def get_pie_chart_data(phyparts_root,total_genes,concord_dict,conflict_dict):
phyparts_hist = [line for line in open(phyparts_root + ".hist")]
phyparts_pies = {}
phyparts_dict = {}
for n in phyparts_hist:
n = n.split(",")
tot_genes = float(n.pop(-1))
node_name = n.pop(0)[4:]
concord = float(n.pop(0))
concord = concord_dict[node_name]
all_conflict = conflict_dict[node_name]
if len(n) > 0:
most_conflict = max([float(x) for x in n])
else:
most_conflict = 0.0
adj_concord = (concord/total_genes) * 100
adj_most_conflict = (most_conflict/total_genes) * 100
other_conflict = (all_conflict - most_conflict) / total_genes * 100
the_rest = (total_genes - concord - all_conflict) / total_genes * 100
pie_list = [adj_concord,adj_most_conflict,other_conflict,the_rest]
phyparts_pies[node_name] = pie_list
phyparts_dict[node_name] = [int(round(concord,0)),int(round(tot_genes-concord,0))]
return phyparts_dict, phyparts_pies
def node_text_layout(mynode):
F = faces.TextFace(mynode.name,fsize=20)
faces.add_face_to_node(F,mynode,0,position="branch-right")
#convert internal phypartspiechart.py data files to csv and export to current directory (for use as ggtree tree data in R)
def pie_data_to_csv(phyparts_dict, phyparts_pies):
phyparts_dist_bin = {}
phyparts_pies_bin = {}
dist_replaced = {}
pies_replaced = {}
phyparts_dist_bin = json.dumps(phyparts_dist)
phyparts_pies_bin = json.dumps(phyparts_pies)
dist_replaced = re.sub(r'{',r'node,concord,genes-concord\n',phyparts_dist_bin)
dist_replaced = re.sub(r'"(\d*)":\s\[(\d*),\s(\d*)\],\s', r'\1,\2,\3\n', dist_replaced)
dist_replaced = re.sub(r'"(\d*)":\s\[(\d*),\s(\d*)\]}', r'\1,\2,\3', dist_replaced)
pies_replaced = re.sub(r'{',r'node,adj_concord,adj_most_conflict,other_conflict,the_rest\n',phyparts_pies_bin)
pies_replaced = re.sub(r'"(\d*)":\s\[(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*)\],\s', r'\1,\2,\3,\4,\5\n', pies_replaced)
pies_replaced = re.sub(r'"(\d*)":\s\[(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*)\]}', r'\1,\2,\3,\4,\5', pies_replaced)
with open('phyparts_dist.csv','w') as file:
for line in dist_replaced:
file.write(line)
with open('phyparts_pies.csv','w') as file:
for line in pies_replaced:
file.write(line)
parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('species_tree',help="Newick formatted species tree topology.")
parser.add_argument('phyparts_root',help="File root name used for Phyparts.")
parser.add_argument('num_genes',type=int,default=0,help="Number of total gene trees. Used to properly scale pie charts.")
parser.add_argument('--taxon_subst',help="Comma-delimted file to translate tip names.")
parser.add_argument("--svg_name",help="File name for SVG generated by script",default="pies.svg")
parser.add_argument("--show_nodes",help="Also show tree with nodes labeled same as PhyParts",action="store_true",default=False)
parser.add_argument("--colors",help="Four colors of the pie chart: concordance (blue) top conflict (green), other conflict (red), no signal (gray)",nargs="+",default=["blue","green","red","dark gray"])
parser.add_argument("--no_ladderize",help="Do not ladderize the input species tree.",action="store_true",default=False)
parser.add_argument("--to_csv",help="Output data files to csv for import into ggtree in R",action="store_true",default=False)
args = parser.parse_args()
if args.no_ladderize:
ladderize=False
else:
ladderize=True
plot_tree,subtrees_dict,subtrees_topids = get_phyparts_nodes(args.species_tree, args.phyparts_root)
#print(subtrees_dict)
concord_dict, conflict_dict = get_concord_and_conflict(args.phyparts_root,subtrees_dict,subtrees_topids)
phyparts_dist, phyparts_pies = get_pie_chart_data(args.phyparts_root,args.num_genes,concord_dict,conflict_dict)
if args.taxon_subst:
taxon_subst = {line.split(",")[0]:line.rstrip().split(",")[1] for line in open(args.taxon_subst,'U')}
for leaf in plot_tree.get_leaves():
try:
leaf.name = taxon_subst[leaf.name]
except KeyError:
print(leaf.name)
continue
def phyparts_pie_layout(mynode):
if mynode.name in phyparts_pies:
pie= faces.PieChartFace(phyparts_pies[mynode.name],
#colors=COLOR_SCHEMES["set1"],
colors = args.colors,
width=50, height=50)
pie.border.width = None
pie.opacity = 1
faces.add_face_to_node(pie,mynode, 0, position="branch-right")
concord_text = faces.TextFace(str(int(concord_dict[mynode.name]))+' ',fsize=20)
conflict_text = faces.TextFace(str(int(conflict_dict[mynode.name]))+' ',fsize=20)
faces.add_face_to_node(concord_text,mynode,0,position = "branch-top")
faces.add_face_to_node(conflict_text,mynode,0,position="branch-bottom")
else:
F = faces.TextFace(mynode.name,fsize=20)
faces.add_face_to_node(F,mynode,0,position="aligned")
#Plot Pie Chart
ts = TreeStyle()
ts.show_leaf_name = False
ts.layout_fn = phyparts_pie_layout
nstyle = NodeStyle()
nstyle["size"] = 0
for n in plot_tree.traverse():
n.set_style(nstyle)
n.img_style["vt_line_width"] = 0
ts.draw_guiding_lines = True
ts.guiding_lines_color = "black"
ts.guiding_lines_type = 0
ts.scale = 30
ts.branch_vertical_margin = 10
plot_tree.convert_to_ultrametric()
if args.to_csv:
pie_data_to_csv(phyparts_dist, phyparts_pies)
if ladderize:
plot_tree.ladderize(direction=1)
my_svg = plot_tree.render(args.svg_name,tree_style=ts,w=595,dpi=300)
if args.show_nodes:
node_style = TreeStyle()
node_style.show_leaf_name=False
node_style.layout_fn = node_text_layout
plot_tree.render("tree_nodes.pdf",tree_style=node_style)
vdisplay.stop()