From dba48339058ec49c986ab699bce403c6539e00e4 Mon Sep 17 00:00:00 2001 From: IvisTang Date: Sat, 20 Dec 2025 01:57:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20.gitignore=EF=BC=8C?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=2006.gene=5Ftrees=20=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=EF=BC=9B=E9=87=8D=E6=9E=84=20macse.sh=20=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=EF=BC=8C=E7=A7=BB=E9=99=A4=20fs=5Flr=20=E5=8F=82=E6=95=B0?= =?UTF-8?q?=EF=BC=9B=E5=88=A0=E9=99=A4=E5=A4=9A=E4=B8=AA=E4=B8=8D=E5=86=8D?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E7=9A=84=E8=84=9A=E6=9C=AC=EF=BC=9B=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=20check=5Fframeshift.py=20=E5=92=8C=20get=5Fog=5Fseqs?= =?UTF-8?q?.py=20=E8=84=9A=E6=9C=AC=E4=BB=A5=E5=A4=84=E7=90=86=E9=98=85?= =?UTF-8?q?=E8=AF=BB=E6=A1=86=E7=A7=BB=E4=BD=8D=E5=92=8C=E6=8F=90=E5=8F=96?= =?UTF-8?q?=E5=8D=95=E6=8B=B7=E8=B4=9DOG=E5=BA=8F=E5=88=97=EF=BC=9B?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=20pixi.toml=20=E5=92=8C=20pixi.lock=20?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=BB=A5=E6=B7=BB=E5=8A=A0=20paml=20?= =?UTF-8?q?=E4=BE=9D=E8=B5=96=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + 99.scripts/miscs/check_frameshift.py | 128 +++++++++++++++ 99.scripts/miscs/get_og_seqs.py | 148 ++++++++++++++++++ 99.scripts/miscs/get_primary_cds.py | 74 +++++++++ 99.scripts/miscs/macse.sh | 5 +- .../orthology_inference/01.hmmer_sc.sh | 51 ------ .../orthology_inference/01.macse_align.sh | 24 +++ .../orthology_inference/02.macse_align.sh | 34 ---- .../orthology_inference/02.trim_alignment.sh | 20 +++ .../orthology_inference/03.treeshrink.sh | 26 +++ .../orthology_inference/06.trim_alignment.sh | 8 - .../orthology_inference/07.fasttree.sh | 8 - .../orthology_inference/08.treeshrink.sh | 11 -- pixi.lock | 10 ++ pixi.toml | 2 +- 15 files changed, 433 insertions(+), 117 deletions(-) create mode 100644 99.scripts/miscs/check_frameshift.py create mode 100644 99.scripts/miscs/get_og_seqs.py create mode 100755 99.scripts/miscs/get_primary_cds.py delete mode 100755 99.scripts/workflow/orthology_inference/01.hmmer_sc.sh create mode 100755 99.scripts/workflow/orthology_inference/01.macse_align.sh delete mode 100755 99.scripts/workflow/orthology_inference/02.macse_align.sh create mode 100755 99.scripts/workflow/orthology_inference/02.trim_alignment.sh create mode 100755 99.scripts/workflow/orthology_inference/03.treeshrink.sh delete mode 100755 99.scripts/workflow/orthology_inference/06.trim_alignment.sh delete mode 100755 99.scripts/workflow/orthology_inference/07.fasttree.sh delete mode 100755 99.scripts/workflow/orthology_inference/08.treeshrink.sh diff --git a/.gitignore b/.gitignore index 4c68590..b4a2e2e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ 05.reduce_redundancy/* 05.orthology_inference/* 06.phylogeny_reconstruction/* +06.gene_trees/* 10.plastid/* 98.results/* 99.scripts/bucky/ diff --git a/99.scripts/miscs/check_frameshift.py b/99.scripts/miscs/check_frameshift.py new file mode 100644 index 0000000..f72a666 --- /dev/null +++ b/99.scripts/miscs/check_frameshift.py @@ -0,0 +1,128 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Check and process frameshift in alignment files. +""" + +import shutil +import pandas as pd +from pathlib import Path +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +import sys +import argparse + + +def parse_frameshift_list(fs_list: Path) -> pd.DataFrame: + """ + Parse a frameshift list file into a pandas DataFrame. + + Args: + fs_list (Path): Path to the frameshift list file. + + Returns: + pd.DataFrame: DataFrame containing the frameshift information. + """ + try: + df = pd.read_csv(fs_list, sep=",", header=0) + except Exception as e: + print(f"Error reading frameshift list file {fs_list}: {e}") + sys.exit(1) + return df + + +def get_alignment_files(aln_dir: Path, ext: str) -> list[Path]: + """ + Get a list of alignment files in a directory with a specific extension. + + Args: + aln_dir (Path): Path to the directory containing alignment files. + ext (str): Extension of the alignment files. + + Returns: + list[Path]: List of Paths to the alignment files. + """ + try: + aln_files = list(aln_dir.glob(f"*{ext}")) + except Exception as e: + print(f"Error accessing alignment files in {aln_dir}: {e}") + sys.exit(1) + return aln_files + + +def main(alignment_dir: str, ext: str, frameshift_list: str, outdir: str): + aln_dir = Path(alignment_dir) + fs_list_path = Path(frameshift_list) + out_dir = Path(outdir) + out_dir.mkdir(parents=True, exist_ok=True) + + fs_df = parse_frameshift_list(fs_list_path) + aln_files = get_alignment_files(aln_dir, ext) + good_aln_count = 0 + keep_fs_count = 0 + discard_count = 0 + + for file in aln_files: + if str(file) not in fs_df["alignment_file"].values.tolist(): + try: + shutil.copy(file, out_dir / file.name) + good_aln_count += 1 + continue + except Exception as e: + print(f"Error copying file {file} to {out_dir}: {e}") + sys.exit(1) + if ( + fs_df.loc[ + fs_df["alignment_file"] == str(file), "possible_attributions" + ].values[0] + == "Framshift only in Ziziphus jujuba or Elaeagnus pungens" + ): + try: + records = [] + for record in SeqIO.parse(file, "fasta"): + seq = str(record.seq).replace("!", "-") + id = record.id + records.append(SeqRecord(seq=Seq(seq), id=id, description="")) + print( + f"Keep the alignment file with frameshift: {file}. Due to frameshift only in outgroup." + ) + keep_fs_count += 1 + SeqIO.write(records, out_dir / file.name, "fasta") + except Exception as e: + print(f"Error processing file {file}: {e}") + sys.exit(1) + else: + discard_count += 1 + print("Frameshift processing completed successfully.") + print(f"Number of good alignments copied: {good_aln_count}") + print(f"Number of alignments with frameshift kept: {keep_fs_count}") + print(f"Number of alignments with frameshift discarded: {discard_count}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check and process frameshift in alignment files." + ) + parser.add_argument( + "-a", + "--alignment_dir", + required=True, + help="Directory containing alignment files", + ) + parser.add_argument( + "-e", "--ext", default=".nal", help="Extension of alignment files" + ) + parser.add_argument( + "-f", + "--frameshift_list", + required=True, + help="CSV file listing frameshift information", + ) + parser.add_argument( + "-o", "--outdir", required=True, help="Output directory for processed files" + ) + args = parser.parse_args() + + main(args.alignment_dir, args.ext, args.frameshift_list, args.outdir) diff --git a/99.scripts/miscs/get_og_seqs.py b/99.scripts/miscs/get_og_seqs.py new file mode 100644 index 0000000..49c7a68 --- /dev/null +++ b/99.scripts/miscs/get_og_seqs.py @@ -0,0 +1,148 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Extract Single Copy OG sequences from a comprehensive FASTA file based on OG definitions from results of orthofinder. +""" + +import sys +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +from pathlib import Path +import argparse + + +def get_og_fastas(og_dir: Path, ext: str) -> dict[str, list[str]]: + """ + Get a dictionary of OGs and their corresponding sequence IDs from FASTA files in a directory. + + Args: + og_dir (Path): Path to the directory containing OG FASTA files. + ext (str): Extension of the FASTA files. Default is ".fa". + + Returns: + dict: Dictionary with OG names as keys and lists of sequence IDs as values. + """ + og_dict: dict[str, list[str]] = {} + try: + for fasta_file in og_dir.glob(f"*{ext}"): + og_name = fasta_file.stem + seq_ids: list[str] = [] + for record in SeqIO.parse(fasta_file, "fasta"): + seq_ids.append(record.id) + og_dict[og_name] = seq_ids + except Exception as e: + print(f"Error processing OG FASTA files in {og_dir}: {e}") + sys.exit(1) + return og_dict + + +def parse_all_fasta(fasta_file: Path) -> dict[str, Seq]: + """ + Parse a FASTA file and return a dictionary of sequence IDs and their corresponding Seq objects. + + Args: + fasta_file (Path): Path to the FASTA file. + Returns: + dict: Dictionary with sequence IDs as keys and Seq objects as values. + """ + seq_dict: dict[str, Seq] = {} + try: + for record in SeqIO.parse(fasta_file, "fasta"): + seq_dict[record.id] = record.seq + except Exception as e: + print(f"Error parsing FASTA file {fasta_file}: {e}") + sys.exit(1) + return seq_dict + + +def output_og_seqs( + all_seq_dict: dict[str, Seq], og_dict: dict[str, list[str]], output_dir: Path +): + """ + Output sequences for each OG into separate FASTA files, remove gene id and only keep taxon name. + + Args: + all_seq_dict (dict): Dictionary of all sequences with sequence IDs as keys. + og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values. + output_dir (Path): Path to the output directory. + """ + for og_name, seq_ids in og_dict.items(): + og_seqs: list[SeqRecord] = [] + for seq_id in seq_ids: + if seq_id in all_seq_dict: + seq_record = SeqRecord( + all_seq_dict[seq_id], id=seq_id.split("@")[0], description="" + ) + og_seqs.append(seq_record) + else: + print(f"Warning: Sequence ID {seq_id} not found in all sequences.") + output_fasta = output_dir / f"{og_name}.fa" + try: + SeqIO.write(og_seqs, output_fasta, "fasta") + except Exception as e: + print(f"Error writing to FASTA file {output_fasta}: {e}") + sys.exit(1) + + +def write_og_list(og_dict: dict[str, list[str]]): + """ + Write the OG list to a text file. + + Args: + og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values. + """ + try: + with open("og_list.tsv", "w") as f: + for og_name, seq_ids in og_dict.items(): + line = f"{og_name}\t" + "\t".join(seq_ids) + "\n" + f.write(line) + print("OG list written to og_list.tsv") + except Exception as e: + print(f"Error writing OG list to file: {e}") + sys.exit(1) + + +def main(og_dir_path: str, all_fasta_path: str, output_dir_path: str, ext: str = ".fa"): + og_dir = Path(og_dir_path) + all_fasta = Path(all_fasta_path) + output_dir = Path(output_dir_path) + output_dir.mkdir(parents=True, exist_ok=True) + + og_dict = get_og_fastas(og_dir, ext) + all_seqs = parse_all_fasta(all_fasta) + output_og_seqs(all_seqs, og_dict, output_dir) + write_og_list(og_dict) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extract OG sequences from a comprehensive FASTA file based on OG definitions." + ) + parser.add_argument( + "-d", + "--og_dir", + required=True, + help="Directory containing OG FASTA files.", + ) + parser.add_argument( + "-a", + "--all_fasta", + required=True, + help="FASTA file containing all sequences.", + ) + parser.add_argument( + "-o", + "--output_dir", + required=True, + help="Output directory for OG FASTA files.", + ) + parser.add_argument( + "-e", + "--ext", + default=".fa", + help="Extension of OG FASTA files (default: .fa).", + ) + args = parser.parse_args() + main(args.og_dir, args.all_fasta, args.output_dir, args.ext) diff --git a/99.scripts/miscs/get_primary_cds.py b/99.scripts/miscs/get_primary_cds.py new file mode 100755 index 0000000..cbec199 --- /dev/null +++ b/99.scripts/miscs/get_primary_cds.py @@ -0,0 +1,74 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Get primary CDS sequences from a FASTA file containing multiple CDS per gene. +""" + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +import re +import argparse +import sys + + +def get_primary_cds(input_fasta, output_fasta): + primary_cds_records = [] + gene = "" + length = 0 + seq = None + id = None + try: + for record in SeqIO.parse(input_fasta, "fasta"): + seq_len = len(record.seq) + desc = record.description + match = re.search(r"\[gene=(\S+)\]", desc) + if match: + gene_name = match.group(1) + else: + # Skip if gene name not found + continue + + if gene_name != gene: + # new gene encountered + # print(f"Processing gene: {gene_name}") + if length > 0: + # this is not the first record, save the previous longest record + primary_cds_record = SeqRecord( + seq, id=id, description=f"[gene={gene}]" + ) + primary_cds_records.append(primary_cds_record) + gene = gene_name + seq = record.seq + id = record.id + length = seq_len + else: + # same gene, check length + if seq_len > length: + seq = record.seq + id = record.id + length = seq_len + # after loop, save the last gene + if gene and length > 0: + primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]") + primary_cds_records.append(primary_cds_record) + SeqIO.write(primary_cds_records, output_fasta, "fasta") + print(f"Primary CDS sequences written to {args.output_fasta}") + except Exception as e: + print(f"Error processing FASTA file {input_fasta}: {e}") + sys.exit(1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extract primary CDS sequences from a FASTA file." + ) + parser.add_argument( + "-i", "--input_fasta", help="Input FASTA file containing CDS sequences." + ) + parser.add_argument( + "-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences." + ) + args = parser.parse_args() + + get_primary_cds(args.input_fasta, args.output_fasta) diff --git a/99.scripts/miscs/macse.sh b/99.scripts/miscs/macse.sh index e3b4e9d..4bff049 100644 --- a/99.scripts/miscs/macse.sh +++ b/99.scripts/miscs/macse.sh @@ -1,7 +1,5 @@ #! /bin/bash -FS_LR=7 - if [ "$#" -ne 3 ]; then echo "Usage: $0 " echo "Perform MACSE alignment on given sequences" @@ -16,7 +14,7 @@ stem=$3 echo "Command:" echo "macse -prog alignSequences -seq $seq -seq_lr ${seq_lr}" echo " -out_NT ${stem}.nal -out_AA ${stem}.pal" - echo " -optim 2 -max_refine_iter 3 -local_realign_init 0.2 -fs_lr $FS_LR" + echo " -optim 2 -max_refine_iter 3 -local_realign_init 0.2" } >alignSequences.log macse -prog alignSequences \ -seq "$seq" -seq_lr "${seq_lr}" \ @@ -24,5 +22,4 @@ macse -prog alignSequences \ -optim 2 \ -max_refine_iter 3 \ -local_realign_init 0.2 \ - -fs_lr $FS_LR \ >>alignSequences.log 2>&1 diff --git a/99.scripts/workflow/orthology_inference/01.hmmer_sc.sh b/99.scripts/workflow/orthology_inference/01.hmmer_sc.sh deleted file mode 100755 index 71410af..0000000 --- a/99.scripts/workflow/orthology_inference/01.hmmer_sc.sh +++ /dev/null @@ -1,51 +0,0 @@ -#! /bin/bash -set -e - -if [ "$#" -ne 4 ]; then - echo "Usage: $0 " - echo "search homologous sequences in using HMMs built from orthogroup alignments" - exit 1 -fi - -ogs_dir=$(readlink -f "$1") -outdir=$2 -proteome=$(readlink -f "$3") -threads=$4 - -mkdir -p "$outdir" -cd "$outdir" || exit 1 -echo "Working directory: $(pwd)" -echo "Using OGS directory: $ogs_dir" -echo "Using $threads threads" -echo "" -echo "Starting orthogroup sequence alignment..." -mkdir -p msa -echo -n >mafft.cmds -for i in "$ogs_dir"/*.fa; do - j=$(basename "$i") - echo "linsi --quiet $i > msa/$j" >>mafft.cmds -done -xargs -t -P "$threads" -I cmd -a mafft.cmds bash -c "cmd" -echo "Orthogroup sequence alignment completed." -echo "" -echo "Starting HMM building from alignments..." -mkdir -p hmms -echo -n >hmmbuild.cmds -for i in msa/*.fa; do - j=$(basename "$i") - echo "hmmbuild -o hmms/${j}.hmmbuild.out --amino hmms/${j}.hmm $i" >>hmmbuild.cmds -done -xargs -t -P "$threads" -I cmd -a hmmbuild.cmds bash -c "cmd" -echo "HMM building completed." -echo "" -echo "Starting HMM search against other proteome..." -mkdir -p search -echo -n >hmmsearch.cmds -for i in hmms/*.hmm; do - j=$(basename "$i") - echo "hmmsearch --tblout search/${j}search.tblout $i $proteome > search/${j}search.rawout" >>hmmsearch.cmds -done -xargs -t -P "$threads" -I cmd -a hmmsearch.cmds bash -c "cmd" -echo "HMM search completed." -echo "" -echo "All steps completed successfully." diff --git a/99.scripts/workflow/orthology_inference/01.macse_align.sh b/99.scripts/workflow/orthology_inference/01.macse_align.sh new file mode 100755 index 0000000..632c889 --- /dev/null +++ b/99.scripts/workflow/orthology_inference/01.macse_align.sh @@ -0,0 +1,24 @@ +#! /bin/bash +set -e +THREADS=${THREADS:-12} +EXT=${EXT:-"fa"} + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "Perform MACSE alignment for each orthologous group" + exit 1 +fi + +ogs_dir=$(readlink -f "$1") +out_dir=$2 + +mkdir -p "$out_dir" +echo "Starting MACSE alignment of orthologous groups..." +echo -n >macse.cmds +for og_fasta in "$ogs_dir"/*."$EXT"; do + og_name=$(basename "$og_fasta" ."$EXT") + out_stem="$out_dir/$og_name" + echo "macse -prog alignSequences -seq $og_fasta -out_AA ${out_stem}.pal -out_NT ${out_stem}.nal > ${out_stem}.log 2>&1" >>macse.cmds +done +xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd" && + echo "MACSE alignment completed." diff --git a/99.scripts/workflow/orthology_inference/02.macse_align.sh b/99.scripts/workflow/orthology_inference/02.macse_align.sh deleted file mode 100755 index 0e3028e..0000000 --- a/99.scripts/workflow/orthology_inference/02.macse_align.sh +++ /dev/null @@ -1,34 +0,0 @@ -#! /bin/bash -set -e -SCRIPTS=${SCRIPTS:-"$PROJECTHOME/99.scripts"} -THREADS=${THREADS:-12} - -if [ "$#" -ne 5 ]; then - echo "Usage: $0 " - echo "Integrate hmmsearch results to new orthologous groups directory and perform MACSE alignment" - exit 1 -fi - -ogs_dir=$(readlink -f "$1") -search_dir=$(readlink -f "$2") -all_cds=$(readlink -f "$3") -out_dir=$4 -stem=$5 - -echo "Integrating hmmsearch results to new orthologous groups directory..." -python3 "$SCRIPTS"/miscs/hmmsearch_result_to_new_ogs_dir.py \ - -d "$ogs_dir" \ - -t "$search_dir" \ - -f "$all_cds" \ - -o "$out_dir" \ - -s "$stem" -echo "Integration completed." - -echo "Starting MACSE alignment of orthologous groups..." -echo -n >macse.cmds -for og_dir in "$out_dir"/ogs/*; do - j=$(basename "$og_dir") - echo "cd $og_dir && bash $SCRIPTS/miscs/macse.sh ${j}_${stem}.fa ${j}.fa $j" >>macse.cmds -done -xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd" -echo "MACSE alignment completed." diff --git a/99.scripts/workflow/orthology_inference/02.trim_alignment.sh b/99.scripts/workflow/orthology_inference/02.trim_alignment.sh new file mode 100755 index 0000000..0be4747 --- /dev/null +++ b/99.scripts/workflow/orthology_inference/02.trim_alignment.sh @@ -0,0 +1,20 @@ +#! /bin/bash +set -e +THREADS=${THREADS:-12} +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + echo "Trim MACSE nucleotide alignments using trimal" + exit 1 +fi + +nal_dir=$(readlink -f "$1") +out_dir=$(readlink -f "$2") +ext=$3 + +mkdir -p "$out_dir" +echo -n >trimal.cmds +for i in "$nal_dir"/*."$ext"; do + j=$(basename "$i" ."$ext") + echo "trimal -in $i -out $out_dir/${j}.trimed.fa -automated1 -resoverlap 0.5 -seqoverlap 50" >>trimal.cmds +done +xargs -t -P "$THREADS" -I cmd -a trimal.cmds bash -c "cmd" diff --git a/99.scripts/workflow/orthology_inference/03.treeshrink.sh b/99.scripts/workflow/orthology_inference/03.treeshrink.sh new file mode 100755 index 0000000..4d454ab --- /dev/null +++ b/99.scripts/workflow/orthology_inference/03.treeshrink.sh @@ -0,0 +1,26 @@ +#! /bin/bash +set -e +THREADS=${THREADS:-12} + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + echo "Build FastTree phylogenetic trees and perform TreeShrink on trimmed MACSE nucleotide alignments" + exit 1 +fi + +aln_dir=$(readlink -f "$1") +out_dir=$(readlink -f "$2") +ext=$3 + +# fasttree +mkdir -p "$out_dir" +echo -n >fasttree.cmds +for i in "$aln_dir"/*."$ext"; do + j=$(basename "$i" ."$ext") + mkdir -p "$out_dir"/"${j}" + cp -s "$i" "$out_dir"/"${j}"/input.fasta + echo "FastTree -nt -gtr -quiet $out_dir/${j}/input.fasta > $out_dir/${j}/input.tree" >>fasttree.cmds +done +xargs -t -P "$THREADS" -I cmd -a fasttree.cmds bash -c "cmd" +# treeshrink +run_treeshrink.py -f -i "$out_dir"/ -t input.tree -a input.fasta >treeshrink.log diff --git a/99.scripts/workflow/orthology_inference/06.trim_alignment.sh b/99.scripts/workflow/orthology_inference/06.trim_alignment.sh deleted file mode 100755 index 80431ba..0000000 --- a/99.scripts/workflow/orthology_inference/06.trim_alignment.sh +++ /dev/null @@ -1,8 +0,0 @@ -#! /usr/bin/env bash -mkdir -p trimed_nal -echo -n > trimal.cmds -for i in cds_aln/*.nal ;do - j=$(basename "$i") - echo "trimal -in $i -out trimed_nal/${j/.nal/.trimed.fa} -automated1 -resoverlap 0.5 -seqoverlap 50" >> trimal.cmds -done -xargs -t -P 4 -I cmd -a trimal.cmds bash -c "cmd" diff --git a/99.scripts/workflow/orthology_inference/07.fasttree.sh b/99.scripts/workflow/orthology_inference/07.fasttree.sh deleted file mode 100755 index 66edf59..0000000 --- a/99.scripts/workflow/orthology_inference/07.fasttree.sh +++ /dev/null @@ -1,8 +0,0 @@ -#! /usr/bin/env bash -mkdir -p fasttree -echo -n > fasttree.cmds -for i in trimed_nal/*.trimed.fa ;do - j=$(basename "$i") - echo "FastTree -nt -gtr -quiet $i > fasttree/${j/.trimed.fa/.tree}" >> fasttree.cmds -done -xargs -t -P 8 -I cmd -a fasttree.cmds bash -c "cmd" diff --git a/99.scripts/workflow/orthology_inference/08.treeshrink.sh b/99.scripts/workflow/orthology_inference/08.treeshrink.sh deleted file mode 100755 index b11bfb7..0000000 --- a/99.scripts/workflow/orthology_inference/08.treeshrink.sh +++ /dev/null @@ -1,11 +0,0 @@ -#! /usr/bin/env bash -mkdir -p treeshrink -for i in trimed_nal/*.trimed.fa; do - j=$(basename "$i") - mkdir -p treeshrink/"${j/.trimed.fa/}" - cd treeshrink/"${j/.trimed.fa/}" || exit 1 - ln -s ../../fasttree/"${j/.trimed.fa/.tree}" input.tree - ln -s ../../"$i" input.fasta - cd ../../ -done -run_treeshrink.py -i treeshrink/ -t input.tree -a input.fasta > treeshrink.log diff --git a/pixi.lock b/pixi.lock index d5fa123..1aa5e90 100644 --- a/pixi.lock +++ b/pixi.lock @@ -338,6 +338,7 @@ environments: - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/p7zip-16.02-h9c3ff4c_1001.tar.bz2 - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/noarch/pal2nal-14.1-pl5321hdfd78af_3.tar.bz2 + - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandoc-3.8.2.1-ha770c72_0.conda - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda @@ -5125,6 +5126,15 @@ packages: license_family: GPL size: 22693 timestamp: 1642293349880 +- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda + sha256: 81c5237b07796984d7915d29a768068a1e5e6c02b4ae25f27836b0babe84e5aa + md5: ef02b401c5daaf104b3d1091f8ec37f5 + depends: + - libgcc >=13 + license: GPL-3.0-or-later + license_family: GPL3 + size: 679359 + timestamp: 1755706728619 - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda sha256: c97f796345f5b9756e4404bbb4ee049afd5ea1762be6ee37ce99162cbee3b1d3 md5: 72e3452bf0ff08132e86de0272f2fbb0 diff --git a/pixi.toml b/pixi.toml index c3eba61..613d196 100644 --- a/pixi.toml +++ b/pixi.toml @@ -29,7 +29,6 @@ cd-hit = ">=4.8.1,<5" orthofinder = ">=3.1.0,<4" pip = ">=25.2,<26" seqkit = ">=2.10.1,<3" -hmmer = ">=3.4,<4" r-rstan = ">=2.32.7,<3" biopython = ">=1.85,<2" pal2nal = ">=14.1,<15" @@ -54,6 +53,7 @@ r-ggplot2 = ">=4.0.1,<5" macse = ">=2.7,<3" td2 = ">=1.0.6,<2" mmseqs2 = ">=18.8cc5c,<19" +paml = ">=4.10.9,<5" [feature.mrbayes.dependencies] beagle-lib = ">=3.1.2,<4"