From dba48339058ec49c986ab699bce403c6539e00e4 Mon Sep 17 00:00:00 2001
From: IvisTang <me@ivistang.com>
Date: Sat, 20 Dec 2025 01:57:25 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20.gitignore=EF=BC=8C?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=2006.gene=5Ftrees=20=E7=9B=AE=E5=BD=95?=
 =?UTF-8?q?=EF=BC=9B=E9=87=8D=E6=9E=84=20macse.sh=20=E8=84=9A=E6=9C=AC?=
 =?UTF-8?q?=EF=BC=8C=E7=A7=BB=E9=99=A4=20fs=5Flr=20=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=EF=BC=9B=E5=88=A0=E9=99=A4=E5=A4=9A=E4=B8=AA=E4=B8=8D=E5=86=8D?=
 =?UTF-8?q?=E4=BD=BF=E7=94=A8=E7=9A=84=E8=84=9A=E6=9C=AC=EF=BC=9B=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=20check=5Fframeshift.py=20=E5=92=8C=20get=5Fog=5Fseqs?=
 =?UTF-8?q?.py=20=E8=84=9A=E6=9C=AC=E4=BB=A5=E5=A4=84=E7=90=86=E9=98=85?=
 =?UTF-8?q?=E8=AF=BB=E6=A1=86=E7=A7=BB=E4=BD=8D=E5=92=8C=E6=8F=90=E5=8F=96?=
 =?UTF-8?q?=E5=8D=95=E6=8B=B7=E8=B4=9DOG=E5=BA=8F=E5=88=97=EF=BC=9B?=
 =?UTF-8?q?=E6=9B=B4=E6=96=B0=20pixi.toml=20=E5=92=8C=20pixi.lock=20?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=BB=A5=E6=B7=BB=E5=8A=A0=20paml=20?=
 =?UTF-8?q?=E4=BE=9D=E8=B5=96=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |   1 +
 99.scripts/miscs/check_frameshift.py          | 128 +++++++++++++++
 99.scripts/miscs/get_og_seqs.py               | 148 ++++++++++++++++++
 99.scripts/miscs/get_primary_cds.py           |  74 +++++++++
 99.scripts/miscs/macse.sh                     |   5 +-
 .../orthology_inference/01.hmmer_sc.sh        |  51 ------
 .../orthology_inference/01.macse_align.sh     |  24 +++
 .../orthology_inference/02.macse_align.sh     |  34 ----
 .../orthology_inference/02.trim_alignment.sh  |  20 +++
 .../orthology_inference/03.treeshrink.sh      |  26 +++
 .../orthology_inference/06.trim_alignment.sh  |   8 -
 .../orthology_inference/07.fasttree.sh        |   8 -
 .../orthology_inference/08.treeshrink.sh      |  11 --
 pixi.lock                                     |  10 ++
 pixi.toml                                     |   2 +-
 15 files changed, 433 insertions(+), 117 deletions(-)
 create mode 100644 99.scripts/miscs/check_frameshift.py
 create mode 100644 99.scripts/miscs/get_og_seqs.py
 create mode 100755 99.scripts/miscs/get_primary_cds.py
 delete mode 100755 99.scripts/workflow/orthology_inference/01.hmmer_sc.sh
 create mode 100755 99.scripts/workflow/orthology_inference/01.macse_align.sh
 delete mode 100755 99.scripts/workflow/orthology_inference/02.macse_align.sh
 create mode 100755 99.scripts/workflow/orthology_inference/02.trim_alignment.sh
 create mode 100755 99.scripts/workflow/orthology_inference/03.treeshrink.sh
 delete mode 100755 99.scripts/workflow/orthology_inference/06.trim_alignment.sh
 delete mode 100755 99.scripts/workflow/orthology_inference/07.fasttree.sh
 delete mode 100755 99.scripts/workflow/orthology_inference/08.treeshrink.sh

diff --git a/.gitignore b/.gitignore
index 4c68590..b4a2e2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@
 05.reduce_redundancy/*
 05.orthology_inference/*
 06.phylogeny_reconstruction/*
+06.gene_trees/*
 10.plastid/*
 98.results/*
 99.scripts/bucky/
diff --git a/99.scripts/miscs/check_frameshift.py b/99.scripts/miscs/check_frameshift.py
new file mode 100644
index 0000000..f72a666
--- /dev/null
+++ b/99.scripts/miscs/check_frameshift.py
@@ -0,0 +1,128 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Check and process frameshift in alignment files.
+"""
+
+import shutil
+import pandas as pd
+from pathlib import Path
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+import sys
+import argparse
+
+
+def parse_frameshift_list(fs_list: Path) -> pd.DataFrame:
+    """
+    Parse a frameshift list file into a pandas DataFrame.
+
+    Args:
+        fs_list (Path): Path to the frameshift list file.
+
+    Returns:
+        pd.DataFrame: DataFrame containing the frameshift information.
+    """
+    try:
+        df = pd.read_csv(fs_list, sep=",", header=0)
+    except Exception as e:
+        print(f"Error reading frameshift list file {fs_list}: {e}")
+        sys.exit(1)
+    return df
+
+
+def get_alignment_files(aln_dir: Path, ext: str) -> list[Path]:
+    """
+    Get a list of alignment files in a directory with a specific extension.
+
+    Args:
+        aln_dir (Path): Path to the directory containing alignment files.
+        ext (str): Extension of the alignment files.
+
+    Returns:
+        list[Path]: List of Paths to the alignment files.
+    """
+    try:
+        aln_files = list(aln_dir.glob(f"*{ext}"))
+    except Exception as e:
+        print(f"Error accessing alignment files in {aln_dir}: {e}")
+        sys.exit(1)
+    return aln_files
+
+
+def main(alignment_dir: str, ext: str, frameshift_list: str, outdir: str):
+    aln_dir = Path(alignment_dir)
+    fs_list_path = Path(frameshift_list)
+    out_dir = Path(outdir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    fs_df = parse_frameshift_list(fs_list_path)
+    aln_files = get_alignment_files(aln_dir, ext)
+    good_aln_count = 0
+    keep_fs_count = 0
+    discard_count = 0
+
+    for file in aln_files:
+        if str(file) not in fs_df["alignment_file"].values.tolist():
+            try:
+                shutil.copy(file, out_dir / file.name)
+                good_aln_count += 1
+                continue
+            except Exception as e:
+                print(f"Error copying file {file} to {out_dir}: {e}")
+                sys.exit(1)
+        if (
+            fs_df.loc[
+                fs_df["alignment_file"] == str(file), "possible_attributions"
+            ].values[0]
+            == "Framshift only in Ziziphus jujuba or Elaeagnus pungens"
+        ):
+            try:
+                records = []
+                for record in SeqIO.parse(file, "fasta"):
+                    seq = str(record.seq).replace("!", "-")
+                    id = record.id
+                    records.append(SeqRecord(seq=Seq(seq), id=id, description=""))
+                print(
+                    f"Keep the alignment file with frameshift: {file}. Due to frameshift only in outgroup."
+                )
+                keep_fs_count += 1
+                SeqIO.write(records, out_dir / file.name, "fasta")
+            except Exception as e:
+                print(f"Error processing file {file}: {e}")
+                sys.exit(1)
+        else:
+            discard_count += 1
+    print("Frameshift processing completed successfully.")
+    print(f"Number of good alignments copied: {good_aln_count}")
+    print(f"Number of alignments with frameshift kept: {keep_fs_count}")
+    print(f"Number of alignments with frameshift discarded: {discard_count}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check and process frameshift in alignment files."
+    )
+    parser.add_argument(
+        "-a",
+        "--alignment_dir",
+        required=True,
+        help="Directory containing alignment files",
+    )
+    parser.add_argument(
+        "-e", "--ext", default=".nal", help="Extension of alignment files"
+    )
+    parser.add_argument(
+        "-f",
+        "--frameshift_list",
+        required=True,
+        help="CSV file listing frameshift information",
+    )
+    parser.add_argument(
+        "-o", "--outdir", required=True, help="Output directory for processed files"
+    )
+    args = parser.parse_args()
+
+    main(args.alignment_dir, args.ext, args.frameshift_list, args.outdir)
diff --git a/99.scripts/miscs/get_og_seqs.py b/99.scripts/miscs/get_og_seqs.py
new file mode 100644
index 0000000..49c7a68
--- /dev/null
+++ b/99.scripts/miscs/get_og_seqs.py
@@ -0,0 +1,148 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Extract Single Copy OG sequences from a comprehensive FASTA file based on OG definitions from results of orthofinder.
+"""
+
+import sys
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from pathlib import Path
+import argparse
+
+
+def get_og_fastas(og_dir: Path, ext: str) -> dict[str, list[str]]:
+    """
+    Get a dictionary of OGs and their corresponding sequence IDs from FASTA files in a directory.
+
+    Args:
+        og_dir (Path): Path to the directory containing OG FASTA files.
+        ext (str): Extension of the FASTA files. Default is ".fa".
+
+    Returns:
+        dict: Dictionary with OG names as keys and lists of sequence IDs as values.
+    """
+    og_dict: dict[str, list[str]] = {}
+    try:
+        for fasta_file in og_dir.glob(f"*{ext}"):
+            og_name = fasta_file.stem
+            seq_ids: list[str] = []
+            for record in SeqIO.parse(fasta_file, "fasta"):
+                seq_ids.append(record.id)
+            og_dict[og_name] = seq_ids
+    except Exception as e:
+        print(f"Error processing OG FASTA files in {og_dir}: {e}")
+        sys.exit(1)
+    return og_dict
+
+
+def parse_all_fasta(fasta_file: Path) -> dict[str, Seq]:
+    """
+    Parse a FASTA file and return a dictionary of sequence IDs and their corresponding Seq objects.
+
+    Args:
+        fasta_file (Path): Path to the FASTA file.
+    Returns:
+        dict: Dictionary with sequence IDs as keys and Seq objects as values.
+    """
+    seq_dict: dict[str, Seq] = {}
+    try:
+        for record in SeqIO.parse(fasta_file, "fasta"):
+            seq_dict[record.id] = record.seq
+    except Exception as e:
+        print(f"Error parsing FASTA file {fasta_file}: {e}")
+        sys.exit(1)
+    return seq_dict
+
+
+def output_og_seqs(
+    all_seq_dict: dict[str, Seq], og_dict: dict[str, list[str]], output_dir: Path
+):
+    """
+    Output sequences for each OG into separate FASTA files, remove gene id and only keep taxon name.
+
+    Args:
+        all_seq_dict (dict): Dictionary of all sequences with sequence IDs as keys.
+        og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values.
+        output_dir (Path): Path to the output directory.
+    """
+    for og_name, seq_ids in og_dict.items():
+        og_seqs: list[SeqRecord] = []
+        for seq_id in seq_ids:
+            if seq_id in all_seq_dict:
+                seq_record = SeqRecord(
+                    all_seq_dict[seq_id], id=seq_id.split("@")[0], description=""
+                )
+                og_seqs.append(seq_record)
+            else:
+                print(f"Warning: Sequence ID {seq_id} not found in all sequences.")
+        output_fasta = output_dir / f"{og_name}.fa"
+        try:
+            SeqIO.write(og_seqs, output_fasta, "fasta")
+        except Exception as e:
+            print(f"Error writing to FASTA file {output_fasta}: {e}")
+            sys.exit(1)
+
+
+def write_og_list(og_dict: dict[str, list[str]]):
+    """
+    Write the OG list to a text file.
+
+    Args:
+        og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values.
+    """
+    try:
+        with open("og_list.tsv", "w") as f:
+            for og_name, seq_ids in og_dict.items():
+                line = f"{og_name}\t" + "\t".join(seq_ids) + "\n"
+                f.write(line)
+        print("OG list written to og_list.tsv")
+    except Exception as e:
+        print(f"Error writing OG list to file: {e}")
+        sys.exit(1)
+
+
+def main(og_dir_path: str, all_fasta_path: str, output_dir_path: str, ext: str = ".fa"):
+    og_dir = Path(og_dir_path)
+    all_fasta = Path(all_fasta_path)
+    output_dir = Path(output_dir_path)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    og_dict = get_og_fastas(og_dir, ext)
+    all_seqs = parse_all_fasta(all_fasta)
+    output_og_seqs(all_seqs, og_dict, output_dir)
+    write_og_list(og_dict)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extract OG sequences from a comprehensive FASTA file based on OG definitions."
+    )
+    parser.add_argument(
+        "-d",
+        "--og_dir",
+        required=True,
+        help="Directory containing OG FASTA files.",
+    )
+    parser.add_argument(
+        "-a",
+        "--all_fasta",
+        required=True,
+        help="FASTA file containing all sequences.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        required=True,
+        help="Output directory for OG FASTA files.",
+    )
+    parser.add_argument(
+        "-e",
+        "--ext",
+        default=".fa",
+        help="Extension of OG FASTA files (default: .fa).",
+    )
+    args = parser.parse_args()
+    main(args.og_dir, args.all_fasta, args.output_dir, args.ext)
diff --git a/99.scripts/miscs/get_primary_cds.py b/99.scripts/miscs/get_primary_cds.py
new file mode 100755
index 0000000..cbec199
--- /dev/null
+++ b/99.scripts/miscs/get_primary_cds.py
@@ -0,0 +1,74 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Get primary CDS sequences from a FASTA file containing multiple CDS per gene.
+"""
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+import re
+import argparse
+import sys
+
+
+def get_primary_cds(input_fasta, output_fasta):
+    primary_cds_records = []
+    gene = ""
+    length = 0
+    seq = None
+    id = None
+    try:
+        for record in SeqIO.parse(input_fasta, "fasta"):
+            seq_len = len(record.seq)
+            desc = record.description
+            match = re.search(r"\[gene=(\S+)\]", desc)
+            if match:
+                gene_name = match.group(1)
+            else:
+                # Skip if gene name not found
+                continue
+
+            if gene_name != gene:
+                # new gene encountered
+                # print(f"Processing gene: {gene_name}")
+                if length > 0:
+                    # this is not the first record, save the previous longest record
+                    primary_cds_record = SeqRecord(
+                        seq, id=id, description=f"[gene={gene}]"
+                    )
+                    primary_cds_records.append(primary_cds_record)
+                gene = gene_name
+                seq = record.seq
+                id = record.id
+                length = seq_len
+            else:
+                # same gene, check length
+                if seq_len > length:
+                    seq = record.seq
+                    id = record.id
+                    length = seq_len
+        # after loop, save the last gene
+        if gene and length > 0:
+            primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]")
+            primary_cds_records.append(primary_cds_record)
+        SeqIO.write(primary_cds_records, output_fasta, "fasta")
+        print(f"Primary CDS sequences written to {args.output_fasta}")
+    except Exception as e:
+        print(f"Error processing FASTA file {input_fasta}: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extract primary CDS sequences from a FASTA file."
+    )
+    parser.add_argument(
+        "-i", "--input_fasta", help="Input FASTA file containing CDS sequences."
+    )
+    parser.add_argument(
+        "-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences."
+    )
+    args = parser.parse_args()
+
+    get_primary_cds(args.input_fasta, args.output_fasta)
diff --git a/99.scripts/miscs/macse.sh b/99.scripts/miscs/macse.sh
index e3b4e9d..4bff049 100644
--- a/99.scripts/miscs/macse.sh
+++ b/99.scripts/miscs/macse.sh
@@ -1,7 +1,5 @@
 #! /bin/bash
 
-FS_LR=7
-
 if [ "$#" -ne 3 ]; then
     echo "Usage: $0 <reliable_seq> <less_reliable_seq> <stem>"
     echo "Perform MACSE alignment on given sequences"
@@ -16,7 +14,7 @@ stem=$3
     echo "Command:"
     echo "macse -prog alignSequences -seq $seq -seq_lr ${seq_lr}"
     echo "      -out_NT ${stem}.nal -out_AA ${stem}.pal"
-    echo "      -optim 2 -max_refine_iter 3 -local_realign_init 0.2 -fs_lr $FS_LR"
+    echo "      -optim 2 -max_refine_iter 3 -local_realign_init 0.2"
 } >alignSequences.log
 macse -prog alignSequences \
     -seq "$seq" -seq_lr "${seq_lr}" \
@@ -24,5 +22,4 @@ macse -prog alignSequences \
     -optim 2 \
     -max_refine_iter 3 \
     -local_realign_init 0.2 \
-    -fs_lr $FS_LR \
     >>alignSequences.log 2>&1
diff --git a/99.scripts/workflow/orthology_inference/01.hmmer_sc.sh b/99.scripts/workflow/orthology_inference/01.hmmer_sc.sh
deleted file mode 100755
index 71410af..0000000
--- a/99.scripts/workflow/orthology_inference/01.hmmer_sc.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#! /bin/bash
-set -e
-
-if [ "$#" -ne 4 ]; then
-    echo "Usage: $0 <ogs_dir> <outdir> <proteome> <threads>"
-    echo "search homologous sequences in <proteome> using HMMs built from orthogroup alignments"
-    exit 1
-fi
-
-ogs_dir=$(readlink -f "$1")
-outdir=$2
-proteome=$(readlink -f "$3")
-threads=$4
-
-mkdir -p "$outdir"
-cd "$outdir" || exit 1
-echo "Working directory: $(pwd)"
-echo "Using OGS directory: $ogs_dir"
-echo "Using $threads threads"
-echo ""
-echo "Starting orthogroup sequence alignment..."
-mkdir -p msa
-echo -n >mafft.cmds
-for i in "$ogs_dir"/*.fa; do
-    j=$(basename "$i")
-    echo "linsi --quiet $i > msa/$j" >>mafft.cmds
-done
-xargs -t -P "$threads" -I cmd -a mafft.cmds bash -c "cmd"
-echo "Orthogroup sequence alignment completed."
-echo ""
-echo "Starting HMM building from alignments..."
-mkdir -p hmms
-echo -n >hmmbuild.cmds
-for i in msa/*.fa; do
-    j=$(basename "$i")
-    echo "hmmbuild -o hmms/${j}.hmmbuild.out --amino hmms/${j}.hmm $i" >>hmmbuild.cmds
-done
-xargs -t -P "$threads" -I cmd -a hmmbuild.cmds bash -c "cmd"
-echo "HMM building completed."
-echo ""
-echo "Starting HMM search against other proteome..."
-mkdir -p search
-echo -n >hmmsearch.cmds
-for i in hmms/*.hmm; do
-    j=$(basename "$i")
-    echo "hmmsearch --tblout search/${j}search.tblout $i $proteome > search/${j}search.rawout" >>hmmsearch.cmds
-done
-xargs -t -P "$threads" -I cmd -a hmmsearch.cmds bash -c "cmd"
-echo "HMM search completed."
-echo ""
-echo "All steps completed successfully."
diff --git a/99.scripts/workflow/orthology_inference/01.macse_align.sh b/99.scripts/workflow/orthology_inference/01.macse_align.sh
new file mode 100755
index 0000000..632c889
--- /dev/null
+++ b/99.scripts/workflow/orthology_inference/01.macse_align.sh
@@ -0,0 +1,24 @@
+#! /bin/bash
+set -e
+THREADS=${THREADS:-12}
+EXT=${EXT:-"fa"}
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <ogs_dir> <out_dir>"
+    echo "Perform MACSE alignment for each orthologous group"
+    exit 1
+fi
+
+ogs_dir=$(readlink -f "$1")
+out_dir=$2
+
+mkdir -p "$out_dir"
+echo "Starting MACSE alignment of orthologous groups..."
+echo -n >macse.cmds
+for og_fasta in "$ogs_dir"/*."$EXT"; do
+    og_name=$(basename "$og_fasta" ."$EXT")
+    out_stem="$out_dir/$og_name"
+    echo "macse -prog alignSequences -seq $og_fasta -out_AA ${out_stem}.pal -out_NT ${out_stem}.nal > ${out_stem}.log 2>&1" >>macse.cmds
+done
+xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd" &&
+    echo "MACSE alignment completed."
diff --git a/99.scripts/workflow/orthology_inference/02.macse_align.sh b/99.scripts/workflow/orthology_inference/02.macse_align.sh
deleted file mode 100755
index 0e3028e..0000000
--- a/99.scripts/workflow/orthology_inference/02.macse_align.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#! /bin/bash
-set -e
-SCRIPTS=${SCRIPTS:-"$PROJECTHOME/99.scripts"}
-THREADS=${THREADS:-12}
-
-if [ "$#" -ne 5 ]; then
-    echo "Usage: $0 <ogs_dir> <hmmsearch_result_dir> <all_cds.fa> <output_dir> <homolog_stem>"
-    echo "Integrate hmmsearch results to new orthologous groups directory and perform MACSE alignment"
-    exit 1
-fi
-
-ogs_dir=$(readlink -f "$1")
-search_dir=$(readlink -f "$2")
-all_cds=$(readlink -f "$3")
-out_dir=$4
-stem=$5
-
-echo "Integrating hmmsearch results to new orthologous groups directory..."
-python3 "$SCRIPTS"/miscs/hmmsearch_result_to_new_ogs_dir.py \
-    -d "$ogs_dir" \
-    -t "$search_dir" \
-    -f "$all_cds" \
-    -o "$out_dir" \
-    -s "$stem"
-echo "Integration completed."
-
-echo "Starting MACSE alignment of orthologous groups..."
-echo -n >macse.cmds
-for og_dir in "$out_dir"/ogs/*; do
-    j=$(basename "$og_dir")
-    echo "cd $og_dir && bash $SCRIPTS/miscs/macse.sh ${j}_${stem}.fa ${j}.fa $j" >>macse.cmds
-done
-xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd"
-echo "MACSE alignment completed."
diff --git a/99.scripts/workflow/orthology_inference/02.trim_alignment.sh b/99.scripts/workflow/orthology_inference/02.trim_alignment.sh
new file mode 100755
index 0000000..0be4747
--- /dev/null
+++ b/99.scripts/workflow/orthology_inference/02.trim_alignment.sh
@@ -0,0 +1,20 @@
+#! /bin/bash
+set -e
+THREADS=${THREADS:-12}
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <nal_dir> <out_dir> <ext>"
+    echo "Trim MACSE nucleotide alignments using trimal"
+    exit 1
+fi
+
+nal_dir=$(readlink -f "$1")
+out_dir=$(readlink -f "$2")
+ext=$3
+
+mkdir -p "$out_dir"
+echo -n >trimal.cmds
+for i in "$nal_dir"/*."$ext"; do
+    j=$(basename "$i" ."$ext")
+    echo "trimal -in $i -out $out_dir/${j}.trimed.fa -automated1 -resoverlap 0.5 -seqoverlap 50" >>trimal.cmds
+done
+xargs -t -P "$THREADS" -I cmd -a trimal.cmds bash -c "cmd"
diff --git a/99.scripts/workflow/orthology_inference/03.treeshrink.sh b/99.scripts/workflow/orthology_inference/03.treeshrink.sh
new file mode 100755
index 0000000..4d454ab
--- /dev/null
+++ b/99.scripts/workflow/orthology_inference/03.treeshrink.sh
@@ -0,0 +1,26 @@
+#! /bin/bash
+set -e
+THREADS=${THREADS:-12}
+
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <aln_dir> <out_dir> <ext>"
+    echo "Build FastTree phylogenetic trees and perform TreeShrink on trimmed MACSE nucleotide alignments"
+    exit 1
+fi
+
+aln_dir=$(readlink -f "$1")
+out_dir=$(readlink -f "$2")
+ext=$3
+
+# fasttree
+mkdir -p "$out_dir"
+echo -n >fasttree.cmds
+for i in "$aln_dir"/*."$ext"; do
+    j=$(basename "$i" ."$ext")
+    mkdir -p "$out_dir"/"${j}"
+    cp -s "$i" "$out_dir"/"${j}"/input.fasta
+    echo "FastTree -nt -gtr -quiet $out_dir/${j}/input.fasta > $out_dir/${j}/input.tree" >>fasttree.cmds
+done
+xargs -t -P "$THREADS" -I cmd -a fasttree.cmds bash -c "cmd"
+# treeshrink
+run_treeshrink.py -f -i "$out_dir"/ -t input.tree -a input.fasta >treeshrink.log
diff --git a/99.scripts/workflow/orthology_inference/06.trim_alignment.sh b/99.scripts/workflow/orthology_inference/06.trim_alignment.sh
deleted file mode 100755
index 80431ba..0000000
--- a/99.scripts/workflow/orthology_inference/06.trim_alignment.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#! /usr/bin/env bash
-mkdir -p trimed_nal
-echo -n > trimal.cmds
-for i in cds_aln/*.nal ;do
-    j=$(basename "$i")
-    echo "trimal -in $i -out trimed_nal/${j/.nal/.trimed.fa} -automated1 -resoverlap 0.5 -seqoverlap 50" >> trimal.cmds
-done
-xargs -t -P 4 -I cmd -a trimal.cmds bash -c "cmd"
diff --git a/99.scripts/workflow/orthology_inference/07.fasttree.sh b/99.scripts/workflow/orthology_inference/07.fasttree.sh
deleted file mode 100755
index 66edf59..0000000
--- a/99.scripts/workflow/orthology_inference/07.fasttree.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#! /usr/bin/env bash
-mkdir -p fasttree
-echo -n > fasttree.cmds
-for i in trimed_nal/*.trimed.fa ;do
-    j=$(basename "$i")
-    echo "FastTree -nt -gtr -quiet $i > fasttree/${j/.trimed.fa/.tree}" >> fasttree.cmds
-done
-xargs -t -P 8 -I cmd -a fasttree.cmds bash -c "cmd"
diff --git a/99.scripts/workflow/orthology_inference/08.treeshrink.sh b/99.scripts/workflow/orthology_inference/08.treeshrink.sh
deleted file mode 100755
index b11bfb7..0000000
--- a/99.scripts/workflow/orthology_inference/08.treeshrink.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#! /usr/bin/env bash
-mkdir -p treeshrink
-for i in trimed_nal/*.trimed.fa; do
-    j=$(basename "$i")
-    mkdir -p treeshrink/"${j/.trimed.fa/}"
-    cd treeshrink/"${j/.trimed.fa/}" || exit 1
-    ln -s ../../fasttree/"${j/.trimed.fa/.tree}" input.tree
-    ln -s ../../"$i" input.fasta
-    cd ../../
-done
-run_treeshrink.py -i treeshrink/ -t input.tree -a input.fasta > treeshrink.log
diff --git a/pixi.lock b/pixi.lock
index d5fa123..1aa5e90 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -338,6 +338,7 @@ environments:
       - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/p7zip-16.02-h9c3ff4c_1001.tar.bz2
       - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
       - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/noarch/pal2nal-14.1-pl5321hdfd78af_3.tar.bz2
+      - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda
       - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
       - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandoc-3.8.2.1-ha770c72_0.conda
       - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda
@@ -5125,6 +5126,15 @@ packages:
   license_family: GPL
   size: 22693
   timestamp: 1642293349880
+- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda
+  sha256: 81c5237b07796984d7915d29a768068a1e5e6c02b4ae25f27836b0babe84e5aa
+  md5: ef02b401c5daaf104b3d1091f8ec37f5
+  depends:
+  - libgcc >=13
+  license: GPL-3.0-or-later
+  license_family: GPL3
+  size: 679359
+  timestamp: 1755706728619
 - conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
   sha256: c97f796345f5b9756e4404bbb4ee049afd5ea1762be6ee37ce99162cbee3b1d3
   md5: 72e3452bf0ff08132e86de0272f2fbb0
diff --git a/pixi.toml b/pixi.toml
index c3eba61..613d196 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -29,7 +29,6 @@ cd-hit = ">=4.8.1,<5"
 orthofinder = ">=3.1.0,<4"
 pip = ">=25.2,<26"
 seqkit = ">=2.10.1,<3"
-hmmer = ">=3.4,<4"
 r-rstan = ">=2.32.7,<3"
 biopython = ">=1.85,<2"
 pal2nal = ">=14.1,<15"
@@ -54,6 +53,7 @@ r-ggplot2 = ">=4.0.1,<5"
 macse = ">=2.7,<3"
 td2 = ">=1.0.6,<2"
 mmseqs2 = ">=18.8cc5c,<19"
+paml = ">=4.10.9,<5"
 
 [feature.mrbayes.dependencies]
 beagle-lib = ">=3.1.2,<4"