更新 .gitignore,添加 06.gene_trees 目录;重构 macse.sh 脚本,移除 fs_lr 参数;删除多个不再使用的脚本;添加 check_frameshift.py 和 get_og_seqs.py 脚本以处理阅读框移位和提取单拷贝OG序列;更新 pixi.toml 和 pixi.lock 文件以添加 paml 依赖。

This commit is contained in:
IvisTang 2025-12-20 01:57:25 +08:00
parent e8ba2ed962
commit dba4833905
15 changed files with 433 additions and 117 deletions

1
.gitignore vendored
View File

@ -10,6 +10,7 @@
05.reduce_redundancy/*
05.orthology_inference/*
06.phylogeny_reconstruction/*
06.gene_trees/*
10.plastid/*
98.results/*
99.scripts/bucky/

View File

@ -0,0 +1,128 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Check and process frameshift in alignment files.
"""
import shutil
import pandas as pd
from pathlib import Path
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import sys
import argparse
def parse_frameshift_list(fs_list: Path) -> pd.DataFrame:
"""
Parse a frameshift list file into a pandas DataFrame.
Args:
fs_list (Path): Path to the frameshift list file.
Returns:
pd.DataFrame: DataFrame containing the frameshift information.
"""
try:
df = pd.read_csv(fs_list, sep=",", header=0)
except Exception as e:
print(f"Error reading frameshift list file {fs_list}: {e}")
sys.exit(1)
return df
def get_alignment_files(aln_dir: Path, ext: str) -> list[Path]:
"""
Get a list of alignment files in a directory with a specific extension.
Args:
aln_dir (Path): Path to the directory containing alignment files.
ext (str): Extension of the alignment files.
Returns:
list[Path]: List of Paths to the alignment files.
"""
try:
aln_files = list(aln_dir.glob(f"*{ext}"))
except Exception as e:
print(f"Error accessing alignment files in {aln_dir}: {e}")
sys.exit(1)
return aln_files
def main(alignment_dir: str, ext: str, frameshift_list: str, outdir: str):
aln_dir = Path(alignment_dir)
fs_list_path = Path(frameshift_list)
out_dir = Path(outdir)
out_dir.mkdir(parents=True, exist_ok=True)
fs_df = parse_frameshift_list(fs_list_path)
aln_files = get_alignment_files(aln_dir, ext)
good_aln_count = 0
keep_fs_count = 0
discard_count = 0
for file in aln_files:
if str(file) not in fs_df["alignment_file"].values.tolist():
try:
shutil.copy(file, out_dir / file.name)
good_aln_count += 1
continue
except Exception as e:
print(f"Error copying file {file} to {out_dir}: {e}")
sys.exit(1)
if (
fs_df.loc[
fs_df["alignment_file"] == str(file), "possible_attributions"
].values[0]
== "Framshift only in Ziziphus jujuba or Elaeagnus pungens"
):
try:
records = []
for record in SeqIO.parse(file, "fasta"):
seq = str(record.seq).replace("!", "-")
id = record.id
records.append(SeqRecord(seq=Seq(seq), id=id, description=""))
print(
f"Keep the alignment file with frameshift: {file}. Due to frameshift only in outgroup."
)
keep_fs_count += 1
SeqIO.write(records, out_dir / file.name, "fasta")
except Exception as e:
print(f"Error processing file {file}: {e}")
sys.exit(1)
else:
discard_count += 1
print("Frameshift processing completed successfully.")
print(f"Number of good alignments copied: {good_aln_count}")
print(f"Number of alignments with frameshift kept: {keep_fs_count}")
print(f"Number of alignments with frameshift discarded: {discard_count}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check and process frameshift in alignment files."
)
parser.add_argument(
"-a",
"--alignment_dir",
required=True,
help="Directory containing alignment files",
)
parser.add_argument(
"-e", "--ext", default=".nal", help="Extension of alignment files"
)
parser.add_argument(
"-f",
"--frameshift_list",
required=True,
help="CSV file listing frameshift information",
)
parser.add_argument(
"-o", "--outdir", required=True, help="Output directory for processed files"
)
args = parser.parse_args()
main(args.alignment_dir, args.ext, args.frameshift_list, args.outdir)

View File

@ -0,0 +1,148 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract Single Copy OG sequences from a comprehensive FASTA file based on OG definitions from results of orthofinder.
"""
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from pathlib import Path
import argparse
def get_og_fastas(og_dir: Path, ext: str) -> dict[str, list[str]]:
"""
Get a dictionary of OGs and their corresponding sequence IDs from FASTA files in a directory.
Args:
og_dir (Path): Path to the directory containing OG FASTA files.
ext (str): Extension of the FASTA files. Default is ".fa".
Returns:
dict: Dictionary with OG names as keys and lists of sequence IDs as values.
"""
og_dict: dict[str, list[str]] = {}
try:
for fasta_file in og_dir.glob(f"*{ext}"):
og_name = fasta_file.stem
seq_ids: list[str] = []
for record in SeqIO.parse(fasta_file, "fasta"):
seq_ids.append(record.id)
og_dict[og_name] = seq_ids
except Exception as e:
print(f"Error processing OG FASTA files in {og_dir}: {e}")
sys.exit(1)
return og_dict
def parse_all_fasta(fasta_file: Path) -> dict[str, Seq]:
"""
Parse a FASTA file and return a dictionary of sequence IDs and their corresponding Seq objects.
Args:
fasta_file (Path): Path to the FASTA file.
Returns:
dict: Dictionary with sequence IDs as keys and Seq objects as values.
"""
seq_dict: dict[str, Seq] = {}
try:
for record in SeqIO.parse(fasta_file, "fasta"):
seq_dict[record.id] = record.seq
except Exception as e:
print(f"Error parsing FASTA file {fasta_file}: {e}")
sys.exit(1)
return seq_dict
def output_og_seqs(
all_seq_dict: dict[str, Seq], og_dict: dict[str, list[str]], output_dir: Path
):
"""
Output sequences for each OG into separate FASTA files, remove gene id and only keep taxon name.
Args:
all_seq_dict (dict): Dictionary of all sequences with sequence IDs as keys.
og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values.
output_dir (Path): Path to the output directory.
"""
for og_name, seq_ids in og_dict.items():
og_seqs: list[SeqRecord] = []
for seq_id in seq_ids:
if seq_id in all_seq_dict:
seq_record = SeqRecord(
all_seq_dict[seq_id], id=seq_id.split("@")[0], description=""
)
og_seqs.append(seq_record)
else:
print(f"Warning: Sequence ID {seq_id} not found in all sequences.")
output_fasta = output_dir / f"{og_name}.fa"
try:
SeqIO.write(og_seqs, output_fasta, "fasta")
except Exception as e:
print(f"Error writing to FASTA file {output_fasta}: {e}")
sys.exit(1)
def write_og_list(og_dict: dict[str, list[str]]):
"""
Write the OG list to a text file.
Args:
og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values.
"""
try:
with open("og_list.tsv", "w") as f:
for og_name, seq_ids in og_dict.items():
line = f"{og_name}\t" + "\t".join(seq_ids) + "\n"
f.write(line)
print("OG list written to og_list.tsv")
except Exception as e:
print(f"Error writing OG list to file: {e}")
sys.exit(1)
def main(og_dir_path: str, all_fasta_path: str, output_dir_path: str, ext: str = ".fa"):
og_dir = Path(og_dir_path)
all_fasta = Path(all_fasta_path)
output_dir = Path(output_dir_path)
output_dir.mkdir(parents=True, exist_ok=True)
og_dict = get_og_fastas(og_dir, ext)
all_seqs = parse_all_fasta(all_fasta)
output_og_seqs(all_seqs, og_dict, output_dir)
write_og_list(og_dict)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract OG sequences from a comprehensive FASTA file based on OG definitions."
)
parser.add_argument(
"-d",
"--og_dir",
required=True,
help="Directory containing OG FASTA files.",
)
parser.add_argument(
"-a",
"--all_fasta",
required=True,
help="FASTA file containing all sequences.",
)
parser.add_argument(
"-o",
"--output_dir",
required=True,
help="Output directory for OG FASTA files.",
)
parser.add_argument(
"-e",
"--ext",
default=".fa",
help="Extension of OG FASTA files (default: .fa).",
)
args = parser.parse_args()
main(args.og_dir, args.all_fasta, args.output_dir, args.ext)

View File

@ -0,0 +1,74 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Get primary CDS sequences from a FASTA file containing multiple CDS per gene.
"""
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import re
import argparse
import sys
def get_primary_cds(input_fasta, output_fasta):
primary_cds_records = []
gene = ""
length = 0
seq = None
id = None
try:
for record in SeqIO.parse(input_fasta, "fasta"):
seq_len = len(record.seq)
desc = record.description
match = re.search(r"\[gene=(\S+)\]", desc)
if match:
gene_name = match.group(1)
else:
# Skip if gene name not found
continue
if gene_name != gene:
# new gene encountered
# print(f"Processing gene: {gene_name}")
if length > 0:
# this is not the first record, save the previous longest record
primary_cds_record = SeqRecord(
seq, id=id, description=f"[gene={gene}]"
)
primary_cds_records.append(primary_cds_record)
gene = gene_name
seq = record.seq
id = record.id
length = seq_len
else:
# same gene, check length
if seq_len > length:
seq = record.seq
id = record.id
length = seq_len
# after loop, save the last gene
if gene and length > 0:
primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]")
primary_cds_records.append(primary_cds_record)
SeqIO.write(primary_cds_records, output_fasta, "fasta")
print(f"Primary CDS sequences written to {args.output_fasta}")
except Exception as e:
print(f"Error processing FASTA file {input_fasta}: {e}")
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract primary CDS sequences from a FASTA file."
)
parser.add_argument(
"-i", "--input_fasta", help="Input FASTA file containing CDS sequences."
)
parser.add_argument(
"-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences."
)
args = parser.parse_args()
get_primary_cds(args.input_fasta, args.output_fasta)

View File

@ -1,7 +1,5 @@
#! /bin/bash
FS_LR=7
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <reliable_seq> <less_reliable_seq> <stem>"
echo "Perform MACSE alignment on given sequences"
@ -16,7 +14,7 @@ stem=$3
echo "Command:"
echo "macse -prog alignSequences -seq $seq -seq_lr ${seq_lr}"
echo " -out_NT ${stem}.nal -out_AA ${stem}.pal"
echo " -optim 2 -max_refine_iter 3 -local_realign_init 0.2 -fs_lr $FS_LR"
echo " -optim 2 -max_refine_iter 3 -local_realign_init 0.2"
} >alignSequences.log
macse -prog alignSequences \
-seq "$seq" -seq_lr "${seq_lr}" \
@ -24,5 +22,4 @@ macse -prog alignSequences \
-optim 2 \
-max_refine_iter 3 \
-local_realign_init 0.2 \
-fs_lr $FS_LR \
>>alignSequences.log 2>&1

View File

@ -1,51 +0,0 @@
#! /bin/bash
set -e
if [ "$#" -ne 4 ]; then
echo "Usage: $0 <ogs_dir> <outdir> <proteome> <threads>"
echo "search homologous sequences in <proteome> using HMMs built from orthogroup alignments"
exit 1
fi
ogs_dir=$(readlink -f "$1")
outdir=$2
proteome=$(readlink -f "$3")
threads=$4
mkdir -p "$outdir"
cd "$outdir" || exit 1
echo "Working directory: $(pwd)"
echo "Using OGS directory: $ogs_dir"
echo "Using $threads threads"
echo ""
echo "Starting orthogroup sequence alignment..."
mkdir -p msa
echo -n >mafft.cmds
for i in "$ogs_dir"/*.fa; do
j=$(basename "$i")
echo "linsi --quiet $i > msa/$j" >>mafft.cmds
done
xargs -t -P "$threads" -I cmd -a mafft.cmds bash -c "cmd"
echo "Orthogroup sequence alignment completed."
echo ""
echo "Starting HMM building from alignments..."
mkdir -p hmms
echo -n >hmmbuild.cmds
for i in msa/*.fa; do
j=$(basename "$i")
echo "hmmbuild -o hmms/${j}.hmmbuild.out --amino hmms/${j}.hmm $i" >>hmmbuild.cmds
done
xargs -t -P "$threads" -I cmd -a hmmbuild.cmds bash -c "cmd"
echo "HMM building completed."
echo ""
echo "Starting HMM search against other proteome..."
mkdir -p search
echo -n >hmmsearch.cmds
for i in hmms/*.hmm; do
j=$(basename "$i")
echo "hmmsearch --tblout search/${j}search.tblout $i $proteome > search/${j}search.rawout" >>hmmsearch.cmds
done
xargs -t -P "$threads" -I cmd -a hmmsearch.cmds bash -c "cmd"
echo "HMM search completed."
echo ""
echo "All steps completed successfully."

View File

@ -0,0 +1,24 @@
#! /bin/bash
set -e
THREADS=${THREADS:-12}
EXT=${EXT:-"fa"}
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <ogs_dir> <out_dir>"
echo "Perform MACSE alignment for each orthologous group"
exit 1
fi
ogs_dir=$(readlink -f "$1")
out_dir=$2
mkdir -p "$out_dir"
echo "Starting MACSE alignment of orthologous groups..."
echo -n >macse.cmds
for og_fasta in "$ogs_dir"/*."$EXT"; do
og_name=$(basename "$og_fasta" ."$EXT")
out_stem="$out_dir/$og_name"
echo "macse -prog alignSequences -seq $og_fasta -out_AA ${out_stem}.pal -out_NT ${out_stem}.nal > ${out_stem}.log 2>&1" >>macse.cmds
done
xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd" &&
echo "MACSE alignment completed."

View File

@ -1,34 +0,0 @@
#! /bin/bash
set -e
SCRIPTS=${SCRIPTS:-"$PROJECTHOME/99.scripts"}
THREADS=${THREADS:-12}
if [ "$#" -ne 5 ]; then
echo "Usage: $0 <ogs_dir> <hmmsearch_result_dir> <all_cds.fa> <output_dir> <homolog_stem>"
echo "Integrate hmmsearch results to new orthologous groups directory and perform MACSE alignment"
exit 1
fi
ogs_dir=$(readlink -f "$1")
search_dir=$(readlink -f "$2")
all_cds=$(readlink -f "$3")
out_dir=$4
stem=$5
echo "Integrating hmmsearch results to new orthologous groups directory..."
python3 "$SCRIPTS"/miscs/hmmsearch_result_to_new_ogs_dir.py \
-d "$ogs_dir" \
-t "$search_dir" \
-f "$all_cds" \
-o "$out_dir" \
-s "$stem"
echo "Integration completed."
echo "Starting MACSE alignment of orthologous groups..."
echo -n >macse.cmds
for og_dir in "$out_dir"/ogs/*; do
j=$(basename "$og_dir")
echo "cd $og_dir && bash $SCRIPTS/miscs/macse.sh ${j}_${stem}.fa ${j}.fa $j" >>macse.cmds
done
xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd"
echo "MACSE alignment completed."

View File

@ -0,0 +1,20 @@
#! /bin/bash
set -e
THREADS=${THREADS:-12}
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <nal_dir> <out_dir> <ext>"
echo "Trim MACSE nucleotide alignments using trimal"
exit 1
fi
nal_dir=$(readlink -f "$1")
out_dir=$(readlink -f "$2")
ext=$3
mkdir -p "$out_dir"
echo -n >trimal.cmds
for i in "$nal_dir"/*."$ext"; do
j=$(basename "$i" ."$ext")
echo "trimal -in $i -out $out_dir/${j}.trimed.fa -automated1 -resoverlap 0.5 -seqoverlap 50" >>trimal.cmds
done
xargs -t -P "$THREADS" -I cmd -a trimal.cmds bash -c "cmd"

View File

@ -0,0 +1,26 @@
#! /bin/bash
set -e
THREADS=${THREADS:-12}
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <aln_dir> <out_dir> <ext>"
echo "Build FastTree phylogenetic trees and perform TreeShrink on trimmed MACSE nucleotide alignments"
exit 1
fi
aln_dir=$(readlink -f "$1")
out_dir=$(readlink -f "$2")
ext=$3
# fasttree
mkdir -p "$out_dir"
echo -n >fasttree.cmds
for i in "$aln_dir"/*."$ext"; do
j=$(basename "$i" ."$ext")
mkdir -p "$out_dir"/"${j}"
cp -s "$i" "$out_dir"/"${j}"/input.fasta
echo "FastTree -nt -gtr -quiet $out_dir/${j}/input.fasta > $out_dir/${j}/input.tree" >>fasttree.cmds
done
xargs -t -P "$THREADS" -I cmd -a fasttree.cmds bash -c "cmd"
# treeshrink
run_treeshrink.py -f -i "$out_dir"/ -t input.tree -a input.fasta >treeshrink.log

View File

@ -1,8 +0,0 @@
#! /usr/bin/env bash
mkdir -p trimed_nal
echo -n > trimal.cmds
for i in cds_aln/*.nal ;do
j=$(basename "$i")
echo "trimal -in $i -out trimed_nal/${j/.nal/.trimed.fa} -automated1 -resoverlap 0.5 -seqoverlap 50" >> trimal.cmds
done
xargs -t -P 4 -I cmd -a trimal.cmds bash -c "cmd"

View File

@ -1,8 +0,0 @@
#! /usr/bin/env bash
mkdir -p fasttree
echo -n > fasttree.cmds
for i in trimed_nal/*.trimed.fa ;do
j=$(basename "$i")
echo "FastTree -nt -gtr -quiet $i > fasttree/${j/.trimed.fa/.tree}" >> fasttree.cmds
done
xargs -t -P 8 -I cmd -a fasttree.cmds bash -c "cmd"

View File

@ -1,11 +0,0 @@
#! /usr/bin/env bash
mkdir -p treeshrink
for i in trimed_nal/*.trimed.fa; do
j=$(basename "$i")
mkdir -p treeshrink/"${j/.trimed.fa/}"
cd treeshrink/"${j/.trimed.fa/}" || exit 1
ln -s ../../fasttree/"${j/.trimed.fa/.tree}" input.tree
ln -s ../../"$i" input.fasta
cd ../../
done
run_treeshrink.py -i treeshrink/ -t input.tree -a input.fasta > treeshrink.log

10
pixi.lock generated
View File

@ -338,6 +338,7 @@ environments:
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/p7zip-16.02-h9c3ff4c_1001.tar.bz2
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/noarch/pal2nal-14.1-pl5321hdfd78af_3.tar.bz2
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandoc-3.8.2.1-ha770c72_0.conda
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda
@ -5125,6 +5126,15 @@ packages:
license_family: GPL
size: 22693
timestamp: 1642293349880
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda
sha256: 81c5237b07796984d7915d29a768068a1e5e6c02b4ae25f27836b0babe84e5aa
md5: ef02b401c5daaf104b3d1091f8ec37f5
depends:
- libgcc >=13
license: GPL-3.0-or-later
license_family: GPL3
size: 679359
timestamp: 1755706728619
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
sha256: c97f796345f5b9756e4404bbb4ee049afd5ea1762be6ee37ce99162cbee3b1d3
md5: 72e3452bf0ff08132e86de0272f2fbb0

View File

@ -29,7 +29,6 @@ cd-hit = ">=4.8.1,<5"
orthofinder = ">=3.1.0,<4"
pip = ">=25.2,<26"
seqkit = ">=2.10.1,<3"
hmmer = ">=3.4,<4"
r-rstan = ">=2.32.7,<3"
biopython = ">=1.85,<2"
pal2nal = ">=14.1,<15"
@ -54,6 +53,7 @@ r-ggplot2 = ">=4.0.1,<5"
macse = ">=2.7,<3"
td2 = ">=1.0.6,<2"
mmseqs2 = ">=18.8cc5c,<19"
paml = ">=4.10.9,<5"
[feature.mrbayes.dependencies]
beagle-lib = ">=3.1.2,<4"