更新 .gitignore,添加 06.gene_trees 目录;重构 macse.sh 脚本,移除 fs_lr 参数;删除多个不再使用的脚本;添加 check_frameshift.py 和 get_og_seqs.py 脚本以处理阅读框移位和提取单拷贝OG序列;更新 pixi.toml 和 pixi.lock 文件以添加 paml 依赖。
This commit is contained in:
parent
e8ba2ed962
commit
dba4833905
|
|
@ -10,6 +10,7 @@
|
||||||
05.reduce_redundancy/*
|
05.reduce_redundancy/*
|
||||||
05.orthology_inference/*
|
05.orthology_inference/*
|
||||||
06.phylogeny_reconstruction/*
|
06.phylogeny_reconstruction/*
|
||||||
|
06.gene_trees/*
|
||||||
10.plastid/*
|
10.plastid/*
|
||||||
98.results/*
|
98.results/*
|
||||||
99.scripts/bucky/
|
99.scripts/bucky/
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Check and process frameshift in alignment files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
from Bio import SeqIO
|
||||||
|
from Bio.SeqRecord import SeqRecord
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def parse_frameshift_list(fs_list: Path) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Parse a frameshift list file into a pandas DataFrame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fs_list (Path): Path to the frameshift list file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame containing the frameshift information.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(fs_list, sep=",", header=0)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading frameshift list file {fs_list}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def get_alignment_files(aln_dir: Path, ext: str) -> list[Path]:
|
||||||
|
"""
|
||||||
|
Get a list of alignment files in a directory with a specific extension.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
aln_dir (Path): Path to the directory containing alignment files.
|
||||||
|
ext (str): Extension of the alignment files.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[Path]: List of Paths to the alignment files.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
aln_files = list(aln_dir.glob(f"*{ext}"))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error accessing alignment files in {aln_dir}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
return aln_files
|
||||||
|
|
||||||
|
|
||||||
|
def main(alignment_dir: str, ext: str, frameshift_list: str, outdir: str):
|
||||||
|
aln_dir = Path(alignment_dir)
|
||||||
|
fs_list_path = Path(frameshift_list)
|
||||||
|
out_dir = Path(outdir)
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
fs_df = parse_frameshift_list(fs_list_path)
|
||||||
|
aln_files = get_alignment_files(aln_dir, ext)
|
||||||
|
good_aln_count = 0
|
||||||
|
keep_fs_count = 0
|
||||||
|
discard_count = 0
|
||||||
|
|
||||||
|
for file in aln_files:
|
||||||
|
if str(file) not in fs_df["alignment_file"].values.tolist():
|
||||||
|
try:
|
||||||
|
shutil.copy(file, out_dir / file.name)
|
||||||
|
good_aln_count += 1
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error copying file {file} to {out_dir}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
if (
|
||||||
|
fs_df.loc[
|
||||||
|
fs_df["alignment_file"] == str(file), "possible_attributions"
|
||||||
|
].values[0]
|
||||||
|
== "Framshift only in Ziziphus jujuba or Elaeagnus pungens"
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
records = []
|
||||||
|
for record in SeqIO.parse(file, "fasta"):
|
||||||
|
seq = str(record.seq).replace("!", "-")
|
||||||
|
id = record.id
|
||||||
|
records.append(SeqRecord(seq=Seq(seq), id=id, description=""))
|
||||||
|
print(
|
||||||
|
f"Keep the alignment file with frameshift: {file}. Due to frameshift only in outgroup."
|
||||||
|
)
|
||||||
|
keep_fs_count += 1
|
||||||
|
SeqIO.write(records, out_dir / file.name, "fasta")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing file {file}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
discard_count += 1
|
||||||
|
print("Frameshift processing completed successfully.")
|
||||||
|
print(f"Number of good alignments copied: {good_aln_count}")
|
||||||
|
print(f"Number of alignments with frameshift kept: {keep_fs_count}")
|
||||||
|
print(f"Number of alignments with frameshift discarded: {discard_count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Check and process frameshift in alignment files."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--alignment_dir",
|
||||||
|
required=True,
|
||||||
|
help="Directory containing alignment files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-e", "--ext", default=".nal", help="Extension of alignment files"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-f",
|
||||||
|
"--frameshift_list",
|
||||||
|
required=True,
|
||||||
|
help="CSV file listing frameshift information",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o", "--outdir", required=True, help="Output directory for processed files"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args.alignment_dir, args.ext, args.frameshift_list, args.outdir)
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Extract Single Copy OG sequences from a comprehensive FASTA file based on OG definitions from results of orthofinder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from Bio import SeqIO
|
||||||
|
from Bio.SeqRecord import SeqRecord
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def get_og_fastas(og_dir: Path, ext: str) -> dict[str, list[str]]:
|
||||||
|
"""
|
||||||
|
Get a dictionary of OGs and their corresponding sequence IDs from FASTA files in a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
og_dir (Path): Path to the directory containing OG FASTA files.
|
||||||
|
ext (str): Extension of the FASTA files. Default is ".fa".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary with OG names as keys and lists of sequence IDs as values.
|
||||||
|
"""
|
||||||
|
og_dict: dict[str, list[str]] = {}
|
||||||
|
try:
|
||||||
|
for fasta_file in og_dir.glob(f"*{ext}"):
|
||||||
|
og_name = fasta_file.stem
|
||||||
|
seq_ids: list[str] = []
|
||||||
|
for record in SeqIO.parse(fasta_file, "fasta"):
|
||||||
|
seq_ids.append(record.id)
|
||||||
|
og_dict[og_name] = seq_ids
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing OG FASTA files in {og_dir}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
return og_dict
|
||||||
|
|
||||||
|
|
||||||
|
def parse_all_fasta(fasta_file: Path) -> dict[str, Seq]:
|
||||||
|
"""
|
||||||
|
Parse a FASTA file and return a dictionary of sequence IDs and their corresponding Seq objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fasta_file (Path): Path to the FASTA file.
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary with sequence IDs as keys and Seq objects as values.
|
||||||
|
"""
|
||||||
|
seq_dict: dict[str, Seq] = {}
|
||||||
|
try:
|
||||||
|
for record in SeqIO.parse(fasta_file, "fasta"):
|
||||||
|
seq_dict[record.id] = record.seq
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error parsing FASTA file {fasta_file}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
return seq_dict
|
||||||
|
|
||||||
|
|
||||||
|
def output_og_seqs(
|
||||||
|
all_seq_dict: dict[str, Seq], og_dict: dict[str, list[str]], output_dir: Path
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Output sequences for each OG into separate FASTA files, remove gene id and only keep taxon name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
all_seq_dict (dict): Dictionary of all sequences with sequence IDs as keys.
|
||||||
|
og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values.
|
||||||
|
output_dir (Path): Path to the output directory.
|
||||||
|
"""
|
||||||
|
for og_name, seq_ids in og_dict.items():
|
||||||
|
og_seqs: list[SeqRecord] = []
|
||||||
|
for seq_id in seq_ids:
|
||||||
|
if seq_id in all_seq_dict:
|
||||||
|
seq_record = SeqRecord(
|
||||||
|
all_seq_dict[seq_id], id=seq_id.split("@")[0], description=""
|
||||||
|
)
|
||||||
|
og_seqs.append(seq_record)
|
||||||
|
else:
|
||||||
|
print(f"Warning: Sequence ID {seq_id} not found in all sequences.")
|
||||||
|
output_fasta = output_dir / f"{og_name}.fa"
|
||||||
|
try:
|
||||||
|
SeqIO.write(og_seqs, output_fasta, "fasta")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error writing to FASTA file {output_fasta}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def write_og_list(og_dict: dict[str, list[str]]):
|
||||||
|
"""
|
||||||
|
Write the OG list to a text file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
og_dict (dict): Dictionary of OGs with OG names as keys and lists of sequence IDs as values.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open("og_list.tsv", "w") as f:
|
||||||
|
for og_name, seq_ids in og_dict.items():
|
||||||
|
line = f"{og_name}\t" + "\t".join(seq_ids) + "\n"
|
||||||
|
f.write(line)
|
||||||
|
print("OG list written to og_list.tsv")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error writing OG list to file: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def main(og_dir_path: str, all_fasta_path: str, output_dir_path: str, ext: str = ".fa"):
|
||||||
|
og_dir = Path(og_dir_path)
|
||||||
|
all_fasta = Path(all_fasta_path)
|
||||||
|
output_dir = Path(output_dir_path)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
og_dict = get_og_fastas(og_dir, ext)
|
||||||
|
all_seqs = parse_all_fasta(all_fasta)
|
||||||
|
output_og_seqs(all_seqs, og_dict, output_dir)
|
||||||
|
write_og_list(og_dict)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Extract OG sequences from a comprehensive FASTA file based on OG definitions."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--og_dir",
|
||||||
|
required=True,
|
||||||
|
help="Directory containing OG FASTA files.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--all_fasta",
|
||||||
|
required=True,
|
||||||
|
help="FASTA file containing all sequences.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output_dir",
|
||||||
|
required=True,
|
||||||
|
help="Output directory for OG FASTA files.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--ext",
|
||||||
|
default=".fa",
|
||||||
|
help="Extension of OG FASTA files (default: .fa).",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.og_dir, args.all_fasta, args.output_dir, args.ext)
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Get primary CDS sequences from a FASTA file containing multiple CDS per gene.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from Bio import SeqIO
|
||||||
|
from Bio.SeqRecord import SeqRecord
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def get_primary_cds(input_fasta, output_fasta):
|
||||||
|
primary_cds_records = []
|
||||||
|
gene = ""
|
||||||
|
length = 0
|
||||||
|
seq = None
|
||||||
|
id = None
|
||||||
|
try:
|
||||||
|
for record in SeqIO.parse(input_fasta, "fasta"):
|
||||||
|
seq_len = len(record.seq)
|
||||||
|
desc = record.description
|
||||||
|
match = re.search(r"\[gene=(\S+)\]", desc)
|
||||||
|
if match:
|
||||||
|
gene_name = match.group(1)
|
||||||
|
else:
|
||||||
|
# Skip if gene name not found
|
||||||
|
continue
|
||||||
|
|
||||||
|
if gene_name != gene:
|
||||||
|
# new gene encountered
|
||||||
|
# print(f"Processing gene: {gene_name}")
|
||||||
|
if length > 0:
|
||||||
|
# this is not the first record, save the previous longest record
|
||||||
|
primary_cds_record = SeqRecord(
|
||||||
|
seq, id=id, description=f"[gene={gene}]"
|
||||||
|
)
|
||||||
|
primary_cds_records.append(primary_cds_record)
|
||||||
|
gene = gene_name
|
||||||
|
seq = record.seq
|
||||||
|
id = record.id
|
||||||
|
length = seq_len
|
||||||
|
else:
|
||||||
|
# same gene, check length
|
||||||
|
if seq_len > length:
|
||||||
|
seq = record.seq
|
||||||
|
id = record.id
|
||||||
|
length = seq_len
|
||||||
|
# after loop, save the last gene
|
||||||
|
if gene and length > 0:
|
||||||
|
primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]")
|
||||||
|
primary_cds_records.append(primary_cds_record)
|
||||||
|
SeqIO.write(primary_cds_records, output_fasta, "fasta")
|
||||||
|
print(f"Primary CDS sequences written to {args.output_fasta}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing FASTA file {input_fasta}: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Extract primary CDS sequences from a FASTA file."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-i", "--input_fasta", help="Input FASTA file containing CDS sequences."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences."
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
get_primary_cds(args.input_fasta, args.output_fasta)
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#! /bin/bash
|
#! /bin/bash
|
||||||
|
|
||||||
FS_LR=7
|
|
||||||
|
|
||||||
if [ "$#" -ne 3 ]; then
|
if [ "$#" -ne 3 ]; then
|
||||||
echo "Usage: $0 <reliable_seq> <less_reliable_seq> <stem>"
|
echo "Usage: $0 <reliable_seq> <less_reliable_seq> <stem>"
|
||||||
echo "Perform MACSE alignment on given sequences"
|
echo "Perform MACSE alignment on given sequences"
|
||||||
|
|
@ -16,7 +14,7 @@ stem=$3
|
||||||
echo "Command:"
|
echo "Command:"
|
||||||
echo "macse -prog alignSequences -seq $seq -seq_lr ${seq_lr}"
|
echo "macse -prog alignSequences -seq $seq -seq_lr ${seq_lr}"
|
||||||
echo " -out_NT ${stem}.nal -out_AA ${stem}.pal"
|
echo " -out_NT ${stem}.nal -out_AA ${stem}.pal"
|
||||||
echo " -optim 2 -max_refine_iter 3 -local_realign_init 0.2 -fs_lr $FS_LR"
|
echo " -optim 2 -max_refine_iter 3 -local_realign_init 0.2"
|
||||||
} >alignSequences.log
|
} >alignSequences.log
|
||||||
macse -prog alignSequences \
|
macse -prog alignSequences \
|
||||||
-seq "$seq" -seq_lr "${seq_lr}" \
|
-seq "$seq" -seq_lr "${seq_lr}" \
|
||||||
|
|
@ -24,5 +22,4 @@ macse -prog alignSequences \
|
||||||
-optim 2 \
|
-optim 2 \
|
||||||
-max_refine_iter 3 \
|
-max_refine_iter 3 \
|
||||||
-local_realign_init 0.2 \
|
-local_realign_init 0.2 \
|
||||||
-fs_lr $FS_LR \
|
|
||||||
>>alignSequences.log 2>&1
|
>>alignSequences.log 2>&1
|
||||||
|
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
||||||
#! /bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [ "$#" -ne 4 ]; then
|
|
||||||
echo "Usage: $0 <ogs_dir> <outdir> <proteome> <threads>"
|
|
||||||
echo "search homologous sequences in <proteome> using HMMs built from orthogroup alignments"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ogs_dir=$(readlink -f "$1")
|
|
||||||
outdir=$2
|
|
||||||
proteome=$(readlink -f "$3")
|
|
||||||
threads=$4
|
|
||||||
|
|
||||||
mkdir -p "$outdir"
|
|
||||||
cd "$outdir" || exit 1
|
|
||||||
echo "Working directory: $(pwd)"
|
|
||||||
echo "Using OGS directory: $ogs_dir"
|
|
||||||
echo "Using $threads threads"
|
|
||||||
echo ""
|
|
||||||
echo "Starting orthogroup sequence alignment..."
|
|
||||||
mkdir -p msa
|
|
||||||
echo -n >mafft.cmds
|
|
||||||
for i in "$ogs_dir"/*.fa; do
|
|
||||||
j=$(basename "$i")
|
|
||||||
echo "linsi --quiet $i > msa/$j" >>mafft.cmds
|
|
||||||
done
|
|
||||||
xargs -t -P "$threads" -I cmd -a mafft.cmds bash -c "cmd"
|
|
||||||
echo "Orthogroup sequence alignment completed."
|
|
||||||
echo ""
|
|
||||||
echo "Starting HMM building from alignments..."
|
|
||||||
mkdir -p hmms
|
|
||||||
echo -n >hmmbuild.cmds
|
|
||||||
for i in msa/*.fa; do
|
|
||||||
j=$(basename "$i")
|
|
||||||
echo "hmmbuild -o hmms/${j}.hmmbuild.out --amino hmms/${j}.hmm $i" >>hmmbuild.cmds
|
|
||||||
done
|
|
||||||
xargs -t -P "$threads" -I cmd -a hmmbuild.cmds bash -c "cmd"
|
|
||||||
echo "HMM building completed."
|
|
||||||
echo ""
|
|
||||||
echo "Starting HMM search against other proteome..."
|
|
||||||
mkdir -p search
|
|
||||||
echo -n >hmmsearch.cmds
|
|
||||||
for i in hmms/*.hmm; do
|
|
||||||
j=$(basename "$i")
|
|
||||||
echo "hmmsearch --tblout search/${j}search.tblout $i $proteome > search/${j}search.rawout" >>hmmsearch.cmds
|
|
||||||
done
|
|
||||||
xargs -t -P "$threads" -I cmd -a hmmsearch.cmds bash -c "cmd"
|
|
||||||
echo "HMM search completed."
|
|
||||||
echo ""
|
|
||||||
echo "All steps completed successfully."
|
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
#! /bin/bash
|
||||||
|
set -e
|
||||||
|
THREADS=${THREADS:-12}
|
||||||
|
EXT=${EXT:-"fa"}
|
||||||
|
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
echo "Usage: $0 <ogs_dir> <out_dir>"
|
||||||
|
echo "Perform MACSE alignment for each orthologous group"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ogs_dir=$(readlink -f "$1")
|
||||||
|
out_dir=$2
|
||||||
|
|
||||||
|
mkdir -p "$out_dir"
|
||||||
|
echo "Starting MACSE alignment of orthologous groups..."
|
||||||
|
echo -n >macse.cmds
|
||||||
|
for og_fasta in "$ogs_dir"/*."$EXT"; do
|
||||||
|
og_name=$(basename "$og_fasta" ."$EXT")
|
||||||
|
out_stem="$out_dir/$og_name"
|
||||||
|
echo "macse -prog alignSequences -seq $og_fasta -out_AA ${out_stem}.pal -out_NT ${out_stem}.nal > ${out_stem}.log 2>&1" >>macse.cmds
|
||||||
|
done
|
||||||
|
xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd" &&
|
||||||
|
echo "MACSE alignment completed."
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
#! /bin/bash
|
|
||||||
set -e
|
|
||||||
SCRIPTS=${SCRIPTS:-"$PROJECTHOME/99.scripts"}
|
|
||||||
THREADS=${THREADS:-12}
|
|
||||||
|
|
||||||
if [ "$#" -ne 5 ]; then
|
|
||||||
echo "Usage: $0 <ogs_dir> <hmmsearch_result_dir> <all_cds.fa> <output_dir> <homolog_stem>"
|
|
||||||
echo "Integrate hmmsearch results to new orthologous groups directory and perform MACSE alignment"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ogs_dir=$(readlink -f "$1")
|
|
||||||
search_dir=$(readlink -f "$2")
|
|
||||||
all_cds=$(readlink -f "$3")
|
|
||||||
out_dir=$4
|
|
||||||
stem=$5
|
|
||||||
|
|
||||||
echo "Integrating hmmsearch results to new orthologous groups directory..."
|
|
||||||
python3 "$SCRIPTS"/miscs/hmmsearch_result_to_new_ogs_dir.py \
|
|
||||||
-d "$ogs_dir" \
|
|
||||||
-t "$search_dir" \
|
|
||||||
-f "$all_cds" \
|
|
||||||
-o "$out_dir" \
|
|
||||||
-s "$stem"
|
|
||||||
echo "Integration completed."
|
|
||||||
|
|
||||||
echo "Starting MACSE alignment of orthologous groups..."
|
|
||||||
echo -n >macse.cmds
|
|
||||||
for og_dir in "$out_dir"/ogs/*; do
|
|
||||||
j=$(basename "$og_dir")
|
|
||||||
echo "cd $og_dir && bash $SCRIPTS/miscs/macse.sh ${j}_${stem}.fa ${j}.fa $j" >>macse.cmds
|
|
||||||
done
|
|
||||||
xargs -t -P "$THREADS" -I cmd -a macse.cmds bash -c "cmd"
|
|
||||||
echo "MACSE alignment completed."
|
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
#! /bin/bash
|
||||||
|
set -e
|
||||||
|
THREADS=${THREADS:-12}
|
||||||
|
if [ "$#" -ne 3 ]; then
|
||||||
|
echo "Usage: $0 <nal_dir> <out_dir> <ext>"
|
||||||
|
echo "Trim MACSE nucleotide alignments using trimal"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
nal_dir=$(readlink -f "$1")
|
||||||
|
out_dir=$(readlink -f "$2")
|
||||||
|
ext=$3
|
||||||
|
|
||||||
|
mkdir -p "$out_dir"
|
||||||
|
echo -n >trimal.cmds
|
||||||
|
for i in "$nal_dir"/*."$ext"; do
|
||||||
|
j=$(basename "$i" ."$ext")
|
||||||
|
echo "trimal -in $i -out $out_dir/${j}.trimed.fa -automated1 -resoverlap 0.5 -seqoverlap 50" >>trimal.cmds
|
||||||
|
done
|
||||||
|
xargs -t -P "$THREADS" -I cmd -a trimal.cmds bash -c "cmd"
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
#! /bin/bash
|
||||||
|
set -e
|
||||||
|
THREADS=${THREADS:-12}
|
||||||
|
|
||||||
|
if [ "$#" -ne 3 ]; then
|
||||||
|
echo "Usage: $0 <aln_dir> <out_dir> <ext>"
|
||||||
|
echo "Build FastTree phylogenetic trees and perform TreeShrink on trimmed MACSE nucleotide alignments"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
aln_dir=$(readlink -f "$1")
|
||||||
|
out_dir=$(readlink -f "$2")
|
||||||
|
ext=$3
|
||||||
|
|
||||||
|
# fasttree
|
||||||
|
mkdir -p "$out_dir"
|
||||||
|
echo -n >fasttree.cmds
|
||||||
|
for i in "$aln_dir"/*."$ext"; do
|
||||||
|
j=$(basename "$i" ."$ext")
|
||||||
|
mkdir -p "$out_dir"/"${j}"
|
||||||
|
cp -s "$i" "$out_dir"/"${j}"/input.fasta
|
||||||
|
echo "FastTree -nt -gtr -quiet $out_dir/${j}/input.fasta > $out_dir/${j}/input.tree" >>fasttree.cmds
|
||||||
|
done
|
||||||
|
xargs -t -P "$THREADS" -I cmd -a fasttree.cmds bash -c "cmd"
|
||||||
|
# treeshrink
|
||||||
|
run_treeshrink.py -f -i "$out_dir"/ -t input.tree -a input.fasta >treeshrink.log
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
#! /usr/bin/env bash
|
|
||||||
mkdir -p trimed_nal
|
|
||||||
echo -n > trimal.cmds
|
|
||||||
for i in cds_aln/*.nal ;do
|
|
||||||
j=$(basename "$i")
|
|
||||||
echo "trimal -in $i -out trimed_nal/${j/.nal/.trimed.fa} -automated1 -resoverlap 0.5 -seqoverlap 50" >> trimal.cmds
|
|
||||||
done
|
|
||||||
xargs -t -P 4 -I cmd -a trimal.cmds bash -c "cmd"
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
#! /usr/bin/env bash
|
|
||||||
mkdir -p fasttree
|
|
||||||
echo -n > fasttree.cmds
|
|
||||||
for i in trimed_nal/*.trimed.fa ;do
|
|
||||||
j=$(basename "$i")
|
|
||||||
echo "FastTree -nt -gtr -quiet $i > fasttree/${j/.trimed.fa/.tree}" >> fasttree.cmds
|
|
||||||
done
|
|
||||||
xargs -t -P 8 -I cmd -a fasttree.cmds bash -c "cmd"
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
#! /usr/bin/env bash
|
|
||||||
mkdir -p treeshrink
|
|
||||||
for i in trimed_nal/*.trimed.fa; do
|
|
||||||
j=$(basename "$i")
|
|
||||||
mkdir -p treeshrink/"${j/.trimed.fa/}"
|
|
||||||
cd treeshrink/"${j/.trimed.fa/}" || exit 1
|
|
||||||
ln -s ../../fasttree/"${j/.trimed.fa/.tree}" input.tree
|
|
||||||
ln -s ../../"$i" input.fasta
|
|
||||||
cd ../../
|
|
||||||
done
|
|
||||||
run_treeshrink.py -i treeshrink/ -t input.tree -a input.fasta > treeshrink.log
|
|
||||||
|
|
@ -338,6 +338,7 @@ environments:
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/p7zip-16.02-h9c3ff4c_1001.tar.bz2
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/p7zip-16.02-h9c3ff4c_1001.tar.bz2
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/noarch/pal2nal-14.1-pl5321hdfd78af_3.tar.bz2
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/noarch/pal2nal-14.1-pl5321hdfd78af_3.tar.bz2
|
||||||
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandoc-3.8.2.1-ha770c72_0.conda
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandoc-3.8.2.1-ha770c72_0.conda
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda
|
||||||
|
|
@ -5125,6 +5126,15 @@ packages:
|
||||||
license_family: GPL
|
license_family: GPL
|
||||||
size: 22693
|
size: 22693
|
||||||
timestamp: 1642293349880
|
timestamp: 1642293349880
|
||||||
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/linux-64/paml-4.10.9-h7b50bb2_1.conda
|
||||||
|
sha256: 81c5237b07796984d7915d29a768068a1e5e6c02b4ae25f27836b0babe84e5aa
|
||||||
|
md5: ef02b401c5daaf104b3d1091f8ec37f5
|
||||||
|
depends:
|
||||||
|
- libgcc >=13
|
||||||
|
license: GPL-3.0-or-later
|
||||||
|
license_family: GPL3
|
||||||
|
size: 679359
|
||||||
|
timestamp: 1755706728619
|
||||||
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
|
- conda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/linux-64/pandas-2.3.3-py311hed34c8f_1.conda
|
||||||
sha256: c97f796345f5b9756e4404bbb4ee049afd5ea1762be6ee37ce99162cbee3b1d3
|
sha256: c97f796345f5b9756e4404bbb4ee049afd5ea1762be6ee37ce99162cbee3b1d3
|
||||||
md5: 72e3452bf0ff08132e86de0272f2fbb0
|
md5: 72e3452bf0ff08132e86de0272f2fbb0
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,6 @@ cd-hit = ">=4.8.1,<5"
|
||||||
orthofinder = ">=3.1.0,<4"
|
orthofinder = ">=3.1.0,<4"
|
||||||
pip = ">=25.2,<26"
|
pip = ">=25.2,<26"
|
||||||
seqkit = ">=2.10.1,<3"
|
seqkit = ">=2.10.1,<3"
|
||||||
hmmer = ">=3.4,<4"
|
|
||||||
r-rstan = ">=2.32.7,<3"
|
r-rstan = ">=2.32.7,<3"
|
||||||
biopython = ">=1.85,<2"
|
biopython = ">=1.85,<2"
|
||||||
pal2nal = ">=14.1,<15"
|
pal2nal = ">=14.1,<15"
|
||||||
|
|
@ -54,6 +53,7 @@ r-ggplot2 = ">=4.0.1,<5"
|
||||||
macse = ">=2.7,<3"
|
macse = ">=2.7,<3"
|
||||||
td2 = ">=1.0.6,<2"
|
td2 = ">=1.0.6,<2"
|
||||||
mmseqs2 = ">=18.8cc5c,<19"
|
mmseqs2 = ">=18.8cc5c,<19"
|
||||||
|
paml = ">=4.10.9,<5"
|
||||||
|
|
||||||
[feature.mrbayes.dependencies]
|
[feature.mrbayes.dependencies]
|
||||||
beagle-lib = ">=3.1.2,<4"
|
beagle-lib = ">=3.1.2,<4"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue