biyelunwen/99.scripts/miscs/check_and_translate_outgrou...

145 lines
5.2 KiB
Python
Executable File

#! /usr/bin/env python3
#!/usr/bin/env python3
"""
CDS to Protein Converter with Internal Stop Codon Filtering
This script processes CDS sequences from a FASTA file, translates them to protein sequences,
checks for internal stop codons, and outputs clean CDS and protein sequences.
"""
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def translate_cds_and_filter(
input_fasta, output_clean_cds, output_clean_protein, translation_table=1
):
"""
Main processing function: Translates CDS sequences and filters those with internal stop codons[2](@ref)
Parameters:
input_fasta: Path to input CDS sequences FASTA file
output_clean_cds: Path for output clean CDS sequences
output_clean_protein: Path for output protein sequences
translation_table: Genetic code table number (default: 1 = Standard)
Returns:
Tuple of (clean_cds_count, removed_count)
"""
clean_cds_records = [] # Store CDS sequences without internal stop codons
clean_protein_records = [] # Store corresponding protein sequences
removed_count = 0 # Count of removed sequences
total_count = 0 # Total sequences processed
print(f"Processing file: {input_fasta}")
# Process each sequence in the input FASTA file
for record in SeqIO.parse(input_fasta, "fasta"):
total_count += 1
cds_seq = record.seq
seq_id = record.id
# Check if sequence length is multiple of 3
if len(cds_seq) % 3 != 0:
print(f"Warning: Sequence {seq_id} length is not multiple of 3, skipping.")
removed_count += 1
continue
try:
# Translate CDS to protein sequence (including stop codon '*')
protein_seq = cds_seq.translate(table=translation_table, to_stop=False)
protein_str = str(protein_seq)
# Find all stop codon positions in the protein sequence
stop_positions = [i for i, aa in enumerate(protein_str) if aa == "*"]
has_internal_stop = False
# Check if any stop codon is not at the end (internal stop)
if stop_positions:
last_position = len(protein_str) - 1
# Internal stop exists if stop codon is found not at the very end
if any(pos != last_position for pos in stop_positions):
has_internal_stop = True
if has_internal_stop:
# Skip sequences with internal stop codons
print(
f"Warning: Removing sequence {seq_id}: Internal stop codon detected"
)
removed_count += 1
else:
# Create clean protein sequence (remove terminal stop codon if present)
if protein_str.endswith("*"):
protein_seq_clean = protein_seq[:-1] # Remove terminal stop codon
else:
protein_seq_clean = protein_seq
# Create protein sequence record
protein_record = SeqRecord(
seq=protein_seq_clean, id=seq_id, description=record.description
)
# Add to results
clean_cds_records.append(record)
clean_protein_records.append(protein_record)
except Exception as e:
print(f"Error processing sequence {seq_id}: {e}")
removed_count += 1
continue
# Write output files if we have valid sequences
if clean_cds_records:
SeqIO.write(clean_cds_records, output_clean_cds, "fasta")
SeqIO.write(clean_protein_records, output_clean_protein, "fasta")
print("\nProcessing completed successfully!")
print(f"Total input sequences: {total_count}")
print(f"Sequences retained: {len(clean_cds_records)}")
print(f"Sequences removed: {removed_count}")
print(f"Clean CDS sequences saved to: {output_clean_cds}")
print(f"Protein sequences saved to: {output_clean_protein}")
return len(clean_cds_records), removed_count
else:
print("Warning: No sequences passed filtering. Please check input file format.")
return 0, removed_count
def main():
"""Main command-line interface function"""
if len(sys.argv) != 3:
print("Usage: python cds_to_protein_filter.py input.fasta output_stem")
print("Arguments:")
print(" input.fasta Input CDS sequences FASTA file")
print(" output_stem Stem for output files ")
sys.exit(1)
input_file = sys.argv[1]
output_stem = sys.argv[2]
output_cds_file = f"{output_stem}.cds.fa"
output_protein_file = f"{output_stem}.pep.fa"
# Verify input file exists
try:
with open(input_file, "r"):
pass
except FileNotFoundError:
print(f"Error: Input file {input_file} not found!")
sys.exit(1)
except IOError as e:
print(f"Error reading input file {input_file}: {e}")
sys.exit(1)
# Execute processing
try:
translate_cds_and_filter(input_file, output_cds_file, output_protein_file)
except Exception as e:
print(f"Fatal error during processing: {e}")
sys.exit(1)
if __name__ == "__main__":
main()