145 lines
5.2 KiB
Python
Executable File
145 lines
5.2 KiB
Python
Executable File
#! /usr/bin/env python3
|
|
#!/usr/bin/env python3
|
|
"""
|
|
CDS to Protein Converter with Internal Stop Codon Filtering
|
|
|
|
This script processes CDS sequences from a FASTA file, translates them to protein sequences,
|
|
checks for internal stop codons, and outputs clean CDS and protein sequences.
|
|
"""
|
|
|
|
import sys
|
|
from Bio import SeqIO
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
|
|
def translate_cds_and_filter(
|
|
input_fasta, output_clean_cds, output_clean_protein, translation_table=1
|
|
):
|
|
"""
|
|
Main processing function: Translates CDS sequences and filters those with internal stop codons[2](@ref)
|
|
|
|
Parameters:
|
|
input_fasta: Path to input CDS sequences FASTA file
|
|
output_clean_cds: Path for output clean CDS sequences
|
|
output_clean_protein: Path for output protein sequences
|
|
translation_table: Genetic code table number (default: 1 = Standard)
|
|
|
|
Returns:
|
|
Tuple of (clean_cds_count, removed_count)
|
|
"""
|
|
clean_cds_records = [] # Store CDS sequences without internal stop codons
|
|
clean_protein_records = [] # Store corresponding protein sequences
|
|
removed_count = 0 # Count of removed sequences
|
|
total_count = 0 # Total sequences processed
|
|
|
|
print(f"Processing file: {input_fasta}")
|
|
|
|
# Process each sequence in the input FASTA file
|
|
for record in SeqIO.parse(input_fasta, "fasta"):
|
|
total_count += 1
|
|
cds_seq = record.seq
|
|
seq_id = record.id
|
|
|
|
# Check if sequence length is multiple of 3
|
|
if len(cds_seq) % 3 != 0:
|
|
print(f"Warning: Sequence {seq_id} length is not multiple of 3, skipping.")
|
|
removed_count += 1
|
|
continue
|
|
|
|
try:
|
|
# Translate CDS to protein sequence (including stop codon '*')
|
|
protein_seq = cds_seq.translate(table=translation_table, to_stop=False)
|
|
protein_str = str(protein_seq)
|
|
|
|
# Find all stop codon positions in the protein sequence
|
|
stop_positions = [i for i, aa in enumerate(protein_str) if aa == "*"]
|
|
has_internal_stop = False
|
|
|
|
# Check if any stop codon is not at the end (internal stop)
|
|
if stop_positions:
|
|
last_position = len(protein_str) - 1
|
|
# Internal stop exists if stop codon is found not at the very end
|
|
if any(pos != last_position for pos in stop_positions):
|
|
has_internal_stop = True
|
|
|
|
if has_internal_stop:
|
|
# Skip sequences with internal stop codons
|
|
print(
|
|
f"Warning: Removing sequence {seq_id}: Internal stop codon detected"
|
|
)
|
|
removed_count += 1
|
|
else:
|
|
# Create clean protein sequence (remove terminal stop codon if present)
|
|
if protein_str.endswith("*"):
|
|
protein_seq_clean = protein_seq[:-1] # Remove terminal stop codon
|
|
else:
|
|
protein_seq_clean = protein_seq
|
|
|
|
# Create protein sequence record
|
|
protein_record = SeqRecord(
|
|
seq=protein_seq_clean, id=seq_id, description=record.description
|
|
)
|
|
|
|
# Add to results
|
|
clean_cds_records.append(record)
|
|
clean_protein_records.append(protein_record)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing sequence {seq_id}: {e}")
|
|
removed_count += 1
|
|
continue
|
|
|
|
# Write output files if we have valid sequences
|
|
if clean_cds_records:
|
|
SeqIO.write(clean_cds_records, output_clean_cds, "fasta")
|
|
SeqIO.write(clean_protein_records, output_clean_protein, "fasta")
|
|
|
|
print("\nProcessing completed successfully!")
|
|
print(f"Total input sequences: {total_count}")
|
|
print(f"Sequences retained: {len(clean_cds_records)}")
|
|
print(f"Sequences removed: {removed_count}")
|
|
print(f"Clean CDS sequences saved to: {output_clean_cds}")
|
|
print(f"Protein sequences saved to: {output_clean_protein}")
|
|
|
|
return len(clean_cds_records), removed_count
|
|
else:
|
|
print("Warning: No sequences passed filtering. Please check input file format.")
|
|
return 0, removed_count
|
|
|
|
|
|
def main():
|
|
"""Main command-line interface function"""
|
|
if len(sys.argv) != 3:
|
|
print("Usage: python cds_to_protein_filter.py input.fasta output_stem")
|
|
print("Arguments:")
|
|
print(" input.fasta Input CDS sequences FASTA file")
|
|
print(" output_stem Stem for output files ")
|
|
sys.exit(1)
|
|
|
|
input_file = sys.argv[1]
|
|
output_stem = sys.argv[2]
|
|
output_cds_file = f"{output_stem}.cds.fa"
|
|
output_protein_file = f"{output_stem}.pep.fa"
|
|
|
|
# Verify input file exists
|
|
try:
|
|
with open(input_file, "r"):
|
|
pass
|
|
except FileNotFoundError:
|
|
print(f"Error: Input file {input_file} not found!")
|
|
sys.exit(1)
|
|
except IOError as e:
|
|
print(f"Error reading input file {input_file}: {e}")
|
|
sys.exit(1)
|
|
|
|
# Execute processing
|
|
try:
|
|
translate_cds_and_filter(input_file, output_cds_file, output_protein_file)
|
|
except Exception as e:
|
|
print(f"Fatal error during processing: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|