#! /usr/bin/env python3 #!/usr/bin/env python3 """ CDS to Protein Converter with Internal Stop Codon Filtering This script processes CDS sequences from a FASTA file, translates them to protein sequences, checks for internal stop codons, and outputs clean CDS and protein sequences. """ import sys from Bio import SeqIO from Bio.SeqRecord import SeqRecord def translate_cds_and_filter( input_fasta, output_clean_cds, output_clean_protein, translation_table=1 ): """ Main processing function: Translates CDS sequences and filters those with internal stop codons[2](@ref) Parameters: input_fasta: Path to input CDS sequences FASTA file output_clean_cds: Path for output clean CDS sequences output_clean_protein: Path for output protein sequences translation_table: Genetic code table number (default: 1 = Standard) Returns: Tuple of (clean_cds_count, removed_count) """ clean_cds_records = [] # Store CDS sequences without internal stop codons clean_protein_records = [] # Store corresponding protein sequences removed_count = 0 # Count of removed sequences total_count = 0 # Total sequences processed print(f"Processing file: {input_fasta}") # Process each sequence in the input FASTA file for record in SeqIO.parse(input_fasta, "fasta"): total_count += 1 cds_seq = record.seq seq_id = record.id # Check if sequence length is multiple of 3 if len(cds_seq) % 3 != 0: print(f"Warning: Sequence {seq_id} length is not multiple of 3, skipping.") removed_count += 1 continue try: # Translate CDS to protein sequence (including stop codon '*') protein_seq = cds_seq.translate(table=translation_table, to_stop=False) protein_str = str(protein_seq) # Find all stop codon positions in the protein sequence stop_positions = [i for i, aa in enumerate(protein_str) if aa == "*"] has_internal_stop = False # Check if any stop codon is not at the end (internal stop) if stop_positions: last_position = len(protein_str) - 1 # Internal stop exists if stop codon is found not at the very end if any(pos != last_position for pos in stop_positions): has_internal_stop = True if has_internal_stop: # Skip sequences with internal stop codons print( f"Warning: Removing sequence {seq_id}: Internal stop codon detected" ) removed_count += 1 else: # Create clean protein sequence (remove terminal stop codon if present) if protein_str.endswith("*"): protein_seq_clean = protein_seq[:-1] # Remove terminal stop codon else: protein_seq_clean = protein_seq # Create protein sequence record protein_record = SeqRecord( seq=protein_seq_clean, id=seq_id, description=record.description ) # Add to results clean_cds_records.append(record) clean_protein_records.append(protein_record) except Exception as e: print(f"Error processing sequence {seq_id}: {e}") removed_count += 1 continue # Write output files if we have valid sequences if clean_cds_records: SeqIO.write(clean_cds_records, output_clean_cds, "fasta") SeqIO.write(clean_protein_records, output_clean_protein, "fasta") print("\nProcessing completed successfully!") print(f"Total input sequences: {total_count}") print(f"Sequences retained: {len(clean_cds_records)}") print(f"Sequences removed: {removed_count}") print(f"Clean CDS sequences saved to: {output_clean_cds}") print(f"Protein sequences saved to: {output_clean_protein}") return len(clean_cds_records), removed_count else: print("Warning: No sequences passed filtering. Please check input file format.") return 0, removed_count def main(): """Main command-line interface function""" if len(sys.argv) != 3: print("Usage: python cds_to_protein_filter.py input.fasta output_stem") print("Arguments:") print(" input.fasta Input CDS sequences FASTA file") print(" output_stem Stem for output files ") sys.exit(1) input_file = sys.argv[1] output_stem = sys.argv[2] output_cds_file = f"{output_stem}.cds.fa" output_protein_file = f"{output_stem}.pep.fa" # Verify input file exists try: with open(input_file, "r"): pass except FileNotFoundError: print(f"Error: Input file {input_file} not found!") sys.exit(1) except IOError as e: print(f"Error reading input file {input_file}: {e}") sys.exit(1) # Execute processing try: translate_cds_and_filter(input_file, output_cds_file, output_protein_file) except Exception as e: print(f"Fatal error during processing: {e}") sys.exit(1) if __name__ == "__main__": main()