#! /usr/bin/env python3 # -*- coding: utf-8 -*- """ Get primary CDS sequences from a FASTA file containing multiple CDS per gene. """ from Bio import SeqIO from Bio.SeqRecord import SeqRecord import re import argparse import sys def get_primary_cds(input_fasta, output_fasta): primary_cds_records = [] gene = "" length = 0 seq = None id = None try: for record in SeqIO.parse(input_fasta, "fasta"): seq_len = len(record.seq) desc = record.description match = re.search(r"\[gene=(\S+)\]", desc) if match: gene_name = match.group(1) else: # Skip if gene name not found continue if gene_name != gene: # new gene encountered # print(f"Processing gene: {gene_name}") if length > 0: # this is not the first record, save the previous longest record primary_cds_record = SeqRecord( seq, id=id, description=f"[gene={gene}]" ) primary_cds_records.append(primary_cds_record) gene = gene_name seq = record.seq id = record.id length = seq_len else: # same gene, check length if seq_len > length: seq = record.seq id = record.id length = seq_len # after loop, save the last gene if gene and length > 0: primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]") primary_cds_records.append(primary_cds_record) SeqIO.write(primary_cds_records, output_fasta, "fasta") print(f"Primary CDS sequences written to {args.output_fasta}") except Exception as e: print(f"Error processing FASTA file {input_fasta}: {e}") sys.exit(1) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract primary CDS sequences from a FASTA file." ) parser.add_argument( "-i", "--input_fasta", help="Input FASTA file containing CDS sequences." ) parser.add_argument( "-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences." ) args = parser.parse_args() get_primary_cds(args.input_fasta, args.output_fasta)