biyelunwen/99.scripts/miscs/get_primary_cds.py

#! /usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Get primary CDS sequences from a FASTA file containing multiple CDS per gene.
"""

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import re
import argparse
import sys


def get_primary_cds(input_fasta, output_fasta):
    primary_cds_records = []
    gene = ""
    length = 0
    seq = None
    id = None
    try:
        for record in SeqIO.parse(input_fasta, "fasta"):
            seq_len = len(record.seq)
            desc = record.description
            match = re.search(r"\[gene=(\S+)\]", desc)
            if match:
                gene_name = match.group(1)
            else:
                # Skip if gene name not found
                continue

            if gene_name != gene:
                # new gene encountered
                # print(f"Processing gene: {gene_name}")
                if length > 0:
                    # this is not the first record, save the previous longest record
                    primary_cds_record = SeqRecord(
                        seq, id=id, description=f"[gene={gene}]"
                    )
                    primary_cds_records.append(primary_cds_record)
                gene = gene_name
                seq = record.seq
                id = record.id
                length = seq_len
            else:
                # same gene, check length
                if seq_len > length:
                    seq = record.seq
                    id = record.id
                    length = seq_len
        # after loop, save the last gene
        if gene and length > 0:
            primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]")
            primary_cds_records.append(primary_cds_record)
        SeqIO.write(primary_cds_records, output_fasta, "fasta")
        print(f"Primary CDS sequences written to {args.output_fasta}")
    except Exception as e:
        print(f"Error processing FASTA file {input_fasta}: {e}")
        sys.exit(1)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extract primary CDS sequences from a FASTA file."
    )
    parser.add_argument(
        "-i", "--input_fasta", help="Input FASTA file containing CDS sequences."
    )
    parser.add_argument(
        "-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences."
    )
    args = parser.parse_args()

    get_primary_cds(args.input_fasta, args.output_fasta)