biyelunwen/99.scripts/miscs/get_primary_cds.py

75 lines
2.4 KiB
Python
Executable File

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Get primary CDS sequences from a FASTA file containing multiple CDS per gene.
"""
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import re
import argparse
import sys
def get_primary_cds(input_fasta, output_fasta):
primary_cds_records = []
gene = ""
length = 0
seq = None
id = None
try:
for record in SeqIO.parse(input_fasta, "fasta"):
seq_len = len(record.seq)
desc = record.description
match = re.search(r"\[gene=(\S+)\]", desc)
if match:
gene_name = match.group(1)
else:
# Skip if gene name not found
continue
if gene_name != gene:
# new gene encountered
# print(f"Processing gene: {gene_name}")
if length > 0:
# this is not the first record, save the previous longest record
primary_cds_record = SeqRecord(
seq, id=id, description=f"[gene={gene}]"
)
primary_cds_records.append(primary_cds_record)
gene = gene_name
seq = record.seq
id = record.id
length = seq_len
else:
# same gene, check length
if seq_len > length:
seq = record.seq
id = record.id
length = seq_len
# after loop, save the last gene
if gene and length > 0:
primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]")
primary_cds_records.append(primary_cds_record)
SeqIO.write(primary_cds_records, output_fasta, "fasta")
print(f"Primary CDS sequences written to {args.output_fasta}")
except Exception as e:
print(f"Error processing FASTA file {input_fasta}: {e}")
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract primary CDS sequences from a FASTA file."
)
parser.add_argument(
"-i", "--input_fasta", help="Input FASTA file containing CDS sequences."
)
parser.add_argument(
"-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences."
)
args = parser.parse_args()
get_primary_cds(args.input_fasta, args.output_fasta)