75 lines
2.4 KiB
Python
Executable File
75 lines
2.4 KiB
Python
Executable File
#! /usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Get primary CDS sequences from a FASTA file containing multiple CDS per gene.
|
|
"""
|
|
|
|
from Bio import SeqIO
|
|
from Bio.SeqRecord import SeqRecord
|
|
import re
|
|
import argparse
|
|
import sys
|
|
|
|
|
|
def get_primary_cds(input_fasta, output_fasta):
|
|
primary_cds_records = []
|
|
gene = ""
|
|
length = 0
|
|
seq = None
|
|
id = None
|
|
try:
|
|
for record in SeqIO.parse(input_fasta, "fasta"):
|
|
seq_len = len(record.seq)
|
|
desc = record.description
|
|
match = re.search(r"\[gene=(\S+)\]", desc)
|
|
if match:
|
|
gene_name = match.group(1)
|
|
else:
|
|
# Skip if gene name not found
|
|
continue
|
|
|
|
if gene_name != gene:
|
|
# new gene encountered
|
|
# print(f"Processing gene: {gene_name}")
|
|
if length > 0:
|
|
# this is not the first record, save the previous longest record
|
|
primary_cds_record = SeqRecord(
|
|
seq, id=id, description=f"[gene={gene}]"
|
|
)
|
|
primary_cds_records.append(primary_cds_record)
|
|
gene = gene_name
|
|
seq = record.seq
|
|
id = record.id
|
|
length = seq_len
|
|
else:
|
|
# same gene, check length
|
|
if seq_len > length:
|
|
seq = record.seq
|
|
id = record.id
|
|
length = seq_len
|
|
# after loop, save the last gene
|
|
if gene and length > 0:
|
|
primary_cds_record = SeqRecord(seq, id=id, description=f"[gene={gene}]")
|
|
primary_cds_records.append(primary_cds_record)
|
|
SeqIO.write(primary_cds_records, output_fasta, "fasta")
|
|
print(f"Primary CDS sequences written to {args.output_fasta}")
|
|
except Exception as e:
|
|
print(f"Error processing FASTA file {input_fasta}: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract primary CDS sequences from a FASTA file."
|
|
)
|
|
parser.add_argument(
|
|
"-i", "--input_fasta", help="Input FASTA file containing CDS sequences."
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output_fasta", help="Output FASTA file to write primary CDS sequences."
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
get_primary_cds(args.input_fasta, args.output_fasta)
|