biyelunwen/99.scripts/miscs/rename_trinity_fasta.py

86 lines
2.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Trinity FASTA Sequence Renaming Script
Function: Rename sequences in FASTA file to format: [prefix@sequence_number]
"""
import sys
import os
def rename_fasta_sequences(input_file, prefix, output_file=None):
"""
Rename sequence headers in a FASTA file
Parameters:
input_file: Input FASTA filename
prefix: Prefix for sequence names
output_file: Output filename (optional, defaults to input_file_renamed.fasta)
"""
# Set output filename
if output_file is None:
file_base, file_ext = os.path.splitext(input_file)
output_file = f"{file_base}_renamed{file_ext}"
match_tsv = f"{output_file}.tsv"
print(f"Input file: {input_file}")
print(f"Output file: {output_file}")
print(f"Naming format: {prefix}@mrna_<number>")
print(f"Match TSV file: {match_tsv}")
# Counter for sequences
seq_count = 0
try:
with (
open(input_file, "r") as fin,
open(output_file, "w") as fout,
open(match_tsv, "w") as tsvout,
):
tsvout.write("Original_Name\tNew_Name\n")
for line in fin:
if line.startswith(">"):
# Sequence header line: rename it
seq_count += 1
original_name = line[1:].strip().split()[0]
new_name = f"{prefix}@mrna_{seq_count}\n"
fout.write(f">{new_name}")
tsvout.write(f"{original_name}\t{new_name}\n")
else:
# Sequence data line: write as-is
fout.write(line)
print(f"Successfully renamed {seq_count} sequences")
except FileNotFoundError:
print(f"Error: Input file '{input_file}' not found")
sys.exit(1)
except Exception as e:
print(f"Error processing file: {e}")
sys.exit(1)
def main():
"""Main function"""
if len(sys.argv) < 3:
print("Usage: python script.py <fasta_file> <prefix> [output_file]")
print("Example: python script.py sequences.fasta Gene new_sequences.fasta")
sys.exit(1)
input_file = sys.argv[1]
prefix = sys.argv[2]
output_file = sys.argv[3] if len(sys.argv) > 3 else None
# Verify input file exists
if not os.path.isfile(input_file):
print(f"Error: File '{input_file}' does not exist")
sys.exit(1)
rename_fasta_sequences(input_file, prefix, output_file)
if __name__ == "__main__":
main()