189 lines
4.1 KiB
Perl
189 lines
4.1 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Carp;
|
|
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
|
|
use lib "$ENV{TRINITY_HOME}/PerlLib";
|
|
use Fasta_reader;
|
|
use List::Util qw (shuffle);
|
|
#use Math::Random;
|
|
|
|
|
|
my $usage = <<__EOUSAGE__;
|
|
|
|
##################################################################
|
|
#
|
|
# --fasta <string> fasta filename
|
|
#
|
|
# --subst_rate <float> rate of single base substitutions
|
|
#
|
|
# --insert_rate <float> rate of insertion
|
|
#
|
|
# --insert_size <int> default: 1
|
|
#
|
|
# --delete_rate <float> rate for deletions
|
|
#
|
|
# --delete_size <int> default: 1
|
|
#
|
|
# * note, all rates must be 0 <= x <= 0.25
|
|
#
|
|
###################################################################
|
|
|
|
|
|
__EOUSAGE__
|
|
|
|
;
|
|
|
|
my $fasta_file;
|
|
my $subst_rate = 0;
|
|
my $insert_rate = 0;
|
|
my $insert_size = 1;
|
|
my $delete_rate = 0;
|
|
my $delete_size = 1;
|
|
|
|
|
|
&GetOptions (
|
|
'fasta=s' => \$fasta_file,
|
|
'subst_rate=f' => \$subst_rate,
|
|
|
|
'insert_rate=f' => \$insert_rate,
|
|
'insert_size=i' => \$insert_size,
|
|
|
|
'delete_rate=f' => \$delete_rate,
|
|
'delete_size=i' => \$delete_size,
|
|
|
|
);
|
|
|
|
|
|
unless ($fasta_file) {
|
|
die $usage;
|
|
}
|
|
unless ($subst_rate || $insert_rate || $delete_rate) {
|
|
die $usage;
|
|
}
|
|
|
|
|
|
foreach my $info_aref ( ['subst_rate', $subst_rate ],
|
|
['insert_rate', $insert_rate ],
|
|
['delete_rate', $delete_rate ] ) {
|
|
|
|
my ($rate_type, $val) = @$info_aref;
|
|
if ($val > 0.25) {
|
|
die "Error, --$rate_type $val exceeds max val of 0.25 ";
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
main: {
|
|
|
|
my $fasta_reader = new Fasta_reader($fasta_file);
|
|
|
|
my %fasta_seqs = $fasta_reader->retrieve_all_seqs_hash();
|
|
|
|
foreach my $acc (keys %fasta_seqs) {
|
|
|
|
|
|
my $seq = $fasta_seqs{$acc};
|
|
|
|
my @seqarray = &convert_to_seqarray($seq);
|
|
|
|
my %seen;
|
|
|
|
|
|
if ($subst_rate) {
|
|
@seqarray = &mutate_seq(\@seqarray, \%seen, $subst_rate, 'substitution');
|
|
}
|
|
if ($insert_rate) {
|
|
@seqarray = &mutate_seq(\@seqarray, \%seen, $insert_rate, 'insertion');
|
|
}
|
|
if ($delete_rate) {
|
|
@seqarray = &mutate_seq(\@seqarray, \%seen, $delete_rate, 'deletion');
|
|
}
|
|
|
|
my $mutated_seq = join("", @seqarray);
|
|
print ">$acc mutated\n$mutated_seq\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
####
|
|
sub convert_to_seqarray {
|
|
my ($seq) = @_;
|
|
|
|
my @chars = split(//, uc $seq);
|
|
|
|
return(@chars);
|
|
}
|
|
|
|
####
|
|
sub mutate_seq {
|
|
my ($seqarray_aref, $seen_href, $mut_rate, $mut_type) = @_;
|
|
|
|
my $seqlen = $#$seqarray_aref + 1;
|
|
|
|
my $num_mutations = int($mut_rate * $seqlen + 0.5);
|
|
|
|
for (1..$num_mutations) {
|
|
|
|
my $pos = -1;
|
|
do {
|
|
$pos = int(rand($seqlen));
|
|
|
|
} while ($seen_href->{$pos});
|
|
|
|
$seen_href->{$pos} = 1;
|
|
|
|
## Substitutions
|
|
if ($mut_type eq 'substitution') {
|
|
|
|
my $char = $seqarray_aref->[$pos];
|
|
|
|
my @mut_chars = grep { $_ ne $char } qw(G A T C);
|
|
|
|
my $mut_base = $mut_chars[ int(rand(3)) ];
|
|
|
|
$seqarray_aref->[$pos] = lc $mut_base;
|
|
}
|
|
|
|
## Deletions
|
|
elsif ($mut_type eq 'deletion') {
|
|
for (my $i = $pos; $i < $pos + $delete_size && $i < $seqlen; $i++) {
|
|
|
|
$seqarray_aref->[$i] = "";
|
|
$seen_href->{$i} = 1;
|
|
}
|
|
}
|
|
|
|
## Insertions
|
|
elsif ($mut_type eq 'insertion') {
|
|
|
|
## create insertion
|
|
my $insertion_seq = "";
|
|
my @bases = qw(G A T C);
|
|
for (my $i = 0; $i < $insert_size; $i++) {
|
|
my $base = $bases[int(rand(4))];
|
|
$insertion_seq .= $base;
|
|
}
|
|
$seqarray_aref->[$pos] .= lc $insertion_seq;
|
|
}
|
|
else {
|
|
confess "Error, do not understand mutation type: $mut_type ";
|
|
}
|
|
|
|
}
|
|
|
|
|
|
return (@$seqarray_aref);
|
|
|
|
}
|
|
|
|
|