biyelunwen/99.scripts/trinity_utils/util/misc/allele_simulator.pl

114 lines
2.1 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use lib("$FindBin::RealBin/../../PerlLib");
use Fasta_reader;
use List::Util qw(min max);
use Data::Dumper;
my $usage = "usage: $0 transcriptome.fasta polymorphRatePercentage\n\n";
my $transcriptome = $ARGV[0] or die $usage;
my $poly_rate = $ARGV[1] or die $usage;
if ($poly_rate < 1) {
print STDERR "\n\n** WARNING: polymorphRatePercentage expects a percentage, and your input value is quite small: $poly_rate ** \n\n";
}
my %mutations;
main: {
my $fasta_reader = new Fasta_reader($transcriptome);
my $counter = 0;
while (my $seq_obj = $fasta_reader->next()) {
my $acc = $seq_obj->get_accession();
my $sequence = $seq_obj->get_sequence();
my $num_snps = int($poly_rate/100 * length($sequence) + 0.5);
if ($num_snps < 1) { next; } # exclude since not helpful here.
&print_fasta("aleA;$acc", $sequence);
$sequence = &mutate($sequence, $num_snps);
&print_fasta("aleB;$acc", $sequence);
$counter++;
print STDERR "\r[$counter] ";
}
print STDERR "\n\n";
#print STDERR Dumper(\%mutations);
exit(0);
}
####
sub print_fasta {
my ($acc, $sequence) = @_;
$sequence =~ s/(\S{60})/$1\n/g;
chomp $sequence;
print ">$acc\n$sequence\n";
return;
}
####
sub mutate {
my ($sequence, $num_snps) = @_;
my %seen;
my @chars = qw(G A T C);
my @seq = split(//, uc $sequence);
for (1..$num_snps) {
my $pos; # select unique sites (sampling w/o replacement)
do {
$pos = int(rand(length($sequence)));
} while ($seen{$pos});
$seen{$pos} = 1;
my $nuc = uc $seq[$pos];
my @others = grep { $_ ne $nuc } @chars;
my $substitution = $others[ int(rand(scalar(@others))) ];
#print STDERR "$nuc -> $substitution\n";
#$mutations{"$nuc,$substitution"}++;
$seq[$pos] = lc $substitution;
}
$sequence = join("", @seq);
return($sequence);
}