biyelunwen/99.scripts/trinity_utils/PerlLib/EM.pm

package EM;

use strict;
use warnings;
use SAM_entry;
use Carp;

use vars qw($TOKEN);
BEGIN {
	$TOKEN = "$;";
}

my $MIN_EFF_TRANS_LENGTH = 10; # // large enough to keep abundance estimates from going absurdly high for short transcripts.

####
sub  new {
	my $packagename = shift;
	my ($transcript_seqs_href, $fragment_length) = @_;

    unless ($fragment_length) {
        $fragment_length = 1; # negligible with respect to the length of the transcript
    }


	# transcript_seqs_href:  ( seq_acc => sequence, ...)
	# trans_reads_aref: ( [trans_accA, read_accA], [trans_accB, read_accB], ...)

	unless ($transcript_seqs_href) {
		confess "Error, need param: transcript_seqs_href";
	}


	my $self = {

		# mapping info
		_trans_to_multi_map_counts => {}, # trans => { transA\$;transB\$;transC... } => count_of_reads

		# model params
		_ENt => {}, # trans => expected read count
		_theta => {}, # trans => fraction of all reads

		# transcript info
		_trans_lengths => {}, # trans => length
        _frag_length => $fragment_length,


		# misc
		_total_reads => 0, # total read count

	};


	bless ($self, $packagename);

	$self->_init_trans_lengths($transcript_seqs_href);

	return($self);

}


####
sub _init_trans_lengths {
	my $self = shift;
	my ($transcript_seqs_href) = @_;

	foreach my $transcript (keys %$transcript_seqs_href) {

		my $sequence = $transcript_seqs_href->{$transcript} or confess "Error, no sequence for transcript: $transcript";

		$self->{_trans_lengths}->{$transcript} = length($sequence);

	}

	return;
}

sub run {
	my $self = shift;
	my (%settings) = @_;

	my $max_iterations = $settings{max_iterations} || 1000;
	my $min_delta_ML = $settings{min_delta_ML} || 0.01;
    my $verbose_flag = $settings{verbose} || 0;


	$self->_init_theta();

	#########################
	## Do the EM:

	my $prev_ML_val = undef;

	my $round = 0;
	my $delta = 1;

	print STDERR "\tEM started\n";
	while (1) {

		$round++;

		$self->_E_step();

		$self->_M_step();


		my $ml = $self->_compute_Max_Likelihood();

		if (defined $prev_ML_val) {
			$delta = $ml - $prev_ML_val;
		}

		print STDERR "\rML[$round]: $ml       $delta       ";

		$prev_ML_val = $ml;

        if ($verbose_flag) {
            print "\n\nEM_results, round: $round:\n";
            $self->report_results();
            print "\n";
        }

		if ($round >= $max_iterations || $delta  < $min_delta_ML) {

			last;
		}

	}
	print STDERR "\n\tEM finished.\n";

	return;
}


####
sub _init_theta {
	my $self = shift;

	###########################
	## init theta:
	##   assume mutliply mapped reads equally divided among their mapped locations for init

	my @transcripts = $self->_get_all_transcripts();

	my $total_reads = $self->{_total_reads};

	foreach my $transcript (@transcripts) {

		my $read_count_sum = 0;

		my @combos = $self->_get_multi_map_list_including_transcript($transcript);

		foreach my $combo (@combos) {

			my @other_trans = $self->_get_transcripts_in_combo($combo);

			my $num_other = scalar(@other_trans);
			my $count = $self->_get_multi_map_read_count_for_combo($transcript, $combo);

			$read_count_sum += ($count/$num_other);
		}

		$self->{_theta}->{$transcript} = $read_count_sum / $total_reads;

	}

	return;
}

####
sub _get_all_transcripts {
	my $self = shift;

	my @transcripts = keys %{$self->{_trans_to_multi_map_counts}};

	return(@transcripts);
}


####
sub _get_reads_mapped_to_transcript {
	my $self = shift;
	my ($transcript) = @_;

	if (exists $self->{_trans_to_reads_href}->{$transcript}) {
		my @reads = @{$self->{_trans_to_reads_href}->{$transcript}};
		return(@reads);
	}
	else {
		confess "Error, no reads mapped to transcript $transcript";
	}
}

####
sub _get_transcript_length {
	my $self = shift;
	my ($transcript) = @_;

	return($self->{_trans_lengths}->{$transcript});
}


####
sub _E_step {
	my $self = shift;

	my @transcripts = $self->_get_all_transcripts();

	## do the E
	foreach my $transcript (@transcripts) {

		my $expected_read_count = 0;


		my $trans_length = $self->_get_transcript_length($transcript) or die "Error, no length for transcript: $transcript";
        my $eff_trans_length = $trans_length - $self->{_frag_length} + 1;
        if ($eff_trans_length < $MIN_EFF_TRANS_LENGTH) {
            $eff_trans_length = $MIN_EFF_TRANS_LENGTH;
        }


		my $theta_t = $self->{_theta}->{$transcript};
		if (! defined $theta_t) {
			die "Error, no theta($transcript), $theta_t";
		}


		my @combos = $self->_get_multi_map_list_including_transcript($transcript);

		#print "Combos: @combos\n";


	    foreach my $combo (@combos) {

			my $combo_count = $self->_get_multi_map_read_count_for_combo($transcript, $combo);

			#print "Got combo: $combo, count: $combo_count\n";

			my @other_transcripts = split(/$TOKEN/, $combo);
			if (scalar @other_transcripts == 1) {
				# only current transcript:
				unless ($other_transcripts[0] eq $transcript) {
					confess "Error, mapped single transcript should be the current transcript";
				}
				$expected_read_count += $combo_count;
			}
			else {
				## compute partial mapping.


				my $numerator = (1 / $eff_trans_length) * $theta_t;
                #my $numerator = $theta_t;


				# demonator: sum for all t: P(r|t) * theta(t)
				my $denominator = 0;


				#print STDERR "Other transcripts: @other_transcripts\n";

				foreach my $other_transcript (@other_transcripts) {  # other transcripts includes the current transcript as well.

					my $other_trans_len = $self->_get_transcript_length($other_transcript);
					unless ($other_trans_len) {
						die "Error, no trans length for $other_transcript";
					}

                    my $eff_other_trans_length = $other_trans_len - $self->{_frag_length} + 1;
                    if ($eff_other_trans_length < $MIN_EFF_TRANS_LENGTH) {
                        $eff_other_trans_length = $MIN_EFF_TRANS_LENGTH;
                    }

					my $other_theta = $self->{_theta}->{$other_transcript};

					if (! defined $other_theta) {
						die "no theta for $other_transcript, $other_theta";
					}

					my $val = (1 / $eff_other_trans_length) * $other_theta;
                    #my $val = $other_theta;

					$denominator += $val;
				}

				#print "Fractional read count for $transcript = $numerator / $denominator\n";

				my $fractional_read_count = $combo_count * ($numerator/$denominator);


				$expected_read_count += $fractional_read_count;
			}
		}

		$self->{_ENt}->{$transcript} = $expected_read_count;
	}

	return;
}


####
sub _M_step {
	my $self = shift;

	my $sum_NT = 0;

	foreach my $E (values %{$self->{_ENt}}) {
		$sum_NT += $E;
	}

	## theta = ENt(t) / sum( for all t: ENt(t) )


	foreach my $transcript ($self->_get_all_transcripts()) {
		my $new_theta = $self->{_ENt}->{$transcript} / $sum_NT;

		$self->{_theta}->{$transcript} = $new_theta;
	}

	return;
}


####
sub _compute_Max_Likelihood {
	my $self = shift;

	# ML = sum:t (ENt * log(theta(t)))

	my $ML = 0;

	foreach my $transcript ($self->_get_all_transcripts()) {

		my $theta = $self->{_theta}->{$transcript};

		if ($theta > 0) {

			my $ENt = $self->{_ENt}->{$transcript};

			$ML += $ENt * log($theta);

		}
	}

	return($ML);
}


####
sub get_results {
	my $self = shift;

	my $total_reads = $self->{_total_reads};

    my @results;

	foreach my $transcript (sort $self->_get_all_transcripts()) {

		my $num_frags = $self->{_ENt}->{$transcript};
		my $len = $self->_get_transcript_length($transcript);

		$num_frags = sprintf("%.2f", $num_frags);

		my $fpkm = $num_frags / ($len/1e3) / ($total_reads/1e6);
		$fpkm = sprintf("%.2f", $fpkm);

		my ($num_unique_reads, $num_multi_map_reads) = $self->count_reads_mapped_to_transcript($transcript);

        my $struct = { trans_id => $transcript,
                       length => $len,
                       unique_map => $num_unique_reads,
                       multi_map => $num_multi_map_reads,
                       expected_map => $num_frags,
                       FPKM => $fpkm,
        };

        push (@results, $struct);


	}


	return (@results);
}


sub print_results {
    my $self = shift;

    print $self->report_results();
    return;
}


sub report_results {
    my ($self) = shift;

    my @results = $self->get_results();

    my $out_text = join("\t", "#transcript", "trans_length", "unique_map", "multi_map", "EM_frag_count", "FPKM") . "\n";

    foreach my $result (@results) {

        $out_text .= join("\t",
                          $result->{trans_id},
                          $result->{length},
                          $result->{unique_map},
                          $result->{multi_map},
                          $result->{expected_map},
                          $result->{FPKM}) . "\n";
    }

    $out_text .= "\n"; # last spacer

    return ($out_text);

}


####
sub _get_multi_map_list_including_transcript {
	my $self = shift;
	my ($transcript) = @_;

	my @combos = keys %{$self->{_trans_to_multi_map_counts}->{$transcript}};
	return(@combos);
}

####
sub _get_multi_map_read_count_for_combo {
	my $self = shift;
	my ($transcript, $combo) = @_;

	my $val = $self->{_trans_to_multi_map_counts}->{$transcript}->{$combo};

	return($val);
}

####
sub _get_transcripts_in_combo {
	my $self = shift;
	my ($combo) = @_;

	my @transcripts = split(/$TOKEN/, $combo);
	return(@transcripts);
}


####
sub _create_combo {
	my $self = shift;
	my @transcripts = @_;

	my $combo = join($TOKEN, sort @transcripts);

	return($combo);
}

####
sub add_read { # really add_fragment: count pairs only once.
	my $self = shift;
	my @transcripts = @_;

	my $combo = $self->_create_combo(@transcripts);

	foreach my $transcript (@transcripts) {
		$self->{_trans_to_multi_map_counts}->{$transcript}->{$combo}++;
	}

	$self->{_total_reads}++;

	return;
}

####
sub count_reads_mapped_to_transcript {
	my $self = shift;
	my ($transcript) = @_;

	my $unique_read_count = 0;
	my $multi_map_count = 0;


	my @combos = $self->_get_multi_map_list_including_transcript($transcript);

	foreach my $combo (@combos) {

		my $combo_count = $self->_get_multi_map_read_count_for_combo($transcript, $combo);


		my @other_transcripts = $self->_get_transcripts_in_combo($combo);

		if (scalar @other_transcripts == 1) {
			# only current transcript:
			$unique_read_count += $combo_count;
		}
		else {
			$multi_map_count += $combo_count;
		}
	}

	return($unique_read_count, $multi_map_count);
}

1; #EOM