biyelunwen/99.scripts/trinity_utils/PerlLib/BHStats.pm

318 lines
5.3 KiB
Perl

package BHStats;
require Exporter;
our @ISA = qw (Exporter);
our @EXPORT = qw (binomial_probability_sum_k_to_n
binomial_probability_sum_k_to_0
binomial_probability
binomial_coefficient
factorial
stDev
stdErr
median
avg
CorrelationCoeff
geometric_mean
min
max
sum
tukey_biweight
);
use strict;
sub binomial_probability_sum_k_to_n {
my ($n,$k,$p) = @_;
my $sum = 0;
for (my $i = $k; $i <= $n; $i++) {
my $binProb = binomial_probability($n,$i,$p);
$sum += $binProb;
}
return ($sum);
}
sub binomial_probability_sum_k_to_0 {
my ($n,$k,$p) = @_;
my $sum = 0;
for (my $i = $k; $i >= 0; $i--) {
my $binProb = binomial_probability($n,$i,$p);
$sum += $binProb;
}
return ($sum);
}
sub binomial_probability {
my ($n_observations, $k_successes, $p_probability) = @_;
my ($n, $k, $p) = ($n_observations, $k_successes, $p_probability);
### Given B(n,p), find P(X=k)
my $binomial_prob = binomial_coefficient($n,$k) * ($p**$k) * (1-$p)**($n-$k);
return ($binomial_prob);
}
sub binomial_coefficient {
my ($n_things, $k_at_a_time) = @_;
my $number_of_k_arrangements = (factorial($n_things)) / ( factorial($k_at_a_time) * factorial($n_things-$k_at_a_time) );
return ($number_of_k_arrangements);
}
sub factorial {
my $x = shift;
$x = int($x);
my $factorial = 1;
while ($x > 1) {
$factorial *= $x;
$x--;
}
return ($factorial);
}
sub stDev {
# standard deviation calculation
my @nums = @_;
@nums = sort {$a<=>$b} @nums;
my $avg = avg(@nums);
my $count_eles = scalar(@nums);
## sum up the sqr of diff from avg
my $sum_avg_diffs_sqr = 0;
foreach my $num (@nums) {
my $diff = $num - $avg;
my $sqr = $diff**2;
$sum_avg_diffs_sqr += $sqr;
}
my $stdev = sqrt ($sum_avg_diffs_sqr/($count_eles-1));
return ($stdev);
}
####
sub stdErr {
my @vals = @_;
my $stdev = &stDev(@vals);
my $num_vals = scalar(@vals);
my $stdErr = $stdev / sqrt($num_vals);
return($stdErr);
}
sub median {
my @nums = @_;
@nums = sort {$a<=>$b} @nums;
my $count = scalar (@nums);
if ($count %2 == 0) {
## even number:
my $half = $count / 2;
return (avg ($nums[$half-1], $nums[$half]));
}
else {
## odd number. Return middle value
my $middle_index = int($count/2);
return ($nums[$middle_index]);
}
}
sub avg {
my @nums = @_;
my $total = $#nums + 1;
my $sum = 0;
foreach my $num (@nums) {
$sum += $num;
}
my $avg = $sum/$total;
return ($avg);
}
sub CorrelationCoeff {
my ($x_aref, $y_aref) = @_;
my @x = @$x_aref;
my @y = @$y_aref;
my $total = $#x + 1;
my $avg_x = avg(@x);
my $avg_y = avg(@y);
my $stdev_x = stDev(@x);
my $stdev_y = stDev(@y);
# sum part of equation
my $summation = 0;
for (my $i = 0; $i < $total; $i++) {
my $x_val = $x[$i];
my $y_val = $y[$i];
my $x_part = ($x_val - $avg_x)/$stdev_x;
my $y_part = ($y_val - $avg_y)/$stdev_y;
$summation += ($x_part * $y_part);
}
my $cor = (1/($total-1)) * $summation;
return ($cor);
}
####
sub geometric_mean {
my @entries = @_;
my $num_entries = scalar (@entries);
unless ($num_entries) {
return (undef);
}
## All entries must be > 0
my $logsum = 0;
foreach my $entry (@entries) {
unless ($entry > 0) {
return (undef);
}
$logsum += log ($entry);
}
my $geo_mean = exp ( (1/$num_entries) * $logsum);
return ($geo_mean);
}
####
sub min {
my @vals = @_;
@vals = sort {$a<=>$b} @vals;
my $min_val = shift @vals;
return ($min_val);
}
####
sub max {
my @vals = @_;
@vals = sort {$a<=>$b} @vals;
my $max_val = pop @vals;
return ($max_val);
}
####
sub sum {
my @vals = @_;
my $x = 0;
foreach my $val (@vals) {
$x += $val;
}
return ($x);
}
=Rcode
> tukey.biweight
function (x, c = 5, epsilon = 1e-04)
{
m <- median(x)
s <- median(abs(x - m))
u <- (x - m)/(c * s + epsilon)
w <- rep(0, length(x))
i <- abs(u) <= 1
w[i] <- ((1 - u^2)^2)[i]
t.bi <- sum(w * x)/sum(w)
return(t.bi)
}
=cut
####
sub tukey_biweight {
my (@x) = @_;
my $m = median(@x);
my $s;
{
my @y;
foreach my $val (@x) {
my $t = abs($val - $m);
push (@y, $t);
}
$s = median(@y);
}
my @u;
{
my $epsilon = 1e-4;
my $c = 5;
foreach my $val (@x) {
my $t = $val - $m;
$t /= ($c * $s + $epsilon);
push (@u, $t);
}
}
my @i;
{
foreach my $val (@u) {
my $t = (abs($val) <= 1) ? 1:0;
push (@i, $t);
}
}
my @w;
{
foreach my $val (@u) {
my $i = shift @i;
my $t = ( (1 - $val**2) **2) * $i;
push (@w, $t);
}
}
my $bi;
{
my $sum_w = 0;
for (my $i = 0; $i < @w; $i++) {
my $w = $w[$i];
my $x = $x[$i];
$bi += $w * $x;
$sum_w += $w;
}
$bi /= $sum_w;
}
return($bi);
}
1; #EOM