#!/usr/bin/perl

use strict;
use warnings;

package Jls::Wilcoxon;

=head1 NAME

Jls::Wilcoxon - Perl module for calculating p-values of the Wilcoxon statistic.

=head1 SYNOPSIS

use Jls::Wilcoxon;

$w=Wilcoxon->new (\@a_, \@b_);

# Initializes the Wilcoxon statistic.

sub new
# \@a_ // a reference to an array of real numbers
# \@b_ // a reference to an array of real numbers

$h = Jls::Wilcoxon::string_to_hash_of_array_references ($string_)
$w = new Jls::Wilcoxon ($h->{a}, $h->{b});

# Converts a string to 2 array references in a hash.

sub string_to_hash_of_array_references 
# $string_ // Contains 2 arrays as strings on 2 lines.

$w->get_statistic ();

# Returns the statistic.

sub get_statistic

$w->get_pvalue ();

# Returns the pvalue.

sub get_pvalue

# Returns the p-value of the Wilcoxon statistic.
#
# Tests one-sidedly whether @b_ is significantly larger than @a_.
#
# The p-value is the probability over all permutations that 
#     the sum of @b_'s ranks is larger than the actual values. 

=head1 DESCRIPTION

This Perl module calculates p-values (5 significant digits) of the Wilcoxon statistic for comparing the ranks of two samples, which are supposed independent under the null hypothesis. 

For more information, see:

S. Siegel & N.J. Castellan
Nonparametric statistics for the behavioral sciences
(1988) McGraw-Hill, p.128

=head1 BUGS 

This version has been released after testing against examples in Siegel and Castellan. Nevertheless, if you find any bugs or oddities, please do inform the author. 

=head1 INSTALLATION 

See perlmodinstall for information and options on installing Perl modules. 

=head1 AVAILABILITY 

The latest version of this module is available from John L. Spouge.

=head1 AUTHOR

John L. Spouge <spouge@ncbi.nlm.nih.gov>

=head1 COPYRIGHT 

Copyright 2010 John L. Spouge. All rights reserved. 

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. 

=head1 SEE ALSO

Statistics::Distribution (Copyright 2003 Michael Kospach. All rights reserved.), perl(1).

=cut

$Jls::Wilcoxon::_k_normal = 10; # Calculates the normal p-value for k > _k_normal.

# Converts a file to 2 arrays.

sub string_to_hash_of_array_references 
# $string_ // Contains 2 arrays as strings on 2 lines.
{
    my $string_ = shift;

    my @data = split ('\n', $string_);
    my @reduced = ();
    
    foreach my $s (@data)
    {
        $s =~ s/^\s+|\s+$//;
        if ($s =~ m/\S/) { push (@reduced, $s); }
    }
    
    if (scalar (@reduced) != 2)
    {
        my $msg = 'The string should contain exactly 2 non-blank lines.';
        $msg .= '\nDied';
        
        die $msg;
    }

    my @a = split (/\s+/, $reduced [0]);
    Jls::Wilcoxon::check_float (\@a);

    my @b = split (/\s+/, $reduced [1]);
    Jls::Wilcoxon::check_float (\@b);

    my $h = 
    {
        a => \@a,
        b => \@b,
    };
            
    return $h;
}

sub check_float
# \@a_ // a reference to an array of real numbers
{
    use Jls::File;
    my $r_a = $_ [0];

    foreach my $v (@$r_a) 
    {    
        package Jls::File;

        if (! is_float ($v)) 
        {
            my $msg; 
    
            $msg = 'The following entry was not a real number :\n';
            $msg .= $v;
            $msg .= '\nDied';

            die $msg;
        }
    }
}

sub new
# \@a_ // a reference to an array of real numbers
# \@b_ // a reference to an array of real numbers
{
    my $class = shift;

    my $a_ = $_ [0];
    my $b_ = $_ [1];

    my $total = scalar (@$a_) + scalar (@$b_);
    my @array = ();
    
    my $i = 0;
    
    for ($i = 0; $i < $total; $i++)
    {
        my %hash = ();

        $hash {'type'}  = $i < scalar (@$a_) ? 0 :  1;
        $hash {'value'} = $i < scalar (@$a_) ? $a_->[$i] : 
                                               $b_->[$i - scalar (@$a_)];
        push (@array, \%hash);
    }
    
    my @sorted = sort { $a->{'value'} <=> $b->{'value'} } @array;
    
    my $begin = 0;
    
    for ($i = 0; $i <= $total; $i++)
    {
        if ($i == 0) {}
        elsif ($i == $total || 
               $sorted [$i]->{'value'} != $sorted [$i - 1]->{'value'})
        {
            my $rank = ($i * ($i + 1) - $begin * ($begin + 1));
            $rank /= 2.0 * ($i - $begin);
            
            for (my $j = $begin; $j < $i; $j++)
            {
                $sorted [$j]->{'rank'} = $rank;
            }
            
            $begin = $i;
        }
    }
    
    my $self = 
    {
        _a => $a_,
        _b => $b_,
        _sorted => \@sorted,
    };

    return bless $self, $class;
}

# Returns n, the total number of numbers in sets 0 and 1.

sub _get_n
{
    return scalar (@{$_ [0]->{'_sorted'}});
}

# Returns k, the total number of numbers in sets 0 and 1.

sub get_k
{
    return scalar (@{$_ [0]->get_array (0)});
}

# Returns a reference to array 0 or 1 of numbers (unsorted)

sub get_array
# $i_ // 0 or 1
{
    my $i_ = $_ [1];
    
    if ($i_ == 0) { return $_ [0]->{'_a'}; }
    elsif ($i_ == 1) { return $_ [0]->{'_b'}; }
    else { kill "Wilcoxon->get_array : argument $i_ is impossible." } 
}

# Returns a reference to an array of hashes sorted on {'value'}.
# my $h
# {
#     type => 0 or 1;
#     value => value of corresponding element;
#     rank => adjusted for ties;
# }

sub get_sorted
{
    return $_ [0]->{'_sorted'};
}

# Returns the Wilcoxon statistic, the sum of ranks, with ties accounted.

sub get_statistic
{
    my $sum = 0.0;
    
    for (my $i = 0; $i < $_ [0]->_get_n (); $i++)
    {
        if ($_ [0]->get_sorted ()->[$i]->{'type'} == 0) 
        { 
            $sum += $_ [0]->get_sorted ()->[$i]->{'rank'}; 
        }
    }
    
    return $sum;
}

# Returns an array reference to the number in each group of ties.
# The array is empty, if there are no ties.

sub _get_number_of_ties
{
    my $a = $_ [0]->get_sorted ();
    my $n = $_ [0]->_get_n ();

    my @ties = ();
    
    my $begin = 0;
    
    for (my $i = 0; $i <= $n; $i++)
    {
        if ($i == 0) {}
        elsif ($i == $n || 
               $a->[$i]->{'value'} != $a->[$i - 1]->{'value'})
        {
            if ($i - $begin != 1) { push (@ties, $i - $begin); }
            
            $begin = $i;
        }
    }

    return \@ties;
}

# Returns the permutation probability.

sub get_permutation_probability
{
    use Jls::Combination;
    
    my $sorted = $_ [0]->get_sorted ();
    my $n = $_ [0]->_get_n ();
    my $k = $_ [0]->get_k ();

    ($n > 0) || die 'Wilcoxon::get_permutation_quantile : ! ($n > 0)';
    ($k > 0) || die 'Wilcoxon::get_permutation_quantile : ! ($k > 0)';
    
    my $c = new Jls::Combination ($n, $k);

    my $s0 = $_ [0]->get_statistic ();
    my $count = 0.0;
    
    do
    {
        my $s = 0.0;
        
        for (my $i = 0; $i < $k; $i++)
        {
            $s += $sorted->[$c->get_array ()->[$i]]->{'rank'}; 
        }

        if ($s == $s0) { $count++; } 

    } while ($c->increment ());
    
    return $count / Jls::Combination::binomial ($n, $k);
}

# Returns the pvalue by permutation.

sub _get_permutation_pvalue
{
    use Jls::Combination;
    
    my $sorted = $_ [0]->get_sorted ();
    my $n = $_ [0]->_get_n ();
    my $k = $_ [0]->get_k ();

    ($n > 0) || die 'Wilcoxon::_get_permutation_pvaluee : ! ($n > 0)';
    ($k > 0) || die 'Wilcoxon::_get_permutation_pvalue : ! ($k > 0)';
    
    my $c = new Jls::Combination ($n, $k);

    my $s0 = $_ [0]->get_statistic ();
    my $count = 0.0;
    
    do
    {
        my $s = 0.0;
        
        for (my $i = 0; $i < $k; $i++)
        {
            $s += $sorted->[$c->get_array ()->[$i]]->{'rank'}; 
        }

        if ($s <= $s0) { $count++; } 

    } while ($c->increment ());
    
    return $count / Jls::Combination::binomial ($n, $k);
}

# Returns a normal p-value by normal approximation.

sub _normal_pvalue
# delta_ // 0.5 (quantile) or -0.5 (p-value)
{
    use lib '..';
    use Statistics::Distributions;
    
    my $n = $_ [0]->_get_n ();
    my $k = $_ [0]->get_k ();
    my $delta_ = $_ [1];
    
    # ($n > 1) : Otherwise, variance is 0.0.
    ($n > 1) || die 'Jls::Wilcoxon::_normal_quantile : ! ($n > 1)';
    ($k > 0) || die 'Jls::Wilcoxon::_normal_quantile : ! ($k > 0)';
    (abs ($delta_) == 0.5) || 
        die 'Jls::Wilcoxon::_normal_quantile : ! (abs ($delta) == 0.5)';
    
    my $mean = $k * ($n + 1.0) / 2.0; 
    my $var = $n * ($n * $n - 1.0);
    
    my $ties = $_ [0]->_get_number_of_ties ();

    for (my $i = 0; $i < scalar (@$ties); $i++)
    {
        my $t = $ties->[$i];
        $var -= $t * ($t * $t - 1.0)
    }
    
    $var *= $k * ($n - $k);
    $var /= 12.0 * $n * ($n - 1);
    
    ($var > 0.0) || die 'Jls::Wilcoxon::_get_normal_pvalue : ! ($var > 0.0)';
    
    my $s0 = $_ [0]->get_statistic ();
    
    # Adds Sheppard's continuity correction.
    
    $s0 += $delta_; 

    # Calculates the z-score, where small is the direction of deviation.
    
    my $z = ($s0 - $mean) / sqrt ($var);

    return Statistics::Distributions::uprob (-$z); # p-value of -$z
}

# Returns the probability by normal approximation.

sub _get_normal_probability
{
    return $_ [0]->_normal_pvalue (0.5) - $_ [0]->_normal_pvalue (-0.5);
}

# Returns the pvalue by normal approximation.

sub _get_normal_pvalue
{
    return $_ [0]->_normal_pvalue (0.5);
}

# Returns the p-value of the statistic.
# The p-value is the permutation probability that 
#     a sum of ranks is smaller than @a_'s.

sub _get_probability
{
    my $k0 = $_ [0]->get_k ();
    my $k1 = $_ [0]->_get_n () - $k0;

    if ($k0 <= $Jls::Wilcoxon::_k_normal && $k1 <= $Jls::Wilcoxon::_k_normal) 
    { 
        return $_ [0]->get_permutation_probability (); 
    }

    return $_ [0]->_get_normal_probability ();
}

# Returns the pvalue.

sub get_pvalue
{
    my $k0 = $_ [0]->get_k ();
    my $k1 = $_ [0]->_get_n () - $k0;

    if ($k0 <= $Jls::Wilcoxon::_k_normal && $k1 <= $Jls::Wilcoxon::_k_normal) 
    { 
        return $_ [0]->_get_permutation_pvalue (); 
    }

    return $_ [0]->_get_normal_pvalue ();
}

1;
