#!/usr/bin/perl

use strict;
use warnings;

package Jls::TapCalculate;

use Exporter;
our @ISA = 'Exporter';
our @EXPORT = 'precompute_for_taps taps_for_retrieval_lists descending non_monotonic ascending equal classify_monotonicity';

=head 1

# The retrieval lists are subject to checks before computation.
# 
# The retrieval lists must be all non-increasing or all non-decreasing.
# At least one retrieval list must contain two different values among its scores or E-values.

# The routines in this file are based on the following objects:

# $retrieval_lists = an array reference \@list, 
#     whose elements all have the form of the following hash reference.
#
# $retrieval_list =
# {
#     id, 
#     weight, 
#     number_of_relevant_records_stated,
#     list => \@list, // a list of hashes to the next structure
# };
#
# @list contains
#
# my $hash = 
# {
#     is_relevant, // 0 : irrelevant ; 1 : relevant
#     score, // score or E-value
# };

# Must be called before average_tap_for_retrieval_lists.

# Must be called before average_tap_for_retrieval_lists.
# $EQUAL returns an error, if all lists have fewer than 2 distinct scores.
# $EQUAL returns an error, if all lists have fewer than 2 distinct scores.

sub precompute_for_taps 
# $retrieval_lists_
# \$r_monotonicity_ : (equal ()) ascending (), descending (), or equal ()
#                     Returns as non_monotonic () on internal file problems.
#                     Returns as non_monotonic () on internal file problems.

# Calculates the $tap for a given score (or E-value) threshold.
# Requires that precompute_for_taps ($retrieval_lists_) be called first.
# Returns the $tap hash reference.

sub taps_for_retrieval_lists
# \@retrieval_lists_
# $threshold_ : (if not present, permits all records to enter the tap calculation)
# $unweighted_ : (0) ? Enforce an unweighted calculation ?

# $taps = a hash reference \%hash, 
#
# %hash =
# {
#     threshold,
#     is_weighted, // 0 = unweighted average ; 1 = weighted average
#     average_tap, // Averages the taps in the following array.
#     taps => \@list, // a list of hash references, below
# };
#
# @list contains hash references to the following structure
#
# my $hash = 
# {
#     id, // the retrieval list id
#     normalized_weight, // normalized weight of the corresponding retrieval
#     weight, // weight of the corresponding retrieval
#     tap, // the tap
# };

# Returns the numerical value corresponding to a descending set of retrieval lists.

sub descending

=cut

# Classifies the monotonicity of @scores.
# Returns
#     -1 : descending
#      0 : non-monotonic
#      1 : ascending
#      2 : all numbers equal (or list empty).

my $DESCENDING = -1;
my $NON_MONOTONIC = 0;
my $ASCENDING = 1;
my $EQUAL = 2;

# Returns the numerical value indicating a descending set of retrieval lists.

sub descending
{
    return $DESCENDING;
}

sub non_monotonic
{
    return $NON_MONOTONIC;
}

sub ascending
{
    return $ASCENDING;
}

sub equal
{
    return $EQUAL;
}

sub classify_monotonicity
# @scores_
{
    my @scores = @_;
    my $is_ascend = $EQUAL;
    my $old;

    for (my $i = 0; $i <= $#scores; $i++)
    {
        if ($is_ascend == $EQUAL && $i != 0 && $old != $scores [$i]) 
        {
            $is_ascend = 0.0 < $_ [$i] - $old ? $ASCENDING : $DESCENDING;
        }
        elsif ($i != 0 && $old != $scores [$i]) 
        {
            if ($is_ascend * ($scores [$i] - $old) < 0.0) { return $NON_MONOTONIC; }
        }

        $old = $scores [$i];
    }

    return $is_ascend;
}

# Must be called before average_tap_for_retrieval_lists.
# $EQUAL returns an error, if all lists have fewer than 2 distinct scores.

sub precompute_for_taps 
# $retrieval_lists_
# \$r_monotonicity_ : ($EQUAL) $ASCENDING, $DESCENDING, or $EQUAL
{
    my $retrieval_lists_ = $_ [0];
    my $monotonicity = $EQUAL;
    my $r_monotonicity_ = 2 <= scalar (@_) ? $_ [1] : \$monotonicity;
    
    my $total_weight = 0.0;
    
    foreach my $retrieval_list (@$retrieval_lists_)
    {
        my @scores = ();
        my $number_of_relevant_records_found = 0;

        foreach my $item (@{$retrieval_list->{list}}) 
        { 
            if ($item->{'is_relevant'}) { $number_of_relevant_records_found++; }
            push (@scores, $item->{'score'}); 
        }
        
        $retrieval_list->{'monotonicity'} = classify_monotonicity (@scores);
        $retrieval_list->{'number_of_relevant_records_found'} = $number_of_relevant_records_found;
        $total_weight += $retrieval_list->{'weight'};
    }

    foreach my $retrieval_list (@$retrieval_lists_)
    {
        $retrieval_list->{'normalized_weight'} = $retrieval_list->{'weight'} / $total_weight;
    }

    my $msg = msg_retrieval_lists ($retrieval_lists_, $r_monotonicity_);
    if ($msg) { die $msg; }
    
    return $retrieval_lists_;
}

sub msg_number_of_relevant_records 
# $retrieval_list_
{
    my $retrieval_list = $_ [0];
    
    my $msg = "\nThe truncated retrieval list with id \n";
    $msg .= $retrieval_list->{id};
    $msg .= "\n should have contained fewer than ";
    $msg .= $retrieval_list->{number_of_relevant_records_found};
    $msg .= " records.";
    $msg .= "\nThe complete retrieval list supposedly on contains ";
    $msg .= $retrieval_list->{number_of_relevant_records_stated};
    $msg .= " records.";
    $msg .= "\nDied";

    return $msg;
}

sub msg_non_monotonic 
# $retrieval_list_
{
    my $retrieval_list = $_ [0];
    
    my $msg = "\nThe truncated retrieval list with id \n";
    $msg .= $retrieval_list->{id};
    $msg .= "\n should have been contained its scores or E-values in ascending or descending order.";
    $msg .= "\nDied";

    return $msg;
}

sub msg_different_monotonicities
{
    my $monotonicity_ = $_ [0];

    my $msg = "\nThe scores or E-values in the retrieval lists";
    $msg .= "\nascended in some and descended in others.";
    $msg .= "\nDied";

    return $msg;
}

sub msg_no_monotonicity
{
    my $msg = "\nEvery retrieval list was empty or repeated the same score or E-value,";
    $msg .= "\nin which case the program input should have contained";
    $msg .= "\nwhether the lists are ascending or descending.";
    $msg .= "\nDied";

    return $msg;
}

sub msg_conflict_monotonicity
# $monotonicity_ : $ASCENDING, $DESCENDING
{
    my $monotonicity_ = $_ [0];

    my $msg = "\nThe scores or E-values in the retrieval lists were ";
    $msg .= $monotonicity_ == $ASCENDING ? "a" : "de";
    $msg .= "scending, but ";
    $msg .= "\nthey were supposed to be ";
    $msg .= $monotonicity_ == $ASCENDING ? "de" : "a";
    $msg .= "scending.";
    $msg .= "\nDied";

    return $msg;
}

sub msg_retrieval_lists 
# \@retrieval_lists_
# \$r_monotonicity_ : $ASCENDING, $DESCENDING, or $EQUAL
{
    my $retrieval_lists_ = $_ [0];
    my $r_monotonicity_ = $_ [1];
    my $monotonicity = $EQUAL;

    foreach my $retrieval_list (@$retrieval_lists_)
    {
        if ($retrieval_list->{number_of_relevant_records_stated} <
            $retrieval_list->{number_of_relevant_records_found})
        {
            return msg_number_of_relevant_records ($retrieval_list);
        }
        
        if ($retrieval_list->{monotonicity} == $NON_MONOTONIC)
        {
            $$r_monotonicity_ = $NON_MONOTONIC;
            return msg_non_monotonic ($retrieval_list);
        }
        elsif ($retrieval_list->{monotonicity} == $EQUAL) {}        
        elsif ($monotonicity == $EQUAL) 
        { 
            $monotonicity = $retrieval_list->{monotonicity}; 
        }        
        elsif ($monotonicity != $retrieval_list->{monotonicity}) 
        {
            $$r_monotonicity_ = $NON_MONOTONIC;
            return msg_different_monotonicities ($retrieval_list);
        }
    }

    if ($$r_monotonicity_ == $EQUAL) 
    { 
        if ($monotonicity == $EQUAL) { return msg_no_monotonicity (); }
    }
    elsif ($monotonicity != $EQUAL && $monotonicity != $$r_monotonicity_)
    { 
        return msg_conflict_monotonicity ($monotonicity); 
    }
    else  { $monotonicity = $$r_monotonicity_; }
    
    foreach my $retrieval_list (@$retrieval_lists_)
    {
        $retrieval_list->{monotonicity} = $monotonicity;
    }

    return '';
}

# Are the weights different?

sub is_weighted
# \@taps_
{
    my $taps_ = $_ [0];
    my $old;

    for (my $i = 0; $i < scalar (@$taps_); $i++)
    {
        if ($i != 0) { if ($old != $taps_->[$i]->{'weight'}) { return 1; } } 
        $old = $taps_->[$i]->{'weight'}; 
    }

    return 0;
}

# Calculates the average TAP for a given score (or E-value) threshold.

sub average_tap_for_retrieval_lists
# \@taps_
# $unweighted_ : (0) ? Enforce an unweighted calculation ?
{
    (my $taps_, my $unweighted_) = @_;

    if (! $unweighted_) { $unweighted_ = 0; }
    
    my $weight;
    my $i = 0;
    
    my $average_tap = 0.0;

    foreach my $hash_r (@$taps_)
    {
        if (! $unweighted_) { $weight = $hash_r->{'normalized_weight'}; }
        elsif ($i == 0) 
        { 
            $weight = 1.0 / scalar (@$taps_); 
            $i++;
        }
        $average_tap += $hash_r->{'tap'} * $weight;
    }

    return $average_tap;
}

# Calculates the $tap for a given score (or E-value) threshold.
# Requires that precompute_for_taps ($retrieval_lists_) be called first.
# Returns the $tap hash reference.

sub taps_for_retrieval_lists
# $retrieval_lists_
# $threshold_ : (if not present, permits all records to enter the tap calculation)
#             : 'NaN' pads out every list with trailing 0's, i.e., $pad_0_ = 1
# $unweighted_ : (0) ? Enforce an unweighted calculation ?
{
    (my $retrieval_lists_, my $threshold_, my $unweighted_) = @_;

    if (! $unweighted_) { $unweighted_ = 0; }
    
    my @taps = ();

    foreach my $retrieval_list (@$retrieval_lists_)
    {
        my $total_retrieved = 0;
        my $fp_retrieved = 0;
        my $tp_retrieved = 0;

        my $list_ref = $retrieval_list->{'list'};
        my $tps = $retrieval_list->{'number_of_relevant_records_stated'};

        my $tap = 0.0;

        foreach my $hash_ref (@$list_ref)
        {
            if (defined $threshold_ && $threshold_ ne 'NaN' && 
                0 < $retrieval_list->{monotonicity} * 
                     ($hash_ref->{'score'} - $threshold_)) { last; }

            $total_retrieved++;

            if ($hash_ref->{'is_relevant'}) 
            {
                $tp_retrieved++;
                $tap += $tp_retrieved / $total_retrieved;
            }
            else { $fp_retrieved++; }
        }

        if ($total_retrieved == 0)
        {
            $tap = 0.0;
        }
        elsif (! defined $threshold_ || $threshold_ ne 'NaN')
        {
            $tap += $tp_retrieved / $total_retrieved;
        }

        $tap /= $tps + 1.0;

        my %hash;
        my $hash_ref = \%hash;
        $hash_ref->{'id'} = $retrieval_list->{'id'};
        $hash_ref->{'normalized_weight'} = $retrieval_list->{'normalized_weight'};
        $hash_ref->{'weight'} = $retrieval_list->{'weight'};
        $hash_ref->{'tap'} = $tap;
        push (@taps, $hash_ref);
    }

    my %hash;
    my $taps = \%hash;
    $taps->{'threshold'} = $threshold_;
    $taps->{'taps'} = \@taps;
    $taps->{'is_weighted'} = ! $unweighted_ && is_weighted (\@taps);
    $taps->{'average_tap'} = 
        average_tap_for_retrieval_lists (\@taps, ! $taps->{'is_weighted'});

    return $taps;
}

1;
