#!/usr/bin/perl -w

#############################################################
# this script takes user submitted runs
# and then generate input for TAP-k computation
# additionally, it will
# a) give 0 to docs without any returned gene ids
# b) find the cut-off score when less than half have k errors
# c) make sure scores are sorted monotonically
#
# input: 
#  *gold-standard gene annotations
#  *submitted runs for GN task
#  *k errors
# 
# output: tap-k scores
#############################################################

use strict;
use POSIX qw(ceil);

if (@ARGV != 3) {
	print "\nabout:\ta wrapper for taking GN submission results to TAP computation\n";
	print "usage:\teval.pl <gold_standard> <submission file> <K errors per query>\n";
	print "pltry:\teval.pl ex.gold ex.txt 5\n";
	print "\tprints results and produces an intermediate file ex.txt.tap.in\n";
	die "\n";
}


########################################
# loading gold standard to harsh G(old) 
########################################
my %G = ();
my %E = ();
my $file = $ARGV[0]; 
open(FILE, $file) or die "couldn't open $file for reading: $!\n";
while(<FILE>) {
	chomp;
	my ($pmcid, $gid) = split(/\t/, $_);
	$G{$pmcid}{$gid}++ if $gid ne '-';
	$E{$pmcid}++ if $gid eq '-';
}
close FILE;

########################################
# loading results to harsh R(esult)
########################################
my %R = ();
my $i = 0;
my $min = 0;
my $max = 0;
$file = $ARGV[1];
open(FILE, $file) or die "couldn't open $file for reading: $!\n";
while(<FILE>) {
	chomp;
	my ($pmcid, $gid, $score) = split(/\t/, $_);
	if (exists $G{$pmcid} || exists $E{$pmcid}) {
		$i++;
		$min = $score if $i == 1;
		$R{$pmcid}{$gid} = $score;
		$min = $score if $min > $score;
		$max = $score if $max < $score;
	}
}
close FILE;

###########################################
# generating input file for tap computation
###########################################

my @a = ();	# number of false positives in each returned doc 

my $out = "$file.tap.in";
open(OUT, ">", $out) or die "couldn't open $out for writing: $!\n";

foreach my $pmcid (keys %G) {
	print OUT "$pmcid\n"; 						
	print OUT scalar keys %{$G{$pmcid}}, "\n";
	
	if (exists $R{$pmcid}) {

		my $fp = 0;								
		
		# get gids sorted by their scores in descending order
		# if all scores are identical, substract 0.001 from the last score
		my @gids = sort {$R{$pmcid}{$b} <=> $R{$pmcid}{$a}} keys %{$R{$pmcid}};
		if (@gids >= 2 && $R{$pmcid}{$gids[0]} == $R{$pmcid}{$gids[-1]}) {
			$R{$pmcid}{$gids[-1]} -= 0.001;
			$min -= 0.001 if $min > $R{$pmcid}{$gids[-1]};
		}
		
		foreach my $gid (@gids) {
			if (exists $G{$pmcid}{$gid}) {
				print OUT "1\t$R{$pmcid}{$gid}\n";
			} else {
				$fp++;
				print OUT "0\t$R{$pmcid}{$gid}\n";
			}
		}

		if (scalar @gids == 1) {
			print OUT "0\t0.0\n";
		}

		push @a, $fp;
		print OUT "\n";
	
	} else {
		print OUT "0\t", $min, "\n";
		print OUT "\n";
	}
		

}


# artificial results for docs with no annotations
foreach my $pmcid (keys %E) {
	print OUT "$pmcid\n";
	print OUT "1\n";
	if (exists $R{$pmcid}) {
		print OUT "0\t", $min, "\n";
		print OUT "0\t", $min-0.001, "\n";
	} else {
		print OUT "1\t", $max, "\n";
		print OUT "0\t", $min-0.001, "\n";
	}
	print OUT "\n";
}

close OUT;

# compute tap k
my $k = $ARGV[2];

# number of lists with more than k errors 
my $n = 0;	
foreach (@a) {
	$n++ if $_ >= $k;
}

my $total = scalar keys %G;
$total   += scalar keys %E;

# if there are fewer than 0.5 of the retrieval lists have k errors
# we use as the threshold the minimal score among submitted results
print "\nGN statistics:\n";
print "Total documents in gold standard: $total\n";
print "Total documents returned by participant: ", scalar keys %R, "\n";
if ($n >= ceil($total/2)) {
	system("./tap.pl -i $out -k $k");
} else {	
	print "Fewer than half of the returned docs have $k errors\n";
	system("./tap.pl -i $out -t $min");
}
