#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;
use File::Find;
use Cwd qw(realpath);
use Data::Dump qw(dump);
use Statistics::Descriptive;
use Cwd qw(realpath);

my $el = "\n";

# get the total number of arguements.
my $totalArguments = @ARGV;

# dump the help message if no arguments, and exit.
if ($totalArguments == 0)
{
  print "-d scoreDirectory -o overallScoresFile -i individualScoresFile\n";
  exit 0;
}

my $scoreDirectory;
$scoreDirectory = '/g01/conundrum/corpora/evaluations/duc02/scores/classy';
$scoreDirectory = '/g01/conundrum/corpora/evaluations/oracles/';
$scoreDirectory = '/g01/conundrum/corpora/evaluations/wikipedia_fa';
$scoreDirectory = '/Users/jmkubin/projects/corpora/wikipedia_fa/stemmed/scores/classy/type1';
$scoreDirectory = '/g01/conundrum/corpora/evaluations/wikipedia_fa/unstemmed_stemmed/scores';

my $overallScoresFile = '/tmp/scores.txt';
my $individualScoresFile = '/tmp/iscores.txt';
my $result = GetOptions ("d=s" => \$scoreDirectory, "o=s" => \$overallScoresFile, "i=s" => \$individualScoresFile);

#wikipedia_fa,human,stemmed,it,400,f-measure,rouge-2,0.04807,/.ccs/g01/conundrum/corpora/evaluations/wikipedia_fa/stemmed/scores/sumbody/400/it-400.txt

# get the directory containing the score files.
$scoreDirectory = realpath ($scoreDirectory);

my $statsFile;
GetOptions ("i=s" => \$scoreDirectory, "o=s" => \$statsFile);

my @scoreInfo = ();
my @singleDocumentScores = ();
find (\&processDuc02ScoreFile, $scoreDirectory);

@scoreInfo = sort
  {
    ($a->{dataset} cmp $b->{dataset}) ||
    ($a->{technique} cmp $b->{technique}) ||
   ($a->{language} cmp $b->{language}) ||
    ($a->{stemmed} cmp $b->{stemmed}) ||
    ($a->{size} cmp $b->{size})
    
  }
@scoreInfo;

my $fh_overallScoresFile;
unless (open ($fh_overallScoresFile, '>:utf8', $overallScoresFile))
{
	die "could not open file '$overallScoresFile' for writing.\n";
}
my @order = qw(dataset technique stemmed language size type method score file);
@order = qw(dataset technique stemmed language size type method score);
foreach my $score (@scoreInfo)
{
  if (exists ($score->{jackknifed}) && $score->{jackknifed}) { $score->{jackknifed} = 1; }
  else { $score->{jackknifed} = 0; }
  
  if ($score->{technique} =~ /human/) { $score->{technique} = 'human'; }
  if ($score->{technique} =~ /classy/) { $score->{technique} = 'classy'; }
  if ($score->{size} == 0) { $score->{size} = 'full'; }

  my @values;
  foreach my $key (@order)
  {
    my $value = '';
    $value = $score->{$key} if exists $score->{$key};
    push @values, $value;
  }
  print $fh_overallScoresFile join (',', @values) . $el;
}
close $fh_overallScoresFile;


my $fh_individualScoresFile;
unless (open ($fh_individualScoresFile, '>:utf8', $individualScoresFile))
{
	die "could not open file '$individualScoresFile' for writing.\n";
}

@order = qw(dataset technique stemmed language size method docId recall precision fmeasure);
foreach my $score (@singleDocumentScores)
{
  if ($score->{technique} =~ /human/) { $score->{technique} = 'human'; }
  if ($score->{technique} =~ /classy/) { $score->{technique} = 'classy'; }
  if ($score->{size} == 0) { $score->{size} = 'full'; }

  my @values;
  foreach my $key (@order)
  {
    my $value = '';
    $value = $score->{$key} if exists $score->{$key};
    push @values, $value;
  }
  print $fh_individualScoresFile join (',', @values) . $el;
}
close $fh_individualScoresFile;


#computeStats (\@scoreInfo);


sub computeStats
{
  my ($scoreInfo) = $_[0];

  # compute the stats for the full human summaries.
  my $humanPrecisionUnstemmed = Statistics::Descriptive::Full->new();
  my $humanPrecisionStemmed = Statistics::Descriptive::Full->new();
  
  foreach my $score (@$scoreInfo)
  {
    if (($score->{technique} =~ /human/i) &&
        ($score->{stemmed} eq 'stemmed') &&
        ($score->{method} eq 'rouge-2') &&
        ($score->{type} eq 'precision'))
    {
        $humanPrecisionStemmed->add_data ($score->{score});
    }
    
    if (($score->{technique} =~ /human/i) &&
        ($score->{stemmed} eq 'unstemmed') &&
        ($score->{method} eq 'rouge-2') &&
        ($score->{type} eq 'precision'))
    {
        $humanPrecisionUnstemmed->add_data ($score->{score});
    }
  }

  print 'human unstemmed count: ', $humanPrecisionUnstemmed->count(), $el;
  print 'human unstemmed mean: ', $humanPrecisionUnstemmed->mean(), $el;
  print 'human unstemmed standard_deviation: ', $humanPrecisionUnstemmed->standard_deviation(), $el;
  print 'human unstemmed median: ', $humanPrecisionUnstemmed->median(), $el;
  print 'human unstemmed max: ', $humanPrecisionUnstemmed->max(), $el;
  print 'human unstemmed min: ', $humanPrecisionUnstemmed->min(), $el;

  print 'human stemmed count: ', $humanPrecisionStemmed->count(), $el;
  print 'human stemmed mean: ', $humanPrecisionStemmed->mean(), $el;
  print 'human stemmed standard_deviation: ', $humanPrecisionStemmed->standard_deviation(), $el;
  print 'human stemmed median: ', $humanPrecisionStemmed->median(), $el;
  print 'human stemmed max: ', $humanPrecisionStemmed->max(), $el;
  print 'human stemmed min: ', $humanPrecisionStemmed->min(), $el;
  
  
}

sub processDuc02ScoreFile
{
  my $file = $_;
  
  return undef unless $File::Find::name =~ /scores/i;

  # if not ending in .txt return now.
  return undef unless $file =~ /\.txt$/;

  # if not a file return now.
  return undef unless -f $file;
  
  my $fh;
  unless (open ($fh, "<", $file))
  {
    warn "could not open file '$file' for reading.\n";
    return undef;
  }

  my $baseEvalInfo = getPathInfo (realpath ($File::Find::name));
  $baseEvalInfo->{file} = realpath ($File::Find::name);
  while (defined (my $line = <$fh>))
  {
    chomp $line;
    my $evalInfo = getScoreInfo ($line, {%$baseEvalInfo});
    push @scoreInfo, $evalInfo if exists $evalInfo->{score};
    push @singleDocumentScores, $evalInfo if $evalInfo->{singleDocumentScore};
  }
  
  close $fh;
}


sub getPathInfo
{
  my $Path = lc $_[0];
  my $EvalInfo = {};
  $EvalInfo = $_[1] if exists $_[1];
  $EvalInfo->{dataset} = 'wikipedia_fa';
  
  if ($Path =~ /duc02/)
  {
    $EvalInfo->{dataset} = 'duc02';
    $EvalInfo->{language} = 'en';
  }
  if ($Path =~ /wikipedia_fa/)
  {
    $EvalInfo->{dataset} = 'wikipedia_fa';
  }
  $EvalInfo->{technique} = 'classy' if ($Path =~ /classy/);
  $EvalInfo->{technique} = 'lead' if ($Path =~ /sumbody/);
  $EvalInfo->{technique} = 'marcu' if ($Path =~ /marcu/);
  $EvalInfo->{technique} = 'rouge1' if ($Path =~ /rouge1/);
  $EvalInfo->{technique} = 'rouge2' if ($Path =~ /rouge2/);
  $EvalInfo->{technique} = 'lead' if ($Path =~ /lead/);
  
  if ($Path =~ /unstemmed/) { $EvalInfo->{stemmed} = 'unstemmed'; }
  else
  {
    if ($Path =~ /stemmed/) { $EvalInfo->{stemmed} = 'stemmed'; }
    else { $EvalInfo->{stemmed} = 'unstemmed'; }
  }
  $EvalInfo->{size} = $1 + 0 if ($Path =~ /[\-_](\d+)\.txt$/);
  $EvalInfo->{jackknifed} = 1 if ($Path =~ /\-j\-/);
  
# wikipedia_fa,no-peer-id,stemmed,id,400,precision,rouge-su4,0.06141,
# /g01/conundrum/corpora/evaluations/oracles/stemmed/lead/scores/400/tr_400.txt
  if ($Path =~ m|/([a-z]+)/scores/([124]00)/([a-z][a-z])_[124]00\.txt$|)
  {
    $EvalInfo->{technique} = $1;
    $EvalInfo->{size} = $2;
    $EvalInfo->{language} = $3;
  }
  
  if ($Path =~ m|/([a-z\-_]+)\-([124]00)\.txt$|)
  {
    $EvalInfo->{size} = $2;
    $EvalInfo->{language} = $1;
  }

  if ($Path =~ m|/([a-z\-_]+)\-full\.txt$|)
  {
    $EvalInfo->{size} = 0;
    $EvalInfo->{language} = $1;
  }

  if ($Path =~ m|/([a-z][a-z])\.txt$|)
  {
    $EvalInfo->{language} = $1;
  }

  if ($Path =~ m|/([a-z][a-z])[\-\_\d]*\.txt$|)
  {
    $EvalInfo->{language} = $1;
  }
  
  return $EvalInfo;
}


sub getScoreInfo
{
  my $Line = lc $_[0];
  my $EvalInfo = {};
  $EvalInfo = $_[1] if exists $_[1];
  $EvalInfo->{singleDocumentScore} = 0;
  
  if ($Line =~ / r:([\d\.]+) p:([\d\.]+) f:([\d\.]+)\s*$/i)
  {
  	$EvalInfo->{singleDocumentScore} = 1;
  	
  	my $recall = $1 + 0;
  	my $precision = $2 + 0;
  	my $fmeasure = $3 + 0;

  	# line is a single document score.
  	# lead-stemmed-en ROUGE-1 Eval 0A1AB42FAB68B7E8ED4E6B12BEEDDB2A.lead-stemmed-en R:0.38000 P:0.38000 F:0.38000
  	my @scoreInfo = split (/\s+/, $Line);
  	
  	my $stemmed = 'unstemmed';
  	if ($Line =~ m/unstemmed/)
  	{
  		$stemmed = 'unstemmed';
  	}
  	elsif ($Line =~ m/stemmed/)
  	{
  		$stemmed = 'stemmed';
  	}

		my $docId;
		$docId = $1 if ($scoreInfo[3] =~ m/([a-f0-9]{32})/);
		
		my $method = $scoreInfo[1];
		
		$EvalInfo->{recall} = $recall;
		$EvalInfo->{precision} = $precision;
		$EvalInfo->{fmeasure} = $fmeasure;
		$EvalInfo->{stemmed} = $stemmed;
		$EvalInfo->{docId} = $docId;
		$EvalInfo->{method} = $method;
  }
  elsif ($Line =~ /^(.*?)\s+rouge\-(1|2|su4)\s+average_(.):\s+([\.\d\+\-]+)/)
  {
    my $name = $1;
    $EvalInfo->{name} = $name;
    $EvalInfo->{method} = 'rouge-' . $2;
    $EvalInfo->{type} = $3;
    $EvalInfo->{score} = $4 + 0;
    
    if ($name =~ /\-clust/)
    {
      $EvalInfo->{technique} = 'classy-clustering'
    }
    elsif ($name =~ /classy/)
    {
      $EvalInfo->{technique} = 'classy'
    }
    elsif (!exists($EvalInfo->{technique}) && ($name =~ /(peer\-.)/))
    {
      $EvalInfo->{technique} = $1
    }
    
    if ($EvalInfo->{type} eq 'r') { $EvalInfo->{type} ='recall'; }
    elsif ($EvalInfo->{type} eq 'p') { $EvalInfo->{type} ='precision'; }
    elsif ($EvalInfo->{type} eq 'f') { $EvalInfo->{type} ='f-measure'; }
    
    if ($name =~ /\-unstem/) 
    { 
      $EvalInfo->{stemmed} = 'unstemmed';
    }
    elsif ($name =~ /\-stem/) 
    {
      $EvalInfo->{stemmed} = 'stemmed';
    }
    
    if ($name =~ m/^oracle/)
    {
      $EvalInfo->{size} += 0;
      $EvalInfo->{jackknifed} = 1 if ($name =~ /\-j\-/);
      $EvalInfo->{size} = $1 + 0 if ($name =~ /\-(\d+)\-/);
      $EvalInfo->{language} = $1 if (!exists ($EvalInfo->{language}) && ($name =~ /\-([a-z][a-z])\-/));
      $EvalInfo->{technique} = $1 if (!exists ($EvalInfo->{language}) && ($name =~ /^([a-z\d\-]+)/));
    }
    else
    {
      $EvalInfo->{size} += 0;
      $EvalInfo->{jackknifed} = 1 if ($name =~ /\-j\-/);
      $EvalInfo->{size} = $1 + 0 if ($name =~ /\-(\d+)\-/);
      $EvalInfo->{language} = $1 if (!exists ($EvalInfo->{language}) && ($name =~ /\-([a-z][a-z])\-/));
      $EvalInfo->{technique} = $1 if (!exists ($EvalInfo->{language}) && ($name =~ /^([a-z\d\-]+)/));
      $EvalInfo->{language} = 'en' if ($name =~ /^peer/);
    }

  }
  return $EvalInfo;
}
