#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;
use Data::Dump qw(dump);
use File::Find;
use File::Spec;
use Cwd qw(abs_path);

# duc model is the human summary.
# peer is the summary to score.

# get the total number of arguements.
my $totalArguments = @ARGV;

# dump the help message if no arguments, and exit.
if ($totalArguments == 0)
{
  print "-c computerSummaryDirectory -h humanSummaryDirectory -o outputFile -s idsToSkipFile -j \n";
  #exit 0;
}

# get the test and reference directories.
my $computerSummaryDirectory = '/data/wiki/aclpilot2013eval/submits-cleaned/lancaster/ar';
my $humanSummaryDirectory = '/data/wiki/aclpilot2013eval/dataset/summary-cleaned/ar';
my $configFile = '/scr/config.txt';
my $jackknifing = 0;
my $idsToSkipFile;
GetOptions ("c=s" => \$computerSummaryDirectory, "h=s" => \$humanSummaryDirectory, "o=s" => \$configFile, "j" => \$jackknifing, "b=s" => \$idsToSkipFile);

# make sure the reference directory of summaries exist.
if (! -d $humanSummaryDirectory)
{
  die "reference directory '$humanSummaryDirectory' does not exist.\n";
}

# get the list of reference files.
#$humanSummaryDirectory = abs_path ($humanSummaryDirectory);

my $listOfReferenceFiles = getListOfReferenceFiles ($humanSummaryDirectory);

# put the list of reference files into a hash based on the original document.
my %listOfReferenceSummaries;
foreach my $referenceFile (@$listOfReferenceFiles)
{
  # get the components of the file path.
  my ($volume, $path, $basename) = File::Spec->splitpath ($referenceFile);
  
  # get the document name of the file.
  my $docId = $basename;
  if ($basename =~ /^(D\d+\_[^\.]+)/i)
  {
    $docId = lc $1;
  }
  elsif ($basename =~ /^([a-f\d]+)\_(body|summary)\.txt$/i)
  {
    $docId = lc $1;
  }
  elsif ($basename =~ /^([^\.]+)/i)
  {
    $docId = lc $1;
  }
 
  # add the file to the list for the document.
  $listOfReferenceSummaries{$docId} = [] if ! exists $listOfReferenceSummaries{$docId};
  push @{$listOfReferenceSummaries{$docId}}, $referenceFile;
}


# read in the ids to skip and delete them from the $listOfReferenceSummaries.
if (defined ($idsToSkipFile) && -f $idsToSkipFile)
{
	if (-f $idsToSkipFile)
	{
		my $fh;
		if (! open ($fh, '<:utf8', $idsToSkipFile))
		{
			warn "could not open file '$idsToSkipFile' for reading.\n";
			return undef;
		}
		
		my $docId;
		while (defined ($docId = <$fh>))
		{
			chomp $docId;
			$docId = lc $docId;
			$docId = $1 if ($docId =~ /([a-f0-9]{32})/);
			if (exists ($listOfReferenceSummaries{$docId}))
			{
				delete $listOfReferenceSummaries{$docId};
			}
		}
		close $fh;
  }
  else
  {
    warn "$idsToSkipFile is not a file.\n";
  }
}


# get the list of test files.
#$computerSummaryDirectory = abs_path ($computerSummaryDirectory);
my (undef, undef, $testName) = File::Spec->splitpath ($computerSummaryDirectory);
my $listOfTestFiles = getListOfTestFiles ($computerSummaryDirectory);

# foreach test file get the corresponding reference files.
my @listOfEvalInfo;
foreach my $testFile (@$listOfTestFiles)
{
  # split the file path to get the basename.
  my ($volume, $path, $basename) = File::Spec->splitpath ($testFile);
  
  # from the basename get the reference file ids.
  my $docId;
  if ($basename =~ /^(D\d+\_[^\.]+)/i)
  {
    $docId = lc $1;
  }
  elsif ($basename =~ /^([a-f\d]+)\_(body|summary)\.txt$/i)
  {
    $docId = lc $1;
  }
  elsif ($basename =~ /^([^\.]+)/i)
  {
    $docId = lc $1;
  }
  
  if (defined $docId)
  {
    if (exists ($listOfReferenceSummaries{$docId}))
    {
      my %evaluation;
      $evaluation{evalid} = $docId;
      $evaluation{testfiles} = {$testName => $testFile};
      $evaluation{referenceFiles} = {map {(getModelId ($_), $_)} @{$listOfReferenceSummaries{$docId}}};
      push @listOfEvalInfo, \%evaluation;
    }
  }
}


if ($jackknifing)
{
  writeRougeJackknifeConfigFile (\@listOfEvalInfo, $configFile);
}
else
{
  writeRougeConfigFile (\@listOfEvalInfo, $configFile);
}

exit;


sub writeRougeConfigFile
{
  my $el = "\n";
  my ($listOfEvalInfo, $outputFile) = @_;
  
  # open the output file for writing.
  my $fhOut;
  if (!open ($fhOut, '>', $outputFile))
  {
    die "could not open file '$outputFile' for writing.\n";
  }
  
  print $fhOut '<ROUGE_EVAL version="1.5.5">' . $el;
  
  foreach my $evaluation (@$listOfEvalInfo)
  {
    # print the eval id.
    print $fhOut '<EVAL ID="' . $evaluation->{evalid} . '">' . $el;
    
    # print the model (reference) root directory.
    print $fhOut '<MODEL-ROOT>/' . $el;
    print $fhOut '</MODEL-ROOT>' . $el;

    # print the peer (test) root directory.
    print $fhOut '<PEER-ROOT>/' . $el;
    print $fhOut '</PEER-ROOT>' . $el;
    
    # print the text input format.
    print $fhOut '<INPUT-FORMAT TYPE="SPL">' . $el .  '</INPUT-FORMAT>' . $el;
    
    # write the test file.
    print $fhOut '<PEERS>' . $el;
    while (my ($id, $testFile) = each %{$evaluation->{testfiles}})
    {
      my $ident = getPeerId ($testFile);
      print $fhOut '<P ID="' . $ident . '">' . $testFile . '</P>' . $el;
    }
    print $fhOut '</PEERS>' . $el;

    # write the reference files.
    print $fhOut '<MODELS>' . $el;
    while (my ($id, $referenceFile) = each %{$evaluation->{referenceFiles}})
    {
      my $isTestFile = 0;
      while (my (undef, $testFile) = each %{$evaluation->{testfiles}})
      {
        if ($testFile eq $referenceFile)
        {
          $isTestFile = 1;
          last;
        }
      }
      next if $isTestFile;
      print $fhOut '<M ID="' . $id . '">' . $referenceFile . '</M>' . $el;
    }
    print $fhOut '</MODELS>' . $el;    

    print $fhOut '</EVAL>' . $el;
  }
  
  print $fhOut '</ROUGE_EVAL>' . $el;
}


sub writeRougeJackknifeConfigFile
{
  my $el = "\n";
  my ($listOfEvalInfo, $outputFile) = @_;
  
  my $evalCount = 0;
  
  my @listOfJackknifeEvalInfo;
  
  foreach my $evaluation (@$listOfEvalInfo)
  {
    my $referenceFiles = $evaluation->{referenceFiles};
    my $totalReferencesFiles = scalar keys %$referenceFiles;
    if ($totalReferencesFiles > 1)
    {
      # add the jackknifes for each reference.
      foreach my $id (keys  %$referenceFiles)
      {
        my %newEval;
        $newEval{evalid} = $evalCount++;
        
        # add reference file ($id, $referenceFile) to the test file list. 
        $newEval{testfiles} = {%{$evaluation->{testfiles}}};
        $newEval{testfiles}->{$id} = $referenceFiles->{$id};
        
        # remove the reference file ($id, $referenceFile) from the list.
        my $copyOfReferenceFiles = {%$referenceFiles};
        delete $copyOfReferenceFiles->{$id};
        $newEval{referenceFiles} = $copyOfReferenceFiles;
        push @listOfJackknifeEvalInfo, \%newEval;
      }
    }
    else
    {
      my %newEval;
      $newEval{evalid} = $evalCount++;
      $newEval{testfiles} = {%{$evaluation->{testfiles}}};
      $newEval{referenceFiles} = {%$referenceFiles};
      push @listOfJackknifeEvalInfo, \%newEval;
    }
  }
  
  writeRougeConfigFile (\@listOfJackknifeEvalInfo, $outputFile);
}


# gets the list of all full paths of the reference summaries.
{
  my @listOfReferenceFiles;

  sub getListOfReferenceFiles
  {
    @listOfReferenceFiles = ();
    find(\&addReferenceFiles,  @_);
    return [@listOfReferenceFiles];
  }

  sub addReferenceFiles
  {
    return 0 if ! -f $_;
    return 0 if $_ !~ /\.txt$/i;
    my $file = $File::Find::name;
    dump $file;
    $file =~ s|^/\.ccs||;
    push @listOfReferenceFiles, $file;
    return 1;
  }
}


# gets the list of all full paths of the summaries to test.
{
  my @listOfTestFiles;

  sub getListOfTestFiles
  {
    @listOfTestFiles = ();
    find(\&addTestFiles,  @_);
    return [@listOfTestFiles];
  }

  sub addTestFiles
  {
    return 0 if ! -f $_;
    my $file = $File::Find::name;
    $file =~ s|^/\.ccs||;
    if ($_ =~ /^D\d+\_/i)
    {
      push @listOfTestFiles, $file;
      return 1;
    }
    if ($_ =~ /CLASSY\d*$/i)
    {
      push @listOfTestFiles, $file;
      return 1;
    }
    if ($_ =~ /\.txt$/i)
    {
      push @listOfTestFiles, $file;
      return 1;
    }
    return 0;
  }
}

sub getPeerId
{
  my $filePath = lc $_[0];
  return $1 . '-' . $2 if ($filePath =~ m|([^/]+)/([^/]+)/[^/]+_summary.txt|);
  return 'classy-stemmed-' . $1 . '-' . $2 if ($filePath =~ /\/classy\/stemmed\/(\d\d\d)\/(..)/);
  return 'classy-unstemmed-' . $1 . '-' . $2 if ($filePath =~ /\/classy\/unstemmed\/(\d\d\d)\/(..)/);
  return 'classy-unstemmed-' . $1 . '-' . $2 if ($filePath =~ /\/classy\/split\/(\d\d\d)\/(..)/);
  return 'human-summary-unstemmed-'.$1 if ($filePath =~ /wikipedia_fa_txt\/(..)\/summary/);
  return 'human-summary-stemmed-'.$1  if ($filePath =~ /wikipedia-fa\/splitsummstxt\/(..)/);
  return 'oracle-' . $2 . '-' . $3 . '-' . $1  if ($filePath =~ /oracles\/([^\/]+)\/([^\/]+)\/(..)/);
  return 'lead-stemmed-' . $1  if ($filePath =~ m|/stemmed/wikipedia-fa/splitbodiestype1/(..)|);
  return 'lead-unstemmed-' . $1  if ($filePath =~ m|/unstemmed/wikipedia-fa/splitbodiestype1/(..)|);
  return 'lead-unstemmed-' . $1  if ($filePath =~ m|wikipedia_fa_type1_txt/(..)/body|);
  return 'unstemmed-stemmed-' . $1  if ($filePath =~ m|unstemmed_stemmed\/(\d\d\d)\/|);
  
  return $1 if ($filePath =~ /\.(.)$/);
  return 'no-peer-id';
}

sub getModelId
{
  my $filePath = lc $_[0];
  return 'human-summary-unstemmed-'.$1 if ($filePath =~ /wikipedia_fa_txt\/(..)\/summary/);
  return 'human-body-unstemmed-'.$1 if ($filePath =~ /wikipedia_fa_txt\/(..)\/body/);
  return 'human-summary-stemmed-'.$1  if ($filePath =~ /wikipedia-fa\/splitsummstxt\/(..)/);
  return 'human-body-stemmed-'.$1  if ($filePath =~ /wikipedia-fa\/splitbodiestxt\/(..)/);
  return 'oracle-' . $2 . '-' . $3 . '-' . $1  if ($filePath =~ /oracles\/([^\/]+)\/([^\/]+)\/(..)/);
  return 'human-stemmed-' . $1  if ($filePath =~ m|/stemmed/wikipedia-fa/splitsummstype1/(..)|);
  return 'human-unstemmed-' . $1  if ($filePath =~ m|/unstemmed/wikipedia-fa/splitsummstype1/(..)|);
  return 'human-summary-unstemmed-' . $1  if ($filePath =~ m|wikipedia_fa_type1_txt/(..)/summary|);
  return 'no-model-id';
}








