#!/usr/bin/env perl

use strict;
use warnings;
use Data::Dump qw(dump);
use XML::Simple;
use Encode;
use Getopt::Long;
use File::Path;
use File::Find;
use Forks::Super;

my $maxProcesses = 1;
$Forks::Super::MAX_PROC = $maxProcesses;
$Forks::Super::ON_BUSY  = 'block';
  
# process the command line parameters.
my $type = 'rouge2';
my $inputDirectory = '/g01/conundrum/corpora/stemmed/wikipedia-fa/splitbodies';
my $outputDirectory = '/g01/conundrum/corpora/oracles/stemmed/';
my $result = GetOptions("t=s" => \$type, "i=s" => \$inputDirectory, "o=s" => \$outputDirectory, "p=n" => \$maxProcesses);
$maxProcesses = int abs $maxProcesses;
$maxProcesses = 1 if ($maxProcesses < 1);

# make the output directory.
$outputDirectory = File::Spec->catfile ($outputDirectory, $type);
mkpath ($outputDirectory, 1, 0755);

# create the oracle files in parallel.
processFilesInParallel ($inputDirectory, $outputDirectory, $type, $maxProcesses);

exit;

sub processFilesInParallel
{
  my ($InputDirectory, $OutputDirectory, $Type, $MaxProcesses) = @_;

  # get the maximum number of processes to fork.
  $Forks::Super::MAX_PROC = $MaxProcesses;
  $Forks::Super::ON_BUSY  = 'block';

  # get the list of all the html files to process.
  my $listOfBodyFiles = getListOfBodyFiles($InputDirectory);
  
  # randomly shuffle the files.
  my $totalFiles = $#$listOfBodyFiles + 1;
  for (my $i = 0; $i < $totalFiles; $i++)
  {
    my $index = int rand $totalFiles;
    ($listOfBodyFiles->[$i], $listOfBodyFiles->[$index]) = ($listOfBodyFiles->[$index], $listOfBodyFiles->[$i]);
  }

  # get the number of files each process should process.
  my $filesPerProcess = int((scalar(@$listOfBodyFiles) + $maxProcesses - 1) / $MaxProcesses);

  # create the sublist of files each process will work on.
  my $listOfHtmlFilesForProcess = [];
  for (my $p = 0 ; $p < $maxProcesses ; $p++)
  {
    $listOfHtmlFilesForProcess->[$p] = [ splice(@$listOfBodyFiles, 0, $filesPerProcess) ];
  }

  for (my $p = 0 ; $p < $maxProcesses ; $p++)
  {
    if ($maxProcesses == 1)
    {
      processListOfFiles($listOfHtmlFilesForProcess->[$p], $Type, $OutputDirectory);
    }
    else
    {
      my $pid =
        fork { sub => \&processListOfFiles, args => [$listOfHtmlFilesForProcess->[$p], $Type, $OutputDirectory] };
    }
  }

  # wait for the forked processes to finish.
  waitall if ($MaxProcesses != 1);

  return 1;
}


{
  my $listOfBodyFiles;
  
  sub getListOfBodyFiles
  {
    # initialize the list to hold the files.
    $listOfBodyFiles = [];
    
    # find all the files.
    find (\&collectBodyFiles, @_);
    
    # randomly shuffle the files.
    shuffle ($listOfBodyFiles);
    
    # return the list of files.
    return $listOfBodyFiles;
  }

  sub collectBodyFiles
  { 
    # skip if not a file.
    return 0 unless -f $File::Find::name;
    
    # skip if not an xml file.
    return 0 unless ($File::Find::name =~ /\.xml$/);
    
    # store the file.
    push @$listOfBodyFiles, $File::Find::name;
  }
}


sub shuffle
{
  my $deck = shift;
  return undef unless @$deck;

  my $i = @$deck;
  while (--$i) 
  {
    my $j = int rand ($i+1);
    @$deck[$i,$j] = @$deck[$j,$i];
  }
  
  return undef;
}


sub processListOfFiles
{
  my ($ListOfBodyFiles, $Type, $OutputDirectory) = @_;
  
  foreach my $fullPathBodyFile (@$ListOfBodyFiles)
  {
    processFile ($fullPathBodyFile, $Type, $OutputDirectory);
  }
}


{
  my %directoryExists;
  
  sub processFile
  {
    my ($FullPathBodyFile, $Type, $OutputDirectory) = @_;
    $Type = lc $Type;
    
    # get the path to the summary file.
    my $fullPathSummaryFile = $FullPathBodyFile;
    if ($fullPathSummaryFile =~ m/alllangsdocbodies/)
    {
      $fullPathSummaryFile =~ s/alllangsdocbodies/alllangssumms/;
    }
    else
    {
      $fullPathSummaryFile =~ s/splitbodies/splitsumms/;
    }
    
    # if the summary file does not exist return now.
    return undef unless -f $fullPathSummaryFile;
    
    # make the output directory if it does not exist.
    unless (exists $directoryExists{$OutputDirectory})
    {
      mkpath ($OutputDirectory, 1, 0755);
      $directoryExists{$OutputDirectory} = 1;
    }
    
    # get the new order of the body sentences based on the summary.
    my $summarySentences = getSentencesOfFile ($fullPathSummaryFile);
    my $bodySentences = getSentencesOfFile ($FullPathBodyFile);
    my $order;
    
    if ($Type eq 'rouge1')
    {
      $order = rankSentencesByOneGrams ($summarySentences, $bodySentences);
    }
    elsif ($Type eq 'rouge2')
    {
      $order = rankSentencesByTwoGrams ($summarySentences, $bodySentences);
    }
    elsif ($Type eq 'marcu')
    {
      $order = rankSentencesByMarcu ($summarySentences, $bodySentences);
    }
    elsif ($Type eq 'marcu1')
    {
      $order = rankSentencesByMarcuOneGrams ($summarySentences, $bodySentences);
    }
    else
    {
      die "Type '$Type' is not valid.\n";
    }
    

    # make a copy of the sentences in their new order.
    my @sentences = map {$bodySentences->[$_]} @$order;
    
    # get the path name of the text file to write the sentences.
    my $outputFile = $FullPathBodyFile;
    $outputFile =~ s/^.*?alllangsdocbodies//;
    $outputFile =~ s/\.xml$/.txt/i;
    $outputFile = File::Spec->catfile ($OutputDirectory, $outputFile);
    my ($volume, $path, $file) = File::Spec->splitpath ($outputFile);
    my $outputFileDirectory = File::Spec->catfile ($volume, $path);
    unless (exists $directoryExists{$outputFileDirectory})
    {
      mkpath ($outputFileDirectory, 1, 0755);
      $directoryExists{$outputFileDirectory} = 1;
    }
    
    # write the rearrange sentences to the file.
    my $fh;
    if (open ($fh, '>:utf8', $outputFile))
    {
      print $fh join ("\n", @sentences) . "\n";
      close $fh;
    }
    else
    {
      warn "could not open file '$outputFile' for writing.\n";
    }
  }
}




sub rankSentencesByMarcu
{
  # get the list of sentences in the summary and content.
  my ($ListOfSummarySentences, $ListOfContentSentences) = @_;

  # put all the summary words into a hash.
  my %summaryVector;
  for (my $i = 0; $i < @$ListOfSummarySentences; $i++)
  {
    my $listOfWords = getListOfWords ($ListOfSummarySentences->[$i]);
    foreach my $word (@$listOfWords)
    {
      ++$summaryVector{$word};
    }
  }
 
  # normalize the hash of the summary to 1.
  my $norm = 0;
  foreach my $value (values %summaryVector)
  {
    $norm += $value * $value;
  }
  $norm = sqrt $norm;
  $norm = 1 if ($norm <= 0);

  foreach my $word (keys %summaryVector)
  {
    $summaryVector{$word} /= $norm;
  }
  
  # put each content sentence into a hash.
  my @listOfContentVectors;
  for (my $i = 0; $i < @$ListOfContentSentences; $i++)
  {
    my $listOfWords = getListOfWords ($ListOfContentSentences->[$i]);
    
    # get the size of the sentence.
    my %sentenceVector;
    
    # compute the score of the sentence against the summary.
    my $totalSize = 0;
    foreach my $word (@$listOfWords)
    {
      ++$sentenceVector{$word};
      ++$totalSize;
    }
    
    # normalize the vector to sum to one.
    $totalSize = 1 if ($totalSize <= 0);
    foreach my $word (keys %sentenceVector)
    {
      $sentenceVector{$word} /= $totalSize;
    }
    
    push @listOfContentVectors, [\%sentenceVector, $i];
  }
  
  my @sentenceOrder;
  while (@listOfContentVectors)
  {
    my $maxDiffColumn = getMinVectorIndex (\%summaryVector, \@listOfContentVectors);
    push @sentenceOrder, $maxDiffColumn->[1];
    splice @listOfContentVectors, $maxDiffColumn->[2], 1, ();
  }
  
  return [reverse @sentenceOrder];
}


sub getMinVectorIndex
{
  my ($Vector, $Matrix) = @_;
  
  # compute dot product of each matrix column with Vector.
  my @vectorMatrix;
  my @vectorKeys = keys %$Vector;
  foreach my $columnIndex (@$Matrix)
  {
    my $column = $columnIndex->[0];
    my $product = 0;
    foreach my $key (@vectorKeys)
    {
      my $columnValue = 0;
      $columnValue = $column->{$key} if exists $column->{$key};
      $product += $Vector->{$key} * $columnValue;
    }
    push @vectorMatrix, $product;
  }
  
  my $vectorMatrixSum = 0;
  foreach my $value (@vectorMatrix)
  {
    $vectorMatrixSum += $value;
  }
  
  # sum all the columns in the $Matrix.
  my %sumOfColumns;
  foreach my $columnIndex (@$Matrix)
  {
    while (my ($key, $value) = each %{$columnIndex->[0]})
    {
      $sumOfColumns{$key} += $value;
    }
  }
  
  my $minColumn;
  for (my $i = 0; $i < @$Matrix; $i++)
  {
    # get the product of the vector and matrix with i-th column removed.
    my $newVectorMatrixSum = $vectorMatrixSum - $vectorMatrix[$i];
    
    # compute the norm of the sum of the columns with the i-th column removed.
    my %newSumOfColumns = %sumOfColumns;
    my $column = $Matrix->[$i][0];
    while (my ($key, $value) = each %$column)
    {
      $newSumOfColumns{$key} -= $value;
    }

    # get the norm of $newSumOfColumns.
    my $normOfNewSumOfColumns = 0;
    foreach my $value (values %newSumOfColumns)
    {
      $normOfNewSumOfColumns += $value * $value;
    }
    $normOfNewSumOfColumns = sqrt $normOfNewSumOfColumns;
    $normOfNewSumOfColumns = 1 if ($normOfNewSumOfColumns <= 0); 
    
    # compute the score the vector and sum of columns with the i-th column removed.
    my $similarityScore = $newVectorMatrixSum / $normOfNewSumOfColumns;
    
    if (defined ($minColumn))
    {
      $minColumn = [$similarityScore, $Matrix->[$i][1], $i] if ($similarityScore > $minColumn->[0]);
    }
    else
    {
      $minColumn = [$similarityScore, $Matrix->[$i][1], $i];
    }
  }

  return $minColumn;
}

sub rankSentencesByMarcuOneGrams
{
  # get the list of sentences in the summary and content.
  my ($ListOfSummarySentences, $ListOfContentSentences) = @_;
  
  # put all the summary words into a hash.
  my %summary;
  for (my $i = 0; $i < @$ListOfSummarySentences; $i++)
  {
    foreach my $word (@{getListOfWords ($ListOfSummarySentences->[$i])})
    {
      ++$summary{$word};
    }
  }
  
  # get the words for each sentence in the content.
  my @listOfContentSentences;
  for (my $i = 0; $i < @$ListOfContentSentences; $i++)
  {
    push @listOfContentSentences, [$i, getListOfWords ($ListOfContentSentences->[$i])];
  }
  
  # rank the content sentences.
  my @sentenceIndex;
  while (@listOfContentSentences)
  {
    # score each sentence.
    my $chosenSentence;
    for (my $i = 0; $i < @listOfContentSentences; $i++)
    {
      my $sentence = $listOfContentSentences[$i]->[1];
      
      # get the size of the sentence.
      my $sentenceSize = $#$sentence + 1;
      $sentenceSize = 1 if ($sentenceSize < 1);
      
      # compute the score of the sentence against the summary.
      my $sentenceScore = 0;
      foreach my $word (@$sentence)
      {
        $sentenceScore += exists ($summary{$word});
      }
      $sentenceScore /= $sentenceSize;
      
      # store the sentences index and size.
      unless (defined $chosenSentence)
      {
        $chosenSentence = [$i, $sentenceScore, $sentenceSize];
      }
      elsif ($chosenSentence->[1] > $sentenceScore)
      {
        $chosenSentence = [$i, $sentenceScore, $sentenceSize];
      }
    }
    
    # store the min sentence index.
    push @sentenceIndex, $listOfContentSentences[$chosenSentence->[0]]->[0];
    
    # remove the words in the summary that are in the sentence.
    foreach my $word (@{$listOfContentSentences[$chosenSentence->[0]]->[1]})
    {
      delete $summary{$word};
    }
    
    # remove the minumum sentence.
    splice @listOfContentSentences,$chosenSentence->[0],1,();
  }

  # rank by higher score and total words.
  return \@sentenceIndex;
}


sub rankSentencesByOneGrams
{
  # get the list of sentences in the summary and content.
  my ($ListOfSummarySentences, $ListOfContentSentences) = @_;
  
  # put all the summary words into a hash.
  my %summary;
  for (my $i = 0; $i < @$ListOfSummarySentences; $i++)
  {
    foreach my $word (@{getListOfWords ($ListOfSummarySentences->[$i])})
    {
      ++$summary{$word};
    }
  }

  # score each sentence.
  my @sentenceIndexScore;
  for (my $i = 0; $i < @$ListOfContentSentences; $i++)
  {
    my $sentence = getListOfWords ($ListOfContentSentences->[$i]);
    
    # get the size of the sentence.
    my $sentenceSize = $#$sentence + 1;
    $sentenceSize = 1 if ($sentenceSize < 1);
    
    # compute the score of the sentence against the summary.
    my $sentenceScore = 0;
    foreach my $word (@$sentence)
    {
      $sentenceScore += exists ($summary{$word});
    }
    $sentenceScore /= $sentenceSize;
    
    # store the sentences index and size.
    push @sentenceIndexScore, [$i, $sentenceScore, $sentenceSize]
  }
  
  # rank by higher score and total words.
  @sentenceIndexScore = sort {($b->[1] <=> $a->[1]) || ($b->[2] <=> $a->[2])} @sentenceIndexScore;
  return [map {$_->[0]} @sentenceIndexScore];
}


sub rankSentencesByTwoGrams
{
  # get the list of sentences in the summary and content.
  my ($ListOfSummarySentences, $ListOfContentSentences) = @_;
  
  # put all the summary words into a hash.
  my %summary;
  my @allSummaryWords = map {@{getListOfWords($_)}} @$ListOfSummarySentences;
  for (my $i = 1; $i < @allSummaryWords; $i++)
  {
    my $word = $allSummaryWords[$i - 1] . ' ' . $allSummaryWords[$i];
    ++$summary{$word};
  }

  # score each sentence.
  my @sentenceIndexScore;
  for (my $i = 0; $i < @$ListOfContentSentences; $i++)
  {
    my $sentence = getListOfWords ($ListOfContentSentences->[$i]);
    
    # get the size of the sentence.
    my $sentenceSize = $#$sentence;
    $sentenceSize = 1 if ($sentenceSize < 1);
    
    # compute the score of the sentence against the summary.
    my $sentenceScore = 0;
    for (my $j = 1; $j < @$sentence; $j++)
    {
      my $word = $sentence->[$j - 1] . ' ' . $sentence->[$j];
      $sentenceScore += exists ($summary{$word});
    }
    $sentenceScore /= $sentenceSize;
    
    # store the sentences index and size.
    push @sentenceIndexScore, [$i, $sentenceScore, $sentenceSize]
  }

  # rank by higher score and total words.  
  @sentenceIndexScore = sort {($b->[1] <=> $a->[1]) || ($b->[2] <=> $a->[2])} @sentenceIndexScore;
  return [map {$_->[0]} @sentenceIndexScore];
}


sub getSentencesOfFile
{
  my $inputFile     = $_[0];

  # if not a file return now.
  return 0 if !-f $inputFile;

  my $sentences; 
  my $article;
  my $result = eval { $article = XMLin($inputFile, ForceArray => 1); return 1; };
  if (!defined($result) || $@)
  {
    warn "xml parsing errors with file '$inputFile': $@\n";
    $sentences = getSentencesViaString ($inputFile);
  }
  else
  {
    # replace all sentences with the stemmed sentences.
    $sentences = getSentences ($article);
  }
  
  foreach my $sentence (@$sentences)
  {
    $sentence =~ s/\p{C}+/ /g;
  }

  return $sentences;
}


sub getSentences
{
  my $article = $_[0];
  my $allSentences = 0;
  my $sentences = [];
  $sentences = $_[1] if @_ > 1;
  
  if (ref($article) eq 'HASH')
  {
    while (my ($key, $value) = each %$article)
    {
      if (($key eq 's') && (ref ($value) eq 'ARRAY'))
      {
        # stem the words in the sentences.
        foreach my $sentence (@$value)
        {
          next if !exists $sentence->{content};
          
          if (!$allSentences)
          {
            next if !exists $sentence->{stype};
            next if $sentence->{stype} != 1;
          }
    
          # the strings returned should be utf8 encoded, if not, make them so.
          unless (Encode::is_utf8($sentence->{content}, 1))
          {
            my $reults = eval { $sentence->{content} = decode_utf8($sentence->{content}, 0); return 1; };
            if (!defined($result) || $@)
            {
              warn "problem with UTF8 encoding of sentence; skipping it.\n";
              $sentence->{content} = '';
              next;
            }
          }
    
          push @$sentences, $sentence->{content};
        }
      }
      else
      {
        $sentences = getSentences ($value, $sentences, $allSentences);
      }
    }
  }
  elsif (ref ($article) eq 'ARRAY')
  {
    foreach my $item (@$article)
    {
      $sentences = getSentences ($item, $sentences, $allSentences);
    }
  }
  elsif (ref ($article) eq 'REF')
  {
    $sentences = getSentences ($$article, $sentences, $allSentences);
  }
  return $sentences;
}


sub getSentencesViaString
{
  my $inputFile = $_[0];
  my $allSentences = 0;
  
  my $fh;
  if (! open ($fh, '<:utf8', $inputFile))
  {
    warn "could not open file '$inputFile' for reading.\n";
    return undef;
  }
  
  my $contents;
  read ($fh, $contents, -s $inputFile);
  close $fh;
  
  # split the file on the sentences.
  my @content = split (/(<\s*s[^>]*>[^<]+<\s*\/s\s*>)/ix, $contents);
  
  my @sentences;
  foreach my $sentence (@content)
  {
    next if (!$allSentences && ($sentence !~ /stype=\"1\"/));
    
    if ($sentence =~ /<\s*s[^>]*>([^<]+)<\s*\/s\s*>/ix)
    {
      push @sentences, $1;
    }
  }
  
  return \@sentences;
}

# given a string returns the list of words in the string.
sub getListOfWords
{
  my @listOfWords;
  
  # split the string on the non-alnum chacacters.
  foreach my $word (split(/\P{Alnum}+/x, lc $_[0]))
  {
    # skip the word if it does not contain at least one letter.
    next unless $word =~ /\p{Letter}/;
    
    # save the word.
    push @listOfWords, $word;
  }
  
  # return the list of words.
  return \@listOfWords;
}




