#!/usr/bin/env perl

use strict;
use warnings;
use XML::Simple;
use File::Slurp;
use Data::Dump qw(dump);
use File::Path qw(make_path);
use Lingua::EN::Sentence qw(get_sentences);
use File::Find;

my $inDirectory = '/g01/conundrum/corpora/duc02/data/test/summaries/summaries';
my $outDirectory = '/g01/conundrum/corpora/duc02/data/test/summaries/single2';
make_path ($outDirectory, {mode => 0700});

find(\&makeIndividualSummaryFiles, $inDirectory);


sub makeIndividualSummaryFiles
{
  # get the file to parse.
  my $file = $_;
  return 0 if ! -f $file;
  return 0 if $file ne 'perdocs';

  # make the xml parser.
  my $parser = XML::Simple->new (ForceArray => 1);
  
  # get the content of the file and add doc tags.
  my $content = read_file($file);
  $content = '<DOC>' . $content . '</DOC>';
  $content =~ s/\&/ /g;
  
  # get the list of summaries in the file.
  my $summaries;
  my $result = eval {$summaries = $parser->XMLin ($content); return 1;};
  if ($@ || ! defined $result)
  {
    print "errors: $File::Find::name\n";
    return 0;
  }
  
  # write each summary to a separate file.
  foreach my $summary (@{$summaries->{SUM}})
  {
    my $docset = uc $summary->{DOCSET};
    my $filename = $docset . '_' . $summary->{DOCREF} . '.' . $summary->{SUMMARIZER} . '.txt';
    my $content = $summary->{content};
    $content =~ s/^\s+//;
    $content =~ s/\s+$//;
    $content =~ s/[\x00-\x20]+/ /g;
    my $sentences = get_sentences ($content);
    my $filePath = File::Spec->catfile ($outDirectory, $filename);
    my $fileIndex = 1;
    while (-f $filePath)
    {
      warn "warning: file '$filePath' already exists, overwriting it.\n";
      $filePath =~ s/(\.\d+)?\.txt$//;
      $filePath = $filePath . '.' . $fileIndex . '.txt';
    }
    write_file ($filePath, join ("\n", @$sentences));
  }
  
  return 1;
}






