#!/usr/bin/env perl

use strict;
use warnings;
use Carp qw(cluck croak);
use File::Spec;
use XML::Simple;
use Data::Dump qw(dump);
use Encode;
use Getopt::Long;

# get the number of command line options present.
my $totalOptions = @ARGV;
if ($totalOptions == 0)
{
  #exit 0;
}

# process the command line parameters.
my $inputFile = '/g01/conundrum/corpora/unstemmed/alllangsdocbodies/af/0a8c6d18f36e58e71c12a13687cfc5c6.xml';
my $outputFile = '/tmp/tmp.txt';
my $allSentences = 0;
my $result = GetOptions("i=s" => \$inputFile, "o=s" => \$outputFile, 'a' => \$allSentences);

# make sure the inputDirectory was defined.
if (! defined $inputFile)
{
  die "input file parameter '-i' not defined.\n";
}

# make sure the input directory exists.
if (!-f $inputFile)
{
  die "file '$inputFile' does not exist.\n";
}

# make the outputDirectory was defined.
if (! defined $outputFile)
{
  die "output file parameter '-o' not defined.\n";
}

$inputFile = File::Spec->rel2abs ($inputFile);
$outputFile = File::Spec->rel2abs ($outputFile);

createFileOfSentences ($inputFile, $outputFile, $allSentences);

sub createFileOfSentences
{
  my $inputFile     = $_[0];
  my $outputFile     = $_[1];
  my $allSentences = $_[2];

  # if not a file return now.
  return 0 if !-f $inputFile;

  my $sentences; 
  my $article;
  my $result = eval { $article = XMLin($inputFile, ForceArray => 1); return 1; };
  if (!defined($result) || $@)
  {
    warn "xml parsing errors with file '$inputFile': $@\n";
    $sentences = getSentencesViaString ($inputFile, $allSentences);
  }
  else
  {
    # replace all sentences with the stemmed sentences.
    $sentences = getSentences ($article, [], $allSentences);
  }
  
  foreach my $sentence (@$sentences)
  {
    $sentence =~ s/\p{C}+/ /g;
  }

  my $fh;
  if (! open ($fh, '>:utf8', $outputFile))
  {
    warn "could not open file '$outputFile' for writing.\n";
    return undef;
  }
  
  print $fh join ("\n", @$sentences);
  close $fh;

  return undef;
}


sub getSentences
{
  my $article = $_[0];
  my $allSentences = $_[2];
  my $sentences = [];
  $sentences = $_[1] if @_ > 1;
  
  if (ref($article) eq 'HASH')
  {
    while (my ($key, $value) = each %$article)
    {
      if (($key eq 's') && (ref ($value) eq 'ARRAY'))
      {
        # stem the words in the sentences.
        foreach my $sentence (@$value)
        {
          next if !exists $sentence->{content};
          
          if (!$allSentences)
          {
            next if !exists $sentence->{stype};
            next if $sentence->{stype} != 1;
          }
    
          # the strings returned should be utf8 encoded, if not, make them so.
          unless (Encode::is_utf8($sentence->{content}, 1))
          {
            my $reults = eval { $sentence->{content} = decode_utf8($sentence->{content}, 0); return 1; };
            if (!defined($result) || $@)
            {
              cluck "problem with UTF8 encoding of sentence; skipping it.\n";
              $sentence->{content} = '';
              next;
            }
          }
    
          push @$sentences, $sentence->{content};
        }
      }
      else
      {
        $sentences = getSentences ($value, $sentences, $allSentences);
      }
    }
  }
  elsif (ref ($article) eq 'ARRAY')
  {
    foreach my $item (@$article)
    {
      $sentences = getSentences ($item, $sentences, $allSentences);
    }
  }
  elsif (ref ($article) eq 'REF')
  {
    $sentences = getSentences ($$article, $sentences, $allSentences);
  }
  return $sentences;
}


sub getSentencesViaString
{
  my $inputFile = $_[0];
  my $allSentences = $_[1];
  
  my $fh;
  if (! open ($fh, '<:utf8', $inputFile))
  {
    warn "could not open file '$inputFile' for reading.\n";
    return undef;
  }
  
  my $contents;
  read ($fh, $contents, -s $inputFile);
  close $fh;
  
  # split the file on the sentences.
  my @content = split (/(<\s*s[^>]*>[^<]+<\s*\/s\s*>)/ix, $contents);
  
  my @sentences;
  foreach my $sentence (@content)
  {
    next if (!$allSentences && ($sentence !~ /stype=\"1\"/));
    
    if ($sentence =~ /<\s*s[^>]*>([^<]+)<\s*\/s\s*>/ix)
    {
      push @sentences, $1;
    }
  }
  
  return \@sentences;
}























