#!/usr/bin/env perl

use strict;
use warnings;
use Carp qw(cluck croak);
use File::Spec;
use File::Path qw(make_path);
use XML::Simple;
use Data::Dump qw(dump);
use Lingua::Stem::Snowball;
use Encode;
use Getopt::Long;
use File::Find;

# get the number of command line options present.
my $totalOptions = @ARGV;
if ($totalOptions == 0)
{
	#exit 0;
}

# process the command line parameters.
my $langCode = 'en';
my $inputDirectory = '/g01/conundrum/DUC/DUC02/2010shallow';
my $outputDirectory = '/home/jmkubin/tmp/duc02s';
my $result = GetOptions("l=s" => \$langCode, "i=s" => \$inputDirectory, "o=s" => \$outputDirectory);


# get a hash of the language codes that can be stemmed.
my %stemmingLangsCode = map { (lc $_, 1) } Lingua::Stem::Snowball::stemmers();
$langCode = lc $langCode;
if (!exists($stemmingLangsCode{$langCode}))
{
	die "cannot stem language with code '$langCode'.\n";
}
my $stemmer = Lingua::Stem::Snowball->new(lang => $langCode, encoding => 'UTF-8');

# make sure the inputDirectory was defined.
if (! defined $inputDirectory)
{
  die "input directory parameter '-i' not defined.\n";
}

# make sure the input directory exists.
if (!-d $inputDirectory)
{
	die "directory '$inputDirectory' does not exist.\n";
}

# make the outputDirectory was defined.
if (! defined $outputDirectory)
{
	die "output directory parameter '-o' not defined.\n";
}

$inputDirectory = File::Spec->rel2abs ($inputDirectory);
$outputDirectory = File::Spec->rel2abs ($outputDirectory);

my %cacheOfCreatedDirectories;
find(\&createStemmedXmlFiles, $inputDirectory);


sub createStemmedXmlFiles
{
	my $localDir = $File::Find::dir;
	my $inputFile     = $_;

	# if not a file return now.
	return 0 if !-f $inputFile;
	
	# if not an xml file return now.
	return 0 if $inputFile !~ /\.(s|xml)$/i;

  # make the directory to put the stemmed file into.
  my $relativeOutDirectory = File::Spec->abs2rel ($File::Find::dir, $inputDirectory);
  my $outputFileDirectory = File::Spec->catfile($outputDirectory, $relativeOutDirectory);
  if (!exists $cacheOfCreatedDirectories{$outputFileDirectory})
  {
    make_path($outputFileDirectory, { verbose => 0, mode => oct '777' });
    $cacheOfCreatedDirectories{$outputFileDirectory} = 1;
    die "could not create directory $outputFileDirectory.\n" if (!-d $outputFileDirectory);
  } 

  # get the full path of the file to write the stemmed article to.
  my $outputFile = File::Spec->catfile($outputFileDirectory, $inputFile);

	my $article;
	my $result = eval { $article = XMLin($inputFile, ForceArray => 1); return 1; };
	if (!defined($result) || $@)
	{
		warn "xml parsing errors with file '$File::Find::name': $@\n";
    stemViaString ($inputFile, $outputFile);
		return undef;
	}
	
	# replace all sentences with the stemmed sentences.
	stemAllSentences ($article);

  # write the stemmed sentences to a file.
  XMLout($article, RootName => 'DOC', OutputFile => $outputFile);
  
  return undef;
}


sub stemAllSentences
{
  my $article = $_[0];
  
  if (ref($article) eq 'HASH')
  {
    while (my ($key, $value) = each %$article)
    {
      if (($key eq 's') && (ref ($value) eq 'ARRAY'))
      {
        # stem the words in the sentences.
        foreach my $sentence (@$value)
        {
          next if !exists $sentence->{content};
    
          # the strings returned should be utf8 encoded, if not, make them so.
          unless (Encode::is_utf8($sentence->{content}, 1))
          {
            my $reults = eval { $sentence->{content} = decode_utf8($sentence->{content}, 0); return 1; };
            if (!defined($result) || $@)
            {
              cluck "problem UTF8 encoding sentence; skipping it.\n";
              $sentence->{content} = '';
              next;
            }
          }
    
          $sentence->{content} = getStemmedSentence ($sentence->{content});
        }
      }
      else
      {
        stemAllSentences ($value);
      }
    }
  }
  elsif (ref($article) eq 'ARRAY')
  {
    foreach my $item (@$article)
    {
      stemAllSentences ($item);
    }
  }
  elsif (ref ($article) eq 'REF')
  {
    stemAllSentences ($$article);
  }
  return undef;
}


sub stemViaString
{
  my $inputFile = $_[0];
  
  my $fh;
  if (! open ($fh, '<:utf8', $inputFile))
  {
    warn "could not open file '$inputFile' for reading.\n";
    return undef;
  }
  
  my $contents;
  read ($fh, $contents, -s $inputFile);
  close $fh;
  
  # split the file on the sentences.
  my @content = split (/(<\s*s[^>]*>[^<]+<\s*\/s\s*>)/ix, $contents);
  
  foreach my $sentence (@content)
  {
  	if ($sentence =~ /(<\s*s[^>]*>)([^<]+)(<\s*\/s\s*>)/ix)
  	{
  	  my $prefix = $1;
  	  my $middle = getStemmedSentence ($2);
  	  my $suffix = $3;
  	  $sentence = $prefix . $middle . $suffix;
  	}
  }

  my $outputFile = $_[1];
  if (! open ($fh, '>:utf8', $outputFile))
  {
    warn "could not open file '$outputFile' for writing.\n";
    return undef;
  }
  
  print $fh join ('', @content);
  close $fh;
  
  return undef;
}


sub getStemmedSentence
{
  my $sentence = lc $_[0];
  
  # get the words in the sentence as lowercase.
  my @words = split(/\P{Alnum}+/x, $sentence);
  
  # stem the words in place.
  $stemmer->stem_in_place(\@words);
  $sentence = join(' ', @words);
  $sentence =~ s/^\s+//;
  $sentence =~ s/\s+$//;
  $sentence =~ s/\s\s+/ /;
  return $sentence;
}





















