#!/usr/bin/env perl

use strict;
use warnings;
use diagnostics;
use Carp;

use Text::Engrams;
use Data::Dump qw(dump);
use Text::Corpus::Util;
use Local::Util;
use Getopt::Long;
use File::Path qw(make_path);
use File::Find;
use XML::Simple;

my $el = "\n";

# flush all output to STDOUT.
{ 
  my $fh = select STDOUT;
  $| = 1;
  select $fh;
}

my $totalArguments = @ARGV;
if ($totalArguments == 0)
{
  print "-i inputDirectory -d databaseDirectory -s ngramSize -t ngramType -n corpusName\n";
  #exit 0;
}

# get the commandline parameters.
my $inputDirectory = '/data/multilang/corpora/split/alllangsdocbodies/en';
my $databaseDirectory = '/data/multilang/corpora/db';
my $sizeOfNgrams = 1;
my $typeOfNgrams = 'words';
my $corpusName;
my $result = GetOptions
  (
    "i=s" => \$inputDirectory, 
    "d:s" => \$databaseDirectory,
    "s:i" => \$sizeOfNgrams,
    "t:s" => \$typeOfNgrams,
    "n:s" => \$corpusName,
  );

# set the default corpus name if not set.
unless (defined $corpusName)
{
  (undef, undef, $corpusName) = File::Spec->splitpath ($inputDirectory);
  $corpusName = 'en' unless defined $corpusName;
}

# make sure the inputDirectory was defined.
if (! defined $inputDirectory)
{
  die "input directory parameter '-i' not defined.\n";
}

# make sure the input directory exists.
if (! -d $inputDirectory)
{
  die "directory '$inputDirectory' does not exist.\n";
}

# initialize the logging.
my $logBaseDir = File::Spec->catfile ($databaseDirectory, 'log');
make_path ($logBaseDir, { verbose => 0, mode => 0777 });
initializeLogger ($logBaseDir);

# set the size of the ngrams.
$sizeOfNgrams = int abs $sizeOfNgrams;
$sizeOfNgrams = 1 if $sizeOfNgrams < 1;

# get the engrammer.
my $pathToSqliteExtensions = $ENV{HOME} . '/workspace/sqliteExtension/libsqlitefunctions.so';
$pathToSqliteExtensions = $ENV{HOME} . '/workspace/sqliteExtension/libsqlitefunctions.dylib';
my %engramParameters = (hashNgrams => 0, sizeOfNgrams => $sizeOfNgrams, typeOfNgrams => $typeOfNgrams, normalizeText => 1, databaseDirectory => File::Spec->catfile ($databaseDirectory, 'db', $corpusName), storeNgramPositions => 1, pathToSqliteExtensions => $pathToSqliteExtensions);
my $engram = Text::Engrams->new (%engramParameters);

find (\&processFile, $inputDirectory);

# compute the statistics on all the ngrams.
$engram->createTablesOfBaseStats ();
$engram->createPopulateTable_tokenWeights ();

sub processFile
{
  my $inputFile = $_;
  
  # if not a file, return now.
  return 0 unless -f $inputFile;
  
  # make sure the file ends in xml.
  return 0 unless ($inputFile =~ /xml$/i);
  
  addSentencesInFileToDb ($inputFile, $engram);
}

{

my $totalFilesAdded;

sub addSentencesInFileToDb
{
  $totalFilesAdded = 0 unless defined $totalFilesAdded;
  #return 0 if ($totalFilesAdded > 30);
  
  my $inputFile = $_[0];
  my $engram = $_[1];
  
  my $sentences; 
  my $article;
  my $result = eval { $article = XMLin($inputFile, ForceArray => 1); return 1; };
  if (!defined($result) || $@)
  {
    warn "xml parsing errors with file '$inputFile': $@\n";
    $sentences = getSentencesViaString ($inputFile);
  }
  else
  {
    # get all the sentences via the xml.
    $sentences = getSentences ($article);
  }
  
  # clean the text of the sentences.
  foreach my $sentence (@$sentences)
  {
    $sentence =~ s/[^\p{L}\p{N}]+/ /g;
    $sentence = lc $sentence;
  }
  
  $engram->addDocument (uri => $inputFile, text => $sentences);

  print $inputFile . $el;
  ++$totalFilesAdded;
  
  return 1;
}
}


sub getSentences
{
  my $article = $_[0];
  my $sentences = [];
  $sentences = $_[1] if @_ > 1;
  
  if (ref($article) eq 'HASH')
  {
    while (my ($key, $value) = each %$article)
    {
      if (($key eq 's') && (ref ($value) eq 'ARRAY'))
      {
        # stem the words in the sentences.
        foreach my $sentence (@$value)
        {
          next if !exists $sentence->{content};
    
          # the strings returned should be utf8 encoded, if not, make them so.
          unless (Encode::is_utf8($sentence->{content}, 1))
          {
            my $reults = eval { $sentence->{content} = decode_utf8($sentence->{content}, 0); return 1; };
            if (!defined($result) || $@)
            {
              warn "problem with UTF8 encoding of sentence; skipping it.\n";
              $sentence->{content} = '';
              next;
            }
          }
    
          push @$sentences, $sentence->{content};
        }
      }
      else
      {
        $sentences = getSentences ($value, $sentences);
      }
    }
  }
  elsif (ref ($article) eq 'ARRAY')
  {
    foreach my $item (@$article)
    {
      $sentences = getSentences ($item, $sentences);
    }
  }
  elsif (ref ($article) eq 'REF')
  {
    $sentences = getSentences ($$article, $sentences);
  }
  return $sentences;
}


sub getSentencesViaString
{
  my $inputFile = $_[0];
  
  my $fh;
  if (! open ($fh, '<:utf8', $inputFile))
  {
    warn "could not open file '$inputFile' for reading.\n";
    return undef;
  }
  
  my $contents;
  read ($fh, $contents, -s $inputFile);
  close $fh;
  
  # split the file on the sentences.
  my @content = split (/(<\s*s[^>]*>[^<]+<\s*\/s\s*>)/ix, $contents);
  
  my @sentences;
  foreach my $sentence (@content)
  {
    if ($sentence =~ /<\s*s[^>]*>([^<]+)<\s*\/s\s*>/ix)
    {
      push @sentences, $1;
    }
  }
  
  return \@sentences;
}


sub generateSummaryOfDocument
{
  my %Parameters = @_;
  
  # get the database of the documents and their ngrams.
  my $engramDatabase = $Parameters{engramDatabase};
  
  # get the uri of the document.
  my $uri = $Parameters{uri};
  
  
  
}










