#!/usr/bin/env perl

use strict;
use warnings;
use Carp qw(cluck croak);
use File::Spec;
use XML::Simple;
use Data::Dump qw(dump);
use Encode;
use Getopt::Long;
use File::Find;
use Statistics::Descriptive;

{
	my $previous_default = select(STDOUT);
	$|++;                                  
	select(STDERR);
	$|++;                                   
	select($previous_default);              
}
	
# get the number of command line options present.
my $totalOptions = @ARGV;
if ($totalOptions == 0)
{
  #exit 0;
}

# process the command line parameters.
my @corpusDirectory = ('/g01/conundrum/corpora/split/alllangsdocbodies', '/g01/conundrum/corpora/split/alllangssumms');
@corpusDirectory = ();

my $outputFile = '/tmp/tmp.txt';
my $allSentences = 0;
my $idsToSkipFile = '/g01/conundrum/corpora/evaluations/wikipedia_fa/info/badIds.txt';
my $result = GetOptions("d=s{,}" => \@corpusDirectory, "o=s" => \$outputFile, 'a' => \$allSentences, "b=s" => \$idsToSkipFile);

unless (@corpusDirectory)
{
	@corpusDirectory = ('/g01/conundrum/corpora/split/alllangsdocbodies', '/g01/conundrum/corpora/split/alllangssumms');
}

# make sure the inputDirectory was defined.
if (! @corpusDirectory)
{
  die "input file parameter '-d' not defined.\n";
}

# make the outputDirectory was defined.
if (! defined $outputFile)
{
  die "output file parameter '-o' not defined.\n";
}

foreach my $corpusDirectory (@corpusDirectory)
{
  # make sure the input directory exists.
  if (!-d $corpusDirectory)
  {
    die "file '$corpusDirectory' does not exist.\n";
  }
  
  $corpusDirectory = File::Spec->rel2abs ($corpusDirectory);
}

# read in the ids to skip and delete them from the $listOfReferenceSummaries.
my %badIds;
if (defined ($idsToSkipFile) && -f $idsToSkipFile)
{
	if (-f $idsToSkipFile)
	{
		my $fh;
		if (! open ($fh, '<:utf8', $idsToSkipFile))
		{
			warn "could not open file '$idsToSkipFile' for reading.\n";
			return undef;
		}
		
		my $docId;
		while (defined ($docId = <$fh>))
		{
			chomp $docId;
			$docId = lc $docId;
			$docId = $1 if ($docId =~ /([a-f0-9]{32})/);
			$badIds{$docId} = 1;
		}
		close $fh;
  }
  else
  {
    warn "$idsToSkipFile is not a file.\n";
  }
}

my %sizeInfo;
find (\&processFile, @corpusDirectory);
my %statInfo;

foreach my $lang (keys %sizeInfo)
{
	$statInfo{$lang} = {};
	my $langInfo = $statInfo{$lang};
	$langInfo->{body} = {};
	$langInfo->{body}->{sentences} = Statistics::Descriptive::Full->new();
	$langInfo->{body}->{tokens} = Statistics::Descriptive::Full->new();
	$langInfo->{body}->{chars} = Statistics::Descriptive::Full->new();
	$langInfo->{summary} = {};
	$langInfo->{summary}->{sentences} = Statistics::Descriptive::Full->new();
	$langInfo->{summary}->{tokens} = Statistics::Descriptive::Full->new();
	$langInfo->{summary}->{chars} = Statistics::Descriptive::Full->new();
	$langInfo->{ratio} = {};
	$langInfo->{ratio}->{sentences} = Statistics::Descriptive::Full->new();
	$langInfo->{ratio}->{tokens} = Statistics::Descriptive::Full->new();
	$langInfo->{ratio}->{chars} = Statistics::Descriptive::Full->new();
	
	foreach my $id (keys %{$sizeInfo{$lang}})
	{
		next unless exists ($sizeInfo{$lang}->{$id}->{summary});
		next unless exists ($sizeInfo{$lang}->{$id}->{body});
		next if exists $badIds{lc $id};
		
		foreach my $type (qw(body summary))
		{
			$langInfo->{$type}->{sentences}->add_data ($sizeInfo{$lang}->{$id}->{$type}->[0]);
			$langInfo->{$type}->{tokens}->add_data ($sizeInfo{$lang}->{$id}->{$type}->[1]);
			$langInfo->{$type}->{chars}->add_data ($sizeInfo{$lang}->{$id}->{$type}->[2]);
		}
		
		$langInfo->{ratio}->{sentences}->add_data ($sizeInfo{$lang}->{$id}->{summary}->[0] / $sizeInfo{$lang}->{$id}->{body}->[0]);
		$langInfo->{ratio}->{tokens}->add_data ($sizeInfo{$lang}->{$id}->{summary}->[1] / $sizeInfo{$lang}->{$id}->{body}->[1]);
		$langInfo->{ratio}->{chars}->add_data ($sizeInfo{$lang}->{$id}->{summary}->[2] / $sizeInfo{$lang}->{$id}->{body}->[2]);
	}
}


foreach my $lang (keys %statInfo)
{
	my $langInfo = $statInfo{$lang};
	print $lang . ',';
	print $langInfo->{body}->{sentences}->count () . ',';
	foreach my $type (qw(body summary ratio))
	{
		foreach my $chunkType (qw(sentences tokens chars))
		{
			print $langInfo->{$type}->{$chunkType}->sum () . ',';
			print $langInfo->{$type}->{$chunkType}->mean () . ',';
			print $langInfo->{$type}->{$chunkType}->standard_deviation () . ',';
		}
	}
	print "\n";
}


sub processFile
{
  my $file = $_;
  return undef unless -f $file;
  return undef unless ($file =~ m/xml$/);
  my $fullFile = $File::Find::name;
  my $lang;
  $lang = $1 if ($fullFile =~ m|/([a-z][a-z])/|);
  my $type = 'body';
  $type = 'summary' if ($fullFile =~ m|alllangssumms|);
  my $id;
  $id = $1 if ($file =~ m/([a-f0-9]{32})/);
  
  my ($totalSentences, $totalTokens, $totalChars) = createFileOfSentences ($file, $outputFile, $allSentences);
  
  if (defined $lang)
  {
    $sizeInfo{$lang} = {} unless exists $sizeInfo{$lang};
    $sizeInfo{$lang}->{$id} = {} unless exists $sizeInfo{$lang}->{$id};
    $sizeInfo{$lang}->{$id}->{$type} = [$totalSentences, $totalTokens, $totalChars];
  }
}


sub createFileOfSentences
{
  my $inputFile     = $_[0];
  my $outputFile     = $_[1];
  my $allSentences = $_[2];

  # if not a file return now.
  return 0 if !-f $inputFile;

  my $sentences; 
  my $article;
  my $result = eval { $article = XMLin($inputFile, ForceArray => 1); return 1; };
  if (!defined($result) || $@)
  {
    warn "xml parsing errors with file '$inputFile': $@\n";
    $sentences = getSentencesViaString ($inputFile, $allSentences);
  }
  else
  {
    # replace all sentences with the stemmed sentences.
    $sentences = getSentences ($article, [], $allSentences);
  }
  
  my $totalSentences = 0;
  my $totalTokens = 0;
  my $totalChars = 0;
  foreach my $sentence (@$sentences)
  {
    $sentence =~ s/\p{C}+/ /g;
    ++$totalSentences;
    $totalChars += length $sentence;
    my @tokens = getListOfTokensInText ($sentence);
    $totalTokens += scalar @tokens;
  }

  return ($totalSentences, $totalTokens, $totalChars);
}

sub getListOfTokensInText
{
  my @words = split (/\P{Alnum}+/, lc $_[0]);
  #dump \@words;
  return @words;
}


sub getSentences
{
  my $article = $_[0];
  my $allSentences = $_[2];
  my $sentences = [];
  $sentences = $_[1] if @_ > 1;
  
  if (ref($article) eq 'HASH')
  {
    while (my ($key, $value) = each %$article)
    {
      if (($key eq 's') && (ref ($value) eq 'ARRAY'))
      {
        # stem the words in the sentences.
        foreach my $sentence (@$value)
        {
          next if !exists $sentence->{content};
          
          if (!$allSentences)
          {
            next if !exists $sentence->{stype};
            next if $sentence->{stype} != 1;
          }
    
          # the strings returned should be utf8 encoded, if not, make them so.
          unless (Encode::is_utf8($sentence->{content}, 1))
          {
            my $reults = eval { $sentence->{content} = decode_utf8($sentence->{content}, 0); return 1; };
            if (!defined($result) || $@)
            {
              cluck "problem with UTF8 encoding of sentence; skipping it.\n";
              $sentence->{content} = '';
              next;
            }
          }
    
          push @$sentences, $sentence->{content};
        }
      }
      else
      {
        $sentences = getSentences ($value, $sentences, $allSentences);
      }
    }
  }
  elsif (ref ($article) eq 'ARRAY')
  {
    foreach my $item (@$article)
    {
      $sentences = getSentences ($item, $sentences, $allSentences);
    }
  }
  elsif (ref ($article) eq 'REF')
  {
    $sentences = getSentences ($$article, $sentences, $allSentences);
  }
  return $sentences;
}


sub getSentencesViaString
{
  my $inputFile = $_[0];
  my $allSentences = $_[1];
  
  my $fh;
  if (! open ($fh, '<:utf8', $inputFile))
  {
    warn "could not open file '$inputFile' for reading.\n";
    return undef;
  }
  
  my $contents;
  read ($fh, $contents, -s $inputFile);
  close $fh;
  
  # split the file on the sentences.
  my @content = split (/(<\s*s[^>]*>[^<]+<\s*\/s\s*>)/ix, $contents);
  
  my @sentences;
  foreach my $sentence (@content)
  {
    next if (!$allSentences && ($sentence !~ /stype=\"1\"/));
    
    if ($sentence =~ /<\s*s[^>]*>([^<]+)<\s*\/s\s*>/ix)
    {
      push @sentences, $1;
    }
  }
  
  return \@sentences;
}























