#!/usr/bin/env perl

use strict;
use warnings;
use Data::Dump qw(dump);
use File::Find;
use Getopt::Long;
my $el = "\n";

my $corpusDirectory;
my $badIdsFile;
my $sizesFile;

# get the test and reference directories.
GetOptions ("d=s" => \$corpusDirectory, "b=s" => \$badIdsFile, "s=s" => \$sizesFile);
my $fileInfo = getFileSizes ($corpusDirectory);

writeBadIds ($fileInfo, $badIdsFile) if defined $badIdsFile;
writeIdsSizes ($fileInfo, $sizesFile) if defined $sizesFile;

sub writeBadIds
{
  my ($FileInfo, $OutputFile) = @_;
  
  my $fh;
  if (! open ($fh, '>:utf8', $OutputFile))
  {
    warn "could not open file '$OutputFile' for writing.\n";
    return undef;
  }

  while (my ($id, $sizes) = each %$FileInfo)
  {
    # if info for the body and summary do not exist print the id.
    if (!exists ($sizes->{body}) || !exists ($sizes->{summary}))
    {
      print $fh $id . $el;
      next;
    }
    
    if ($sizes->{summary}->{tokens} < 100)
    {
      print $fh $id . $el;
      next;
    }

    if ($sizes->{body}->{tokens} < 2 * $sizes->{summary}->{tokens})
    {
      print $fh $id . $el;
      next;
    }
  }

  close $fh;
}


sub writeIdsSizes
{
  my ($FileInfo, $OutputFile) = @_;
  
  my $fh;
  if (! open ($fh, '>:utf8', $OutputFile))
  {
    warn "could not open file '$OutputFile' for writing.\n";
    return undef;
  }

  while (my ($id, $sizes) = each %$FileInfo)
  {
    if (exists ($sizes->{body}))
    {
      my $info = $sizes->{body};
      print $fh join (',', $id, $sizes->{language}, 'body', $info->{tokens}, $info->{chars}, $info->{bytes}) . $el;
    }

    if (exists ($sizes->{summary}))
    {
      my $info = $sizes->{summary};
      print $fh join (',', $id, $sizes->{language}, 'summary', $info->{tokens}, $info->{chars}, $info->{bytes}) . $el;
    }
    
    if (exists ($sizes->{body}) && exists ($sizes->{summary}))
    {
      my $bodyTokens = $sizes->{body}->{tokens};
      my $summaryTokens = $sizes->{summary}->{tokens};
      my $ratio = 0;
      if ($summaryTokens > 0)
      {
        $ratio = $bodyTokens / $summaryTokens;
      }
      print $fh join (',', $id, $sizes->{language}, 'ratio', $ratio) . $el;
    }
  }

  close $fh;
}



{
  my $fileInfo;
  
  sub getFileSizes
  {
    $fileInfo = {};
    find (\&processFiles, $_[0]);
    return $fileInfo;
  }

  sub processFiles 
  {
    my $fileName = $_;
    my $fullPathName = $File::Find::name;
    
    # if not a file return now.
    return undef unless -f $fileName;

    # get the id of the file.
    my $id;
    if ($fileName =~ /([A-Fa-f0-9]{32})/)
    {
      $id = $1;
    }
    else
    {
      return undef;
    }
    
    # get the type of the file.
    my $type = 'body';
    $type = 'summary' if ($fullPathName =~ m|[\/\\]summary[\/\\]|);

    # get the language of the file.
    my $language = 'unknown';
    $language = $1 if ($fullPathName =~ m|[\/\\]([a-z][a-z])[\/\\]|);

    my $sizes = getTokensCharactersBytesInFile ($fileName);
    return undef unless defined $sizes;

    # initialize the file info.
    $fileInfo->{$id} = {} unless exists $fileInfo->{$id};
    $fileInfo->{$id}->{$type} = $sizes;
    $fileInfo->{$id}->{language} = $language;
  }
}


sub getTokensCharactersBytesInFile
{
  my $InputFile = $_[0];
  
  # initialize the sizes to all zeros.
  my %sizes = qw (tokens 0 chars 0 bytes 0);

  # if not a file return undef;
  return undef unless -f $InputFile;
  
  # return all zeros if file is empty.
  $sizes{bytes} = -s $InputFile;
  return \%sizes unless $sizes{bytes};
  
  my $fh;
  if (! open ($fh, '<:utf8', $InputFile))
  {
    die "could not open file '$InputFile' for reading.\n";
    return undef;
  }
  
  # count the items by line.
  while (my $line = <$fh>)
  {
    my @listOfTokens = getListOfTokensInText ($line);
    $sizes{tokens} += scalar @listOfTokens;
    $sizes{chars} += length $line;
  }
  close $fh;
  
  return \%sizes;  
}


sub getListOfTokensInText
{
  return split (/\P{Alnum}+/, $_[0]);
}
