#!/usr/bin/perl -w 
#
# morogram.5.8.pl (N-gram tool for Perl 5.8.x)
# by Shigeki Moro (s-moro@hanazono.ac.jp)
#
# $Id: morogram.5.8.pl,v 1.3 2008/08/28 13:00:02 moroshigeki Exp $
#
# This file is part of "morogram".
# "morogram" is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
# 
# このスクリプトは以下の論文にあるアルゴリズムに基づいています。
# Makoto NAGAO and Shinsuke MORI.
# 	"A New Method of N-gram Statistics for Large Number of n and Automatic
# 	Extraction of Words and Phrases from Large Text Data of Japanese",
# 	In Proceedings of the 15th International Conference on Computational
# 	Linguistics (1994), pp.611-615.
# 	http://www-lab25.kuee.kyoto-u.ac.jp/member/mori/postscript/Coling94.ps


##########  初期設定 ##########
use strict;
use utf8;
use Encode::Unicode;
my $now = time;
my $version = '0.7.3';
my $email = 's-moro@hanazono.ac.jp';

# switches
my $frequency_min = 2;
my $gram_min = 1;
my $gram_max = 256;
my $entity2characters  = 'off';
my $delete_punctuation = 'off';
my $punctuations = q#-!"'(),./:;?[]_{}|¡«»¿‘’‚‛“”„‟‥…‹›‼、。〈〉《》「」『』【】〔〕〖〗〘〙〚〛〞〟｡｢｣､･！（），－．／：；？［＼］＿｛｜｝#;
my $print_BOM = 'off';

my $packformat = 'N';
my $OFFSET_FILE = 'morogram.offset.bin';
my $POINTER_FILE = 'morogram.pointer.bin';
my $COINCIDENCE_FILE = 'morogram.coincidence.bin';
my $title1 = <<"END";

morogram: N-gram tool version $version,
    by Shigeki Moro ($email).
END
my $title2 = <<"END";

Usage: perl morogram.pl [switches] input_file > output_file
    --help       Display this help.
    --f=n        Set minimum frequency (default: n=$frequency_min).
    --g=min,max  Set minimum and maximal gram (default: min=$gram_min, max=$gram_max).
    --p          Delete punctuations.
    --e          Regard &Mnnnnnn; as a charcter.
    --BOM        Print Byte Order Mark (BOM).
END

#オプションの解読
my @options = @ARGV;
if (scalar @options == 0 or scalar grep(/--help/i, @options) > 0) {
    print STDERR $title1, $title2;
    exit;
}

my $fullpath = '';
for my $i (@options) {
    if ($i =~ /^--f=(\d+)$/) {
	$frequency_min = $1;
    } elsif ($i =~ /^--g=(\d+),(\d+)$/) {
	$gram_min = $1;
	$gram_max = $2;
    } elsif ($i =~ /^--e$/) {
	$entity2characters = 'on';
    } elsif ($i =~ /^--p$/) {
	$delete_punctuation = 'on';
    } elsif ($i =~ /^--BOM$/) {
	$print_BOM = 'on';
    } else {
	$fullpath = $i;
    }
}

print STDERR <<"END";
$title1
	minimum number of frequency: $frequency_min
	minimum number of gram     : $gram_min
	largest number of gram     : $gram_max
	filename                   : $fullpath

END

#入力ファイル

open F, "<:utf8", $fullpath or die "Can't open $fullpath:";

########## First Stage (1) ##########

# OFFSET（UCS4ベース）作成と文字数カウント
print STDERR "------- First Stage -------\n";
print STDERR "\tcreating OFFSET file...";
open OFFSET, ">:encoding(UTF-32BE)", $OFFSET_FILE;
while (<F>) {
    chomp;
    s/\x{FEFF}//;	#BOM消去
    s/\&M(\d\d\d\d\d\d);/chr(0xEFFFF+$1)/ge if ($entity2characters eq 'on');	#実体参照を普通の文字列とみなす
    s/[\s　]//g;	#空白消去
    s#[\Q$punctuations\E]##g if ($delete_punctuation eq 'on');	#句読点消去
    tr/A-Z/a-z/;	#小文字に統一
    print OFFSET;
}
close OFFSET;
my $length_input_file = (-s $OFFSET_FILE) / 4;
$gram_max = $length_input_file if ($gram_max > $length_input_file);
printf STDERR "done.\n\tnumber of characters: $length_input_file.\n";

#文字数が1の場合，処理を中止する．
if ($length_input_file == 1) {
    print STDERR "\t*** too few item for N-gram statistics ***\n";
    exit;
}

#sort & print
print STDOUT "\x{FEFF}" if ($print_BOM eq 'on');
system "morogram-sort $length_input_file $gram_min $gram_max $frequency_min";

##########  あとしまつ ##########

#テンポラリファイルの消去
print STDERR "\n\n\tdeleting temporary files...";
my $unlinked = unlink $OFFSET_FILE, $POINTER_FILE, $COINCIDENCE_FILE;
if ($unlinked == 3) {
    print STDERR "done.\n";
} else {
    print STDERR "failed\n";
}

#かかった時間の表示
$now = time - $now;
printf STDERR ("\tTotal time: %d hour(s) %d minute(s) %d second(s)\n", $now / 3600, $now % 3600 / 60, $now % 60);

__END__
