#!/usr/local/bin/perl
#
# $Id: morogram-0.7.1x.pl,v 1.35 2004/06/13 03:47:11 dune Exp $
#
use strict;
use Encode qw(from_to decode encode);
use Encode::Guess;
use charnames ':full';	# \pM etc.
use utf8;



### 定数
###-----------------------------------------------------------------//
my $VERSION			= '0.7.1';
my($REVISION)		= q($Revision: 1.35 $) =~ m/\s([\d.]+)\s/;
my $FREQ_MIN		= 2;
my $FREQ_MAX		= 0;
my $GRAM_MIN		= 1;
my $GRAM_MAX		= 256;
my $OFFSET_FILE		= 'morogram.offset.bin';
my $POINTER_FILE	= 'morogram.pointer.bin';
my $COINCID_FILE	= 'morogram.coincidence.bin';
my $UTF8TEMP_FILE	= 'morogram.utf8';
my $DEFAULT_CODE	= 'utf8';
my $SORTER			= "morogram-sort-${VERSION}x.exe";



### 著作権表示用文字列
###-----------------------------------------------------------------//
my $AUTHOR_INFO	=<< "__END__";
$0: N-gram tool version $VERSION
  by Shigeki Moro (moro\@ya.sakura.ne.jp).
__END__



### 使い方
###-----------------------------------------------------------------//
my $USAGE_INFO	=<< "__END__";
Usage: $0 [switches] input_file >output_file
  --help, --?  Display this help.
  --f=min,max  Set min and max frequency
               (default: min=$FREQ_MIN, max=$FREQ_MAX).
  --g=min,max  Set min and max gram
               (default: min=$GRAM_MIN, max=$GRAM_MAX).
  --p          Delete punctuations.
  --e          Regard &Mnnnnnn; as a charcter.
  --BOM        Print Byte Order Mark (BOM).
  --c          Case sensitive.
  --w          Word recognition (default: letter).
  --I=encoding Input char encoding (default: utf8).
  --O=encoding Output char encoding.
  --V          Show version and available encodings.
__END__



### 引数なしで起動したときは GUI を表示する。
###----------------------------------------------------------------//
my($input_file,$output_file);
my $gui_mode = 0;
eval q|
	# Windows only
	while(@ARGV == 0){	# Windows only
		use Win32::FileOp qw/
					OpenDialog SaveAsDialog
					OFN_FILEMUSTEXIST OFN_OVERWRITEPROMPT/;
	
		my $file;
	
		# 入力ファイル名
		$file = OpenDialog({options => OFN_FILEMUSTEXIST});
		-e $file ? push(@ARGV,$file) : last;
	
		# 出力ファイル名
		$file = SaveAsDialog({options => OFN_OVERWRITEPROMPT});
		if(not defined $file){	@ARGV = ();	last;	}
		$output_file = $file;
	
		# GUI 使用時は文字コードを自動判別
		$gui_mode = 1;
		last;
	}
|;



### ヘルプ表示／対応文字コード一覧表示
###-----------------------------------------------------------------//
my @options	= @ARGV;
if(scalar @options == 0 or grep(/[-\/]+[hH?]/,@options)){
	print STDERR "$AUTHOR_INFO\n$USAGE_INFO";
	exit 8;
}elsif(scalar @options == 1 and $options[0] eq '--V'){
	print $AUTHOR_INFO;
	print qq(Win32 freestanding executable: Rev.$REVISION\n);
	print qq(  by Gokuaku (FZH01112\@nifty.com)\n);
	print qq(Available encodings:\n);
	my $i;
	
	foreach(Encode->encodings(":all")){
		printf("%-20s\t",$_);
		if(++$i % 3 == 0){ $i = 0; print "\n"; }
	}
	
	print "\n" if $i % 3;
	print "\n";
	exit 8;
}



### sorter の選択
###-----------------------------------------------------------------//
my $sorter;

eval{
	# Perl2Exe only
	no strict qw(subs);
	$sorter	= PerlApp::extract_bound_file($SORTER);
};

$sorter	||= $SORTER;



### 引数の解釈
###-----------------------------------------------------------------//
my($input_code,$output_code);
my($fmin,$fmax) = ($FREQ_MIN,$FREQ_MAX);
my($gmin,$gmax) = ($GRAM_MIN,$GRAM_MAX);
my $entity2char		= 0;
my $del_punct		= 0;
my $print_BOM		= 0;
my $ignore_case		= 1;
my $word_mode		= 0;

foreach(@options){

	# 頻度設定
	if(m/^--f=(\d+),(\d+)$/){
		$fmin		= $1;
		$fmax		= $2;
	}elsif(m/^--f=,(\d+)$/){		$fmax		= $1	}
	elsif(m/^--f=(\d+),?$/){		$fmin		= $1	}

	# 文字数設定
	elsif(m/^--g=(\d+),(\d+)$/){
		$gmin		= $1;
		$gmax		= $2;
	}elsif(m/^--g=,(\d+)$/){		$gmax		= $1	}
	elsif(m/^--g=(\d+),?$/){		$gmin		= $1	}

	# その他
	elsif(m/^--e$/){				$entity2char	= 1		}
	elsif(m/^--p$/){				$del_punct		= 1		}
	elsif(m/^--c$/){				$ignore_case	= 0		}
	elsif(m/^--BOM$/){				$print_BOM		= 1		}
	elsif(m/^--w$/){				$word_mode		= 1		}

	# 入出力文字コード
	elsif(m/^--I=(.+)$/){			$input_code		= $1	}
	elsif(m/^--O=(.+)$/){			$output_code	= $1	}

	elsif(defined $input_file or (!-e $_ and m/^--?[A-Z]/i)){
		print STDERR "invalid argument: $_\n";
		exit 255;
	}elsif(!-f $_){
		print STDERR "No such file or directory: $_\n";
		exit 255;
	}else{
		$input_file		= $_;
	}
}

$gmax	= $gmin if($gmax < $gmin and $gmax);
$fmax	= $fmin if($fmax < $fmin and $fmax);



### 入出力文字コードの設定
###-----------------------------------------------------------------//
if($input_code =~ m/^Guess$/i or $gui_mode){
	open(FILE,"<",$input_file) or die qq($input_file : $!\n);
	local $/;
	local $_ = <FILE>;
	close FILE;

	my $suspects = $0;
	my @suspects;

	while($suspects =~ s/[CJKT]//){
		$& eq 'C' and push(@suspects,qw/euc-cn/);
		$& eq 'J' and push(@suspects,qw/shiftjis euc-jp 7bit-jis/);
		$& eq 'K' and push(@suspects,qw/euc-kr/);
		$& eq 'T' and push(@suspects,qw/big5-eten/);
	}

	my $enc = guess_encoding($_,@suspects);
	$input_code = ref($enc) ? $enc->name : $DEFAULT_CODE;

	# 後に行単位で読み込むので、２行目以降、ファイル先頭の
	# BOM はあてにできない。代わりにここで BE/LE を確定させる。
	if($input_code =~ m/^u.+\d/i){
		if(m/^\xFE\xFF/){		$input_code .= "BE" }
		elsif(m/^\xFF\xFE/){	$input_code .= "LE" }
	}
}

$input_code ||= $DEFAULT_CODE;
$output_code ||= $input_code;



### オプションの表示
###-----------------------------------------------------------------//
print STDERR $AUTHOR_INFO,"\n";
print STDERR << "__END__";
  frequency range  : [$fmin .. $fmax]
  gram size range  : [$gmin .. $gmax]
  delete punct     : @{[$del_punct   ? 'yes' : 'no']}
  regard &Mnnnnnn; : @{[$entity2char ? 'yes' : 'no']}
  print BOM        : @{[$print_BOM   ? 'yes' : 'no']}
  case sensitive   : @{[$ignore_case ? 'no' : 'yes']}
  word recognition : @{[$word_mode   ? 'yes' : 'no']}
  input encoding   : $input_code
  output encoding  : $output_code
  filename         : $input_file
__END__
print STDERR "\n";



### 前処理
###-----------------------------------------------------------------//
print STDERR "------- First Stage -------\n";
print STDERR "\tcreating offset table...";
open(OFFSET,">",$OFFSET_FILE) or die "can't open $OFFSET_FILE\n";
binmode OFFSET;
open(FILE,"<",$input_file) or die "can't open $input_file\n";
binmode FILE;
my @utf8;

if($word_mode){
	local $/;
	if($input_code =~ m/u.+32LE$/i){
		$/ = "\x0A\x00\x00\x00";
	}elsif($input_code =~ m/u.+16LE$/i or $input_code =~ m/u.+2LE$/i){
		$/ = "\x0A\x00";
	}

	my(%id,$id);
	while(<FILE>){
		$_ = decode($input_code,$_);	# $input_code => internal
		s/\x{FEFF}//g;
		s/&M(\d{6});/chr(0x0EFFFF+$1)/ge	if $entity2char;
		s/　/\x20/g;	s/[\x0D\x0A\x09\x20]+/\x20/g;
		if($ignore_case){	tr/A-Z/a-z/;	tr/Ａ-Ｚ/ａ-ｚ/;	}

		if($del_punct){
			s/\pM/\x20/go;
			s/\pP/\x20/go;
			s/\pS/\x20/go;
			s/\pZ/\x20/go;
		}

		foreach my $utf8 (split m/\x20+/){
			if(not exists $id{$utf8}){
				next unless length $utf8;
				$utf8[++$id] = encode($DEFAULT_CODE,$utf8);
				$id{$utf8} = $id;
				# $id == 0xFFFFFFFF なら ID オーバフローだが、
				# そんな巨大なデータはあり得ない。
			}
			print OFFSET pack('N',$id{$utf8});
		}
	}
}else{
	local $/;
	if($input_code =~ m/u.+32LE$/i){
		$/ = "\x0A\x00\x00\x00";
	}elsif($input_code =~ m/u.+16LE$/i or $input_code =~ m/u.+2LE$/i){
		$/ = "\x0A\x00";
	}

	while(<FILE>){
		$_ = decode($input_code,$_);	# $input_code => internal
		s/^\x{FEFF}//;
		s/&M(\d{6});/chr(0x0EFFFF+$1)/ge	if $entity2char;
		s/　//g;	tr/\x0D\x0A\x09\x20//d;
		if($ignore_case){	tr/A-Z/a-z/;	tr/Ａ-Ｚ/ａ-ｚ/;	}

		if($del_punct){
			s/\pM//go;
			s/\pP//go;
			s/\pS//go;
			s/\pZ//go;
		}

		print OFFSET encode('UTF-32BE',$_)	# utf8 => ucs4
	}
}

close FILE;
close OFFSET;

my $byte4	= (-s $OFFSET_FILE) / 4;
$gmax	= $byte4	if($gmax > $byte4 or !$gmax);
$gmin	= $byte4	if $gmin > $byte4;
$fmax	= $byte4	if($fmax > $byte4 or !$fmax);
$fmin	= $byte4	if $fmin > $byte4;
printf STDERR "done.\n\tnumber of characters is $byte4.\n";

if($byte4 == 1){
	print STDERR "\t*** too few items for N-gram statistics ***\n";
	exit 255;
}



### 主処理
###-----------------------------------------------------------------//
print STDERR qq(\n);
print STDERR qq(------- Second Stage ------\n);

if(defined $output_file){
	open(STDOUT,">",$output_file) or die "$! $output_file\n";
}

print "\x{FEFF}" if $print_BOM;

if($word_mode){
	system "$sorter -$byte4 $gmin $gmax $fmin $fmax";
	open(FILE,"<",$UTF8TEMP_FILE) or die "$! $UTF8TEMP_FILE\n";
	binmode FILE;
	while(<FILE>){
		s/^(\d+\t)((?:....)+?)(\t\d+)\D*/$1@{[
			join(" ",@utf8[unpack('N*',$2)])
		]}$3\n/s or do{ $_ .= <FILE>; redo; };

		from_to($_,$DEFAULT_CODE => $output_code);
		print $_;
	}
	close FILE;
}else{
	system "$sorter +$byte4 $gmin $gmax $fmin $fmax >$UTF8TEMP_FILE";
	open(FILE,"<",$UTF8TEMP_FILE) or die "$! $UTF8TEMP_FILE\n";
	while(<FILE>){
		from_to($_,$DEFAULT_CODE => $output_code);
		print $_;
	}
	close FILE;
}

print STDERR "\n";



### 後処理
###-----------------------------------------------------------------//
END{
	my $first_message	= "\tdeleting temporary file(s)...";
	my $isok	= 1;

	foreach my $file ($OFFSET_FILE, $POINTER_FILE, $COINCID_FILE,
						$UTF8TEMP_FILE){
		if(-e $file){
			$isok	*= unlink $file;
			if(defined $first_message){
				print STDERR $first_message;
				undef $first_message;
			}
		}
	}

	exit 0 if $first_message;
	print STDERR $isok ? "done.\n" : "failed\n";
	my $now = time - $^T;
	printf STDERR
			("\tTotal time: %d hour(s) %d minute(s) %d second(s)\n",
			$now / 3600, $now % 3600 / 60, $now % 60);

	exit 0;
}

__END__
