/*
  morogram-sort.c
  Copyright (C) 2001 Shigeki Moro
  $Id: morogram-sort.c,v 1.4 2008/08/28 13:02:17 moroshigeki Exp $

  This file is part of "morogram".
  "morogram" is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation; either version 2
  of the License, or (at your option) any later version.
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  
  The algorithm of this program is based on the following paper:
  Makoto NAGAO and Shinsuke MORI.
   "A New Method of N-gram Statistics for Large Number of n and Automatic
   Extraction of Words and Phrases from Large Text Data of Japanese",
   In Proceedings of the 15th International Conference on Computational
   Linguistics (1994), pp.611-615.
   http://www-lab25.kuee.kyoto-u.ac.jp/member/mori/postscript/Coling94.ps
*/

#include <stdio.h>

#define OFFSET_FILE      "morogram.offset.bin"
#define POINTER_FILE     "morogram.pointer.bin"
#define COINCIDENCE_FILE "morogram.coincidence.bin"

FILE *fp, *fpp, *fpc;

long read_offset(FILE *fp, long pointer, int offset) {
  long dummy = 0;
  int i;
  fseek(fp, pointer * offset, 0);
  for (i = 0; i < offset; i++) {
    dummy <<= 8;
    dummy += fgetc(fp);
  }
  return dummy;
}

int write_offset(FILE *fp, long pointer, long item, int offset) {
  int i, j = 0;
  for (i = 0; i < offset; i++) {
    fseek(fp, pointer * offset + offset - i - 1, 0);
    fputc(item & 0xff, fp);
    item >>= 8;
    j++;
  }
  return j;
}

int ucs4(long pointer, long gram) {
  int i;
  long dummy;
  for (i = 0; i < gram; i++) {
    dummy = read_offset(fp, pointer + i, 4);
    if (dummy <= 0x7F) {
      putchar((int) dummy);
    } else if (dummy <= 0x7FF) {
      putchar((int) (192 |  dummy >> 6));
      putchar((int) (128 | (dummy & 63)));
    } else if (dummy <= 0xFFFF) {
      putchar((int) (224 |  dummy >> 12));
      putchar((int) (128 | (dummy >> 6) & 63));
      putchar((int) (128 | (dummy       & 63)));
    } else if (dummy <= 0xEFFFF) {
      putchar((int) (240 |  dummy >> 18));
      putchar((int) (128 | (dummy >> 12) & 63));
      putchar((int) (128 | (dummy >>  6) & 63));
      putchar((int) (128 | (dummy        & 63)));
    } else if (dummy <= 0x10FFFF) {
      dummy -= 0xEFFFF;
      fprintf(stdout, "&M%06d;", dummy);
    }
  }
  return 0;
}

int main (int argc, char *argv[]) {
  int offset = 4, sort_sw, off1, off2;
  long length_input_file, gram_min, gram_max, frequency_min;
  long i, j, k, pi, pj, max, sort_gap, data1, data2;
  long coincidence_num, max_coincidence_num = 0;
  long gram, frequency = 0;
  
  
  /* テンポラリファイル用のOFFSET設定 */
  length_input_file = atol(argv[1]);
  if (length_input_file <= 256) {
    offset = 1;
  } else if (length_input_file <= 65536) {
    offset = 2;
  }
  fprintf(stderr, "\toffset length: %d.\n", offset);
  
  /* 最大グラムと最小グラムと最小頻度 */
  gram_min = atol(argv[2]);
  gram_max = atol(argv[3]);
  frequency_min = atol(argv[4]);
  
  /* pointer tableの作成 */
  fprintf(stderr, "\tcreating pointer files...");
  if (!(fpp = fopen(POINTER_FILE, "wb"))) {
    fprintf(stderr, "\n\t*** can't create pointer table ***\n");
    return 1;
  }
  for (i = 0; i < length_input_file; i++) {
    write_offset(fpp, i, i, offset);
  }
  fclose(fpp);
  fprintf(stderr, "done.\n");
  
  
  /* First Stage (1) */
  
  if (!(fp = fopen(OFFSET_FILE, "rb"))) {
    fprintf(stderr, "\n\t*** can't open offset file ***\n");
    return 1;
  }
  if (!(fpp = fopen(POINTER_FILE, "r+b"))) {
    fprintf(stderr, "\n\t*** can't open pointer table ***\n");
    return 1;
  }
  
  /* Comb Sort (参考: http://www.ffortune.net/comp/slib/sort/combsort.htm) */
  fprintf(stderr, "\tsorting pointer table...");
  if (length_input_file < 1) goto loop_e;
  sort_gap = length_input_file - 1;
 loop_1:
  sort_gap = sort_gap * 10 / 13;
  if (sort_gap == 0) {
    sort_gap = 1;
  } else if (sort_gap == 9 || sort_gap == 10) {
    sort_gap = 11;
  }
  sort_sw = 0;
  i = 0;
 loop_2:
  j = i + sort_gap;
  pi = read_offset(fpp, i, offset);
  pj = read_offset(fpp, j, offset);
  
  if (pi < pj) {max = pi;} else {max = pj;}
  for(k = 0; k < max + 1; k++) {
    data1 = read_offset(fp, pi + k, 4);
    data2 = read_offset(fp, pj + k, 4);
    if (data1 > data2) {
      write_offset(fpp, i, pj, offset);
      write_offset(fpp, j, pi, offset);
      sort_sw = 1;
      goto loop_3;
    } else if (data1 < data2) {
      goto loop_3;
    }
  }
 loop_3:
  if ((j + 1) < length_input_file) {
    i++;
    goto loop_2;
  }
  if ((sort_sw == 1) || (sort_gap > 1))
    goto loop_1;
 loop_e:
  fclose(fpp);
  fprintf(stderr, "done.\n");
  
  /* First Stage (2) */
  
  if (!(fpp = fopen(POINTER_FILE, "rb"))) {
    fprintf(stderr, "\n\t*** can't open pointer table ***\n");
    return 1;
  }
  if (!(fpc = fopen(COINCIDENCE_FILE, "wb"))) {
    fprintf(stderr, "\n\t*** can't create coincidence table ***\n");
    return 1;
  }
  
  fprintf(stderr, "\tcounting coincidence number of characters...");
  for (i = 0; i < length_input_file - 1; i++) {
    coincidence_num = 0;
    pi = read_offset(fpp, i    , offset);
    pj = read_offset(fpp, i + 1, offset);
    /* fprintf(stderr, "%04x\t%04x\t", pi, pj); */
    if (pi < pj)
      max = length_input_file - pi;
    else
      max = length_input_file - pj;
    for (k = 0; k < max; k++) {
      data1 = read_offset(fp, pi + k, 4);
      data2 = read_offset(fp, pj + k, 4);
      if (data1 != data2) break;
      coincidence_num++;
    }
    write_offset(fpc, i, coincidence_num, offset);
    /* fprintf(stderr, "%d\n", coincidence_num); */
    if (coincidence_num > max_coincidence_num)
      max_coincidence_num = coincidence_num;
  }
  /* write_offset(fpc, i + 1, 0, offset); */
  
  fclose(fpc);
  fprintf(stderr, "done.\n\tlargest coincidence number: %d.\n", max_coincidence_num);
  
  if ((frequency_min > 1) && (gram_max > max_coincidence_num))
    gram_max = max_coincidence_num;
  
  /* Second Stage */
  
  fprintf(stderr, "------- Second Stage ------\n\tcalculating ");
  if (!(fpc = fopen(COINCIDENCE_FILE, "rb"))) {
    fprintf(stderr, "\n\t*** can't open coincidence table ***\n");
    return 1;
  }
  /* fprintf(stdout, "\xEF\xBB\xBF"); */
  
  for (gram = gram_max; gram >= gram_min; gram--) {
    fprintf(stderr, "% 10d-gram frequency.", gram);
    i = 0;
    j = 0;
  loop: {
      do {
	coincidence_num = read_offset(fpc, j, offset);
	frequency++;
	j++;
      } while (coincidence_num >= gram);
      if (frequency >= frequency_min) {
	pi = read_offset(fpp, i, offset);
	if ((length_input_file - pi) >= gram) {
	  fprintf(stdout, "%d\t", frequency);
	  ucs4(pi, gram);
	  fprintf(stdout, "\t%d\n", gram);
	}
      }
      i += frequency;
      frequency = 0;
      if (i < length_input_file) goto loop;
    }
    fprintf(stderr, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
  }
  fclose(fp);
  fclose(fpp);
  fclose(fpc);
  
  return 0;
}
