﻿using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace SlothLib.NLP
{
    /// <summary>
    /// 名詞の連続をつなげて、品詞が「名詞,名詞句」である一つのMorphemeに変換するフィルタ
    /// </summary>
    public class NounPhraseFilter : IMorphemeFilter
    {
        private Regex includeRegex;
        private Regex excludeRegex;

        /// <summary>
        /// コンストラクタ
        /// </summary>
        /// <param name="includePattern">つなげたい品詞にマッチする正規表現パターン</param>
        /// <param name="excludePattern">つなげたくない品詞にマッチする正規表現パターン</param>
        public NounPhraseFilter(string includePattern, string excludePattern)
            : base()
        {
            this.includeRegex = new Regex(includePattern, RegexOptions.Compiled);
            this.excludeRegex = new Regex(excludePattern, RegexOptions.Compiled);
        }
        /// <summary>
        /// コンストラクタ
        /// </summary>
        public NounPhraseFilter()
            : this("^名詞", "サ変接続|接尾.人名")
        { }

        #region IMorphemeFilter メンバ

        /// <summary>
        /// 何もせず、そのまま返す
        /// </summary>
        /// <param name="morpheme">変換する形態素</param>
        /// <returns>変換後の形態素。変換前と同じものが返る</returns>
        public IMorpheme DoFilter(IMorpheme morpheme)
        {
            return morpheme;
        }

        /// <summary>
        /// 名詞の連続をつなげて、品詞が「名詞,名詞句」である一つのMorphemeに変換する
        /// </summary>
        /// <param name="morphemes">変換前の形態素列</param>
        /// <returns>変換後の形態素列</returns>
        public IMorpheme[] DoFilter(IEnumerable<IMorpheme> morphemes)
        {
            List<IMorpheme> resultList = new List<IMorpheme>();
            bool flag = false;
            string raw = "";
            string original = "";
            IMorpheme lastMorpheme = null;
            foreach (IMorpheme morpheme in morphemes)
            {
                if (flag)
                {
                    if (includeRegex.Match(morpheme.POS).Success && !excludeRegex.Match(morpheme.POS).Success)
                    {
                        if (Regex.Match(raw + morpheme.Raw, "^[a-zA-Z0-9 ]+$", RegexOptions.Compiled).Success)
                        {
                            raw += " " + morpheme.Raw;
                            original += " " + morpheme.Original;
                        }
                        else
                        {
                            raw += morpheme.Raw;
                            original += morpheme.Original;
                        }
                        lastMorpheme = null;
                    }
                    else
                    {
                        if (lastMorpheme == null)
                        {
                            resultList.Add(new Morpheme("名詞,名詞句", raw, original));
                        }
                        else
                        {
                            resultList.Add(lastMorpheme);
                        }
                        flag = false;
                        raw = original = "";
                        resultList.Add(morpheme);
                    }
                }
                else
                {
                    if (includeRegex.Match(morpheme.POS).Success && !excludeRegex.Match(morpheme.POS).Success)
                    {
                        flag = true;
                        raw += morpheme.Raw;
                        original += morpheme.Original;
                        lastMorpheme = morpheme;
                    }
                    else
                    {
                        resultList.Add(morpheme);
                    }
                }
            }
            if (flag)
            {
                if (lastMorpheme == null)
                {
                    resultList.Add(new Morpheme("名詞,名詞句", raw, original));
                }
                else
                {
                    resultList.Add(lastMorpheme);
                }
            }
            return resultList.ToArray();
        }

        #endregion
    }
}
