/*
 * Copyright (C) 2009 by Aiwota Programmer
 * aiwotaprog@tetteke.tk
 *
 * This file is part of Dialektos.
 *
 * Dialektos is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Dialektos is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Dialektos.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef HTML_PARSER_HXX
#define HTML_PARSER_HXX

#include <algorithm>
#include <map>
#include <string>
#include <boost/range.hpp>
#include <boost/utility.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include <boost/lambda/lambda.hpp>


namespace dialektos {


/*! @brief HTML parser driver with auto anchor(http:// and ttp://) for DAT

ParseHtmlFuncT must implement function sets for parsing HTML.
@code
class ParseHtmlFuncT {
public:
  void parse_text(range);
  iterator parse_tag(range);
  iterator parse_entity(range);
  iterator parse_http(range);
  iterator parse_ttp(range);
};
@endcode
@warning returned value 'iterator' must be the position
after the next of range.begin
*/
template <typename ParseHtmlFuncT>
class DatHtmlParserDriver: public ParseHtmlFuncT {
public:

  /*! @brief parse HTML
   *
   * @param range is HTML represented by Boost.Range concepts.
   */
  template <typename RangeT> void parse_html(const RangeT& range) {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;

    Iterator start = boost::begin(range);

    for (Iterator it = boost::begin(range); it != boost::end(range);) {

      if (*it == '<') {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_tag(std::make_pair(it, boost::end(range)));
        start = it;
      } else if (*it == '&') {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_entity(std::make_pair(it, boost::end(range)));
        start = it;
      } else if (boost::starts_with(
          std::make_pair(it, boost::end(range)), "http://")) {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_http(std::make_pair(it, boost::end(range)));
        start = it;
      } else if (boost::starts_with(
          std::make_pair(it, boost::end(range)), "https://")) {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_http(std::make_pair(it, boost::end(range)));
        start = it;
      } else if (boost::starts_with(
          std::make_pair(it, boost::end(range)), "ttp://")) {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_ttp(std::make_pair(it, boost::end(range)));
        start = it;
      } else
        ++it;
    }

    if (start != boost::end(range)) {
      parse_text(std::make_pair(start, boost::end(range)));
    }
  }
};


/*! @brief HTML parser driver

ParseHtmlFuncT is a class, which implements function sets for parsing HTML.
@code
class ParseHtmlFuncT {
public:
  void parse_text(range);
  iterator parse_tag(range);
  iterator parse_entity(range);
};
@endcode
@warning returned value 'iterator' must be the position
after the next of range.begin
*/
template <typename ParseHtmlFuncT>
class HtmlParserDriver: public ParseHtmlFuncT {
public:

  /*! @brief parse HTML
   *
   * @param range is HTML represented by Boost.Range concepts.
   */
  template <typename RangeT> void parse_html(const RangeT& range) {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;

    Iterator start = boost::begin(range);

    for (Iterator it = boost::begin(range); it != boost::end(range);) {

      if (*it == '<') {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_tag(std::make_pair(it, boost::end(range)));
        start = it;
      } else if (*it == '&') {
        if (start != it) parse_text(std::make_pair(start, it));
        it = parse_entity(std::make_pair(it, boost::end(range)));
        start = it;
      } else
        ++it;
    }

    if (start != boost::end(range)) {
      parse_text(std::make_pair(start, boost::end(range)));
    }
  }
};


/*! @brief HTML parser function sets with auto anchor(http:// and ttp://).

DrivedT must be implemented like this,
@code
class DrivedT {
public:
  on_data(rage);
  on_start_tag(range, attributes);
  on_end_tag(range);
};
@endcode
here 'range' is Boost.Range concepts and 'attributes' is a map.
*/
template <typename DerivedT>
class HtmlParserFunctions {
private:
  typedef std::string StringType;
  typedef std::string AttrKeyType;
  typedef std::string AttrDataType;
  typedef std::map<AttrKeyType, AttrDataType> AttributesType;

  DerivedT* pThis() { return static_cast<DerivedT*>(this); }

  template <typename RangeT>
  void on_data(const RangeT& range) {
    pThis()->on_data(range);
  }
  template <typename RangeT, typename MapT>
  void on_start_tag(const RangeT& range, const MapT& attrs) {
    pThis()->on_start_tag(range, attrs);
  }
  template <typename RangeT>
  void on_end_tag(const RangeT& range) {
    pThis()->on_end_tag(range);
  }

protected:

  /*! @brief parse a normal text */
  template <typename RangeT>
  void parse_text(const RangeT& range) {
    on_data(range);
  }

  /*! @brief skip white spaces */
  template <typename RangeT>
  typename boost::range_const_iterator<RangeT>::type
  skip_white_space(const RangeT& range) const {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;
    using boost::begin;
    using boost::end;

    Iterator it = begin(range);
    for (; it != end(range); ++it) if (*it != ' ') break;
    return it;
  }

  /*! @brief parse a HTML tag*/
  template <typename RangeT>
  typename boost::range_const_iterator<RangeT>::type
  parse_tag(const RangeT& range) {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;

    Iterator it = boost::begin(range);
    const Iterator start = boost::begin(range);
    const Iterator end = boost::end(range);
    ++it; // skip '<'

    Iterator ret = skip_white_space(std::make_pair(it, end));
    if (ret == end) {
      on_data(std::make_pair(start, ret));
      return ret;
    }
    it = ret;

    enum start_or_end {
      StartTag, EndTag
    };

    start_or_end s_o_e = StartTag;

    // check if <tag> or </tag>
    if (*it == '/') {
      s_o_e = EndTag;
      ++it;
    }

    ret = skip_white_space(std::make_pair(it, end));
    if (ret == end) {
      on_data(std::make_pair(start, ret));
      return ret;
    }
    it = ret;

    Iterator tag_end = std::find(it, end, '>');
    if (tag_end == end) {
      // cannot find '>'
      on_data(std::make_pair(boost::begin(range), it));
      return it;
    }

    // check <tag/>
    Iterator tag_with_slash = tag_end;
    std::advance(tag_with_slash, -1);
    if (*tag_with_slash == '/') tag_end = tag_with_slash;

    Iterator tag_name_start = skip_white_space(std::make_pair(it, tag_end));
    if (tag_name_start == tag_end) {
      // ignore space only tag
      tag_end = std::find(tag_end, end, '>');
      ++tag_end;
      return tag_end;
    }

    AttributesType attrs;
    Iterator tag_name_end = std::find(it, tag_end, ' ');
    if (tag_name_end != tag_end) {
      attrs = parse_attributes(std::make_pair(tag_name_end, tag_end));
    }

    if (s_o_e == StartTag)
      on_start_tag(std::make_pair(tag_name_start, tag_name_end), attrs);
    else
      on_end_tag(std::make_pair(tag_name_start, tag_name_end));

    tag_end = std::find(tag_end, end, '>');
    ++tag_end;
    return tag_end;
  }

  /*! @brief parse a HTML entity*/
  template <typename RangeT>
  typename boost::range_const_iterator<RangeT>::type
  parse_entity(const RangeT& range) {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;
    typedef typename boost::iterator_range<Iterator> IterRange;

    Iterator semicolon = boost::begin(range);
    for (int i = 0;; ++semicolon, ++i) {
      if (i == 10 || semicolon == boost::end(range)) {
        on_data(StringType("&"));
        return boost::next(boost::begin(range));
      }
      if (*semicolon == ';') break;
    }
    IterRange entity(boost::next(boost::begin(range)), semicolon);

    if (boost::starts_with(entity, "gt")) on_data(StringType(">"));
    else if (boost::starts_with(entity, "lt")) on_data(StringType("<"));
    else if (boost::starts_with(entity, "amp")) on_data(StringType("&"));
    else if (boost::starts_with(entity, "quot")) on_data(StringType("\""));
    else if (boost::starts_with(entity, "apos")) on_data(StringType("'"));
    else {
      on_data(StringType("&"));
      return boost::next(boost::begin(range));
    }
    return boost::next(semicolon);
  }

  /*! @brief parse http. automatically make an anchor. */
  template <typename RangeT>
  typename boost::range_const_iterator<RangeT>::type
  parse_http(const RangeT& range) {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;

    Iterator end = std::find(boost::begin(range), boost::end(range), ' ');

    AttributesType attributes;

    attributes[AttrKeyType("href")] = AttrDataType(boost::begin(range), end);
    on_start_tag(StringType("a"), attributes);
    on_data(std::make_pair(boost::begin(range), end));
    on_end_tag(StringType("a"));

    return end;
  }

  /*! @brief parse ttp. automatically make an anchor. */
  template <typename RangeT>
  typename boost::range_const_iterator<RangeT>::type
  parse_ttp(const RangeT& range) {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;

    Iterator end = std::find(boost::begin(range), boost::end(range), ' ');

    AttributesType attributes;
    StringType href = "h";
    href += StringType(boost::begin(range), end);
    attributes["href"] = href;
    on_start_tag(StringType("a"), attributes);
    on_data(std::make_pair(boost::begin(range), end));
    on_end_tag(StringType("a"));

    return end;
  }

private:
  template <typename RangeT>
  AttributesType parse_attributes(const RangeT& range) const {
    typedef typename boost::range_const_iterator<RangeT>::type Iterator;
    typedef typename boost::range_value<RangeT>::type ValueType;
    typedef typename boost::iterator_range<Iterator> IterRange;
    using namespace boost::lambda;
    using boost::lambda::_1;

    AttributesType attrs;

    const Iterator end = boost::end(range);
    for (Iterator start = boost::begin(range); start != end;) {
      start = skip_white_space(std::make_pair(start, end));
      if (start == end) break;

      Iterator equal = std::find_if(start, end, _1 == '=' || _1 == ' ');
      if (equal == end) break;
      if (*equal == ' ') {
        // not a 'name=value' token, just ignore
        start = equal;
        ++start;
        continue;
      }

      const StringType name(start, equal);
      Iterator quat = equal;
      ++quat;
      if (quat == end) break;

      if (*quat == '"' || *quat == '\'') {
        Iterator attr_end = std::find(boost::next(quat), end, *quat);
        StringType value(quat, attr_end);
        if (*quat == '"') boost::trim_if(value, _1 == '"');
        else boost::trim_if(value, _1 == '\'');
        attrs[name] = value;
        start = attr_end;
        if (start != end) ++start;
      } else {
        // not quoted
        Iterator attr_end = std::find(quat, end, ' ');
        StringType value(quat, attr_end);
        attrs[name] = value;
        start = attr_end;
      }
    }
    return attrs;
  }
};

}  // namespace dialektos

#endif
