#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""convert html to sfjpwiki-style text."""

import urllib
import urlparse
import HTMLParser
import os.path
import re
import sys
import urlparse

class Html2SfjpWiki(HTMLParser.HTMLParser):
    "html to sfjpwiki-style text converter"
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._target_id = ""
        self._buf = []
        self._thru = True
        self._rex_empty = re.compile(r"^\s*$")
        self._div_counter = 0
        self._prevtag = ""
        self._currenttag = ""
        self._stack = []
        self._href = ""
        self.title = ""
        self._start_handlers = {}
        self._end_handlers = {}

        self._stacking = False
        self._add_handlers()
        self._block = False
        self._list_mode = ""
        self._in_div = 0
        self._inner = {}
        self._url_r_map = {}
        self._pre_data = ""
        self._in_column = False

    def set_url_replace_list(self, fname):
        f = open(fname, "r")
        for item in f:
            if len(item.strip()) == 0:
                continue
            try:
                (url, repl) = item.strip().split(None, 1)
            except ValueError:
                sys.exit("too many values to unpack: %s" % (item))
            self._url_r_map[url] = repl
        f.close()

    def _post_proc(self, str):
        rex_caption = re.compile(r"^'''(.+?)'''", re.M)
        rex_header = re.compile(r"^(==+ )(.+?)( ==+)", re.M)
        rex_sfwiki = re.compile(r"\?sf[0-9][0-9][0-9][0-9] ")
        rex_anchor = re.compile(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)")

        f = lambda x: x.group(1) + re.sub(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)", r"!\1", x.group(2)) + x.group(3)
        t = str

        t = rex_caption.sub(r"====== \1 ======\n", t)
        t = rex_header.sub(f, t)
        t = rex_sfwiki.sub(r"?sfwiki ", t)
        return t

    def parse(self, html_string, target_id):
        """Parse html_string with url, and return anchors"""
        self._anchors = []
        self._imgs = []
        self._id = target_id
        if self._id == "":
            self._thru = False
        # remove script tag
        rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
        rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
        tmp = rex.sub("", html_string)
        html = rex2.sub("", tmp)
        # remove some invalid element
        # example: <! -- this is comment. -->
        rex = re.compile(r"<!\s.*?>", re.S)
        html_string = rex.sub("", html)

        self.feed(html_string)
        ret = "".join(self._buf)
        return self._post_proc(ret)

    def handle_starttag(self, tag, attrs):
        self._prevtag = self._currenttag
        self._currenttag = tag
        if self._inner.has_key(tag):
            self._inner[tag] += 1
        else:
            self._inner[tag] = 1

        if self._thru:
            d_attrs = dict(attrs)
            if d_attrs.has_key("id"):
                if d_attrs["id"] == self._id:
                    self._thru = False
                else:
                    return
            else:
                return

        if tag == "div":
            self._div_counter += 1

        if self._inner.has_key("pre") and self._inner["pre"] > 0:
            self._pre_start_handler(tag, attrs)
            return

        if self._start_handlers.has_key(tag):
            f = self._start_handlers[tag]
            t = f(self, tag, attrs)
            self._put(t)

    def handle_endtag(self, tag):
        self._prevtag = self._currenttag
        self._currenttag = ""
        self._inner[tag] -= 1

        if self._thru:
            return

        if tag == "div":
            self._div_counter -= 1
            if self._div_counter == 0:
                self._thru = True
                return

        if self._inner.has_key("pre") and self._inner["pre"] > 0:
            self._pre_end_handler(tag)
            return

        if self._end_handlers.has_key(tag):
            f = self._end_handlers[tag]
            t = f(self, tag)
            self._put(t)

    def handle_data(self, data):
        if self._currenttag == "title":
            self.title = data.strip()

        if self._thru:
            return
        if self._inner.has_key("pre") and self._inner["pre"] > 0:
            self._pre_data_handler(data)
            return

        if self._rex_empty.search(data):
            return

        output = self.wiki_escape(data)

        if self._href:
            self._stack.append(output)
        else:
            self._put(output.rstrip())

    def wiki_escape(self, data):
        if not self._currenttag in ("td",):
            data = data.replace("__", "!__")
        if self._inner.get("p", 0) > 0 and self._inner["a"] == 0:
            data = re.sub(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)", r"!\1", data)
        return data

    def handle_charref(self, ref):
        pass

    def handle_entityref(self, name):
        pass

    def _put(self, str, force=False):
        if force == False and (str == None or self._block):
            return
        self._buf.append(str)

    def _add_handlers(self):
        """add start/end handlers for each tag."""
        # generate simple replace rule.
        # prepare dictionary. key is tag. value is replaced string.
        r_starttag = dict(
            p="\n\n",
            i="''",
            tt="`",
            b="'''",
            strong="'''",
            big="'''",
            small="__",
            hr="----\n",
            h3="\n=== ",
            h4="\n==== ",
            br="\n")
        # generate function to replace tag to string.
        for key in r_starttag:
            self._start_handlers[key] = lambda s, t, attr: r_starttag[t]

        # for end tag, do same process.
        r_endtag = dict(
            p="\n",
            i="''",
            tt="`",
            b="'''",
            strong="'''",
            big="'''",
            small="__",
            h3=" ===\n",
            h4=" ====\n")
        for key in r_endtag:
            self._end_handlers[key] = lambda s, t: r_endtag[t]

        # add class's "_h_start_<tagname>" function to _start_handlers[tagname],
        # "_h_end_<tagname>" function to _end_handlers[tagname].
        # __class__.__dict__ is a dictionary which contains class's member functions.
        for func in self.__class__.__dict__:
            if func.find("_h_start_") == 0:
                # for example, if "func" is "_h_start_img", then
                # assign func to  self._start_handlers["img"].
                tagname = func[len("_h_start_"):]
                self._start_handlers[tagname] = self.__class__.__dict__[func]
            if func.find("_h_end_") == 0:
                # for example, if "func" is "_h_start_img", then
                # assign func to  self._start_handlers["img"].
                tagname = func[len("_h_end_"):]
                self._end_handlers[tagname] = self.__class__.__dict__[func]

    # tag specific handlers
    
    def _expand_attrs(self, tag, attrs):
        if attrs:
            attrlist = ["=".join((key, '"%s"' % val)) for (key,val) in attrs]
            s = " ".join(attrlist)
            return "<" + " ".join((tag,s)) + ">"
        else:
            return "<" + tag + ">"
        

    def _h_start_table(self, tag, attrs):
        # if tag has "class" attribute, and those value is "table":
        if ("class", "table") in attrs:
            self._start_handlers["tr"] = lambda s, t, a: s._expand_attrs(t, a)
            self._start_handlers["td"] = lambda s, t, a: s._expand_attrs(t, a)
            self._start_handlers["th"] = lambda s, t, a: s._expand_attrs(t, a)
            self._end_handlers["tr"] = lambda s, t: "</" + t + ">\n"
            self._end_handlers["td"] = lambda s, t: "</" + t + ">"
            self._end_handlers["th"] = lambda s, t: "</" + t + ">"
            self._block = False
            self._in_table = True
            return """{{{ html
"""
#         elif ("class", "column") in attrs:
#             self._start_handlers["tr"] = lambda s, t, a: s._expand_attrs(t, a)
#             self._start_handlers["td"] = lambda s, t, a: s._expand_attrs(t, a)
#             self._start_handlers["th"] = lambda s, t, a: s._expand_attrs(t, a)
#             self._end_handlers["tr"] = lambda s, t: "</" + t + ">\n"
#             self._end_handlers["td"] = lambda s, t: "</" + t + ">"
#             self._end_handlers["th"] = lambda s, t: "</" + t + ">"
#             self._block = False
#             self._in_table = True
#             return """{{{ html
# %s
# """ % self._expand_attrs(tag, attrs)
        else:
            self._block = True
            self._in_table = False
        

    def _h_end_table(self, tag):
        if self._in_table:
            self._in_table = False
            self._block = False
            del self._start_handlers["tr"]
            del self._end_handlers["tr"]
            del self._start_handlers["td"]
            del self._end_handlers["td"]
            del self._start_handlers["th"]
            del self._end_handlers["th"]
            return """
</table>
}}}
"""
        else:
            self._in_table = False
            self._block = False

    def _h_start_ul(self, tag, attrs):
        self._list_mode = "ul"

    def _h_start_ol(self, tag, attrs):
        self._list_mode = "ul"

    def _h_start_li(self, tag, attrs):
        if self._list_mode == "ul":
            return " * "
        elif self._list_mode == "ol":
            return " 1. "

    def _h_end_li(self, tag):
        return "\n"

    def _h_end_ol(self, tag):
        return "\n"

    def _h_end_ul(self, tag):
        return "\n"

    def _h_start_caption(self, tag, attrs):
        del self._start_handlers["b"]
        del self._end_handlers["b"]
        return "<h6>"

    def _h_end_caption(self, tagd):
        self._start_handlers["b"] = lambda s, t, a: "'''"
        self._end_handlers["b"] = lambda s, t: "'''"
        return """</h6>\n<table class="wikitable" border="1">\n\n"""
        
    def _h_start_img(self, tag, attrs):
        src = ""
        title = ""
        for (attr, val) in attrs:
            if attr == "src":
                src = val
            elif attr == "alt":
                title = val

        rex = re.compile(r"\.(png|PNG|gif|GIF|jpg|JPG)$")

        if self._prevtag == "a" and self._href and rex.search(self._href):
            filename = self._href.split("/")[-1]
            self._href = ""
        else:
            filename = src.split("/")[-1]

        if self._href:
            self._href = ""

        if title:
            self._put("[[Thumb(%s, caption=%s)]]\n\n" % (filename, title), True)
        else:
            self._put("[[Thumb(%s)]]\n\n" % (filename,), True)

    def _h_start_a(self, tag, attrs):
        href = ""
        for (attr, val) in attrs:
            if attr == "href":
                href = val
                break
        if href:
            self._href = href

    def _replace_url(self, url):
        t = urlparse.urlparse(url)
        if t[1] == "sourceforge.jp":
            m = re.search(r"^(/magazine/\d\d/\d\d/\d\d/\d+)", t[2])
            if m and self._url_r_map.has_key(m.group(1)):
                m2 = re.search(r"^/magazine/\d\d/\d\d/\d\d/\d+/(\d+)", t[2])
                if m2:
                    return self._url_r_map[m.group(1)].replace("_p1", "") + "_p" + m2.group(1)
                else:
                    return self._url_r_map[m.group(1)]
        return url
                
                    

    def _h_end_a(self, tag):
        if self._href:
            if self._stack:
                content = self._stack.pop()
            else:
                content = ""

            rurl = self._replace_url(self._href)
            if self._inner.has_key("table") and self._inner["table"] > 0:
                self._put('<a href="')
                self._put(rurl)
                self._put('">')
                self._put(content)
                self._put("</a>")
                self._href = ""
            else:
                self._put("[")
                self._put(rurl)
                self._put(" ")
                self._put(content)
                self._put("]")
                self._href = ""

    def _h_start_div(self, tag, attrs):
        if ("class", "navigation") in attrs:
            self._in_div = 1
            self._block = True
        elif ("class", "column") in attrs:
            self._in_div = 1
            self._in_column = True
            self._put("""{{{ html
<div class="column">
}}}
""")
        elif self._in_div > 0:
            self._in_div += 1

    def _h_end_div(self, tag):
        if self._in_div > 0:
            self._in_div -= 1

        if self._in_div == 0:
            self._block = False

        if self._in_column:
            self._in_column = False
            self._put("""{{{ html
</div>
}}}
""")

        
    def _h_start_pre(self, tag, attrs):
        pass

    def _h_end_pre(self, tag):
        t = """{{{%s}}}
""" % self._pre_data
        self._pre_data = ""
        return t

    def _pre_data_handler(self, data):
        self._pre_data = self._pre_data + data

    def _pre_start_handler(self, tag, attrs):
        if tag == "pre":
            self._h_start_pre(tag, attrs)
        # self._pre_data = self._pre_data + self._expand_attrs(tag, attrs)

    def _pre_end_handler(self, tag):
        if tag == "pre":
            self._h_end_pre(tag)
        # self._pre_data = self._pre_data + "</" + tag + ">"
    
