#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import os
import os.path
import dircache
import re
import shutil
import urlparse

import html2wiki.h2wconverter as h2wconverter

usage = "%s <input> <output>" % sys.argv[0]

try:
    input = sys.argv[1]
except IndexError:
    sys.exit(usage)

try:
    output = sys.argv[2]
except IndexError:
    output = None

def main(input, output):
    if os.path.isfile(input):
        conv(input, output)
    elif os.path.isdir(input):
        r_scan(input, input, output)

def src_conv(url, input, basedir):
    p = urlparse.urlparse(url)
    # p => <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    if p[1] != "":  # when url is link to external site:
        return url
    if p[2] == "":  # when path is not given:
        return url

    path = p[2]
    if path[0] == "/":
        # absolute path
        path = path[1:]
    else:
        # relative path
        # convert: ../foo/bar.png => hoge/foo/bar.png
        dir = os.path.relpath(input, basedir)
        dir = os.path.dirname(dir)
        dir = os.path.join(dir, path)
        path = os.path.normpath(dir)

        # convert: hoge/foo/bar.png => hoge/foo:bar.png
        dir = os.path.dirname(path)
        file = os.path.basename(path)
        path = ":".join((dir, file))
        
    return path

def link_conv(url, input, basedir):
    p = urlparse.urlparse(url)
    # p => <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    if p[1] != "":  # when url is link to external site:
        return url
    if p[2] == "":
        return url

    path = p[2]
    path = re.sub(r"\.(html|htm|txt)$", "", path)
    if path[0] == "/":
        # absolute path
        path = path[1:]
    else:
        # relative path
        dir = os.path.relpath(input, basedir)
        dir = os.path.dirname(dir)
        dir = os.path.join(dir, path)
        path = os.path.normpath(dir)
    return path
    

def conv(input, output, basedir=""):
    print "processing %s ..." % input
    fi = open(input, "r")
    html_str = fi.read()
    fi.close()

    c = h2wconverter.Html2WikiConverter()

    lc = lambda x: link_conv(x, input, basedir)
    sc = lambda x: src_conv(x, input, basedir)
    c.set_link_converter(lc)
    c.set_src_converter(sc)

    text = c.convert(html_str)

    if output:
        fo = open(output, "w")
        fo.write(text)
        fo.close()
    else:
        print text

def r_scan(dir, input, output):
    for i in dircache.opendir(dir):
        next = os.path.join(dir, i)
        next = os.path.normpath(next)
        if os.path.isdir(next):
            r_scan(next, input, output)
        else:
            dir = os.path.dirname(next)
            o_dir = dir.replace(input, output)
            file = os.path.basename(next)

            o_pathname = os.path.join(o_dir, file)
            if not os.path.exists(o_dir):
                os.makedirs(o_dir)
            if re.search(r"\.html$", file):
                conv(next, o_pathname, input)
            else:
                shutil.copy(next, o_pathname)
    
# main routine
main(input, output)

