"""Utilities"""

import os
import htmlentitydefs
import re
import config


def j(string, encoding="japanese.ms932", error="replace"):
    """Convert string into unicode

    This is equivalent to u operator of str, but default encoding is
    "japanese.ms932" and error handling scheme is "replace"
    """
    if isinstance(string, str):
        return unicode(string, encoding, error)
    else:
        return string

def u(string):
    """Convert UTF-8 encoded string into Unicode"""
    return unicode(string, "utf8")


def extract_html_entities(text):
    """Replace all HTML entitis in the given text and return the result."""
    # The exact regular expression for HTML entity reference is more 
    # complex.  However, replace_entref() looks up the dictionary to 
    # check so that undefined &foo; string will be left.
    # This function is written by 162.
    return re.sub(u"&#?[A-Za-z0-9]+;", _replace_entref, text)

_entity_dict = {}
for k, v in htmlentitydefs.entitydefs.items():
    if v[:2] == "&#" and v[-1] == ";":
        v = unichr(int(v[2:-1]))
    else:
        v = unicode(v, "iso-8859-1")
    _entity_dict[u"&%s;" % k] = v

def _replace_entref(match):
    """Given match object, replace HTML entity with Unicode character 
    and return it.
    """
    key = match.group()
    if key[1] == u"#" and key[2:-1].isdigit():
        new = unichr(int(key[2:-1]))
    else:
        new = _entity_dict.get(key, key)
    return new


