#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os.path
import sys
import re
import os
import urllib
import HTMLParser

usage = "%s <urllist>" % sys.argv[0]

try:
    fname = sys.argv[1]
except IndexError:
    sys.exit(usage)

class TitleParser(HTMLParser.HTMLParser):

    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self._capt = False
        self._title = ""

    def parse(self, html_string):
        """Parse html_string with url, and return anchors"""
        self.feed(html_string)
        return self._title

    def handle_starttag(self, tag, attrs):
        """starttag handler."""
        if tag == "title":
            self._capt = True

    def handle_endtag(self, tag):
        """starttag handler."""
        if tag == "title":
            self._capt = False

    def handle_data(self, data):
        if self._capt:
            self._title = self._title + data.strip()

f = open(fname, "r")

rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
for url in f:
    url = url.strip()
    u = urllib.urlopen(url)
    data = u.read()

    t = rex.sub("", data)
    t = rex2.sub("", t)

    p = TitleParser()
    title = p.parse(t)
    title = re.sub(r"\s*- SourceForge.JP Magazine\s*$", "", title)
    title = title.replace(" ", "_").replace("/", "_").replace("+", "_")
    sys.stdout.write(url + "\t" + title + "_p1\n")

f.close()


