#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import codecs
import re

from BeautifulSoup import BeautifulSoup


sys.stdout = codecs.getwriter('utf_8')(sys.stdout)

try:
	path_to_html = sys.argv[1]
except IndexError:
	sys.exit(sys.argv[0] + " html")

html_file = codecs.open(path_to_html, "r", "utf_8")
html_content = html_file.read()

bsp = BeautifulSoup(html_content,fromEncoding="utf_8")
#bsp = BeautifulSoup(html_content)

bsp_table = bsp.table

for row in bsp_table.findAll('tr'):

	str = unicode(row.prettify(), "utf_8" )
	if not str.find(ur"lt_tb_col") == -1:
#		print str
#		print "++++"
		continue

	datas = row.findAll('td')
#	print str
#	print "----"
	
#	if datas[1].contents[0].strip() == "":
#		continue
#	str = unicode(datas[1].prettify(), "utf_8")
#	sys.stdout.write(str)

	# extract story's url and title
	str = unicode(datas[1].prettify(), "utf_8")
	str = re.sub( r"&nbsp;\s*", " ", str, re.S )
#	print str
	match_obj = re.search( r'<a href="(.*)">(.*)</a>', str, re.S )
	story_url = match_obj.group(1).strip()
	story_title = match_obj.group(2).strip()


	# extract editor
	str = unicode(datas[2].prettify(), "utf_8")
	str = re.sub( r"&nbsp;\s*", " ", str, re.S )
	match_obj = re.search( r'<b>(.*)</b>', str, re.S )
	editor = match_obj.group(1).strip()

	# extract PVs
	str = unicode(datas[5].prettify(), "utf_8")
	str = re.sub( r"&nbsp;\s*", " ", str, re.S )
	match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )
	page_views = match_obj.group(1).strip()

	# extract comments
	str = unicode(datas[6].prettify(), "utf_8")
	str = re.sub( r"&nbsp;\s*", " ", str, re.S )
	match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )
	comments = match_obj.group(1).strip()

	# extract datetime
	str = unicode(datas[7].prettify(), "utf_8")
	str = re.sub( r"&nbsp;\s*", " ", str, re.S )
	match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )
	date_time = match_obj.group(1).strip()

	print story_url, story_title, editor, page_views, comments, date_time

	# <tr class="story_nd ps_7">
	# <td align="right">[<a href="//magazine.sourceforge.jp/admin.pl?op=edit&amp;sid=09/02/26/1113251">1</a>]</td>
	# <td class="admin_title"><a href="//magazine.sourceforge.jp/article.pl?sid=09/02/26/1113251">Firefoxをインテル コンパイラーでコンパイルする&nbsp;</a></td>
	# <td><b>hiromichi-m</b></td>
	# <td>c</td>
	# <td><a href="/admin.pl?section=opensource">	opens</a></td>
	# <td>13</td>
	# <td>0</td>
	# <td>03/08&nbsp; 15:00</td>
	# </tr>
