package asanhtmlparser;

/*
 * 쐬: 2008/05/25
 */

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;

import asanhtmlparser.util.Logger;
import asanhtmlparser.util.NetUtil;

/**
 * HTML̍\͂Ȃ܂.
 * 
 * @author a-san
 */
public class HtmlParser {
	/** Oo */
	private static Logger logger = Logger.getLogger(HtmlParser.class);
	/** HTML̎͊łB */
	HtmlScanner scanner = new HtmlScanner();

	/**
	 * w肳ꂽURLHTML͂A^ÕXgԂ܂B
	 * 
	 * @param url
	 *            HTMLURLBnulls.
	 * @return HtmlNodẽXgԂ܂B
	 * @throws IOException
	 *             HTML̎擾ɎsƂɃX[B
	 * @throws HtmlParserException
	 *             HTML̉͂ɎsƂɃX[B
	 */
	public ArrayList parse(URL url) throws IOException, HtmlParserException {
		assert url != null;
		byte[] bytes = NetUtil.downloadContents(url);
		// R[hsB
		// TODO: HTMLMETA^OcharsetƁAJavacharsetSɈvĂ邩H
		// vĂȂꍇAȂ炩̕ϊe[u݂KvB
		String charset = parseCharset(bytes);
		// R[hʂłȂ "JISAutoDetect"Ƃĉ͂B
		if (charset == null)
			charset = "JISAutoDetect";
		String source = new String(bytes, charset);
		return parse(source, url);
	}

	/**
	 * w肳ꂽHTML̃\[X̍\͂s܂B URLw肳ƃ\[Xʒu(ScannerPosition)Ɋi[܂B
	 * 
	 * @param source HTML̃\[XBnulls
	 * @param url	 URLBnull
	 * @return IHtmlNodẽXgԂ܂B
	 * @throws HtmlParserException
	 *             HTML̉͂ɎsƂɃX[܂B
	 */
	public ArrayList parse(String source, URL url) throws HtmlParserException {
		assert source != null;
		scanner.setSource(source, url);
		logger.debug(source);
		ArrayList taglist = new ArrayList();
		IHtmlNode t;
		while (true) {
			t = scanner.getToken();
			if (t == null)
				break;
			logger.debug(t);
			taglist.add(t);
		}
		;
		return taglist;
	}

	/**
	 * w肳ꂽHTML̃Rec̕Zbg𔻒f܂B
	 * ͂r܂ōsAmeta^OŁAcontent΂charsetԂ܂B
	 * 
	 * @param bytes
	 *            HTML̃RecB
	 * @return Zbg."utf-8"Ȃ.킩ȂꍇnullԂ܂B
	 */
	public String parseCharset(byte[] bytes)
			throws UnsupportedEncodingException {
		// Ƃ肠Autf-8ŕɕϊBHTML̑iKŁAmeta^Ocontentw肪邱Ƃ҂ĂB
		String source = new String(bytes, "utf-8");
		scanner.setSource(source, null);
		logger.debug(source);
		while (true) {
			IHtmlNode t = scanner.getToken();
			if (t == null) break;
			logger.debug(t);
			if (t instanceof HtmlElement) {
				HtmlElement elem = (HtmlElement) t;
				if (elem.tagname.equalsIgnoreCase("meta")) {
					String content = elem.getAttributeValue("content");
					if (content == null)
						continue;
					int pos = content.toLowerCase().indexOf("charset");
					if (pos == -1)
						continue; // contentcharsetȂHHH
					pos = content.indexOf('=', pos);
					if (pos == -1)
						continue; // charset̂Ƃ=ȂH
					String charset = content.substring(pos + 1).trim();
					logger.debug("charset=[" + charset + "]");
					return charset;
				}
			}
		}
		return null; // Ō܂œǂłAR[h킩ȂB
	}

	/** eXgvO. */
	public static void main(String[] args) throws Exception {
		System.out.println("HtmlParser.main()");
		// logger.threshold = Logger.OFF;
		long start = System.currentTimeMillis();
		// w肳ĂȂꍇ
		if (args.length == 0) {
			System.out.println("usage:java HtmlParser <url>");
			return;
		}
		URL url = new URL(args[0]);
		System.out.println("url=" + url);
		HtmlParser parser = new HtmlParser();
		ArrayList taglist = parser.parse(url);
		for (int i = 0; i < taglist.size(); i++) {
			IHtmlNode node = (IHtmlNode) taglist.get(i);
			System.out.println(node);
		}
		// vԂԂB
		System.out.println("complete. " + (System.currentTimeMillis() - start)
				+ "msec");
	}
}
