/*
 * Copyright 2013 Yuichiro Moriguchi
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.morilib.parser.html;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.math.BigInteger;
import java.sql.SQLException;
import java.util.Properties;

public class HTMLParser {

	static enum S {
		INIT,
		TAG1, TAG2, TAG3, TAG4, TAG5, TAG6, TAG7, TGE1,
		ENT1, ENT2,
		MET1, MET2
	}

	private static final BigInteger LIM = BigInteger.valueOf(65536);

	private static Properties prop;
	private static Properties inv;

	static {
		prop = new Properties();
		_load(prop, "/net/morilib/parser/html/entities.properties");
		inv = new Properties();
		_load(inv, "/net/morilib/parser/html/entitiesinv.properties");
	}

	/**
	 * 
	 * @param s
	 * @return
	 */
	public static String escape(String s) {
		StringBuffer b = new StringBuffer();
		String t;
		char c;

		for(int i = 0; i < s.length(); i++) {
			c = s.charAt(i);
			if((t = inv.getProperty((int)c + "")) != null) {
				b.append('&').append(t).append(';');
			} else {
				b.append(c);
			}
		}
		return b.toString();
	}

	private static void _load(Properties v, String s) {
		InputStream ins = null;

		try {
			ins = HTMLParser.class.getResourceAsStream(s);
			v.load(ins);
		} catch(IOException e) {
			throw new RuntimeException(e);
		} finally {
			if(ins != null) {
				try {
					ins.close();
				} catch(IOException e) {
					throw new RuntimeException(e);
				}
			}
		}
	}

	private static void addent1(String s, boolean d, StringBuffer b) {
		String t;

		if((t = prop.getProperty(s)) != null) {
			b.append((char)Integer.parseInt(t));
		} else {
			b.append("&" + s);
			if(d)  b.append(';');
		}
	}

	private static void addent2(String s, boolean d, StringBuffer b) {
		BigInteger c;

		try {
			if((c = new BigInteger(s)).compareTo(LIM) < 0) {
				b.append((char)c.intValue());
				return;
			}
		} catch(NumberFormatException e) {
			// do nothing
		}
		b.append("&#" + s);
		if(d)  b.append(';');
	}

	public static void parse(HTMLHandler h,
			Reader rd) throws SQLException, IOException {
		StringBuffer b1 = new StringBuffer(), b2 = null;
		String k = null, l = null;
		S stat = S.INIT;
		int c;

		while(true) {
			c = rd.read();
			switch(stat) {
			case INIT:
				if(c < 0) {
					if(b1.length() > 0)  h.string(b1.toString());
					return;
				} else if(c == '<') {
					if(b1.length() > 0)  h.string(b1.toString());
					b1 = new StringBuffer();
					stat = S.TAG1;
				} else if(c == '&') {
					b2 = new StringBuffer();
					stat = S.ENT1;
				} else {
					b1.appendCodePoint(c);
				}
				break;
			case TAG1:
				if(c < 0) {
					return;
				} else if(c == '/') {
					stat = S.TGE1;
				} else if(c == '>') {
					h.startTag(b1.toString());
					b1 = new StringBuffer();
					stat = S.INIT;
				} else if(c == '?') {
					b1 = new StringBuffer();
					b1.append('<').append('?');
					stat = S.MET1;
				} else if(!(c == ' ' || c == '\t' || c == '\n' ||
						c == '\r')) {
					b1.appendCodePoint(c);
					stat = S.TAG2;
				}
				break;
			case TAG2:
				if(c < 0) {
					return;
				} else if(c == '/') {
					h.startTag(l = b1.toString());
					b1 = new StringBuffer();
					stat = S.TAG7;
				} else if(c == '>') {
					h.startTag(b1.toString());
					b1 = new StringBuffer();
					stat = S.INIT;
				} else if(c == ' ' || c == '\t' || c == '\n') {
					h.startTag(l = b1.toString());
					b1 = new StringBuffer();
					stat = S.TAG3;
				} else if(c != '\r') {
					b1.appendCodePoint(c);
				}
				break;
			case TAG3:
				if(c < 0) {
					return;
				} else if(c == '=') {
					k = b1.toString();
					b1 = new StringBuffer();
					stat = S.TAG4;
				} else if(c == '/') {
					if(b1.length() > 0) {
						h.tagAttribute(b1.toString(), "");
					}
					stat = S.TAG7;
				} else if(c == '>') {
					if(b1.length() > 0) {
						h.tagAttribute(b1.toString(), "");
					}
					b1 = new StringBuffer();
					stat = S.INIT;
				} else if(!(c == ' ' || c == '\t' || c == '\n' ||
						c == '\r')) {
					b1.appendCodePoint(c);
				}
				break;
			case TAG4:
				if(c < 0) {
					return;
				} else if(c == '\'') {
					stat = S.TAG5;
				} else if(c == '\"') {
					stat = S.TAG6;
				} else if(c == '/') {
					h.tagAttribute(k, b1.toString());
					stat = S.TAG7;
				} else if(c == '>') {
					h.tagAttribute(k, b1.toString());
					b1 = new StringBuffer();
					stat = S.INIT;
				} else if(c == ' ' || c == '\t' || c == '\n') {
					h.tagAttribute(k, b1.toString());
					b1 = new StringBuffer();
					stat = S.TAG3;
				} else if(c != '\r') {
					b1.appendCodePoint(c);
				}
				break;
			case TAG5:
				if(c < 0) {
					return;
				} else if(c == '\'') {
					stat = S.TAG4;
				} else if(c != '\r') {
					b1.appendCodePoint(c);
				}
				break;
			case TAG6:
				if(c < 0) {
					return;
				} else if(c == '\"') {
					stat = S.TAG4;
				} else if(c != '\r') {
					b1.appendCodePoint(c);
				}
				break;
			case TAG7:
				if(c < 0) {
					return;
				} else if(c == '>') {
					h.endTag(l);
					b1 = new StringBuffer();
					stat = S.INIT;
				} else if(!(c == ' ' || c == '\t' || c == '\n' ||
						c == '\r')) {
					b1 = new StringBuffer().appendCodePoint(c);
					stat = S.TAG3;
				}
				break;
			case TGE1:
				if(c < 0) {
					return;
				} else if(c == '>') {
					h.endTag(b1.toString().trim());
					b1 = new StringBuffer();
					stat = S.INIT;
				} else if(c != '\r') {
					b1.appendCodePoint(c);
				}
				break;
			case ENT1:
				if(c < 0) {
					addent1(b2.toString(), false, b1);
					if(b1.length() > 0)  h.string(b1.toString());
					return;
				} else if(c == '#') {
					stat = S.ENT2;
				} else if(c == ';') {
					addent1(b2.toString(), true, b1);
					stat = S.INIT;
				} else if((c >= 'a' && c <= 'z') ||
						(c >= 'A' && c <= 'Z') ||
						(c >= '0' && c <= '9')) {
					b2.append((char)c);
				} else {
					addent1(b2.toString(), false, b1);
					b1.appendCodePoint(c);
					stat = S.INIT;
				}
				break;
			case ENT2:
				if(c < 0) {
					addent2(b2.toString(), false, b1);
					if(b1.length() > 0)  h.string(b1.toString());
					return;
				} else if(c == ';') {
					addent2(b2.toString(), true, b1);
					stat = S.INIT;
				} else if(c >= '0' && c <= '9') {
					b2.append((char)c);
				} else {
					addent2(b2.toString(), false, b1);
					b1.appendCodePoint(c);
					stat = S.INIT;
				}
				break;
			case MET1:
				b1.appendCodePoint(c);
				if(c == '?')  stat = S.MET2;
				break;
			case MET2:
				b1.appendCodePoint(c);
				stat = c == '>' ? S.INIT : S.MET1;
				break;
			}
		}
	}

	/**
	 * 
	 * @param h
	 * @param s
	 * @throws SQLException
	 * @throws IOException
	 */
	public static void parse(HTMLHandler h,
			String s) throws SQLException, IOException {
		parse(h, new StringReader(s));
	}

}
