/**
 * Copyright (C) 2008-2009 RobotBrain. All Rights Reserved.
 * ̃vO̓t[\tgEFAłBȂ͂t[\tgEFAc
 * ɂĔsꂽGNU򓙈ʌOpo[W3(LGPLv3)߂
 * ōĔЕz܂͉ς邱Ƃł܂B
 * ̃vO͗Lpł邱ƂĔЕz܂S̖ۏ؂łB
 * Ɖ\̕ۏ؂ړIւ̓ḰAOɎꂽ̂܂ߑS݂
 * BڂGNU򓙈ʌOpo[W3(LGPLv3)B
 * Ȃ͂̃vOƋɁAGNU򓙈ʌOpo[W3(LGPLv3)
 * Rs[ꕔ󂯎Ă͂łB
 * 󂯎ĂȂ<http://www.gnu.org/licenses/>B
 */
package jp.robotbrain.html;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * html͂NX
 *
 * @since 1.00
 * @author Copyright (C) 2008-2009 <a href="http://robotbrain.jp">
 * RobotBrain.</a> All Rights Reserved.
 */
public class HtmlParser {

	/**
	 * [eBeBNX̂߃RXgN^͌J܂B
	 * 
	 * @since 1.00
	 */
	private HtmlParser() {
	}
	
	/**
	 * œnꂽhtml̓e͂form̃Xg𐶐܂B
	 * 
	 * @since 1.00
	 * @param p_html ͂html̓e
	 * @return form̃Xg
	 */
	static public HtmlFormList parseHtmlForm(String p_html) {
		p_html = p_html.replaceAll("[\r\n]", "");
		HtmlFormList returnValue = new HtmlFormList();
		ArrayList<String> formList = extractStringList("<form(.+?)</form>", p_html, 0);
		for (int i=0; i<formList.size(); i++) {
			String form = formList.get(i);
			String formAttr = extractString("<form([^>]+?)>",form,0);
			// name
			String name = getAttr("name",formAttr);
			// method
			String method = getAttr("method",formAttr);
			// action
			String action = getAttr("action",formAttr);
			// input^ÕXg
			HtmlInputList htmlInputList = parseHtmlInput(form);
			// select^ÕXg
			HtmlSelectList htmlSelectList = parseHtmlSelect(form);
			// form̐
			HtmlForm htmlForm = new HtmlForm();
			htmlForm.setName(name);
			htmlForm.setMethod(method);
			htmlForm.setAction(action);
			htmlForm.setHtmlInputList(htmlInputList);
			htmlForm.setHtmlSelectList(htmlSelectList);
			// formXgɒǉ
			returnValue.add(htmlForm);
		}
		return returnValue;
	}

	/**
	 * œnꂽhtml̓e͂input^ÕXg𐶐܂B
	 * 
	 * @since 1.00
	 * @param p_html ͂html̓e
	 * @return input^ÕXg
	 */
	static private HtmlInputList parseHtmlInput(String p_html) {
		HtmlInputList returnValue = new HtmlInputList();
		ArrayList<String> inputList = extractStringList("<(input|button)([^>]+?)>", p_html, 0);
		for (int j=0; j<inputList.size(); j++) {
			String input = inputList.get(j);
			// id
			String id = getAttr("id",input);
			// type
			String type = getAttr("type",input);
			// name
			String name = getAttr("name",input);
			// value
			String value = getAttr("value",input);
			// input^O̐
			HtmlInput htmlInput = new HtmlInput();
			htmlInput.setId(id);
			htmlInput.setType(type);
			htmlInput.setName(name);
			htmlInput.setValue(value);
			htmlInput.setSelected(false);
			// input^OXgɒǉ
			returnValue.add(htmlInput);
		}
		return returnValue;
	}
	
	/**
	 * œnꂽhtml̓e͂select^ÕXg𐶐܂B
	 * 
	 * @since 1.00
	 * @param p_html ͂html̓e
	 * @return select^ÕXg
	 */
	static private HtmlSelectList parseHtmlSelect(String p_html) {
		HtmlSelectList returnValue = new HtmlSelectList();
		ArrayList<String> selectList = extractStringList("<select(.+?)</select>", p_html, 0);
		for (int i=0; i<selectList.size(); i++) {
			String select = selectList.get(i);
			String selectAttr = extractString("<select([^>]+?)>",select,0);
			// id
			String id = getAttr("id",selectAttr);
			// name
			String name = getAttr("name",selectAttr);
			// select^O̐
			HtmlSelect htmlSelect = new HtmlSelect();
			htmlSelect.setId(id);
			htmlSelect.setName(name);
			htmlSelect.setHtmlOptionList(parseHtmlOption(select));
			// select^OXgɒǉ
			returnValue.add(htmlSelect);
		}
		return returnValue;
	}

	/**
	 * œnꂽhtml̓e͂option^ÕXg𐶐܂B
	 * 
	 * @since 1.00
	 * @param p_html ͂html̓e
	 * @return option^ÕXg
	 */
	static private HtmlOptionList parseHtmlOption(String p_html) {
		HtmlOptionList returnValue = new HtmlOptionList();
		ArrayList<String> optionList = extractStringList("<option[^<]+", p_html, 0);
		for (int i=0; i<optionList.size(); i++) {
			String optionAttr = optionList.get(i);
			// id
			String id = getAttr("id",optionAttr);
			// value
			String value = getAttr("value",optionAttr);
			// ^Cg
			String title = extractString(">(.+)",optionAttr,1);
			// option^O̐
			HtmlOption htmlOption = new HtmlOption();
			htmlOption.setId(id);
			htmlOption.setValue(value);
			htmlOption.setTitle(title);
			htmlOption.setSelected(false);
			// option^OXgɒǉ
			returnValue.add(htmlOption);
		}
		return returnValue;
	}
	
	/**
	 * œnꂽhtmlƑ^Ȏ𒊏o܂B
	 * 
	 * @since 1.00
	 * @param p_attrName  "name","id"Ȃ
	 * @param p_html html̓e
	 * @return 
	 */
	static private String getAttr(String p_attrName, String p_html) {
		String regex = p_attrName + "\\b\\s*=\\s*(?:([^<>'\"\\s]+)|'([^']+)'|(\"([^\"]+)\"))[^>]*>";
		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher m = p.matcher(p_html);
		if (m.find()) {
			if (m.group(1)!=null) return m.group(1).replaceAll("[\"']", ""); 
			if (m.group(2)!=null) return m.group(2).replaceAll("[\"']", ""); 
			if (m.group(3)!=null) return m.group(3).replaceAll("[\"']", ""); 
		}
		return "";
	}
	
	/**
	 * K\ghtml當𔲂o܂B
	 * Yŏ̕Ԃ܂B
	 * 
	 * @since 1.00
	 * @param p_regex K\
	 * @param p_html html̓e
	 * @param p_groupIndex O[vԍ
	 * @return Yŏ̕
	 */
	static private String extractString(String p_regex, String p_html, int p_groupIndex) {
		Pattern p = Pattern.compile(p_regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher m = p.matcher(p_html);
		if (m.find()) {
			String returnValue = m.group(p_groupIndex);
			return returnValue;
		}
		return "";
	}
	
	/**
	 * K\ghtml當𔲂o܂B
	 * Y邷ׂẴ̕XgԂ܂B
	 * 
	 * @since 1.00
	 * @param p_regex K\
	 * @param p_html html̓e
	 * @param p_groupIndex O[vԍ
	 * @return Y邷ׂẴ̕Xg
	 */
	static private ArrayList<String> extractStringList(String p_regex, String p_html, int p_groupIndex) {
		Pattern p = Pattern.compile(p_regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher m = p.matcher(p_html);
		ArrayList<String> returnValue = new ArrayList<String>();
		while (m.find()) {
			String match = m.group(p_groupIndex);
			returnValue.add(match);
		}
		return returnValue;
	}
	
	/**
	 * œnꂽhtml̓e͂Aw肳ꂽNɑ΂URLԂ܂B
	 * 
	 * @since 1.00
	 * @param p_linkString NB
	 * K\g܂̂ŃN̒"("Ȃǂ̓GXP[vĂB<br>
	 * (GXP[v)<br>
	 * "*"  "\\*"<br>
	 * "+"  "\\+"<br>
	 * "."  "\\."<br>
	 * "?"  "\\?"<br>
	 * "{@}"  "\\{@\\}"<br>
	 * "(@)"  "\\(@\\)"<br>
	 * "[@]"  "\\[@\\]"<br>
	 * "^"  "\\^"<br>
	 * "|"  "\\|"<br>
	 * @param p_html html̓e
	 * @return NURL
	 * @throws HtmlHrefException w肳ꂽN񂪑݂ȂꍇANňقȂURL
	 * ɃNĂꍇiǂ炪Ȃj
	 */
	static public String parseHtmlHref(String p_linkString, String p_html) throws HtmlHrefException {
		p_html = p_html.replaceAll("[\r\n]", "");
		String regex = "<a\\s.*?\\bhref\\s*?=\\s*?" +
					   "(?:([^<>'\"\\s]+)|'([^']+)'|\"([^\"]+)\")" +
					   "[^>]*?>\\s*?" + p_linkString + "\\s*?<";
		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher m = p.matcher(p_html);
		ArrayList<String> hrefs = new ArrayList<String>();
		while (m.find()) {
			if (m.group(1)!=null) hrefs.add(m.group(1).replaceAll("[\"']", "")); 
			if (m.group(2)!=null) hrefs.add(m.group(2).replaceAll("[\"']", "")); 
			if (m.group(3)!=null) hrefs.add(m.group(3).replaceAll("[\"']", "")); 
		}
		if (hrefs.size()<=0) {
			throw new HtmlHrefException("method:HtmlParser.parseHtmlHref(1) p_linkString=" + p_linkString, p_html);
		}
		if (hrefs.size()>1) {
			// 2ȏ㓯N񂪂ꍇ͓URLׂ
			String url1 = hrefs.get(0);
			for (int i=1;i<hrefs.size();i++) {
				String url2 = hrefs.get(i);
				if (!url1.equals(url2)) {
					throw new HtmlHrefException("method:HtmlParser.parseHtmlHref(2) p_linkString=" + p_linkString, p_html);
				}
			}
		}
		String returnValue = hrefs.get(0);
		// "&amp;" -> "&" ϊ
		returnValue = returnValue.replaceAll("(?<=&)amp;", "");
		return returnValue;
	}

	/**
	 * œnꂽhtml̓e͂AnCp[ÑXgԂ܂B
	 * 
	 * @since 2.70
	 * @param p_html html̓e
	 * @return nCp[ÑXg
	 */
	static public ArrayList<HtmlHref> parseHtmlHref(String p_html) {
		p_html = p_html.replaceAll("[\r\n]", "");
		String regex = "<a\\s.*?\\bhref\\s*?=\\s*?" +
					   "(?:([^<>'\"\\s]+)|'([^']+)'|\"([^\"]+)\")" +
					   "[^>]*?>\\s*?(.*?)\\s*?<";
		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher m = p.matcher(p_html);
		ArrayList<HtmlHref> returnValue = new ArrayList<HtmlHref>();
		while (m.find()) {
			HtmlHref href = new HtmlHref();
			// URL擾
			if (m.group(1)!=null) href.setURL(m.group(1).replaceAll("[\"']", "")); 
			if (m.group(2)!=null) href.setURL(m.group(2).replaceAll("[\"']", "")); 
			if (m.group(3)!=null) href.setURL(m.group(3).replaceAll("[\"']", ""));
			// N擾
			if (m.group(4)!=null) href.setLinkString(m.group(4)); 
			// "&amp;" -> "&" ϊ
			href.setURL(href.getURL().replaceAll("(?<=&)amp;", ""));
			// Xgɒǉ
			returnValue.add(href);
		}
		return returnValue;
	}
	
	/**
	 * œnꂽhtml̓e͂Aw肳ꂽframe,iframe^Onameɑ΂URLԂ܂B
	 * 
	 * @since 1.12
	 * @param p_name name
	 * @param p_html html̓e
	 * @return NURL
	 * @throws HtmlFrameException nameframe,iframe݂ꍇiǂ炪Ȃj
	 */
	static public String parseHtmlFrame(String p_name, String p_html) throws HtmlFrameException {
		p_html = p_html.replaceAll("[\r\n]", "");
		String regex = "<[i]*?frame\\s.*?\\bsrc\\s*?=\\s*?" +
					   "(?:([^<>'\"\\s]+)|'([^']+)'|\"([^\"]+)\")" +
					   "[^>]*?\\s.*?\\bname\\s*?=\\s*?" + 
					   "(?:([^<>'\"\\s]+)|'([^']+)'|\"([^\"]+)\").+?</[i]*?frame>";
		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
		Matcher m = p.matcher(p_html);
		HashMap<String,String> frames = new HashMap<String,String>();
		while (m.find()) {
			ArrayList<String> srcs = new ArrayList<String>();
			ArrayList<String> names = new ArrayList<String>();
			if (m.group(1)!=null) srcs.add(m.group(1).replaceAll("[\"']", "")); 
			if (m.group(2)!=null) srcs.add(m.group(2).replaceAll("[\"']", "")); 
			if (m.group(3)!=null) srcs.add(m.group(3).replaceAll("[\"']", "")); 
			if (m.group(4)!=null) names.add(m.group(4).replaceAll("[\"']", "")); 
			if (m.group(5)!=null) names.add(m.group(5).replaceAll("[\"']", "")); 
			if (m.group(6)!=null) names.add(m.group(6).replaceAll("[\"']", ""));
			if (srcs.size()==1 || names.size()==1) {
				String key = names.get(0);
				String value = srcs.get(0);
				if (frames.containsKey(key)) {
					String existValue = frames.get(key);
					if (!existValue.equals(value)) {
						// nameframe,iframeňقȂURL݂ꍇǂ炪fłȂ
						throw new HtmlFrameException("method:HtmlParser.parseHtmlFrame(1) p_name=" + p_name);
					}
				} else {
					frames.put(names.get(0), srcs.get(0));
				}
			}
		}
		String returnValue = frames.get(p_name);
		// "&amp;" -> "&" ϊ
		returnValue = returnValue.replaceAll("(?<=&)amp;", "");
		return returnValue;
	}
	
}
