/*
 * 쐬: 2008/05/18
 */
package asanhtmlparser;

import java.net.URL;

import asanhtmlparser.util.Logger;
import asanhtmlparser.util.NetUtil;



/**
 * HTML̎͊ł.
 */
public class HtmlScanner {
	/** Oo */
    private static Logger logger = Logger.getLogger(HtmlScanner.class);
	/** ǂݍރ[_[. */
	CharReader reader = new CharReader();
	/** 擾 */
	int ch = -1;
	// 2008/11/25 a-san begin RĝȂscript^OΉ
	/** RĝȂscript^Oǂݍ񂾂H */
	private boolean badScriptTag = false;	
	// 2008/11/25 a-san end

	
	public void setSource(String source, URL url) {
		assert source != null;
		// m[hSǂݍށB
		reader.setSource(source, url);
		logger.debug(source);
		ch = reader.getChar();
		// 2008/11/25 a-san begin RĝȂscript^OΉ
		badScriptTag = false;
		// 2008/11/25 a-san end
	}
    public IHtmlNode getToken() {
		ScannerPosition srcpos = reader.getScannerPosition();
		int ch0 = ch;
		try {
			return getToken(srcpos);
		} catch (HtmlParserException ex) {
	    	reader.setScannerPosition(srcpos);
			String result = "" + (char) ch0;
	    	ch = reader.getChar();
	    	while (ch != '<') {
	    		result += (char) ch;
	    		ch = reader.getChar();
	    		if (ch == CharReader.EOF) break;
	    	}
			return new HtmlError(ex, result, srcpos);
		}
    }
	/**
	 * m[h擾܂B
	 * @return	m[hBHtmlComment, HtmlElement, HtmlText̂ꂩłB
	 * 		łɖɒBĂꍇ́AnullԂ܂BB
	 */
    public IHtmlNode getToken(ScannerPosition srcpos) throws HtmlParserException {
		// łɖɒBĂB
		if (ch == CharReader.EOF) return null;
		// 2008/11/25 a-san begin RĝȂscript^OΉ
		if (badScriptTag) {
			badScriptTag = false;
			int end_pos = reader.source.toLowerCase().indexOf("</script>", srcpos.pos-1);
			String text = "";
			while (reader.getScannerPosition().pos <= end_pos) {
				text += (char) ch;
				ch = reader.getChar();
			}
			return new HtmlText(text, srcpos);
		}
		// 2008/11/25 a-san end

		if (ch == '<') {
			String type = HtmlElement.OPEN_TAG;
			ch = reader.getChar();
			// <!-- RĝƂ -->
			if (ch == '!') {
				ch = reader.getChar();
				if (ch != '-') {
					// <!DOCTYPE ...> ꍇ
					String sval = "";
					while (ch != '>') {
						sval += ch;
						ch = reader.getChar();
					}
					ch = reader.getChar();
					return new HtmlElement(HtmlElement.SINGLE_TAG, sval, srcpos);
				}
				ch = reader.getChar();
				if (ch != '-') reader.throwException("'-' ܂:"+(char)ch);
				ch = reader.getChar();
				ch = reader.getChar();
				return new HtmlComment(getHttpComment(srcpos), srcpos);
			}
			if (ch == '/') {
				type = HtmlElement.CLOSE_TAG;
				ch = reader.getChar();
				// 2008/11/22 A-san begin "</>"^OΉ 
				ch = skipWhiteSpace();
				if (ch == '>') {
					ch = reader.getChar();
					return new HtmlElement(HtmlElement.SINGLE_TAG, "", srcpos);
				}
				// 2008/11/22 A-san end 
			}
			ch = skipWhiteSpace();
			String tagname = getIdentifier();
			ch = skipWhiteSpace();
			HtmlElement tag = new HtmlElement(type, tagname, srcpos);
			while (ch != '>' && ch != '/') {
				ch = skipWhiteSpace();
				String attrname = getAttributeName();
				//logger.debug("attrname="+attrname+" ch="+(char)ch);
				ch = skipWhiteSpace();
				// ΁Aname=value`̑BF href="http://www.google.co.jp/"
				if (ch == '=') {
					ch = reader.getChar();
					ch = skipWhiteSpace();
					String attrvalue = getAttributeValue();
					logger.debug("name="+attrname+" value="+attrvalue+" ch="+(char)ch);
					// Õ_uNH[e[VȂ
					if (attrvalue.startsWith("\"")) attrvalue = attrvalue.substring(1);
					if (attrvalue.endsWith("\"")) attrvalue = attrvalue.substring(0, attrvalue.length()-1);
					tag.addAttribute(attrname.toLowerCase(), attrvalue);
				}
				// =Ȃ΁Ǎ`B: nowrap
				else {
					logger.debug("name="+null+" value="+attrname+" ch="+(char)ch);
					tag.addAttribute(attrname);
				}
			}

			if (ch == '/') {
				tag.type = HtmlElement.SINGLE_TAG;
				ch = reader.getChar();
			}
			if (ch != '>') reader.throwException("'>' ܂ ch="+(char)ch);
			ch = reader.getChar();
			// 2008/11/25 a-san begin RĝȂscript^OΉ
			if (tag.tagname.equalsIgnoreCase("script") && tag.type == HtmlElement.OPEN_TAG) {
				int comment_pos = reader.source.indexOf("<!--", srcpos.pos);
				int end_pos = reader.source.toLowerCase().indexOf("</script>", srcpos.pos);
				if (comment_pos == -1 || end_pos < comment_pos) {
					// RgȂAĂ</script>肠ƂƁA_ȃ^O
					badScriptTag = true;
				}
			}
			// 2008/11/25 a-san end
			return tag;
		} else {
			String text = "";
			do {
				text += (char) ch;
				ch = reader.getChar();
			} while (ch != '<' && ch != CharReader.EOF);
			return new HtmlText(text, srcpos);
		}
    }
    /** 󔒕ǂݔ΂܂. */
    int skipWhiteSpace() throws HtmlParserException {
		// 󔒕Ȃ玟̃g[N擾B
		while (Character.isWhitespace((char) ch)) {
			ch = reader.getChar();
		}
		return ch;
    }
	/** ʎq擾܂B */
    String getIdentifier() throws HtmlParserException {
		assert Character.isJavaIdentifierStart((char) ch): ch;
		String sval = "";
		do {
			sval += (char) ch;
			ch = reader.getChar();
		} while (Character.isJavaIdentifierPart((char) ch));
		return sval;
    }
	/** ̖O擾܂B */
    String getAttributeName() throws HtmlParserException {
		assert Character.isJavaIdentifierStart((char) ch): ch;
		String sval = "";
		while (true) {
			if (ch == ' ') break;
			if (ch == '=') break;
			if (ch == '/') break;
			if (ch == '>') break;
			sval += (char) ch;
			ch = reader.getChar();
		}
		logger.debug("getAttributeName()=["+sval+"] ch="+(char)ch);
		return sval;
    }
	/** ̒l擾܂B */
    String getAttributeValue() throws HtmlParserException {
		assert Character.isJavaIdentifierStart((char) ch): ch;
		String sval = "";
		// 擪 " Ŏn܂Ƃɂ́AΉ h ܂œǂݍށB
		if (ch == '"') {
			sval += getQuotedString('"');
			return sval;
		}
		else if (ch == '\'') {
			sval += getQuotedString('\'');
			return sval;
		}
		int ch0 = -1;
		ScannerPosition pos = null;
		while (true) {
			//if (ch == '=') break;
			//if (ch == '/') break;
			if (ch == '>') break;
			if (ch == ' ') break;
			if (ch == '/') {
				pos = reader.getScannerPosition();
			}
			if (ch0 == '/' && ch == '>') {
				sval = sval.substring(0, sval.length()-1);
				reader.setScannerPosition(pos);
				break;
			}
			sval += (char) ch;
			ch0 = ch;
			ch = reader.getChar();
		}
		logger.debug("getAttributeValue()=["+sval+"] ch="+(char)ch);
		return sval;
    }
	/**
	 * NH[g(' ܂ ")ň͂܂ꂽ擾܂B
	 * @param	quote	NH[g(' ܂ ")
	 */
    String getQuotedString(char quote) throws HtmlParserException {
		assert ch == quote: ch;
		String str = ""+quote;
		ch = reader.getChar();
		while (ch != quote) {
			str += (char) ch;
			ch = reader.getChar();
		}
		ch = reader.getChar();
		return str + quote;
    }
    /** HTTP̃Rg&lt;!-- -->擾܂B */
    String getHttpComment(ScannerPosition start) throws HtmlParserException {
    	// 󔒕Ȃ玟̃g[N擾B
		while (true) {
			ch = reader.getChar();
			ScannerPosition pos = reader.getScannerPosition();
			if (reader.source.startsWith("-->", pos.pos)) {
				ch = reader.getChar();
				ch = reader.getChar();
				ch = reader.getChar();
				ch = reader.getChar();
				return reader.source.substring(start.pos-1, pos.pos+3);
			}
			if (ch == CharReader.EOF) {
				reader.throwException("Rg̖ '-->' ܂");		
			}
		}
    }   
    public String skip() {
    	String result = "";// + (char) ch;
    	while (ch != '<') {
    		result += (char) ch;
    		ch = reader.getChar();
    		if (ch == CharReader.EOF) break;
    	}
    	return result;
    }
    
    public static void main(String[] args) throws Exception {
		System.out.println("HtmlScanner.main()");
		//logger.objThreshold = Logger.ALL;
		if (args.length==0) {
			System.out.println("usage:java HtmlScanner <url>");
			return;
		}
  		URL url = new URL(args[0]);
		System.out.println("url="+url);
		byte[] bytes = NetUtil.downloadContents(url);
		String charset = "JISAutoDetect";
		String source = new String(bytes, charset);
		HtmlScanner scanner = new HtmlScanner();
		scanner.setSource(source, url);
		while (true) {
			IHtmlNode node = scanner.getToken();
			System.out.println(node);
			if (node == null) break;
		}
    }
}
