﻿/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
  <file> Html5Lex.cs </file>
  <brief>
    Tokenize and dump the html5 file.
  </brief>
  <author>
    Stanley Hong <link2next@gmail.com>, Dec. 2013.
  </author>
*/

using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

using uls.collection;
//
// To generate the file 'Html5LexBasis.cs' from 'html5.ulc', use Ulc2Class.exe
//    Ulc2Class.exe -lcs -o Html5LexBasis.cs -n uls.nemesis.Html5LexBasis html5.ulc
//
namespace uls
{
    namespace tests
    {
        namespace Html5Toks
        {
            public class Html5Lex : Html5LexBasis
            {
                StringBuilder tokbuf;

                StringBuilder txt_buf;
                int prepare_html_text;

                StreamReader fin;

                String tok_str;
                int tok_id;
                bool tok_ungot;

                private bool is_digit(uint ch)
                {
                    bool rval;
                    rval = (ch >= '0' && ch <= '9') ? true : false;
                    return rval;
                }

                private int concat_lexeme(String str, int len)
                {
                    base.getTok();
                    tokbuf.Append(str);
                    tokbuf.Append(base.TokStr);

                    return tokbuf.Length;
                }

                private int run_to_tagbegin(StreamReader fp, StringBuilder txt_buf, out bool is_trivial)
                {
                    bool escape = false, bTrivial = true;
                    int ich;
                    char ch;

                    for ( ; (ich = fp.Read()) > 0; )
                    {
                        ch = (char) ich;

                        if (escape)
                        {
                            txt_buf.Append(ch);
                            escape = false;
                        }
                        else if (ch == '<')
                        {
                            break;
                        }
                        else if (ch == '\\')
                        {
                            escape = true;
                            txt_buf.Append('\\');
                            bTrivial = false;
                        }

                        if (ch != '\n')
                        {
                            txt_buf.Append(ch);
                            if (ch != ' ' && ch != '\t') bTrivial = false;
                        }
                    }

                    is_trivial = bTrivial;
                    return ich;
                }

                private int pass_html_quote(StreamReader fp, StringBuilder txt_buf, char quote_ch)
                {
                    int stat = 0;
                    bool escape = false;
                    char ch;
                    int ich;

                    for ( ; ; )
                    {
                        if ((ich = fp.Read()) < 0)
                        {
                            Console.WriteLine("Unexpected EOF");
                            stat = -1;
                            break;
                        }

                        ch = (char) ich;

                        if (escape)
                        {
                            txt_buf.Append((char)ch);
                            escape = false;
                        }
                        else
                        {
                            if (ch == quote_ch)
                            {
                                txt_buf.Append((char)ch);
                                break;

                            }
                            else if (ch == '\\')
                            {
                                txt_buf.Append('\\');
                                escape = true;
                            }
                            else
                            {
                                txt_buf.Append((char)ch);
                            }
                        }
                    }

                    return stat;
                }

                private int run_to_tagend(StreamReader fp, StringBuilder txt_buf)
                {
                    int stat = 1;
                    int ich;
                    char ch;

                    for ( ; ; )
                    {
                        if ((ich = fp.Read()) < 0)
                        {
                            Console.WriteLine("unexpected terminating file!");
                            stat = -1;
                            break;
                        }

                        ch = (char) ich;

                        if (ch == '\'' || ch == '"')
                        {
                            txt_buf.Append(ch);
                            if (pass_html_quote(fp, txt_buf, ch) < 0)
                            {
                                Console.WriteLine("unexpected html element!");
                                stat = -1;
                                break;
                            }
                        }
                        else
                        {
                            txt_buf.Append(ch);
                            if (ch == '>')
                            {
                                stat = 0;
                                break;
                            }
                        }
                    }

                    return stat;
                }

                public Html5Lex(String config_name)
                    : base(config_name)
                {
                    tokbuf = new StringBuilder();
                    txt_buf = new StringBuilder();
                    tok_id = NONE;
                    tok_str = "";
                    tok_ungot = false;
                }

                public override int getTok()
                {
	                int ich;
	                int tok;
	                bool is_trivial;
	                uint ch;

	                if (tok_ungot == true)
	                {
                         tok_ungot = false;
                         return tok_id;
	                }

	                txt_buf.Length = 0;
	                tokbuf.Length = 0;

	                if (prepare_html_text < 0) {
                         tok_id = EOI;
		               tok_str = "";
		               return tok_id;
	                }

 again_1:
	                if (prepare_html_text != 0) {
                         txt_buf.Length = 0;

		               // From here, to '<' may be a html text.
		               if ((ich=run_to_tagbegin(fin, txt_buf, out is_trivial)) == -1) {
			              // The text from here to EOF is just space.
                             prepare_html_text = -1;
		               } else {
			              prepare_html_text = 0;
		               }

                         tok_id = TEXT;
                         if (txt_buf.Length > 0 && !is_trivial) {
			              // Non-trivial text exists.
                             tok_str = txt_buf.ToString();
                             return TEXT;
                         }

                         if (ich < 0) {
                             tok_id = EOI;
			               tok_str = "";
			               return EOI;
		               }
	                }

                     if (tok_id == TEXT)
                     {
                         // If the current token was HTML-TEXT, ...
                         txt_buf.Length = 0;
                         txt_buf.Append('<');

		               if (run_to_tagend(fin, txt_buf) < 0) {
			              tok_id = ERR;
			              tok_str = "";
			              return ERR;
		               }

		               pushInput(txt_buf.ToString());

		               if ((tok = base.getTok()) != '<') {
                             prepare_html_text = -3;
			              goto again_1;
		               }

		               tok = base.getTok();

		               if (tok == '/') {
                             // The end mark of HTML-Element '/>' detected.
                             tok_id = TAGEND;
			              tok = base.getTok();
			              tok_str = base.TokStr;

		               } else {
                             tok_id = TAGBEGIN;

			              if (tok == '!') { // '<!' 
                                 if ((ch = base.peekCh()) == '-')
                                 {
					            ch = base.getCh();
					            if ((ch=base.peekCh()) == '-') {
						            // '<--'
						            // skip html comment
                                          prepare_html_text = -3;
						            goto again_1;
					             } else {
						            tok_id = ERR;
						            tok_str = "";
						            return tok_id;
					             }
				              } else {
					             concat_lexeme("!", 1);
					             tok = base.TokNum;
					             tok_str = tokbuf.ToString();
				              }
			               } else {
				              tok_str = base.TokStr;
			               }
		                }

		                if (tok != ID) {
			               tok_id = ERR;
			               tok_str = "";
			               prepare_html_text = -2;
			               return tok_id;
		                }

		                return tok_id;
	                }

	                tok = base.getTok();
	                tok_str = base.TokStr;

	                if (tok == '-') {
		                if ((ch=base.peekCh()) == '.' || is_digit(ch)) {
			                tok = NUM;
			                concat_lexeme("-", 1);
			                tok_id = tok;
			                tok_str = txt_buf.ToString();
			                return tok;
		                }
	                } else if (tok == '/' && base.peekCh() == '>') {
                         tok = TAGEND;
		               tok_str = "";
	                } else if (tok == '>') {
                         prepare_html_text = -3;
		               goto again_1;
	                }

	                tok_id = tok;
	                return tok_id;
                }

                public override void ungetTok()
                {
                    tok_ungot = true;
                }

                public override int TokNum
                {
                    get
                    {
                        return tok_id;
                    }
                }

                public override String TokStr
                {
                    get
                    {
                        return tok_str;
                    }
                }
 
                public int setFile(String fpath)
                {
                    StreamReader fp = new StreamReader(fpath);

                    fin = fp;
                    prepare_html_text = 1;

                    tok_id = NONE;
                    tok_str = "";

                    return 0;
                }
            }
        }
    }
}
