/*
 *	Qizx/Open version 0.4p2
 *
 *	Copyright (c) 2003-2004 Xavier C. FRANC -- All rights reserved.
 *
 *	This program is free software; you can redistribute it  and/or
 *	modify it under the terms of the GNU General Public License as
 *	published by the Free Software Foundation (see LICENSE.txt).
 */

package net.xfra.qizxopen.dm;

import net.xfra.qizxopen.util.*;
import java.util.ArrayList;

/**
 *  Full-text queries: parsing, search in fall-back mode (without indexes),
 *  term highlighting.
 *  <p>A query is a AND of <em>required</em> and <em>excluded</em> clauses.
 *  A clause is a phrase or an alternative of terms.
 *  A term is a simple word, a word with wildcards (a la Unix) or an approximate
 *  word (uses a generic Soundex-like phonetic algorithm).
 *  <p>Syntax:<pre>
 *  	query -> clause [ ['AND' | '&']?  ['NOT' | '-']?  clause]*
 *  	clause -> ORterm | phrase
 *  	phrase -> '"' term+ '"' distance?  |  '\'' term+ '\'' distance?
 *  	ORterm -> term [ ['OR' | '|'] term]* 
 * 	term -> [ wordchar | '*' | '?' ]+ distance? 
 *  	distance -> '~' integer?
 *  </pre>
 */
public class FulltextQuery
{
    public Clause[] required;
    public Clause[] excluded;
    public WordSifter wordSifter;

    int termCount;	// total term count

    public String toString() {
	StringBuffer out = new StringBuffer(100);
	if(required != null) {	// should not happen
	    out.append("required: ");
	    for(int r = 0; r < required.length; r++) out.append(required[r].toString());
	}
	if(excluded != null) {
	    out.append(" excluded: ");
	    for(int r = 0; r < excluded.length; r++) out.append(excluded[r].toString());
	}
	return out.toString();
    }
    /**
     *	Parses and builds a general FulltextQuery.
     *	@param query parsed query expression.
     *	@param sifter used for extracting and normalizing terms.
     */
    public static FulltextQuery parseQuery( String query, WordSifter sifter )
	throws ParseException {
	return new Parser(sifter).parse(query);
    }

    /**
     *	Parses and builds a phrase query: simple sequence of terms without quotes.
     *	@param query parsed query expression.
     *	@param distance maximum number of interspersed words (0 for exact phrase).
     *	@param sifter used for extracting and normalizing terms.
     */
    public static FulltextQuery parsePhrase( String query, int distance,
					     WordSifter sifter )
	throws ParseException {
	return new Parser(sifter).parsePhrase(query, distance);
    }

    /**
     *	Abstract clause.
     */
    public static abstract class Clause {
	public StringPattern[] terms;
	// is there a next match from start position? : 
	// iterators for each term are then set on the next match
	abstract boolean nextMatch( WordFlow flow, int termId, int start );
    }
    /**
     *	Union of terms.
     */
    public static class TermOr extends Clause {
	TermOr(StringPattern[] terms) {
	    this.terms = terms;
	}

	boolean nextMatch( WordFlow flow, int termId, int start ) {
	    boolean ok = false;
	    for(int t = 0; t < terms.length; t++) {
		int mat = flow.nextMatchFrom(terms[t], start, termId + t);
		if(mat >= 0)
		    if(flow.forHilite)
			ok = true;	// for hilite, move all iterators 
		    else return true;
	    }
	    return ok;
	}

	public String toString() {
	    StringBuffer out = new StringBuffer(" Or(");
	    for(int r = 0; r < terms.length; r++) {
		if(r>0) out.append(','); out.append(terms[r].toString());
	    }
	    return out.append(")").toString();
	}
    }

    /**
     *	Sequence of terms, with an optional total spacing between terms.
     *	Can represent NEAR of two terms (spacing = distance-1)
     */
    public static class Phrase extends Clause {
	public int spacing;

	Phrase(StringPattern[] terms, int spacing) {
	    this.terms = terms;
	    this.spacing = spacing;
	}

	boolean nextMatch( WordFlow flow, int termId, int start ) {
	    if(flow.forHilite) {
		// dont move if 'start' within phrase
		int lastPos = flow.termPos[termId + terms.length - 1];
		
		if(start <= lastPos)
		    return true;
	    }
	    for(;;) {
		int phrStart = start, firstMatch = -1, lastMatch = -1;
		for(int t = 0; t < terms.length; t++) {
		    int mat = flow.nextMatchFrom(terms[t], phrStart, termId + t);
		    if(mat < 0)
			return false;
		    
		    if(firstMatch < 0)
			firstMatch = mat;
		    lastMatch = mat;
		    phrStart = mat + 1;
		}
		// check spacing:
		if(lastMatch - firstMatch <= terms.length + spacing - 1) {
		    
		    return true;
		}
		// go on from match of first word:
		start = firstMatch + 1;
	    }
	}

	public String toString() {
	    StringBuffer out = new StringBuffer(" Phrase(");
	    for(int r = 0; r < terms.length; r++) {
		if(r>0) out.append(','); out.append(terms[r].toString());
	    }
	    return out.append("/"+spacing).append(")").toString();
	}
    }

    /**
     *	A simple word or a pattern.
     */
    public static class Term extends Clause {
	Term( StringPattern form ) {
	    terms = new StringPattern[] { form };
	}

	boolean nextMatch( WordFlow flow, int termId, int start ) {
	    return flow.nextMatchFrom(terms[0], start, termId) >= 0;
	}

	public String toString() {
	    return " Term "+terms[0];
	}
    }

    // --------------------------------------------------------------------------------

    static class Parser
    {
	WordSifter sifter;

	Parser( WordSifter sifter ) {
	    this.sifter = sifter;
	}

	FulltextQuery parse( String query ) throws ParseException {
	    sifter.start(query.toCharArray(), query.length());
	    FulltextQuery q = new FulltextQuery();
	    q.wordSifter = sifter;
	    if(!parseClause(q))
		throw new ParseException("empty query");
	    
	    for(;;) {
		pickWord("AND");
		if(!parseClause(q))
		    break;
	    }
	    if(sifter.charAt(0) != 0)
		throw new ParseException("unrecognized data at end of query: '"+
					 endOfQuery()+"'");
	    return q;
	}

	FulltextQuery parsePhrase( String query, int distance ) throws ParseException {
	    sifter.start(query.toCharArray(), query.length());
	    FulltextQuery q = new FulltextQuery();
	    q.wordSifter = sifter;
	    StringPattern[] terms = null;
	    for(StringPattern t = parseTerm(); t != null; t = parseTerm()) {
		terms = append(terms, t);
	    }
	    if(terms == null)
		throw new ParseException("empty phrase");
	    if(sifter.charAt(0) != 0)
		throw new ParseException("unrecognized data at end of phrase: '"+
					 endOfQuery()+"'");
	    addClause( q, false, new Phrase(terms, distance) );
	    return q;
	}

	private StringBuffer endOfQuery() {
	    StringBuffer s = new StringBuffer();
	    for(char c = sifter.charAt(0); c != 0; c = sifter.nextChar())
		s.append(c);
	    return s;
	}

	boolean parseClause(FulltextQuery query) throws ParseException {
	    boolean not = pickWord("NOT") || pick('-');
	    char eophr;
	    if(pick(eophr = '"') || pick(eophr = '\'')) {
		StringPattern[] terms = null;
		for(StringPattern t = parseTerm(); t != null; t = parseTerm()) {
		    terms = append(terms, t);
		}
		if(terms == null)
		    throw new ParseException("empty phrase");
		if(!pick(eophr))
		    throw new ParseException("end of phrase not found");
		addClause(query, not, new Phrase(terms, parseDistance(4)));
	    }
	    else {
		StringPattern t = parseTerm();
		if(t == null)
		    return false;
		Clause cl = null;
		if(pick('|') || pickWord("OR")) {
		    StringPattern[] terms = new StringPattern[] { t };
		    do {
			t = parseTerm();
			if(t == null)
			    throw new ParseException("expecting term after OR");
			terms = append(terms, t);
		    } while(pick('|') || pickWord("OR"));
		    cl = new TermOr(terms);
		}
		else cl = new Term(t);
		addClause( query, not, cl );
	    }
	    return true;
	}

	StringPattern parseTerm() throws ParseException {
	    for(;;) {
		skip();
		char[] wordBuffer = new char[12];
		int wordLen = 0;
		char cc = sifter.charAt(0), lastc = ' ';
		char STAR = sifter.wildcardSeveral(), SINGLE = sifter.wildcardSingle();
		if( !sifter.isWordStart(cc) && cc != SINGLE && cc != STAR && cc != '[' )
		    break;
		boolean glob = false;
		do {
		    if( (cc == SINGLE || cc == STAR || cc == '[') && lastc != '\\')
			glob = true;
		    if(wordLen >= wordBuffer.length) {
			char[] old = wordBuffer;
			wordBuffer = new char[ old.length * 2 ];
			System.arraycopy(old, 0, wordBuffer, 0, old.length);
		    }
		    if(cc == STAR) cc = '*';
		    else if(cc == SINGLE) cc = '?'; 
 		    wordBuffer[wordLen++] = sifter.mapChar(cc);
		    lastc = cc;
		} while( sifter.isWordPart(cc = sifter.nextChar()) 
		         || cc == SINGLE || cc == STAR
			 || cc == '[' || (glob && cc == '^') || cc == ']');

		if(wordLen == 1)
		    if(wordBuffer[0] == STAR)
			throw new ParseException("invalid term: '"+STAR+"'");
		    else // ignore words of length 1
			continue;
		if(glob)
		    return new GlobPattern(wordBuffer, wordLen);
		int fuzz = parseDistance(1);
		if(fuzz == 0)
		    return new StringPattern(wordBuffer, wordLen);
		return new SoundsLikePattern(wordBuffer, wordLen, fuzz);
	    }
	    return null;
	}

	int parseDistance( int defaultValue ) {
	    if(!pick('~')) 
		return 0;
	    char cc = sifter.charAt(0);
	    if(!Character.isDigit(cc))
		return defaultValue;
	    int value = 0;
	    for( ; Character.isDigit(cc); cc = sifter.nextChar())
		value = 10 * value + cc - '0';
	    return value;
	}

	boolean pickWord(String word) {
	    skip();
	    int L = word.length();
	    for(int i = 0; i < L; i++)
		if(sifter.charAt(i) != word.charAt(i))
		    return false;
	    if(sifter.isWordPart(sifter.charAt(L)))
		return false;
	    for(; --L >= 0; )
		sifter.nextChar();
	    return true;
	}

	boolean pick(char token) {
	    skip();
	    if(sifter.charAt(0) != token)
		return false;
	    sifter.nextChar();
	    return true;
	}

	void skip() {
	    char c = sifter.charAt(0);
	    for( ; c != 0 && Character.isWhitespace(c); )
		c = sifter.nextChar();
	}

	void addClause(FulltextQuery query, boolean not, Clause clause) {
	    query.termCount += clause.terms.length;
	    if(not)
		query.excluded = append(query.excluded, clause);
	    else
		query.required = append(query.required, clause);
	}

	Clause[] append(Clause[] list, Clause clause) {
	    if(list == null)
		return new Clause[] { clause };
            Clause[] result = new Clause[list.length + 1];
            System.arraycopy(list, 0, result, 0, list.length);
	    result[list.length] = clause;
	    return result;
        }

	StringPattern[] append(StringPattern[] list, StringPattern term) {
	    if(list == null)
		return new StringPattern[] { term };
            StringPattern[] result = new StringPattern[list.length + 1];
            System.arraycopy(list, 0, result, 0, list.length);
	    result[list.length] = term;
	    return result;
        }
    }

    public static class ParseException extends Exception {
	ParseException(String reason) {
	    super(reason);
	}
    }

    // -------------- Matching -------------------------------------------------------
    // Note: this is a fallback implementation, dont expect hi-speed ... 
    // (actually around 1 million words/s on a 2.5 GHz P4)

    /**
     *	Tells whether the text contents of the node matches this query.
     */
    public boolean matches( Node node ) {
	WordFlow flow = new WordFlow(32, termCount);
	parseWords(node, wordSifter, flow);
	int termId = 0;
	if(required != null)
	    for(int c = 0; c < required.length; c++) {
		if( ! required[c].nextMatch(flow, termId, 0) )
		    return false;
		termId += required[c].terms.length;
	    }
	if(excluded != null)
	    for(int c = 0; c < excluded.length; c++) {
		if( excluded[c].nextMatch(flow, termId, 0) )
		    return false;
		termId += required[c].terms.length;
	    }
	return true;
    }

    // sequence of word occurrences: word, node, offset in node, source length.
    static class WordFlow {
	int size;
	char[][] words;    // extracted normalized form
	Node[]   nodes;    // containing text node
	int[]    offsets;  // offset in chars of the word inside the text node
	int[]    lengths;  // original length of the word (most often == word.length)

	// term iterators:
	int[]     termPos;
	//boolean[] clauseMatched;   // indexed by first term id (like termPos)
	boolean forHilite;

	WordFlow(int asize, int termCount) {
	    words = new char[asize][];
	    nodes = new Node[asize];
	    offsets = new int[asize];
	    lengths = new int[asize];
	    termPos = new int[termCount];
	    //clauseMatched = new boolean[termCount];
	}

	void add(char[] word, Node node, int offset, int wlength) {
	    
	    if(size >= words.length) {
		int nsize = size * 2;
		Node[] oldn = nodes;
		nodes = new Node[nsize];
		System.arraycopy(oldn, 0, nodes, 0, size);
		char[][] oldw = words;
		words = new char[nsize][];
		System.arraycopy(oldw, 0, words, 0, size);
		int[] old = offsets;
		offsets = new int[nsize];
		System.arraycopy(old, 0, offsets, 0, size);
		old = lengths;
		lengths = new int[nsize];
		System.arraycopy(old, 0, lengths, 0, size);
	    }
	    words[size] = word;
	    nodes[size] = node;
	    offsets[size] = offset;
	    lengths[size] = wlength;
	    ++ size;
	}

	int nextMatchFrom( StringPattern pattern, int start, int iterator ) {
	    for(int pos = start; pos < size; pos++) {
		
		if(words[pos] != null && pattern.matches(words[pos])) {
		    
		    return termPos[iterator] = pos;
		}
	    }
	    termPos[iterator] = Integer.MAX_VALUE;
	    return -1;
	}
    }

    private void parseWords(Node node, WordSifter sifter, WordFlow flow) {
	switch(node.getNature()) {
	case Node.DOCUMENT:
	case Node.ELEMENT:
	    for(NodeSequence seq = node.children(); seq.nextNode(); )
		parseWords(seq.currentNode(), wordSifter, flow);
	    break;
        case Node.TEXT:
	    char[] text = node.getChars();
	    sifter.start(text, text.length);
	    char[] word = sifter.nextWord();
	    for(; word != null; word = sifter.nextWord()) {
		flow.add(word, node, sifter.wordOffset(), sifter.wordLength());
	    }
	    break;
	// ignore the rest: PI, comment
	}
    }

    // ----------------- highlighting ------------------------------------------------
    /**
     *  Defines the way highlighted words are decorated: a highlighted word
     *	is surrounded by an element of the form <E A="patternN">word</E>, where N
     *	is the rank of the fulltext clause (phrase or alternative of terms),
     *	E is the 'element' field of this hiliter (a QName with default value "span"),
     *	A is the 'attribute' field (a QName with default value "class"), and
     *	pattern is a string prefix to the attribute value (by default empty).
     *	<p>may also carry elements to highlight specifically (set by addMatchingArea),
     *	by default the whole root fragment may be highlighted.
     */
    public class Hiliter {
	public QName element = QName.get("span");
	public QName attribute = QName.get("class");
	public String pattern = "hi";

	ArrayList areas = new ArrayList();
	WordFlow flow;
	Node nextLitNode;
	int  nextLitIndex;
	int  nextLitClause;
	/**
	 *   Defines a particular element to highlight inside the root fragment.
	 */
	public void addMatchingArea( Node area ) {
	    if(areas == null)
		areas = new ArrayList();
	    areas.add(area);
	}

	boolean start(Node node, XMLEventReceiver result) throws DataModelException {
	    if(areas != null && areas.size() > 0 && node.equals(areas.get(0))) {
		areas.remove(0);
		startHiliting(node);
	    }
	    // is this node the next to hilite ?
	    if(!node.equals(nextLitNode))
		return false;
	    int textStart = 0;
	    String text = node.getStringValue();
	    do {
		// text from current pos to hit
		int offset = flow.offsets[nextLitIndex];
		int length = flow.lengths[nextLitIndex];

		result.text( text.substring(textStart, offset) );
		// generate hit:
		result.startElement(element);
		result.attribute(attribute, pattern + nextLitClause);
		textStart = offset + length;
		result.text( text.substring(offset, textStart) );
		result.endElement(element);
		// find next hit
		nextHilite();
	    } while(node.equals(nextLitNode));
	    // end of text node:
	    result.text( text.substring(textStart) );
	    return true;
	}

	void startHiliting(Node area) {
	    if(flow == null)
		flow = new WordFlow(32, termCount);
	    flow.size = 0;
	    parseWords(area, wordSifter, flow);
	    nextLitIndex = -1;
	    nextHilite();
	    flow.forHilite = true;
	}

	void nextHilite() {
	    int termId = 0;
	    if(required == null)
		return;
	    int next = Integer.MAX_VALUE;
	    for(int c = 0; c < required.length; c++) {
		int termCnt = required[c].terms.length;
		if(! required[c].nextMatch(flow, termId, nextLitIndex + 1))
		    continue;
		for(int t = 0; t < termCnt; t++) {
		    int tix = flow.termPos[termId + t];
		    if(tix < next && tix > nextLitIndex) {
			next = tix;
			nextLitClause = c;
		    }
		}
		termId += termCnt;
	    }
	    nextLitIndex = next;
	    nextLitNode = 
		(nextLitIndex == Integer.MAX_VALUE)? null : flow.nodes[nextLitIndex];
	}
    }

    /**
     *	Highlighting matching nodes inside a document fragment with hiliter patterns.
     *	@param fragment source document fragment
     *	@param hiliter defines the way highlighted words are decorated
     *	(see {@link Hiliter}).
     *	@param result hilighted output.
     */
    public void highlight( Node fragment, Hiliter hiliter, XMLEventReceiver result )
	throws DataModelException {
	if(hiliter.areas == null)
	    hiliter.startHiliting(null);	// all the fragment
	hiTraversal(fragment, hiliter, result );
    }

    private void hiTraversal( Node node, Hiliter hiliter, XMLEventReceiver result )
	throws DataModelException {
	boolean hit = hiliter.start(node, result);
	switch(node.getNature()) {
	    case Node.DOCUMENT:
		result.startDocument();
		for(NodeSequence seq = node.children(); seq.nextNode(); )
		    hiTraversal(seq.currentNode(), hiliter, result);
		result.endDocument();
		break;

	    case Node.ELEMENT:
		result.startElement( node.getNodeName() );
		for(NodeSequence seq = node.attributes(); seq.nextNode(); ) {
		    Node attr = seq.currentNode();
		    result.attribute( attr.getNodeName(), attr.getStringValue());
		}
		for(NodeSequence seq = node.children(); seq.nextNode(); )
		    hiTraversal(seq.currentNode(), hiliter, result);
		result.endElement( node.getNodeName() );
		break;

	    case Node.TEXT:
		if(!hit)
		    result.text( node.getStringValue() );	// default treatment
		break;

	    case Node.PROCESSING_INSTRUCTION:
		result.pi( node.getNodeName().toString(), node.getStringValue() );
		break;

	    case Node.COMMENT:
		result.comment( node.getStringValue() );
		break;
	}
    }
} // end of class Fulltext
