/*
 Copyright (C) 1991-2012 the Free Software Foundation, Inc.
 Copyright (C) 2021 Momi-g

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

/*-*
@name	ureg
@_brief	portable regex with posix-ERE + binary 
@synopsis

 typedef struct ureg_tag {
 	void* data;		//opaque regobj
 	char* fastmap;	//fastmap[0]-[255]. holds 1st byte hit or not
 	char* lc;		//locale str
 	int rawflg;		//locale flg
 	const char* p;	//result buff
 	int sz;		//buff
 	const char* pbk[10];	//buff
 	int szbk[10];	//buff
 } ureg_t;

 ureg_t* ureg_new(const char* reg [,int reglen, int syn] )
 ureg_t* ureg_new_raw(const char* reg [,int reglen, int syn] )
 int ureg_search(ureg_t* obj, const char* s [,int slen] )
 int ureg_search_head(ureg_t* obj, const char* s [,int slen] )
 void ureg_free(ureg_t* obj)

 // helper/wrapper funcs
 ureg_t ureg_easy(const char* reg,const char* s [,int reglen,int slen,int syn])
 int ureg_dflsyn(void);
 void* ureg_iconv(const char* toenc, const char* fromenc, char** s [,int slen])

@_eg
 #include "ureg.h"

 int main() {
 //simple
 	ureg_t res = ureg_easy("a[bc]", "123abc"); //res.p="123abc"+3, res.sz=2
 	res = ureg_easy("a(b|c)", "123abc");  //res.pbk[1]="123abc"+4,res.szbk[1]=1
		printf("%.*s", res.szbk[0], res.pbk[0]);	//>> "ab"

 //complex
 	ureg_t* obj = ureg_new("a(b|c)");	//rtn NULL: err
 	const char* s = "123abc";
 	int rc = ureg_search(obj, s);	//rc== hit:>=0 nohit/err:<0
 	  //obj->p=s+3, obj->sz=2
 	  //obj->pbk[0]= s+3, obj->szbk[0] =2	//p==pbk[0]: all the match string
 	  //obj->pbk[1]= s+4, obj->szbk[1] =1	//backref: (b|c) hits 123a(b)c
 	  //obj->pbk[2]= NULL,obj->szbk[2] =-1	//set NULL/-1 if no result

 	rc = ureg_search_head(obj, s);	//search_head: regex hits only string top
 	  // obj->p= NULL, obj->sz= -1	//nohit
 	rc = ureg_search_head(obj, s+3);
 	  // obj->p= s+3, obj->sz= 2
 	ureg_free(obj);
 	
 	setlocale(LC_CTYPE, "UTF32");	// ~$ iconv -l
 	obj = ureg_new_raw("a(b|c)");
 	setlocale(LC_CTYPE, "C");	// reset
 	rc = ureg_search(obj, s);
 	ureg_free(obj);
 	
 //ureg_iconv(): not regex, but you maybe need. conv encode UTF-8,EBCDIC etc
 	const char* s0 = "hwd";		//"hwd"=="\150\167\144" in ascii(C) charset
 	const char* s1 = "\210\246\204";	//"hwd" in EBCDIC-BR
 	char* s = s0;
 	int* p = ureg_iconv("EBCDIC-BR","ASCII",&s,2);	//(to,from,s0_adrs,len)
 	int binsz = p[0];	//==2, conv result bytesz
 	char* pp = (char*)(p+1);
 	  // pp[0]=='\210', pp[1]=='\246', pp[2]=='\0'
 	  // srcptr &s is changed to "hwd"+2, "(hw)d" pos, s[0]=='d'
      // | binsz |  c  |  c  | 0 |
      //    int    char  char  \0		case: binsz=2
	free(p);	// ureg_iconv() rtns sz+bin malloc()ed ptr.
	return 0;
 }
 // ~$ gcc src.c libureg.a

@_param -
@_return	-
@description
 --- regex expression
 default ureg() uses posix-ERE with the following expantion.
    - [] and .(dot) includes '\n' and '\0'
    - [z-a], reverse range causes err
    - [] allows multibyte(mb) charactor range in mb locale, [\u1000-\u2000] etc
    - allow binary expression except [] and .(dot)	eg) "\377z[0-9]" etc
    - [] and .(dot) ignores bad byte.
      eg)  [Σ\377]==[\u03a3]	: 1char, multibyte locale
      eg)  [Σ\377]==[\316\243\377]	:3 char, C locale

 --- easy mode
  - func
	ureg_t ureg_easy(const char* r,const char* s [,int rlen,int slen,int syn])

	ureg_easy() uses for oneshot search. use '-1' if rlen/slen/syn is omitted.
	search result is set to rtn.	if nohit, set res.p==NULL, res.sz<0.
		res = ureg_easy("a[bc]", "123abc");	//match, res.p="..."+3, res.sz=2
		res = ureg_easy("a[bc]", "123abc", -1, -1 ,-1);	//same result

  - params:
	r: regex str.
	s: search target string/binary.
	rlen: r size. use strlen(r) if set -1/noset.
	slen: s size.
	syn: change regex syntax. see below 'complex mode'.

  - result:
    res.p : regex matched ptr. res.p[0]=='a' in above samnple. "123(a)bc".
    res.sz: match string byte size.
    other : see below 'complex mode'.

  - minimum code:
	#include "ureg.h"
	int main(int argc, char** argv){
		ureg_t res = ureg_easy("a[bc]", "123abc");
		printf("%.*s \n" res.sz, res.p);	//>> disp "ab"
		return 0;	//no memleak
	}
	// ~$ gcc src.c libureg.a

  - restriction
    - locale raw mode is unsupport. always run under the sys locale (UTF-8 etc)
    - malloc()/free() is executed every time


 --- complex mode: 1.compile regex, 2.search, 3.free ptn
 1. compile regex
  - func
	ureg_t* obj = ureg_new(const char* reg [,int reglen, int syn] )
	ureg_t* obj = ureg_new_raw(const char* reg [,int reglen, int syn] )

   C_lang uses "C" or "POSIX" locale in default setting
   
		eg) int main(){ puts( setlocale(LC_CTYPE, NULL) ); }	//>> "C"
   
   and regex semantics is differences from locale "C" to "UTF-8".
	C:		"[Σ]" == "[\316\243]" == "(\316|\243)"	// binary \316 or \243
	UTF-8:  "[Σ]" == "\316\243"	// regard "\316\243" sequences as onechar.

   ureg_new() uses OS locale, setlocale(LC_CTYPE, "") (see ~$ echo $LANG) and
   use its locale when search.
   ureg_new_raw() uses current locale and never changes locale setting, so
   adjusting locale is user duty. if new/search locale isnt the same, you
   may catch strange results. 
   obj holds setlocale(LC_TYPE,NULL) data when new() to obj->lc as string.

 	#include "ureg.h"		//~$ cc src.c  libureg.a
	int main() {
		ureg_t* obj = ureg_new("[Σ]");	//save env locale, setlocale(LC_TYPE,"")
		puts(obj->lc);	// "XX.UTF-8" etc
		ureg_search(obj, "xΣyz");	// match len: obj->sz=2
		ureg_free(obj);

		obj = ureg_new_raw("[Σ]");	//== "(\316|\243)" at locale "C"
		puts(obj->lc);	// "C" etc
		ureg_search(obj, "xΣyz");	// obj->sz=1
		ureg_free(obj);

			setlocale(LC_CTYPE, "C.UTF-8");
		obj = ureg_new_raw("[Σ]");	//== "\316\243" at UTF-8
			setlocale(LC_CTYPE, "C");
		ureg_search(obj, "xΣyz");	// obj->sz=1, use current locale
			puts(obj->lc);	//>> "C.UTF-8"
		ureg_free(obj);

		return 0;
	}
   
   i reccomend you to use ureg_new() except rare/special usage.
   len/syn uses strlen(reg)/(dfl) if set -1/noset. the follows work as the same.
	  obj = ureg_new(rstr);
	  obj = ureg_new(rstr, strlen(rstr) );
	  obj = ureg_new(rstr, -1, -1);

  - param: reg
   regex expression str. mb class/range works only under the mb locale env.
     - [Σ-Τ] == "(\u03a3|\u03a4)" 	:"XXX.UTF-8" == setlocale(LC_CTYPE,"")
     - [Σ-Τ] == [\316\243-\316\244]== [\243-\316]		:"ASCII" 
   if you use ureg_new() and system supports mb, you can use mb charactors.

  - param: reglen
   if regstr has '\0', you needs to set len. use strlen(len) if set-1/noset
		const char* r = "[\t\0]";
		obj = ureg_new(r, 4);		//reg matches '\t' or '\0'

  - param: syn
   if you want to change regex syntax from default, set this syntax flag.
   you can use gnu-regex flag. posix-regcomp() flag is unsupported.
   ureg_dflsyn() rtns dfl flag.
		int syn = ureg_dflsyn();
		syn |= RE_ICASE;	 // ignorecase mode, "abc" == "[aA][bB][cC]"
		obj = ureg_new("abc", -1, syn);
		rc = ureg_search(obj, "00AbC11");	//hit "AbC"

   dfl syntax is:
 dflsyn = 0
 //	| RE_BACKSLASH_ESCAPE_IN_LISTS //"\" works as esc in [] on:yes off:no //posix:off, special chars is only ^-[] in class
 // | RE_ICASE			// ignore case, aa==Aa	 on:a==A off:a!=A
	| RE_INTERVALS		// use {} op	on:use off:normal chars
 //	| RE_LIMITED_OPS	// +*?| are normal chars	on:yes off:special chars
	| RE_CHAR_CLASSES	// use [:alnum:] etc	on:yes off:no

 	| RE_CONTEXT_INDEP_ANCHORS	//$^ are special except []/esc (a^c etc) yes/no
	| RE_CONTEXT_INDEP_OPS	// *+? raise err in bad pos, "+a" etc	on/off
	| RE_CONTEXT_INVALID_DUP	// "{2}a" raise err	on:yes off:[{][2][}]a
	| RE_CONTEXT_INVALID_OPS	// +*? raise err, "?a" etc	on:yes off:[+]a
 	| RE_UNMATCHED_RIGHT_PAREN_ORD //")a" DOESNT raise err	on:[)]a off:err
 //	| RE_INVALID_INTERVAL_ORD	// "}abc" DOESNT raise err	on:[}] off:err

	| RE_DOT_NEWLINE	// .(dot) includes '\n'	on:include off:exclude
 //	| RE_DOT_NOT_NULL	// .(dot) excludes '\0'	on:exclude off:include
 //	| RE_HAT_LISTS_NOT_NEWLINE	//[^a] excludes '\n' 	on:exclude off:include

 //	| RE_NEWLINE_ALT	// a(\n)b works as a|b, on:\n==| off:| only
	| RE_NO_BK_BRACES	// interval {}, on:{} off:\{\} //needs RE_INTERVALS:on
	| RE_NO_BK_PARENS	// grouping (),	on:() off:\(\)
	| RE_NO_BK_VBAR		// | or \|,  on:| off:\|	//needs RE_LIMITED_OPS:off
 //	| RE_NO_BK_REFS		// use \1,\2 etc  on:nouse off:use
 //	| RE_BK_PLUS_QM		// repeat "a+" or "a\+" 	on:\+ off:+

	| RE_DEBUG		// holds dbginfo	on:yes off:no	//emsg etc
	| RE_NO_EMPTY_RANGES	// [z-a] is err range	on:yes off:no, use as empty
	| RE_NO_GNU_OPS		// use out of standard op, \< etc 	on:nouse off:use
 //	| RE_NO_POSIX_BACKTRACKING	//use shortest match	on:yes off:no,longest
 //	| RE_NO_SUB		//drop backref data, (ab)c \1=ab etc.	on:yes off:hold
 ;
 
 // --gnu-predefined other syntax
 //	#define _RE_SYNTAX_POSIX_COMMON
 //	  (RE_CHAR_CLASSES | RE_DOT_NEWLINE      | RE_DOT_NOT_NULL
 //	   | RE_INTERVALS  | RE_NO_EMPTY_RANGES)
 //
 //	#define RE_SYNTAX_POSIX_EXTENDED
 //	  (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS
 //	   | RE_CONTEXT_INDEP_OPS  | RE_NO_BK_BRACES
 //	   | RE_NO_BK_PARENS       | RE_NO_BK_VBAR
 //	   | RE_UNMATCHED_RIGHT_PAREN_ORD)
 //
 //	   Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
 //	   replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added.
 //	#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED
 //	  (_RE_SYNTAX_POSIX_COMMON  | RE_CONTEXT_INDEP_ANCHORS
 //	   | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES
 //	   | RE_NO_BK_PARENS        | RE_NO_BK_REFS
 //	   | RE_NO_BK_VBAR          | RE_UNMATCHED_RIGHT_PAREN_ORD)
 //
 // --posix-ERE
 //	--undefined
 //	/()/
 //	/+aa|*{/
 //	/|g/
 //	[z-a]
 //	[a-c-e]
 // --valid
 //	[]]
 //	[\]]	>>2char '\' or ']'
 //	[-a]	>>2char '-' or 'a'
 //	[ac-]
 //	/a^b/, /a$b/ >> valid, but never matched
 
   ...gnu-regex manual have inconsistent expression(DO/NOT DO/NO/LIMIT etc) and
   low readability. refer to the above and the mit manual.
   ( http://web.mit.edu/gnu/doc/html/regex_2.html#SEC3 )

  - return: ureg_t* obj
   this obj holds regex expression, result and others.

		typedef struct ureg_tag {
			void* data;		// compiled opaque data. holds dfa, term list etc
			char* fastmap;	// uses for prune if needs
			char* lc;		// locale str. "XXX.UTF-8", "C" etc
			int rawflg;		// ureg_new() / ureg_new_raw() == 0/1
			const char* p;	// result buff.
			int sz;			// buff
			const char* pbk[10];	// buff
			int szbk[10];	// buff
		} ureg_t;
	
	obj->data is ptr to 'struct re_pattern_buffer*', gnu-regex struct.
	fastmap is ptr to gnu-regex struct members. see gnu-regex manual.
	allmost all users dont care about data/fastmap. see gnu-regex manual 
	if you needs. (-D_GNU_SOURCE + #include <regex.h>) 

 2. search
  - func
	int rc = ureg_search (ureg_t* obj, const char* s [,int slen] )
	int rc = ureg_search_head (ureg_t* obj, const char* s [,int slen] )

   ureg_search() finds match data from s. use strlen(s) if slen= -1/noset.
   ureg_search_head() sees only the target string head. if you used ureg_new(),
   search with obj->lc locale setting.
   
   funcs rtns >=0 or <0 == hit/nohit. srcptr+rc will be hittop ptr.
		r = "a[bc]";
		obj = ureg_new(r);
		s = "123abc";
		rc = ureg_search(obj, s);		//rc=3, hit: "...(ab)c", s[rc]='a'
		rc = ureg_search_head(obj, s);	//rc<0, nohit
		rc = ureg_search_head(obj, s+3);	//rc=0, hit "(ab)c", s[rc]='a'

   search funcs doesnt support pirtial match. you need fulltext.
		r = "a[bc]";	s = "123abc";
		rc = ureg_search(obj, s, 4);	//"123a", reg hits 'a' but fail. rc<0
		// if supports rc= -5(pirtial/morestr) etc, you may use fgetc()

  - param: obj
   regex data. search result is set to obj member. set NULL / -1 if nohit.
		char* obj->p : full matching ptrpos
		int   obj->sz: byte of matching str
		char* obj->pbk[10]: back referece ptrpos. obj->p == obj->pbk[0]
		int  obj->szbk[10]: byte of back reference data. obj->sz==obj->szbk[0]

		sample:
		r = "a[bc]";	s = "123abc";
		rc = ureg_search(obj, s);
		  //obj->p=s+4, obj->sz=2,  pbk[0]=p,szbk[0]=sz, pbk[1]=NULL,szbk[1]=-1
 		r = "(a(b|c))";	s = "123abc";
		rc = ureg_search(obj, s);	//obj->p, pbk is as below
		
		//	str: "123abc",	reg: "(a)(b|c)"
		//	p  :  ...oo.., sz=2, rc=3, p==s+rc
		//	p0 :   (p0==p) 
		//	p1 :  ...o..., sz=1
		//	p2 :  ....o.., sz=1
		//	p3 :  NULL,    sz= -1
		
   back reference result \1-\9 is set to pbk/szbk[1-9]. [0] has fullmatch data.
   index 1-9 applies to open parlen '(' order.
	reg:	( a ( b | c ) )
	bk :	\1  \2

  - param: s
	search target string. treated as binary.
  - param: slen
    target byte size. use strlen(s) if set -1/noset.
  - return: rc
	return hit adrsidx if suc search (p==src+rc). suc/fail == rc>=0 / rc<0

 4. free
  - func
	void ureg_free(ureg_t* obj)
   obj is malloc()ed. free memory when you finish regex search.


 --- other func: ureg_iconv()
  - func
	void* p=ureg_iconv(const char* tenc,const char* fenc,char** s [,int slen])

   this func doesnt directly related to regex, buf useful if you have different
   encoding data.

  - params:
	tenc: toenc type string. "UTF-8", "EBCDIC-BR" etc. see ~$ iconv -l
	fenc: fromenc. src encode type.
	s: srcptr-reference. this arg is changed by func, so pass the srcptr copy.
	slen: src byte size. use strlen(*s) if set -1/noset.

  - result:
    p: converted binary stream with bytesize. byte sequence is as follows.
			| binsz |  c  |  c  | 0 |
			   int    char  char  \0		case: binsz=2

	   use type punning (int), (char*) to get sz and binptr.
			int* p = ureg_iconv(...);	//p is malloc()ed as strdup()
			int binsz = p[0];	//==2, conv result bytesz
			char* bin = (char*)(p+1);
			printf("%.*s \n", binsz, bin);
			free(p);

	s: posbuff. if allsrc is converted to other enc, s is set as s == src+slen.
	  if detect invalid byte sequences, s is set to its address.
		all suc: "oooooooo"
		          ........s(maybe \0 or other)
		bad seq: "ooooox.."
		          .....s.. (s[0]=='\377' etc. between start and (s-1) is valid)


 --- benchmark:	loop(10*1000){ ureg_easy("4.6", "1234567890"); } etc
   - ureg
	real 465.921 ms	: ./ureg.ts.c 192: t_bm0_sub(): msg:easy:10*1000
	real 395.018 ms	: ./ureg.ts.c 202: t_bm1_sub(): msg:new-free: 10*1000
	real 13.612 ms	: ./ureg.ts.c 209: t_bm1_sub(): msg:search: 10*1000
	real 364.995 ms	: ./ureg.ts.c 222: t_bm2_sub(): msg:new-free@raw: 10*1000
	real 4.457 ms	: ./ureg.ts.c 229: t_bm2_sub(): msg:search@raw: 10*1000
   - oniguruma
	real 17.268 ms	: ./ureg.ts.c 264: t_bm4_sub(): msg:new-free@onig: 10*1000
	real 4.473 ms	: ./ureg.ts.c 276: t_bm4_sub(): msg:search@onig: 10*1000
   - strstr()
	real 0.050 ms	: ./ureg.ts.c 240: t_bm3_sub(): msg:strstr(): 10*1000

		FAST: strstr(1) >>> onig(100) >>> ureg(3000) :SLOW

 --- concept
 	- avoid complex api
 	- avoid non-standard regexp/operator like PCRE
 	- support binary input

@notes
 http://web.mit.edu/gnu/doc/html/regex_7.html
 http://www.kt.rim.or.jp/~kbk/regex/re_7.html
 https://regular-expressions.mobi/refunicode.html?wlr=1
 https://stackoverflow.com/questions/2359811/working-with-gnu-regex-functions-in-c-or-c
 https://stackoverflow.com/questions/8727795/does-posix-regex-h-provide-unicode-or-basically-non-ascii-characters
 https://stackoverflow.com/questions/48743106/whats-ansi-x3-4-1968-encoding
 https://www.iana.org/assignments/character-sets/character-sets.xhtml
@conforming_to posix-2001+
@version 2021-06-03 v3.0.1
@copyright Copyright 2021 momi-g, GPLv3+
-*/

#ifndef ureg_dfb770dd3ce8
#define ureg_dfb770dd3ce8

#include <stdio.h>
#if ( _POSIX_C_SOURCE +0 < 200112L )
	# include "needs compiler posix-2001 or upper(c99+)"
#endif

#include <locale.h>
#include <langinfo.h>
#include <iconv.h>
typedef struct ureg_tag {
	void* data;
	char* fastmap;
	char* lc;
	int rawflg;
	const char* p;
	int sz;
	const char* pbk[10];
	int szbk[10];
} ureg_t;

ureg_t* ureg_new_impl(int rawflg, const char* rstr, int rstrsz, int syn, ...);
void ureg_free(ureg_t* obj);
int ureg_search_impl(int topflg, ureg_t* obj, const char* tgtstr, int slen, ...);
int ureg_dflsyn(void);

ureg_t ureg_easy_impl(const char* reg, const char* s, int rlen, int slen, int syn, ...);
void* ureg_iconv_impl(const char* toenc, const char* fromenc, char** str, int strsz, ...);

#define ureg_new(...)		  ureg_new_impl(0,__VA_ARGS__, -1,-1 )
#define ureg_new_raw(...)	  ureg_new_impl(1,__VA_ARGS__, -1,-1 )
#define ureg_search(obj, ...)	  ureg_search_impl(0,obj, __VA_ARGS__, -1 )
#define ureg_search_head(obj,...)	ureg_search_impl(1, obj,__VA_ARGS__, -1 )
#define ureg_easy(reg, ...)		ureg_easy_impl(reg, __VA_ARGS__,-1,-1,-1)
#define ureg_iconv(toenc, fromenc, ...)		ureg_iconv_impl(toenc, fromenc, __VA_ARGS__, -1)

#endif	//ureg_dfb770dd3ce8
