#include <emsg: see ... '~$ sh aaa.sh.c -h'   (other opt:no/-m/-w/)>	/*
C='^[/][/*]SH_'     ;O=${0##*[/]};R=`dirname $0`;R=${R%/}/;R0=$R$O;R=$R${O%%.*}
O=${0##*.};Rs=$R.$O;Rm=$R.tmp.$O;Rh=$R.h;R=$Rs$Rh$Rm;Rp='printf %s\n ';Rc=:;O="
";[ "${R##*$R0*}" = '' ]&&$Rp"$0:NGsuffix"&&exit 1;R='sed -ne ';Cm=$R'"/[E]ND/!d
:l;n;p;bl"<$R0>$Rm;$Rp"$Rm"';RB=$($R"s/${C}OP//p"<$R0|(F=mw;while read -r a b;do
B=${a%:};F=`$Rp"$F"|$R"s#$B:*##1;p"`${a%_};$Rp"C$B=\$(cat<<'E'$O$b${O}E$O)";done
$Rp"R1=$F"));Rw=$R'"/$C$R/!d;:l;n;/${C}ED/q;p;bl"<$R0';Cw="(R=LS;$Rw;$Rw>&3;R=HD
$Rw;R=SC;$Rw>&3)"'>$Rh 3>$Rs;$Rp"$Rh $Rs"';Re=eval\ ;$Re"$RB";while getopts $R1\
 R;do case $R in \?)exit 1;;*)$Re"O$R=\$OPTARG";Rc=$Rc$O`$Re'$Rp"$C'$R\"`;;esac
done;[ "$Rc" = : ]&&Rc=$Cm;shift $((OPTIND-1));$Re"$C_$O$Rc";exit   #END GPL3+*/

/*SH_LS*/
/*
 Copyright (C) 1991-2012 the Free Software Foundation, Inc.
 Copyright (C) 2021 Momi-g

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

/*SH_doc
title=ureg section=3 repnl=\040
@name	ureg
@_brief	portable BRE/ERE regex api
@_syno
 #include "ureg.h"
 
 //main func 
 ureg_t* ureg_new(const char* reg [,int reglen, int syn] );
 ureg_t* ureg_new_raw(const char* reg [,int reglen, int syn] );
 int ureg_search(ureg_t* robj, const char* s [,int slen] );
 int ureg_search_head(ureg_t* robj, const char* s [,int slen] );
 void ureg_free(ureg_t* robj);

 //main object info 
 typedef struct ureg_tag {
	void* data;		//opaque regobj
	char* fastmap;	//fastmap char[255]. holds 1st byte hit or not
	char* lc;		//save regex locale
	int rawflg;		//locale flag
	const char* p;		//result buff
	int sz;
	const char* pbk[10];
	int szbk[10];		//buff
 } ureg_t;
 
 //support func
 ureg_t ureg_easy(const char* reg,const char* s
		[,int reglen,int slen,int syn]);
 int ureg_ere_syn(void);
 int ureg_bre_syn(void);
 void* ureg_iconv(const char* fromenc,
		const char* toenc, char** s [,int slen]);

@tl;dr
	@(code)@
	#include "ureg.h"
	int main() {
		  ureg_t res = ureg_easy("a[bc]", "123abc");	//dfl:ERE
		  printf("%.*s \n", res.sz, res.p);	// == "ab"
		  return 0;
	}
	//~$ cc src.c libureg.a
	@()@

@_eg
	@(code)
 #include "ureg.h"
	
 int main() {
	//simple
	ureg_t res = ureg_easy("a[bc]", "123abc");
		//res.p="123abc"+3, res.sz=2
	res = ureg_easy("a(b|c)", "123abc");
		//res.pbk[1]="123abc"+3,res.szbk[1]=1
	printf("%.*s", res.szbk[0], res.pbk[0]);	//>> "ab"
	
	//complex
	ureg_t* obj = ureg_new("a(b|c)");	//rtn NULL if err
	const char* s = "123abc";
	int rc = ureg_search(obj, s);	//rc== hit:>=0 nohit/err:<0
	  //rc=3, obj->p=s+3(rc), obj->sz=2 
	  //obj->pbk[0]= s+3, obj->szbk[0]=2 //p=pbk[0]: all the match str
	  //obj->pbk[1]= s+4, obj->szbk[1]=1 //bkref: (b|c) hits 123a(b)c
	  //obj->pbk[2]= NULL,obj->szbk[2]=-1 //set NULL/-1 if fail
	
	rc = ureg_search_head(obj, s);	//regex hits only string top
		// rc= -1, obj->p= NULL, obj->sz= -1	//nohit
	rc = ureg_search_head(obj, s+3);  // rc=0, obj->p= s+3, obj->sz= 2
	ureg_free(obj);
	
	setlocale(LC_CTYPE, "UTF32");	// ~$ iconv -l
	obj = ureg_new_raw("a(b|c)");
	setlocale(LC_CTYPE, "C");	// reset
	rc = ureg_search(obj, s);
	ureg_free(obj);
	//--regex end
	
	//ureg_iconv(): not regex. conv encode UTF-8, EBCDIC etc
	const char* s0 = "hwd";  //"hwd"=="\150\167\144" in C charset
	const char* s1 = "\210\246\204";	//"hwd" in EBCDIC-BR
	char* s = s0;
	int* p = ureg_iconv("C","EBCDIC-BR",&s,2); //(from,to,s0_adrs,len)
	int binsz = p[0];	//==2, conv result(EBCDIC-BR) bytesz
	char* pp = (char*)(p+1);
		// pp[0]=='\210', pp[1]=='\246', pp[2]=='\0'
		// srcptr &s is changed to "hwd"+2, "(hw)d" pos, s[0]=='d'
		// | binsz |  c  |  c  | 0 |
		//    int    char  char  \0
	free(p);	// ureg_iconv() rtns sz+bin malloc()ed ptr.
	return 0;
  }
  //~$ cc src.c libureg.a
	@()@

@_desc
	ureg uses posix-ERE with the following expantion in default.
	some items are compromised for reasons --
	@(pre)@
	- [] and .(dot) includes \n and \0. posix doesnt allow byte \0.
	- [] and .(dot) ignores out of charactor byte. (should be err. gave up)
	    eg) [\100\377] >> [\100] (ascii locale)
	    eg) [Σ\377]==[\u03a3] @: 1char, multibyte locale
	    eg) [Σ\377]==[\316\243\377] @: 3 char, C locale
	- allow bytedata except [] and .(dot)   eg) "\377z[0-9]"  >> valid
	- normal char esc \@ treats as @ (should be err. gave up)
    @()@
    --

	@(raw).SS easy mode@()--
	`	`ureg_t res = ureg_easy(r, s [, rlen, slen, syn]); --
	--
	ureg_easy() uses for oneshot search. use '-1' if rlen/slen/syn isnt set.
	search result is set to rtn. if nohit, set res.p==NULL, res.sz<0. --
	`	`res = ureg_easy("a[bc]", "123abc");  //hit, res.p="..."+3, res.sz=2 --
	`	`res = ureg_easy("a[bc]", "123abc", -1,-1,-1);  //same --
	--
	`_param`
	@(pre)
	r     : regex str
	s     : search target string/binary
	[rlen]: r size. use strlen(r) if set -1/noset
	[slen]: s size
	[syn] : change regex syntax. see below 'complex mode'. (dfl:ERE)
	@()--

	`_return`
	@(pre)
    res.p : matched ptr. res.p[0]=='a' in above sample. "123(a)bc"
    res.sz: match string byte size
    others: see below (complex mode)
    @()--
	ureg_easy() has the following restrictions
	@(pre)
	- locale rawmode isnt support. always use the sys locale(UTF-8 etc)
	- execute malloc()/free() every time
	@()--
	
	sample:
		@(pre)
	#include "ureg.h"
	int main(){
		ureg_t res = ureg_easy("a[bc]", "123abc");
		printf("%.*s \n", res.sz, res.p);	//>> disp "ab"
		return 0;	//no memleak
	}
	// ~$ cc src.c libureg.a
		@()--
	@(raw).SS complex mode@()--
	divided to 3 parts, compile >> search >> free	--
	--
	1. compile regex
	@(pre)
	ureg_t* robj = ureg_new(r [,rlen, syn] );
	ureg_t* robj = ureg_new_raw(r [,rlen, syn] );
	int syn = ureg_ere_syn();
	int syn = ureg_bre_syn();
	@()--
	C-lang uses "C" locale in default and all 8bit char is vaild. --
	(www.gnu.org/software/grep/manual/html_node/Character-Encoding.html) --
	`   `eg) int main(){ puts(setlocale(LC_CTYPE, NULL)); } //>> "C" --
	--
	and regex semantics are differences from locale "C" to "UTF-8". --
	`   `C : "[Σ]" == "[\316\243]" == "(\316|\243)" //binary \316 or \243 --
	`   `u8: "[Σ]" == "\316\243" //binary sequence is 1 charactor --

	@(list)
	ureg_new(): use OS locale, setlocale(LC_CTYPE, "") (see ~$ echo $LANG) and
	use its locale when search.
	ureg_new_raw(): uses current locale (maybe "C") and never changes locale,
	so adjusting locale is user duty. if new/search locale isnt the	same,
	you may get the strange results. robj->lc saves setlocale(LC_TYPE,NULL) str.
	@()--
	sample:
	@(code)
	#include "ureg.h"
	int main() {
		ureg_t* obj = ureg_new("[Σ]");
		puts(obj->lc);	// "XX.UTF-8" etc
		ureg_search(obj, "xΣyz");	// match len: obj->sz=2
		ureg_free(obj);

		obj = ureg_new_raw("[Σ]");  //== "(\316|\243)" at locale "C"
		puts(obj->lc);	// "C" etc
		ureg_search(obj, "xΣyz");  // obj->sz=1
		ureg_free(obj);

			setlocale(LC_CTYPE, "C.UTF-8");
		obj = ureg_new_raw("[Σ]");	//== "\316\243" at UTF-8
			setlocale(LC_CTYPE, "C");
		ureg_search(obj, "xΣyz"); //obj->sz=1, use current locale
			puts(obj->lc);	//>> "C.UTF-8"
		ureg_free(obj);
		return 0;
	}
	//~$ cc src.c  libureg.a
	@()--
	malloc()/free() isnt executed every time. you can avoid the overhead.
	but locale setting is very confusing so I recommend you to use
	ureg_new() except rare/special usage.	--
	--
	`_param`
	@(list)
	r   : regex str. mb class/range works only under the mb locale env.--
		[Σ-Τ] == "(\u03a3|\u03a4)"  //"XXX.UTF-8"	--
		[Σ-Τ] == [\316\243-\316\244]== [\243-\316]  //"C"
	[rlen]: r size. use strlen(r) if set -1/noset
	[syn] : use ERE if -1/noset. ureg_ere_syn()/ureg_bre_syn() returns
		ERE/BRE flag.	--
		`   `int syn = ureg_bre_syn(); // or ureg_ere_syn(); --
		`   `robj = ureg_new("a\\(b\\)c", -1, syn); --
		`   `rc = ureg_search(robj, "00abc");	//hit rc=2	--
	@()--
	
	`_return`--
	you may use only 'result buffer' in usual.
	@(list)
	robj->data : ptr to compiled regex data
	robj->fastmap : char arr[255]. if r="[ab]c", arr['a'], ['b'] is 1. others is 0.
	robj->lc : copy of setlocale(LC_CTYPE, NULL) str when regex compiled.
	robj->rawflg : 0/1 == ureg_new()/ureg_new_raw()
	other members : result buffer.
    @()--

	2. search
	@(pre)
	int rc = ureg_search(robj, s [, slen] );
	int rc = ureg_search_head(robj, s [, slen] );
	@()--
	@(list)
	ureg_search(): finds match data from s. use strlen(s) if slen= -1/noset.
		if you used ureg_new(), use robj->lc locale setting automatically.
	ureg_search_head(): check only BOS like lex scanner
	@()--
	
	`_param`
	@(list)
	robj : search result is set to this member. member value will be set
		NULL/-1 if nohit.
	char* robj->p : full matching ptrpos
	int robj->sz : byte size of matching str.
	char* robj->pbk[10]: back reference ptrpos. robj->p == robj->pbk[0]
	int robj->szbk[10]: byte size of back references. robj->sz==robj->szbk[0]. 
		posix-ERE doesnt allow backref syntax \1-\9, but to hold backref data
		is lawful. posix regcomp(), REG_NOSUB also holds its data.
	s		: search target string ptr
	[slen]	: use strlen(s) if -1/noset
	@()

	`_return`
	@(list)
	int rc: funcs rtns >=0 or <0 == hit/nohit. srcptr+rc will be hittop ptr.
	@()
		
	sample:
	@(code)
	r = "a[bc]";
	obj = ureg_new(r);
	s = "123abc";
	rc = ureg_search(obj, s);	//rc=3, hit: "...(ab)c", s[3]='a'
	rc = ureg_search_head(obj, s); //rc<0, nohit
	rc = ureg_search_head(obj, s+3); //rc=0, hit "(ab)c", (s+3)[rc]='a'
	@()--
	search funcs doesnt support partial match. you need fulltext.
	@(code)
	r = "a(bc)";
	s = "123abc";
	rc = ureg_search(obj, s, 4); //"123a", reg hits 'a' but fail. rc<0
	// if supports rc= -5(partial/morestr) etc, you may use fgetc()
	@()--
	back reference result \1-\9 is set to pbk/szbk[1-9]. [0] has fullmatch data.
	index 1-9 applies to open parlen "(" order. the same as posix. --
	`   `reg: ( a ( b | c ) ) --
	`   `bk : \1  \2 --
	--
	back reference sample:
	@(code)
	r = "a[bc]";
	s = "123abc";
	robj = ureg_new(r);
	rc = ureg_search(robj, s);
	  //robj->p=s+4, robj->sz=2
	  //pbk[0]=p,szbk[0]=sz, pbk[1]=NULL, szbk[1]=-1
 	r = "\\(a\\)\\(bc\\)";
 	s = "123abc";
	robj = ureg_new(r, -1, ureg_bre_syn() );
	rc = ureg_search(robj, s);	//robj->p, pbk is as below
		//	str: "123abc",	reg: "(a)(bc)"
		//	p  :  ...oo..,  robj->sz=2, rc=3, robj->p==s+rc
		//	p0 :  (p0==p) 
		//	p1 :  ...o...,  robj->szbk[1]== 1
		//	p2 :  ....o..,  robj->szbk[2]== 1
		//	p3 :  NULL,     robj->szbk[3]== -1
	@()--

	3. free
	@(pre)
	void ureg_free(robj);
	@()--
	robj is malloc()ed. free memory when you finish regex search. --

	@(raw).SS other func@()--
	@(pre)
	void* p = ureg_iconv(fenc, tenc, &s [, slen]);
	@()--
	ureg_iconv() doesnt treat regex. convert different encoding data.
	posix iconv() api/manual is very awkward so make a wrapper. --
	--
	`_param`
		@(pre)
	fenc  : from enc type str. "EBCDIC-BR" etc. see ~$ iconv -l
	tenc  : to enc
	&s    : srcptr-ref. this ag is changed by func, so pass the copy.
	[slen]: src byte size. use strlen(*&s) if set -1/noset.
		@()--
	`_return`
	@(list)
	p: suc/fail == notNULL/NULL. converted binary stream with bytesize.
	use type punning (int), (char*) to get sz and binptr.
	byte sequence is as follows. --
	@()--
		@(code)
	| binsz |  c  |  c  | 0 |
	   int    char  char  \0	(if binsz=2)
	...
	char* s = (char*)"hw";
	int* p = ureg_iconv("ASCII", "UTF32", &s); //p is malloc()ed
	int binsz = p[0];	//==12, conv bytesz. BOM(4b) + h(4b) + w(4b)
	char* bin = (char*)(p+1);
	printf("%.*s \n", binsz, bin);
	free(p);
		@()--
	if allsrc is converted to other enc, s is set as s == src+slen.
	if detect invalid byte sequences, s is set to its address.
		@(code)
	all suc: "oooooooo"
	          ........s(maybe \0 or other)
	bad seq: "ooooox.."
	          .....s.. (s[0]=='\377' etc. from BOS to (s-1) is valid)
		@()--

	sample:
		@(code)
	#include <string.h>
	#include <stdlib.h>
	#include "ureg.h"
	
	int main(){
		const char* s = "ab\377c";	// "ab(NG)c", ascii
		char* sp = (char*)s;
		int* p = ureg_iconv("ASCII", "UTF32", &sp);	// ~$ iconv -l
			printf("%p %p\n", s, sp);	// p-s == 2, sp[0]='\377'
		int sz = p[0];	// ==12, ascii:7bit, UTF32:32bit + BOM 32bit
		char* bin = (char*)&p[1];
			printf("%d\n", sz);	//12, bin[0]..bin[12-1] is UTF32 str
		if(s+strlen(s) != sp){ puts("bad ascii byte"); }
		free(p);
		return 0;
	}
	//~$ cc src.c libureg.a
		@()--
@notes
@(code)
 ureg_erebin_syn_ = 0
 //	| RE_BACKSLASH_ESCAPE_IN_LISTS //"\" works as esc in [] //posix:off
 //	| RE_ICASE			// ignore case, aa==Aa	 on:a==A off:a!=A
	| RE_INTERVALS		// use {} op	on:use off:normal chars
 //	| RE_LIMITED_OPS	// +*?| are normal chars	on:yes off:special
	| RE_CHAR_CLASSES	// use [:alnum:] etc	on:yes off:no

 	| RE_CONTEXT_INDEP_ANCHORS	//$^ works except []/esc (a^c etc)
	| RE_CONTEXT_INDEP_OPS	// *+? raise err in badpos, "+a" etc
	| RE_CONTEXT_INVALID_DUP  // "{2}a" raise err   off:[{][2][}]a
	| RE_CONTEXT_INVALID_OPS  // +*? raise err, "?a"   on:yes off:[+]a
 	| RE_UNMATCHED_RIGHT_PAREN_ORD //")a" ISNT err	on:[)]a off:err
 //	| RE_INVALID_INTERVAL_ORD	// "}abc" ISNT err	on:[}] off:err

	| RE_DOT_NEWLINE	// .(dot) includes '\n'	on:inc off:exc
 //	| RE_DOT_NOT_NULL	// .(dot) excludes '\0'	on:exc off:inc
 //	| RE_HAT_LISTS_NOT_NEWLINE	//[^a] excludes '\n'  on:exc off:inc

 //	| RE_NEWLINE_ALT	// a(\n)b works as a|b, on:\n==| off:| only
	| RE_NO_BK_BRACES	// on:{} off:\{\} //needs RE_INTERVALS
	| RE_NO_BK_PARENS	// on:() off:\(\)
	| RE_NO_BK_VBAR	// on:| off:\|	//needs RE_LIMITED_OPS=off
 //	| RE_NO_BK_REFS	// use \1,\2 etc  on:nouse off:use
 //	| RE_BK_PLUS_QM	// repeat "a+" or "a\+" 	on:\+ off:+

	| RE_DEBUG		// holds dbginfo	on:yes off:no	//emsg etc
	| RE_NO_EMPTY_RANGES	// [z-a] is err  on:yes off:use as empty
	| RE_NO_GNU_OPS	//out of standard op, \< etc  on:nouse off:use
 //	| RE_NO_POSIX_BACKTRACKING	//shortest match  on:yes off:longest
 //	| RE_NO_SUB  //drop backref data, (ab)c \1=ab etc. on:yes off:hold
 ;
// https://www.gnu.org/software/gnulib/manual/html_node/Syntax-Bits.html
 
 // --posix-ERE
 //	--undefined
 //	/()/
 //	/+aa|*{/
 //	/\@/
 //	/|g/
 //	[z-a]
 //	[a-c-e]
 // --valid
 //	)ab		>>uneven parlen treat as ordinary char
 //	[]]
 //	[\]]	>>2char '\' or ']'
 //	[-a]	>>2char '-' or 'a'
 //	[ac-]
 //	/a^b/, /a$b/ >> valid, but never matched

 ureg_brebin_syn_= 0
	| RE_CHAR_CLASSES
	| RE_DOT_NEWLINE
 //	| RE_DOT_NOT_NULL	//>>for support binary
	| RE_INTERVALS
	| RE_NO_EMPTY_RANGES
	;
@()
	..gnu-regex manual have inconsistent expression(DO/NOT DO/NO/LIMIT) and
	low readability. refer to the above and the mit manual. --
	( http://web.mit.edu/gnu/doc/html/regex_2.html#SEC3 )--
	--
	important difference between BRE and ERE syntax is 
	`backref`, `alternate op |`, `anchor work` and `repeat op`:
	@(code)@
	         BRE	  ERE		eg
	backref  \1-\9   nothing  B: \(a\)\1 >>aa,  E: (a)\1 >> undefined
	alter   nothing    |      B:      -         E: (aa|bb)
	anchor  BOS/EOS anywhere  B: \(^a^b\)>>a^b  E: (^a^b) >> never match
	repeat  context anywhare  B: *ab >> *ab     E: *ab >> undefined
	@()--
	I recommend you to use ERE in generally. you should use BRE only when
	you need back-references or non-support ERE command (sed, grep etc).
	additionally, quote/esc the special charactors to avoid 'context depends'
	regex.  eg) ERE: )abc >> [)]abc --
	--
	(https://stackoverflow.com/questions/40455975/why-regular-expressions-with-backreferences-are-not-regular-expressions) --
	(https://swtch.com/~rsc/regexp/regexp1.html) --
	--
	--
	BRE doesnt have '|' op and  equivalent expression seems impossible --
	`   `ERE: (ab|cd)* >> ( (ab){0,1} (cd){0,1} ){0,} --
	`   `ERE: (ab|cd)+ >>  ??? --
	--
	ERE doesnt have back-reference \1-\9. close expression is possible but
	not perfect. catch  _ab_, __ab__, ___ab___ ... is: --
	`   `BRE: \(_*\)[^_]\1	--
	`   `ERE: (_[^_]*_) | (__[^_]*__) | (___[^_]*___) ...	--
	--
	--
	@(code)@
	--- benchmark:	loop(10*1000){ ureg_easy("4.6", "1234567890"); } etc
	compile FAST:    --     >>> onig(1)   >>> ureg(50)  :SLOW
	search  FAST: strstr(1) >>> onig(100) >>> ureg(300) :SLOW

	- ureg
	real 948.292 ms: ./ureg.tmp.c 881: t_bm0_sub(): msg:easy:10*1000
	real 813.072 ms: ./ureg.tmp.c 891: t_bm1_sub(): msg:new-free:10*1000
	real 24.156 ms : ./ureg.tmp.c 898: t_bm1_sub(): msg:search:10*1000
	real 787.625 ms: ./ureg.tmp.c 911: t_bm2_sub(): msg:new-free@raw:10*1000
	real 10.868 ms : ./ureg.tmp.c 918: t_bm2_sub(): msg:search@raw:10*1000
	
	- oniguruma
	real 17.268 ms : ./ureg.ts.c 264: t_bm4_sub(): msg:new-free@onig:10*1000
	real 4.473 ms  : ./ureg.ts.c 276: t_bm4_sub(): msg:search@onig:10*1000

	- strstr()
	real 0.050 ms  : ./ureg.ts.c 240: t_bm3_sub(): msg:strstr(): 10*1000


	--- concept
	- avoid complex api
	- avoid non-standard regexp/operator like PCRE
	- support binary input
	@()@
@conforming_to posix-2001+
@_ver 2021-11-25 v3.0.5
@copyright Copyright 2021 momi-g, GPLv3+
@_see
@(code)@
	https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
	https://swtch.com/~rsc/regexp/regexp1.html
	http://web.mit.edu/gnu/doc/html/regex_7.html
	http://www.kt.rim.or.jp/~kbk/regex/re_7.html
	https://regular-expressions.mobi/refunicode.html?wlr=1
	https://stackoverflow.com/questions/2359811/working-with-gnu-regex-functions-in-c-or-c
	https://stackoverflow.com/questions/8727795/does-posix-regex-h-provide-unicode-or-basically-non-ascii-characters
	https://stackoverflow.com/questions/48743106/whats-ansi-x3-4-1968-encoding
	https://www.iana.org/assignments/character-sets/character-sets.xhtml
@()@
//SH_docE*/

/*SH_ED*/

/*SH_HD*/
#ifndef ureg_dfb770dd3ce8
#define ureg_dfb770dd3ce8

#include <stdio.h>
#if ( _POSIX_C_SOURCE +0 < 200112L )
	# include "needs compiler posix-2001 or upper(c99+)"
#endif

#include <locale.h>
#include <langinfo.h>
#include <iconv.h>
typedef struct ureg_tag {
	void* data;
	char* fastmap;
	char* lc;
	int rawflg;
	const char* p;
	int sz;
	const char* pbk[10];
	int szbk[10];
} ureg_t;

ureg_t* ureg_new_impl(int rawflg, const char* rstr, int rstrsz, int syn, ...);
void ureg_free(ureg_t* obj);
int ureg_search_impl(int topflg, ureg_t* obj, const char* tgtstr, int slen, ...);

int ureg_ere_syn(void);
int ureg_bre_syn(void);

ureg_t ureg_easy_impl(const char* reg, const char* s, int rlen, int slen, int syn, ...);
void* ureg_iconv_impl(const char* fromenc, const char* toenc, char** str, int strsz, ...);

#define ureg_new(...)		  ureg_new_impl(0,__VA_ARGS__, -1,-1 )
#define ureg_new_raw(...)	  ureg_new_impl(1,__VA_ARGS__, -1,-1 )
#define ureg_search(obj, ...)	  ureg_search_impl(0,obj, __VA_ARGS__, -1 )
#define ureg_search_head(obj,...)	ureg_search_impl(1, obj,__VA_ARGS__, -1 )
#define ureg_easy(reg, ...)		ureg_easy_impl(reg, __VA_ARGS__,-1,-1,-1)
#define ureg_iconv(fromenc, toenc, ...)		ureg_iconv_impl(fromenc, toenc, __VA_ARGS__, -1)

#endif	//ureg_dfb770dd3ce8
/*SH_ED*/

/*SH_SC*/
// --cmtout ureg_ere/bre_syn() holds flag data. if user needs
// --customized syntax, use -D_GNU_SOURCE and #include <regex.h>
//	#ifndef _GNU_SOURCE
//		#define _GNU_SOURCE
//	#endif
//	#include <stdio.h>
//	#ifndef _DEFAULT_SOURCE
//		# inclide "this src needs -D_GNU_SOURCE / glibc"
//	#endif

#include "*SH_bn*.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* tool macros */
#ifndef ERRact
#include <stdio.h>
 #if (199901L <= __STDC_VERSION__ +0)	/* nealy 200112L, _POSIX_C_SOURCE	c99*/
	#include <sys/types.h>
	#include <unistd.h>
	#define ERRactag	__func__, getpid()
 #else
	#define ERRactag	"func:c99+", 0
 #endif
 #include <string.h>
 #include <errno.h>
 #define ERRact(xpr, msg, act)	if(xpr){ int en_=errno; fprintf(stderr, \
	"ERR: %s %d %s() pid:%d %s msg:%s sys:%s\n",__FILE__,__LINE__, ERRactag \
	, "hit(" #xpr ")", msg, strerror(en_) ); act; }
 #define STOP(xpr, msg)	ERRact(xpr, msg, fputs("STOP\n",stderr);exit(1) )
#endif
#define loop(a)		for(int lpcnt=1;lpcnt<=a;lpcnt++)

#include "msgp.h"

#ifdef TEST
#include "hcut.h"
#include "msgp.h"
#include "laptime.h"
#include "*SH_bn*.h"		/*SH_co* lib*SH_bn*.a	*/
#define loop(a)		for(int lpcnt=1;lpcnt<=a;lpcnt++)
#define qu(...)		Qsub(__VA_ARGS__)
#define Qsub(...)	#__VA_ARGS__

#undef dbg
#define dbg(...)

#endif

#ifdef TEST
HCUT_ADD(t_ureg_0) {
	int rc;
	ureg_t* obj;
	const char* rg = "\\s";
//	dbg(setlocale(LC_CTYPE, "ja_JP.UTF-8") );
//	setlocale(LC_CTYPE, "");
	obj = ureg_new_raw(rg);
	for(int i=0;i<256;i++){
		printf("%d ", obj->fastmap[i]);
//		printf("|[\\%03o]", i);
	}
	puts("");
	rc = ureg_search(obj, "\377\\aA");
dbg(rc, obj->p, obj->sz);
	ureg_free(obj);
	setlocale(LC_CTYPE, "C");
}
#endif

#ifdef TEST
HCUT_ADD(t_ureg_new) {
	int rc=0;	eq_i(rc, 0);
	ureg_t* obj;
	//ureg_t* ureg_new_impl(const char* restr, int slen, int syn, int rawflg);
	const char* rs = "[a-bあc]";
	obj = ureg_new(rs, -1);
	eq(obj!=NULL, "ascii");
	ureg_free(obj);
//
	rs = "[z-a]";
	fputs("**bad range expr test**\n",stderr);
	obj = ureg_new(rs);
	fputs("**bad range expr test -end**\n",stderr);
	eq(obj==NULL, "inverse range==err");
	ureg_free(obj);
//
	rs = "[a-zあ\377]";
	obj = ureg_new(rs);
	eq(obj!=NULL, "mix mb/bin");
	ureg_free(obj);
//
	rs = "123(12)(abc)zz";
	obj = ureg_new(rs);
	eq(obj!=NULL, "blk");
	ureg_free(obj);
//
	rs = "^123$";
	obj = ureg_new(rs);
	eq(obj!=NULL, "anc");
	ureg_free(obj);
//
	rs = "a{1}";
	obj = ureg_new(rs);
	eq(obj!=NULL, "rp1");
	ureg_free(obj);
//
	rs = "a{1,}";
	obj = ureg_new(rs);
	eq(obj!=NULL, "rp2");
	ureg_free(obj);
//
	rs = "a{1,3}";
	obj = ureg_new(rs);
	eq(obj!=NULL, "rp3");
	ureg_free(obj);
//
	rs = "あいう";
	obj = ureg_new(rs);
	eq(obj!=NULL, "mb");
	ureg_free(obj);
//
	rs = "[あ-う]";
	obj = ureg_new(rs);
	eq(obj!=NULL, "mblist");
	ureg_free(obj);
//
	rs = "(a|b)c\\1";
	obj = ureg_new(rs);
	eq(obj!=NULL, "bkref");
	ureg_free(obj);

//
	rs = "^((ab|[あ-う])|(12{0,2}))\\1$";
	obj = ureg_new(rs, -1);
	eq(obj!=NULL, "fulltest");
	ureg_free(obj);
}
#endif

#ifdef TEST
HCUT_ADD(t_ureg_search) {
//puts(setlocale(LC_CTYPE, NULL) );
	int rc;
	ureg_t* obj;
	char* rs=NULL;
	char* s=NULL;
	//ureg_t* ureg_new_impl(const char* restr, int slen, int syn, int rawflg);
	rs = "[abc]";
	obj = ureg_new(rs);
	eq(obj!=NULL);
	//puts(setlocale(LC_CTYPE, NULL) );
	rc = ureg_search(obj, "1abc");
	eq_i(obj->p[0], 'a');
	eq_i(obj->sz, 1);
	ureg_free(obj);
//
	rs = "a[あ-う]";
	obj = ureg_new(rs);
	rc = ureg_search(obj, "1abc", -1);
	eq_p(obj->p, NULL, "nohit test");
	eq_i(obj->sz, -1);
	ureg_free(obj);
//
	rs = "[あ-う]";
	obj = ureg_new(rs);
	rc = ureg_search(obj, "1abいc");
//printf("%.*s \n", obj->sz, obj->p);
	eq_s(obj->p, "いc", "mbhit test");

	if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
		eq_i(obj->sz, 3);
	}
	
	//dbg(rc, obj->p, obj->pbk[0], obj->sz, obj->szbk[0], setlocale(LC_CTYPE, NULL) );
	ureg_free(obj);
//
	rs = "[abc]";
	obj = ureg_new(rs);
	rc = ureg_search(obj, "123a", 4);
	eq(obj->p!=NULL, "hit test");
	eq_i(obj->sz, 1);
	ureg_free(obj);
//
	rs = "Ω";
	s="1ab\377Ωxyz";
	obj = ureg_new(rs);
	rc = ureg_search(obj, s);
	eq(obj->p!=NULL, "hit test");
	eq_p(obj->p, s+4);

	if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
		eq_i(obj->sz, 2);
	}

	ureg_free(obj);
//
	rs = "[\377]";
	s="1ab\377Ωxyz";
//puts(setlocale(LC_CTYPE, NULL) );
	obj = ureg_new_raw(rs);
//puts(obj->lc);
	rc = ureg_search(obj, s);
	eq(obj->p!=NULL, "nohit test");
	eq_p(obj->p, s+3);
	eq_i(obj->sz, 1);
	ureg_free(obj);
//
	rs = "[\377z]";		//invalid byte is ignored in reg-class(multi byte)
	s="\377z";
	obj = ureg_new(rs);
	rc = ureg_search(obj, s);
if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
	eq_i(rc, 1);
	eq(obj->p!=NULL, "nohit test");
	eq_p(obj->p, s+1);
	eq_i(obj->sz, 1);
}
	ureg_free(obj);
}
#endif

#define ARRI10	{-1,-1,-1,-1,-1, -1,-1,-1,-1,-1}
ureg_t ureg_easy_impl(const char* r, const char* s, int rlen, int slen, int syn, ...) {
	if(rlen== -1) {rlen=strlen(r);}
	if(slen== -1) {slen=strlen(s);}
	ureg_t* obj = ureg_new(r, rlen, syn);
//dbg(r,rlen, obj, obj->lc, obj->rawflg);
//
	if(obj==NULL) { return (ureg_t) {NULL, NULL, NULL, 0, NULL, -1, {0}, ARRI10}; }
	ureg_search(obj, s, slen);
	ureg_t res = *obj;
//dbg(s,slen, res.p, obj->p, obj->sz);
	ureg_free(obj);
	res.data=NULL;
	return res;
}
#ifdef TEST
HCUT_ADD(t_ureg_easy) {
//	puts(setlocale(LC_ALL, NULL) );
	const char* rs = "2";
	const char* s = "123a\377あbc";
	ureg_t res = ureg_easy(rs, s);
	ureg_t* obj = &res;
	eq_p(res.p, s+1);
	eq_i(res.sz, 1);
	
	rs="Ω";
	s = "1234abcΩあabc\377";
	res = ureg_easy(rs, s);
dbg(res.p, res.sz);
	eq_p(res.p, s+7);

if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
		eq_i(res.sz, 2);
	}
	
	printf("%d %.*s \n", res.sz, res.sz, res.p);

	rs="b\0.";
	s = "ab\0xz";
	res = ureg_easy(rs, s, 3, 6);
dbg(res.p, res.sz);
	eq_p(res.p, s+1);
	eq_i(res.sz, 3);
}
#endif

//strpは完全成功ならstrsz+1, ダメシーケンスならその先頭を指す p-1が有効endになる
//
void* ureg_iconv_impl(const char* fromenc, const char* toenc, char** str, int strsz, ...) {
	if(strsz<0){strsz=strlen(*str);}
	iconv_t fd_cv = iconv_open(toenc, fromenc);
	if(fd_cv==(iconv_t)-1) { return NULL; }
	int mmsz = 128-1;	//end '\0' room
	char* mm = (char*)malloc(mmsz);
	if(mm==NULL) {return NULL;}
	size_t rc= (size_t)-1;
	char* sp = *str;
	char* dp = mm+sizeof(int);
	size_t inleft = strsz;
	size_t outleft = mmsz-(dp-mm);
	while(rc==(size_t)-1) {
		errno=0;
		iconv(fd_cv, NULL,NULL,NULL,NULL);
		rc = iconv(fd_cv, &sp, &inleft, &dp, &outleft);
		if(errno==E2BIG) {
			//realloc追加. 処理済は適正なので位置調整して再投入
			int idx = dp-mm;
			mmsz*=2 -1;
			mm = (char*)realloc(mm, mmsz);
			//sp = sp;	inleft = inleft;
			dp = mm+idx;
			outleft = mmsz-idx;
			continue;
		} else if(errno== EILSEQ||errno== EINVAL||inleft==0) {
			*str = sp;	// end+1 ptrpos
			int sz = dp - mm - sizeof(int);
			mm[sizeof(int)+sz]='\0';	// add \0 encは世界共通で\0は使われない
			int* ip = (int*)mm;
			ip[0]=sz;
			break;
		} else { STOP(1, "unreachable code. fatal err"); }
	}
	iconv_close(fd_cv);
	return mm;
}

#ifdef TEST
HCUT_ADD(t_ureg_iconv) {
	const char* s = "\343\201\202\343\201\204\343\201\206";	//utf8
	const char* ss = "\202\240\202\242\202\244";	//shift_jis
	char* p = (char*)s;
puts(p);
fprintf(stderr, "hw %p, %s\n", p, s);
	int* pp = ureg_iconv("UTF8", "SHIFT_JIS", &p, strlen(p));
fprintf(stderr, "hw %p, %s\n", pp, s);
if(pp){
	int sz = pp[0];
	char* str = (char*)&pp[1];
dbg(sz);
	puts(str);
	eq_i(strcmp(ss, (char*)(pp+1) ), 0);
}
	free(pp);
}
#endif

#ifdef TEST
HCUT_ADD(t_ureg) {
	int rc;
	ureg_t* obj;
	const char* s;
	obj = ureg_new_raw("(\277|\377)");
	rc = ureg_search(obj, "a\377");
	eq(rc>0);
	ureg_free(obj);
//
	obj = ureg_new("[あ]");
	eq(obj!=NULL);
	rc = ureg_search(obj, s="xあayz\343");
eq(obj->p -s == 1);
if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
		eq_i(obj->sz, 3);
	}

	// dbg(obj);exit(1);
	// dbg(rc, obj->sidx, obj->eidx);
	ureg_free(obj);
//
	obj = ureg_new("[aあbc]");
	rc = ureg_search_head(obj, s="あxayz\343", -1);
	eq(rc==0);
	eq_p(obj->p, s);
//eq_i(obj->sz, 3);
if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
		eq_i(obj->sz, 3);
	}

	rc = ureg_search_head(obj, s="xあayz\343");
	eq(rc<0);
	eq_p(obj->p, NULL);
	eq_i(obj->sz, -1);
	// dbg(rc, obj->sidx, obj->eidx);
	ureg_free(obj);
	obj = ureg_new("[abc]");
	rc = ureg_search(obj, s="xayz\343");
	eq(rc>0);
	eq(obj->p -s == 1);
	eq_i(obj->sz, 1);
	// dbg(rc, obj->sidx, obj->eidx);
	ureg_free(obj);
//
	obj = ureg_new_raw("[あ]");
	rc = ureg_search(obj, s="xaあyz\343", -1);
//dbg(obj->p, obj->sz, rc, obj->lc);
	eq(rc>0);
	eq(obj->p -s == 2);
	eq_i(obj->sz, 1);
	// dbg(rc, obj->sidx, obj->eidx);
	ureg_free(obj);
//
	char* sv = setlocale(LC_ALL, NULL);
	setlocale(LC_ALL, "");
	obj = ureg_new_raw("[あ]");
	rc = ureg_search(obj, s="xあyz\343", -1);
	eq(rc>0);
	eq(obj->p -s == 1);
//eq_i(obj->sz, 3);
if( strrchr(obj->lc, '.') && strncasecmp( strrchr(obj->lc, '.')+1, "utf", 3)==0){
		eq_i(obj->sz, 3);
	}

	setlocale(LC_ALL, sv);
	// dbg(rc, obj->sidx, obj->eidx);
	ureg_free(obj);
lb_RTN:;
}
#endif

#ifdef TEST
HCUT_ADD(t_ureg_syn) {
	int rc;
	ureg_t* obj;
	char* rs=NULL;
	char* s=NULL;
	rs = "a(bc)";
	obj = ureg_new(rs, -1, ureg_ere_syn() );
	eq(obj!=NULL);
	//puts(setlocale(LC_CTYPE, NULL) );
	s="1abc";
	rc = ureg_search(obj, s);
	eq_i(!rc, 0);
	eq_i(obj->p[0], 'a');
	eq_i(obj->sz, 3);
	eq_p(obj->pbk[1], s+2);
	eq_i(obj->szbk[1], 2);
dbg(obj->pbk[1], obj->szbk[1]);
	ureg_free(obj);
//
	rs = "\\(a\\)\\(bc\\)";
	obj = ureg_new(rs, -1, ureg_bre_syn() );
	eq(obj!=NULL);
	//puts(setlocale(LC_CTYPE, NULL) );
	s = "1abc";
	rc = ureg_search(obj, s);
	eq_i(obj->p[0], 'a');
	eq_i(obj->sz, 3);
	
	eq_p(obj->pbk[2], s+2);
	eq_i(obj->szbk[2], 2);
dbg(obj->pbk[1], obj->szbk[1]);
	ureg_free(obj);
//
}
#endif

#ifdef TEST
#define Lcnt	10*1000
#define RR	"4.6"
#define SS	"1234567890"
#endif

#ifdef TEST
HCUT_ADD(t_bm0) {
laptime(0);
	loop(Lcnt){ ureg_easy(RR,SS); }
laptime("easy:" qu(Lcnt));
}
#endif

#ifdef TEST
HCUT_ADD(t_bm1) {
	int rc=0;	eq_i(rc, 0);
	ureg_t* obj;
laptime(0);
	loop(Lcnt) {
		obj = ureg_new(RR);
		ureg_free(obj);
	}
laptime("new-free: " qu(Lcnt));
//
	obj = ureg_new(RR);
laptime(0);
	loop(Lcnt) { ureg_search(obj, SS); }
laptime("search: " qu(Lcnt));
	//	dbg(rc, obj->sidx, obj->eidx, obj->len);
	ureg_free(obj);
}
#endif

#ifdef TEST
HCUT_ADD(t_bm2) {
	int rc=0;	eq_i(rc, 0);
	ureg_t* obj;
	setlocale(LC_CTYPE, "");
laptime(0);
	loop(Lcnt) {
		obj = ureg_new_raw(RR);
		ureg_free(obj);
	}
laptime("new-free@raw: " qu(Lcnt));
//
	obj = ureg_new(RR);
laptime(0);
	loop(Lcnt) {
		rc = ureg_search(obj, SS);
	}
laptime("search@raw: " qu(Lcnt));
	setlocale(LC_CTYPE, "C");
	ureg_free(obj);
}
#endif

#ifdef TEST
HCUT_ADD(t_bm3) {
//	int rc;
//	char* res=NULL;
laptime(0);
	loop(Lcnt) { strstr(RR, SS); }
laptime("strstr(): " qu(Lcnt));
}
#endif

#ifdef TEST_
#include "oniguruma.h"	//*SH_co*	libonig.a	*/
HCUT_ADD(t_bm4) {
	int rc=0;
	eq_i(rc, 0);
	
	const char* reg = RR;
	const char* str = SS;

laptime(0);

	OnigEncoding encs[] = { ONIG_ENCODING_UTF8 };
	onig_initialize(encs, sizeof(encs)/sizeof(encs[0]));
	/* 初期化でエンコードの種類と数をあれこれ。内部でメモリ確保するんだろう。 */
	regex_t* cmp;
	OnigErrorInfo emsg;
	const unsigned char* sadd=(const unsigned char*)reg;
	const unsigned char* eadd=(const unsigned char*) (sadd+strlen(reg));
laptime(0);
	loop(Lcnt) {
		rc = onig_new(&cmp, sadd, eadd, ONIG_OPTION_FIND_LONGEST, ONIG_ENCODING_UTF8
		, ONIG_SYNTAX_POSIX_EXTENDED, &emsg);
		onig_free(cmp);
	}
laptime("new-free@onig: " qu(Lcnt));

	rc = onig_new(&cmp, sadd, eadd, ONIG_OPTION_FIND_LONGEST, ONIG_ENCODING_UTF8
		, ONIG_SYNTAX_POSIX_EXTENDED, &emsg);
	OnigRegion* posobj = onig_region_new();
	sadd=(const unsigned char*)str;
	eadd=(const unsigned char*)(sadd+strlen(str) );
laptime(0);
	loop(Lcnt) {
		rc = onig_search(cmp, (const unsigned char*)str, eadd, sadd, eadd, posobj, ONIG_OPTION_NONE);
//		puts(str+posobj->beg[0]);
	}
laptime("search@onig: " qu(Lcnt));
//	if(rc >= 0) {	/*hit*/
//		obj->s = posobj->beg[0];
//		obj->e = posobj->end[0] -1;
//	}
	onig_region_free(posobj, 1); /* 1:free self, 0:free contents only */
	onig_free(cmp);
	onig_end();
}
#endif

/*SH_SMP
#include <stdio.h>
#include <locale.h>
#include "*SH_bn*.h"
int main(){
	int rc;
	ureg_t* obj;
	char* p;

	obj = ureg_new("[あ]");
	rc = ureg_search(obj, "xあayz\343", 7);
	;	printf("%.*s \n", obj->sz, obj->p);
	ureg_free(obj);

	setlocale(LC_ALL, "");
	obj = ureg_new_raw( "[あ]");
	rc = ureg_search(obj, "xあyz\343", 7);
	;	printf("%.*s \n", obj->sz, obj->p);
	setlocale(LC_ALL, p);
	ureg_free(obj);
	return 0;
}
//	~$ gcc smpl.c *SH_bn*.c libureg.a
//SH_SMPE*/

#ifdef TEST
HCUT_RUN("stderr", 3,	/* keep newline. use for SH sed edit, -t test.*/
//autofill by brp,	t_ureg_iconv, t_ureg, t_bm1, t_bm2, t_bm3
);
#endif

/*
 change log
 --
2022-09-25 Momi-g	<dmy@dmy.dmy>
	
	* *SH_bn*.c (tests): skip obj->sz tests if non-UTF8 env (strncasecmp)

2021-11-25  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (ver): v3.0.5
	* *SH_bn*.c (TEST): add build test brpcode
	* *SH_bn*.c (doc) : update doc

2021-11-16  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (ver): v3.0.4
	* *SH_bn*.c (ureg_search_impl): fix 64bit builderr, sarr >>  (void*)sarr

2021-08-08  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (doc): fix doc, add amn script v3.0.3

	* *SH_bn*.c (ureg_bre_syn): omit syntax customize system. BRE/ERE only.
	* (ureg_ere_syn): flag getter func renamed. erebin >> ere

2021-08-02  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (all): v3.0.2
	* *SH_bn*.c(ureg_dflsyntax): fix EREflg, nouse BKREF. ')' is ORD. (posix)
	* (ureg_brebin_syn_): add/change BRE/ERE syntax name
	* (ureg_brebin_syn()): add getter

2021-07-02  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c(ureg_t): add struct member 'fastmap'
	(ureg_new_impl): fix fastmap init setting, re_comp_fastmap()
	(cmtdoc): fix doc 

2021-06-26  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c(macro): make it simple
	(ureg_easy): fix no work bug
	(ureg_search_impl): fix lastidx=3; >> lastidx=slen-1; left of dbg.

2021-06-03  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c(ureg_new_impl): init_localeinfo() changes errno. sv + rollback.

2021-05-30  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.h: add include guard

2021-04-16  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (all): change backend. use sed-regex api. v3.0.0

2021-03-16  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (all): rename u8rg >> ureg, change ureg_t obj, add svlc, v2.0.0.

	* *SH_bn*.c (new_impl): add inner func

2021-02-17  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (u8rg_match): omit -4byte request. dfa cant do partial match.
	* (u8rg_search): same.

2021-02-09  Momi-g	<dmy@dmy.dmy>

	* *SH_bn*.c (all): v1.0.0.

*/
/*SH_ED*/

/*SH_EXT*/
#ifndef ERRact
#include <stdio.h>
 #if (199901L <= __STDC_VERSION__ +0)	/* nealy 200112L, _POSIX_C_SOURCE	c99*/
	#include <sys/types.h>
	#include <unistd.h>
	#define ERRactag	__func__, getpid()
 #else
	#define ERRactag	"func:c99+", 0
 #endif
 #include <string.h>
 #include <errno.h>
 #define ERRact(xpr, msg, act)	if(xpr){ int en_=errno; fprintf(stderr, \
	"ERR: %s %d %s() pid:%d %s msg:%s sys:%s\n",__FILE__,__LINE__, ERRactag \
	, "hit(" #xpr ")", msg, strerror(en_) ); act; }
 #define STOP(xpr, msg)	ERRact(xpr, msg, fputs("STOP\n",stderr);exit(1) )
#endif
#define loop(a)		for(int lpcnt=0;lpcnt<a;lpcnt++)

typedef struct ureg_tag {
	void* data;
	char* fastmap;
	char* lc;
	int rawflg;
	const char* p;
	int sz;
	const char* pbk[10];
	int szbk[10];
} ureg_t;

// posix-ERE with \n \0 extension
static int ureg_erebin_syn_ = 0
 //	| RE_BACKSLASH_ESCAPE_IN_LISTS //"\" works as esc in [] on:yes off:no //ERE:off, special chars is only ^-[] in class
 // | RE_ICASE			// ignore case, aa==Aa	 on:a==A off:a!=A
 | RE_INTERVALS		// use {} op	on:use off:normal chars
 //	| RE_LIMITED_OPS	// +*?| are normal chars	on:yes off:special chars
 | RE_CHAR_CLASSES	// use [:alnum:] etc	on:yes off:no

 | RE_CONTEXT_INDEP_ANCHORS	//$^ are special except []/esc (a^c etc) yes/no		//>>ERE a^b never matches.BRE a^b matches str "a^b" 
 | RE_CONTEXT_INDEP_OPS	// *+? raise err in bad pos, "+a" etc	on/off
 | RE_CONTEXT_INVALID_DUP	// "{2}a" raise err	on:yes off:[{][2][}]a
 | RE_CONTEXT_INVALID_OPS	// +*? raise err, "?a" etc	on:yes off:[+]a
 | RE_UNMATCHED_RIGHT_PAREN_ORD //")a" doesnt raise err	on:[)]a off:err	//>>ERE require. }も記載を読むと同じ扱いになる
 //	| RE_INVALID_INTERVAL_ORD	// "{abc" uses as [{]abc	on:[}] off:err	//>>ERE. evenでない{}は不正扱い。}は通常文字扱い 

 | RE_DOT_NEWLINE	// . includes '\n'	on:include off:exclude
 //	| RE_DOT_NOT_NULL	// . excludes '\0'	on:exclude off:include
 //	| RE_HAT_LISTS_NOT_NEWLINE	//[^a] excludes '\n' 	on:exclude off:include

 //	| RE_NEWLINE_ALT	// a(\n)b works as a|b, on:\n==| off:| only
 | RE_NO_BK_BRACES	// interval {}, on:{} off:\{\} //needs RE_INTERVALS:on
 | RE_NO_BK_PARENS	// grouping (),	on:() off:\(\)
 | RE_NO_BK_VBAR		// | or \|,  on:| off:\|	//needs RE_LIMITED_OPS:off
 | RE_NO_BK_REFS		// use \1,\2 etc  on:nouse off:use
 //	| RE_BK_PLUS_QM		// repeat "a+" or "a\+" 	on:\+ off:+		ERE:off

 | RE_DEBUG	// regcomp() holds dbginfo	on:yes off:no	//emsg etc
 | RE_NO_EMPTY_RANGES	// [z-a] is err range	on:yes off:no, use as empty
 | RE_NO_GNU_OPS		// use out of standard op, \< etc 	on:nouse off:use
 //	| RE_NO_POSIX_BACKTRACKING	//use shortest match	on:yes off:no,longetst
 //	| RE_NO_SUB		//drop backref data, (ab)c \1=ab etc.	on:yes off:hold
;
// http://web.mit.edu/gnu/doc/html/regex_7.html#SEC46
// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09

static int ureg_brebin_syn_ = 0
 | RE_INTERVALS
 | RE_CHAR_CLASSES
 | RE_DOT_NEWLINE
 | RE_NO_EMPTY_RANGES
 | RE_BK_PLUS_QM		//\+がone or moreになる。厳密にはBREでは\+は未定義動作 errにしたいけどフラグがないのでsedのライブラリに従う
;

void ureg_free(ureg_t* obj) {
	if(!obj) {return;}
	struct regex* sedobj = obj->data;
	if(sedobj->dfa) {
		dfafree(sedobj->dfa);
		free(sedobj->dfa);
		sedobj->dfa = NULL;
	}
	regfree(&sedobj->pattern);
	free(sedobj);
	free(obj->lc);
	free(obj);
}

int ureg_ere_syn(void) { return ureg_erebin_syn_; }
int ureg_bre_syn(void) { return ureg_brebin_syn_; }
#include <locale.h>
int ureg_search_impl(int headflg, ureg_t* obj, const char *str, int slen, ...);
ureg_t* ureg_new_impl(int rawflg, const char* r, int rlen, int syn, ...) {
	if(rlen== -1){ rlen=strlen(r); }
	if(syn== -1){ syn=ureg_erebin_syn_; }
	; STOP(r==NULL||rlen<0, "ag1/ag2 is NULL/ sz<0");
	char svlc[128]= {0};
	strcpy(svlc, setlocale(LC_CTYPE, NULL));	//now locale
	struct regex* new_regex;
	const char *error;
//
	ureg_t* res = (ureg_t*)calloc(sizeof(ureg_t), 1);
	; ERRact(!res, "calloc() failed", goto lb_ERR);
	if(rawflg==0) { setlocale(LC_CTYPE, ""); }
	res->lc = strdup( setlocale(LC_CTYPE, NULL) );
	// sed malloc() func. abort()/exit(1) if err.
	new_regex = xzalloc(sizeof(struct regex) + rlen - 1);
	new_regex->flags = 0; //sed cmdopt. REG_NEWLINE etc. not gnuregex flg. always 0
	memcpy(new_regex->re, r, rlen);
	new_regex->sz = rlen;
	res->data = new_regex;	//for free() logic
//
	// prune map needs 256 byte
	if(!(syn&RE_ICASE)) { new_regex->pattern.fastmap = malloc(1 << (sizeof(char) * 8)); }
	re_set_syntax(syn);	//>> wapper of 're_syntax_options', gnureg glvar
	error = re_compile_pattern(new_regex->re, new_regex->sz, &new_regex->pattern);
	; ERRact(error, error, fprintf(stderr, "%s\n", r); goto lb_ERR);

//add: fastmap compile, init
	int	rc = re_compile_fastmap (&new_regex->pattern);	//suc/fail == 0/-2
//	new_regex->pattern.fastmap_accurate=1;
	; ERRact(rc, "compile regex-fastmap failed ", goto lb_ERR);
//printf("aft acc:%d fptr:%p \n", new_regex->pattern.fastmap_accurate, new_regex->pattern.fastmap);
//for(int i=0;i<256;i++){	printf("%d:%d ", i, res->fastmap[i]); }

	// omit sed flg, trouble makers. used in dfacomp() dfaparse() etc at dfa.c
	new_regex->pattern.newline_anchor = 0; //sedflg. use \n as ^$ work. DONT set
	new_regex->pattern.translate = NULL;	//works as 'tr' cmd
	int dfaopts = DFA_EOL_NUL; // DFA_EOL_NUL(use \n >> \0), DFA_ANCHOR 
	new_regex->dfa = dfaalloc();
	//locale setting: copied from sed.c main()
	struct localeinfo lc;
	//add_fix>>> sv errno
	int errno_sv=errno;
	init_localeinfo(&lc);	//important. get locale setting.
	errno=errno_sv;
	//corecode for locale. parses bytes to tokens using mbrtowc() etc. see dfa.c
	dfasyntax(new_regex->dfa, &lc, syn, dfaopts);
	dfacomp(new_regex->re, new_regex->sz, new_regex->dfa, 1);	//1, uses for search
	/* The patterns which consist of only ^ or $ often appear in
	   substitution, but regex and dfa are not good at them, as regex does
	   not build fastmap, and as all in buffer must be scanned for $.  So
	   we mark them to handle manually.  */
	if(new_regex->sz == 1) {
		if(new_regex->re[0] == '^')	{ new_regex->begline = true; }
		if(new_regex->re[0] == '$')	{ new_regex->endline = true; }
	}
	res->data = new_regex;
	res->fastmap = new_regex->pattern.fastmap;
	setlocale(LC_CTYPE, svlc);
	return res;
lb_ERR:;
	ureg_free(res);
	setlocale(LC_CTYPE, svlc);
	return NULL;
}

// suc/fail,err = rtn >0 / <0
// rtn hit[idx] ptridx. >=0
#define ARRI10	{-1,-1,-1,-1,-1, -1,-1,-1,-1,-1}
int ureg_search_impl(int headflg, ureg_t* obj, const char *str, int slen, ...) {
	// struct re_registers
	// {
	//   unsigned num_regs;
	//   regoff_t *start;
	//   regoff_t *end;
	// };
	// from gnuregex
//init
	char* s = (char*)str;
	if(slen<0){ slen=strlen(s); }
	regoff_t sarr[10]=ARRI10;
	regoff_t earr[10]=ARRI10;
	//fix sarr >> (void*)sarr, 64bit builderr
	struct re_registers bkres= {10, (void*)sarr, (void*)earr };	// 10... posix supports \1-\9
	struct regex* sedobj = obj->data;
	// use sed regexp.c, rtn suc/other=1/0. ag4:str offset=0, ag6:regoffsz=10
	
	obj->p=NULL;
	obj->sz = -1;
	for(int i=0;i<10;i++){
		obj->pbk[i] = NULL;
		obj->szbk[i] = -1;
	}
	char svlc[128]={0};
	if(obj->rawflg==0){
		strcpy(svlc, setlocale(LC_CTYPE, NULL));	//now locale
		setlocale(LC_CTYPE, obj->lc);
	}
	int lastidx=slen-1; //"abc" ...[2]==les-1
	//gnuは頭おかしい。rangeじゃない。lastidx of searchhead.
	//lenをそのまま突っこむと範囲外のの\0系からも探索してしまう
	if(headflg){ lastidx=0;}
	// sed-regexp() uses superset but needs src copy/malloc(). omit.
	int rc = re_search(&sedobj->pattern, s, slen, 0, lastidx, &bkres);
	// jeez. ag5 is not RANGE but laststart_IDX. manual uses inappropriate name.
	// in case search only top 3byte "...XYZ" len=6, set range=2. not 3.
	// because innercode uses as 'p <= srcptr+range'	(regexec.c)
	// so s[range] == s[3] == 'X' is searched. s[2] is collect.

	if(rc<0){ goto lb_ERR; }

	for(int i=0;;i++) {
		if(bkres.start[i] <0) {break;}
		obj->pbk[i]  = s + bkres.start[i];	// set -1 if nomatch/err
		obj->szbk[i] = bkres.end[i] - bkres.start[i];
	}
	obj->p  = obj->pbk[0];
	obj->sz = obj->szbk[0];
	if(obj->rawflg==0){ setlocale(LC_CTYPE, svlc);}
	return (int)((char*)obj->p - (char*)str );
lb_ERR:;
	if(obj->rawflg==0){ setlocale(LC_CTYPE, svlc);}
	return -1;
}
/*SH_ED*/

/*SH_OP _ set -e;a=`sed -ne "/${C}DF/!d;:l;n;/${C}DE/q;p;bl"<$R0`;eval "$a";set +e	#*/
/*SH_OP	h $p"-tsbS:test/eg/.o/.so -LMP:leak,mem,prof -f:funcs -o:bldout		GPLv3+"	 #*/
/*SH_OP	f sed -ne "/${C}DF/q;/;/d;/^[a-z].*)/p"<$R0 #*/
/*SH_OP t $e"$CW";ftt "$@";$p'cc -O0 -static -Wall -pedantic -g -pg -ggdb3 $Rm `fOI $Rs $tf` `fg $Rs $tf` `fL`'|fv	#*/
/*SH_OP T $e"$CW";ftt "$@";$p'cc -O3 $Rm `fOI $Rs $tf ` `fg $Rs $tf ` `fL`'|fv	#*/
/*SH_OP s $e"$CB";fgr0 "${C}SMP" "${C}SMPE"<$Rs|fbn>eg.c;$p'cc eg.c `fg eg.c` `fOI eg.c`'|fv #*/

/*SH_OP L $p"valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --verbose ./a.out 2>&1|sed -e '/SUMMA/!d;n;n;n;n'"|fv #*/
/*SH_OP M $p"fM ./a.out"|fv	 #*/
/*SH_OP P $p'valgrind --tool=callgrind --callgrind-out-file=log.out ./a.out;kcachegrind log.out'|fv	 #*/

/*SH_OP b $e"$CW";$p'cc -c $Rs -pedantic -O2 -Wall -g `fg $Rs` `fI $Rs`'|fv;$p"$bn.o"	#*/
/*SH_OP B $e"$Cb";$p"ar -r lib$bn.a $bn.o `fO $Rs`"|fv;$p"lib$bn.a"	#*/
/*SH_OP A $e"$CB";$p'fA lib$bn.a `fg $Rh $Rs|fu|grep '[.]a$'|fU`'|fv;$p"lib$bn.a" #*/
/*SH_OP S $e"$Cb";$p"cc -shared -fPIC -o lib$bn.so $bn.o `fOI $Rs` `fg $Rs`"|fv;$p"lib$bn.so" #*/
/*SH_OP W $e"$Cm$O$Cw">/dev/null;fborn;$p"$Rs $Rh $tf";$e"$CJ"	#*/
/*SH_OP J fgr "${C}LS" "${C}ED"<$R0>sedimpl.code;fgr "${C}EXT" "${C}ED"<$R0|fbn>>sedimpl.code	#*/

/*SH_OP o $e"$CW";$p'fman $Rh 3'|fv		#*/

/*SH_DF
#-- noob
fman()( $p"fgr0 '${C}doc' '${C}docE'<$1|amn >$bn.$2
 mandoc -Thtml <$bn.$2 >$bn.$2.html
 #fgr0 '${C}doc' '${C}docE'<$1|amn -Dzwsp=1 >$Rm
 man -Tutf8 /dev/stdin<$bn.$2|sed -e 's/.`printf \"\\b\"`//g'>$bn.$2.txt
 "|fv
)

#-- local

#-- vars
bn=`basename ${Rs%.*}`; tf=${Rs%/*}/${bn}.ts.${Rs##*.}; e="eval "; p="$Rp"
#-- mod
fv()(while read -r a;do $e"cat<<E$O# $a${O}E"|sed -e 's@-L.*-L[^ ]*@-L(omit)@g'>/dev/stderr;$e"$a";done)

fbn()(sed -e "s@\*${C##*]}bn\*@$bn@g"|frf|frv|flit)
fsn()(tr -s ' \t' '\n')
fsl()(tr -s '\n' ' ')
fu()(fsn|sort -u)
fU()(fu|fsl;$p)

fgr()(sed -e "/$1/!d;:l;/$2/{p;d};n;bl")	#切出
fgr0()(sed -ne "/$1/!d;:l;n;/$2/d;p;bl")	#抜き切出
fgR()(sed -ne "/$1/bl;p;d;:l;n;/$2/d;bl")	#切すて
fg()(sed -ne "s/.*${C##*]}co\*\([^*]*\).*$/\1/p" "$@"|fsn|awk '!a[$0]{a[$0]=1;print}'|fsl)

# fO src.o from inc"src.abc" etc. kick self
fO()(set -- `fdp "$@"|awk '$0~/[.](h|hpp)$/{print}'|sed -e 's/[.][^.]*$/.c/'|fU`
	buf="";for i;do test -f $i&&buf="$buf $i";done;$p"$buf"
)
fI()(fdp "$@"|sed -e 's/[^/]*$//g'|fu|sed -e '/./s/^/ -I/g'|fu|grep -v '^\-I$'|fU)
fL()(find -L `dirname $R0` -type d|sed -e 's/^/-L/g'|fU)
# inc""系.h,hpp,oをパス付きで羅列 OIはfdpが重複するので高速化でまとめる 複数file_ok
fOI()(
set -- `fdp "$@"`
s="-I./ "`$p"$@"|sed -e 's/[^/]*$//g'|fu|sed -e 's/^/ -I/g'|fu|grep -v '^\-I$'|fU`
set -- `$p"$@"|awk '$0~/[.](h|hpp)$/{print}'|sed -e 's/[.][^.]*$/.c/'|fU`
buf="";for i;do test -f $i&&buf="$buf $i";done;
$p"$buf $s"
)

# 依存inc""を再帰的に取得./以下全て self系はkick
fdp()( l="$*"; paths="$@"; all=""; used=""
 while :;do
	all=`$p$all $paths|fU`	#差分を追加 repの始末 差分たちからaaa.hを取得 partial path
	buf=`(cat $paths|sed -ne 's@^[ \t]*#inc[^"]*.\([a-zA-Z0-9._]*\)".*@\1@p')|sort -u`
	ch=`$p$used $buf|tr -s ' ' '\n'|sort|uniq -u`	#使用済は外す
	used="$used $ch"	#リスト更新
	paths=`fsvy $ch|sort -u`	#ls検索 name系のみのはず
	buf=`$p"$all" "$paths"|fU`	#増えたらloop
	[ ${#all} = ${#buf} ]&&break
 done
# initを除く
 set -- $all
 for i;do a=${i##*[/]}; a=${a%%.*};[ "${l##*$a*}" = "$l" ]&&set -- "$@" $i;shift;done
 $p"$@"
)

# corecode:search + depthck + uniq
fsvy()(c="find -L ./ -false"
	for i; do c="$c -o -path '*'$i";done; l=`$e"$c"`
	for i; do $p"$l"|grep -F "$i"|awk '{sv=$0;print gsub("[/]","") " " sv}'|
	sort -k 1.1,1n -k 2.2,2|awk '{print $2;exit}'; done
)

# libをまとめる
fA()(n=0;dir=`dirname $0`/tmpdir;mkdir $dir;cd $dir;
 for i;do
 	n=$((n+1))
 	cp ../$i $i
 	ar -x $i
 	for ii in *.o;do mv "$ii" "p${n}_$ii";done
 	ar -r lib$bn.aa *.o
 	rm *.o
 done
 $p'mv lib$bn.aa ../lib$bn.a'|fv
 cd ..;rm -r $dir
)

#-- yacc
# /*SH_OP y $e"$CW";fy
# /*SH_OP Y $e"$Cy";fU $( ($p"lib$bn.a";fg $Rs $Rh)|$n|grep '[.]a$'|$U)
fy()(
cat<<'EEE'|fv
f0 "${C}YACC" "${C}YACCE"<$Rs>myyacc.y
f0 "${C}LEX" "${C}LEXE"<$Rs>mylex.l
lex mylex.l; yacc -p zz -dv myyacc.y
cat y.tab.c lex.yy.c > $Rs
gcc -c y.tab.c lex.yy.c -lfl `fA $Rs $Rh`
rm mylex.l myyacc.y lib$bn.a
ar r lib$bn.a `fo $Rs` y.tab.o lex.yy.o
$p"lib$bn.a"
EEE
)

#-- longcmd
frf()(
 # *sh_rf* 0 a.txt b.txt ...でcat纏めて出力 top0でsrcinfoは無し出力
 awk -v tg="${C##*]}rf" 'index($0,tg){
 s=substr($0, index($0,tg)+length(tg)+1);split(s, a)
 m="[ -f \"%s\" ]&&(echo \"/*--copyfrom %s*\"/;cat \"%s\";echo \"/*--copyend %s*\"/)"
 mm="[ -f \"%s\" ]&&(_=\"%s\"/;cat \"%s\";_=\"%s\")"
 for(i=1;i in a;i++){v=a[i];if(v==0){m=mm;continue};system(sprintf(m,v,v,v,v)) }
 next
 }
 {print}'
)
frv()(buf=`awk '$1=="@_ver" {print $3;exit}'<$R0`;sed -e "s@\*${C##*]}ver\*@$buf@g")
flit()(sed -ne "/${C}lit/bl;p;d;:l;n;/${C}litE/d;"'s/[\]/&&/g;s/"/\\"/g;s@.*@"&\\n"@g;p;bl')

fte(){
 cat > $Rm-
 a="`sed -ne 's@^HCUT_ADD(\([^)]*\).*@\1, @p' $Rm-|tr -d '\n'`NULL"
 if [ $# != 0 ];then	a=""; for i;do a="$a $i,";done; a="$a NULL"; fi
 sed -ne "p;/_RUN/bl;d;:l;/[)]/{c\\$O $a)$O p;d};n;bl" $Rm-
 rm $Rm-
}
ftt()(fte "$@"<$tf>$Rm;mv $Rm $tf; cat $Rs $tf>$Rm)
fborn(){
 fgr0 "^#ifdef TEST" "^#endif"<$R0|fbn>$tf
 fgR "^#ifdef TEST_" "^#endif"<$R0 |fgr0 "^#ifdef TEST" "^#endif"|fbn|fte>tests.code
 fgR "^#ifdef TEST" "^#endif"<$Rs|fbn>$Rm;mv $Rm $Rs;fbn<$Rh>$Rm;mv $Rm $Rh
}
fM()(
 valgrind -q --tool=massif --massif-out-file=./vmem.buf --stacks=yes --trace-children=yes $1>/dev/null
 ms_print ./vmem.buf|sed -ne '/[KMG]B/bl;d;:l;/snap/q;p;n;bl';rm ./vmem.buf)

/*SH_DE*/
