"SfR Fresh" - the SfR Freeware/Shareware Archive

Member "less-424/regexp.c" of archive less-424.tar.gz:


As a special service "SfR Fresh" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
    1 /*
    2  * regcomp and regexec -- regsub and regerror are elsewhere
    3  *
    4  *	Copyright (c) 1986 by University of Toronto.
    5  *	Written by Henry Spencer.  Not derived from licensed software.
    6  *
    7  *	Permission is granted to anyone to use this software for any
    8  *	purpose on any computer system, and to redistribute it freely,
    9  *	subject to the following restrictions:
   10  *
   11  *	1. The author is not responsible for the consequences of use of
   12  *		this software, no matter how awful, even if they arise
   13  *		from defects in it.
   14  *
   15  *	2. The origin of this software must not be misrepresented, either
   16  *		by explicit claim or by omission.
   17  *
   18  *	3. Altered versions must be plainly marked as such, and must not
   19  *		be misrepresented as being the original software.
   20  *
   21  * Beware that some of this code is subtly aware of the way operator
   22  * precedence is structured in regular expressions.  Serious changes in
   23  * regular-expression syntax might require a total rethink.
   24  *
   25  * *** NOTE: this code has been altered slightly for use in Tcl. ***
   26  * Slightly modified by David MacKenzie to undo most of the changes for TCL.
   27  * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
   28  */
   29 
   30 #include "less.h"
   31 #if HAVE_STDIO_H
   32 #include <stdio.h>
   33 #endif
   34 #if HAVE_STDLIB_H
   35 #include <stdlib.h>
   36 #endif
   37 #if HAVE_STRING_H
   38 #include <string.h>
   39 #endif
   40 #include "regexp.h"
   41 
   42 /*
   43  * The "internal use only" fields in regexp.h are present to pass info from
   44  * compile to execute that permits the execute phase to run lots faster on
   45  * simple cases.  They are:
   46  *
   47  * regstart	char that must begin a match; '\0' if none obvious
   48  * reganch	is the match anchored (at beginning-of-line only)?
   49  * regmust	string (pointer into program) that match must include, or NULL
   50  * regmlen	length of regmust string
   51  *
   52  * Regstart and reganch permit very fast decisions on suitable starting points
   53  * for a match, cutting down the work a lot.  Regmust permits fast rejection
   54  * of lines that cannot possibly match.  The regmust tests are costly enough
   55  * that regcomp() supplies a regmust only if the r.e. contains something
   56  * potentially expensive (at present, the only such thing detected is * or +
   57  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
   58  * supplied because the test in regexec() needs it and regcomp() is
   59  * computing it anyway.
   60  */
   61 
   62 /*
   63  * Structure for regexp "program".  This is essentially a linear encoding
   64  * of a nondeterministic finite-state machine (aka syntax charts or
   65  * "railroad normal form" in parsing technology).  Each node is an opcode
   66  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
   67  * all nodes except BRANCH implement concatenation; a "next" pointer with
   68  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
   69  * have one of the subtle syntax dependencies:  an individual BRANCH (as
   70  * opposed to a collection of them) is never concatenated with anything
   71  * because of operator precedence.)  The operand of some types of node is
   72  * a literal string; for others, it is a node leading into a sub-FSM.  In
   73  * particular, the operand of a BRANCH node is the first node of the branch.
   74  * (NB this is *not* a tree structure:  the tail of the branch connects
   75  * to the thing following the set of BRANCHes.)  The opcodes are:
   76  */
   77 
   78 /* definition	number	opnd?	meaning */
   79 #undef EOL
   80 #define	END	0	/* no	End of program. */
   81 #define	BOL	1	/* no	Match "" at beginning of line. */
   82 #define	EOL	2	/* no	Match "" at end of line. */
   83 #define	ANY	3	/* no	Match any one character. */
   84 #define	ANYOF	4	/* str	Match any character in this string. */
   85 #define	ANYBUT	5	/* str	Match any character not in this string. */
   86 #define	BRANCH	6	/* node	Match this alternative, or the next... */
   87 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
   88 #define	EXACTLY	8	/* str	Match this string. */
   89 #define	NOTHING	9	/* no	Match empty string. */
   90 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
   91 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
   92 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
   93 			/*	OPEN+1 is number 1, etc. */
   94 #define	CLOSE	30	/* no	Analogous to OPEN. */
   95 
   96 /*
   97  * Opcode notes:
   98  *
   99  * BRANCH	The set of branches constituting a single choice are hooked
  100  *		together with their "next" pointers, since precedence prevents
  101  *		anything being concatenated to any individual branch.  The
  102  *		"next" pointer of the last BRANCH in a choice points to the
  103  *		thing following the whole choice.  This is also where the
  104  *		final "next" pointer of each individual branch points; each
  105  *		branch starts with the operand node of a BRANCH node.
  106  *
  107  * BACK		Normal "next" pointers all implicitly point forward; BACK
  108  *		exists to make loop structures possible.
  109  *
  110  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
  111  *		BRANCH structures using BACK.  Simple cases (one character
  112  *		per match) are implemented with STAR and PLUS for speed
  113  *		and to minimize recursive plunges.
  114  *
  115  * OPEN,CLOSE	...are numbered at compile time.
  116  */
  117 
  118 /*
  119  * A node is one char of opcode followed by two chars of "next" pointer.
  120  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
  121  * value is a positive offset from the opcode of the node containing it.
  122  * An operand, if any, simply follows the node.  (Note that much of the
  123  * code generation knows about this implicit relationship.)
  124  *
  125  * Using two bytes for the "next" pointer is vast overkill for most things,
  126  * but allows patterns to get big without disasters.
  127  */
  128 #define	OP(p)	(*(p))
  129 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
  130 #define	OPERAND(p)	((p) + 3)
  131 
  132 /*
  133  * See regmagic.h for one further detail of program structure.
  134  */
  135 
  136 
  137 /*
  138  * Utility definitions.
  139  */
  140 #ifndef CHARBITS
  141 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
  142 #else
  143 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
  144 #endif
  145 
  146 #define	FAIL(m)	{ regerror(m); return(NULL); }
  147 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
  148 #define	META	"^$.[()|?+*\\"
  149 
  150 /*
  151  * Flags to be passed up and down.
  152  */
  153 #define	HASWIDTH	01	/* Known never to match null string. */
  154 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
  155 #define	SPSTART		04	/* Starts with * or +. */
  156 #define	WORST		0	/* Worst case. */
  157 
  158 /*
  159  * Global work variables for regcomp().
  160  */
  161 static char *regparse;		/* Input-scan pointer. */
  162 static int regnpar;		/* () count. */
  163 static char regdummy;
  164 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
  165 static long regsize;		/* Code size. */
  166 
  167 /*
  168  * The first byte of the regexp internal "program" is actually this magic
  169  * number; the start node begins in the second byte.
  170  */
  171 #define	MAGIC	0234
  172 
  173 
  174 /*
  175  * Forward declarations for regcomp()'s friends.
  176  */
  177 #ifndef STATIC
  178 #define	STATIC	static
  179 #endif
  180 STATIC char *reg();
  181 STATIC char *regbranch();
  182 STATIC char *regpiece();
  183 STATIC char *regatom();
  184 STATIC char *regnode();
  185 STATIC char *regnext();
  186 STATIC void regc();
  187 STATIC void reginsert();
  188 STATIC void regtail();
  189 STATIC void regoptail();
  190 #ifdef STRCSPN
  191 STATIC int strcspn();
  192 #endif
  193 
  194 /*
  195  - regcomp - compile a regular expression into internal code
  196  *
  197  * We can't allocate space until we know how big the compiled form will be,
  198  * but we can't compile it (and thus know how big it is) until we've got a
  199  * place to put the code.  So we cheat:  we compile it twice, once with code
  200  * generation turned off and size counting turned on, and once "for real".
  201  * This also means that we don't allocate space until we are sure that the
  202  * thing really will compile successfully, and we never have to move the
  203  * code and thus invalidate pointers into it.  (Note that it has to be in
  204  * one piece because free() must be able to free it all.)
  205  *
  206  * Beware that the optimization-preparation code in here knows about some
  207  * of the structure of the compiled regexp.
  208  */
  209 regexp *
  210 regcomp(exp)
  211 char *exp;
  212 {
  213 	register regexp *r;
  214 	register char *scan;
  215 	register char *longest;
  216 	register int len;
  217 	int flags;
  218 
  219 	if (exp == NULL)
  220 		FAIL("NULL argument");
  221 
  222 	/* First pass: determine size, legality. */
  223 	regparse = exp;
  224 	regnpar = 1;
  225 	regsize = 0L;
  226 	regcode = &regdummy;
  227 	regc(MAGIC);
  228 	if (reg(0, &flags) == NULL)
  229 		return(NULL);
  230 
  231 	/* Small enough for pointer-storage convention? */
  232 	if (regsize >= 32767L)		/* Probably could be 65535L. */
  233 		FAIL("regexp too big");
  234 
  235 	/* Allocate space. */
  236 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
  237 	if (r == NULL)
  238 		FAIL("out of space");
  239 
  240 	/* Second pass: emit code. */
  241 	regparse = exp;
  242 	regnpar = 1;
  243 	regcode = r->program;
  244 	regc(MAGIC);
  245 	if (reg(0, &flags) == NULL)
  246 		return(NULL);
  247 
  248 	/* Dig out information for optimizations. */
  249 	r->regstart = '\0';	/* Worst-case defaults. */
  250 	r->reganch = 0;
  251 	r->regmust = NULL;
  252 	r->regmlen = 0;
  253 	scan = r->program+1;			/* First BRANCH. */
  254 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
  255 		scan = OPERAND(scan);
  256 
  257 		/* Starting-point info. */
  258 		if (OP(scan) == EXACTLY)
  259 			r->regstart = *OPERAND(scan);
  260 		else if (OP(scan) == BOL)
  261 			r->reganch++;
  262 
  263 		/*
  264 		 * If there's something expensive in the r.e., find the
  265 		 * longest literal string that must appear and make it the
  266 		 * regmust.  Resolve ties in favor of later strings, since
  267 		 * the regstart check works with the beginning of the r.e.
  268 		 * and avoiding duplication strengthens checking.  Not a
  269 		 * strong reason, but sufficient in the absence of others.
  270 		 */
  271 		if (flags&SPSTART) {
  272 			longest = NULL;
  273 			len = 0;
  274 			for (; scan != NULL; scan = regnext(scan))
  275 				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
  276 					longest = OPERAND(scan);
  277 					len = strlen(OPERAND(scan));
  278 				}
  279 			r->regmust = longest;
  280 			r->regmlen = len;
  281 		}
  282 	}
  283 
  284 	return(r);
  285 }
  286 
  287 /*
  288  - reg - regular expression, i.e. main body or parenthesized thing
  289  *
  290  * Caller must absorb opening parenthesis.
  291  *
  292  * Combining parenthesis handling with the base level of regular expression
  293  * is a trifle forced, but the need to tie the tails of the branches to what
  294  * follows makes it hard to avoid.
  295  */
  296 static char *
  297 reg(paren, flagp)
  298 int paren;			/* Parenthesized? */
  299 int *flagp;
  300 {
  301 	register char *ret;
  302 	register char *br;
  303 	register char *ender;
  304 	register int parno = 0;
  305 	int flags;
  306 
  307 	*flagp = HASWIDTH;	/* Tentatively. */
  308 
  309 	/* Make an OPEN node, if parenthesized. */
  310 	if (paren) {
  311 		if (regnpar >= NSUBEXP)
  312 			FAIL("too many ()");
  313 		parno = regnpar;
  314 		regnpar++;
  315 		ret = regnode(OPEN+parno);
  316 	} else
  317 		ret = NULL;
  318 
  319 	/* Pick up the branches, linking them together. */
  320 	br = regbranch(&flags);
  321 	if (br == NULL)
  322 		return(NULL);
  323 	if (ret != NULL)
  324 		regtail(ret, br);	/* OPEN -> first. */
  325 	else
  326 		ret = br;
  327 	if (!(flags&HASWIDTH))
  328 		*flagp &= ~HASWIDTH;
  329 	*flagp |= flags&SPSTART;
  330 	while (*regparse == '|') {
  331 		regparse++;
  332 		br = regbranch(&flags);
  333 		if (br == NULL)
  334 			return(NULL);
  335 		regtail(ret, br);	/* BRANCH -> BRANCH. */
  336 		if (!(flags&HASWIDTH))
  337 			*flagp &= ~HASWIDTH;
  338 		*flagp |= flags&SPSTART;
  339 	}
  340 
  341 	/* Make a closing node, and hook it on the end. */
  342 	ender = regnode((paren) ? CLOSE+parno : END);
  343 	regtail(ret, ender);
  344 
  345 	/* Hook the tails of the branches to the closing node. */
  346 	for (br = ret; br != NULL; br = regnext(br))
  347 		regoptail(br, ender);
  348 
  349 	/* Check for proper termination. */
  350 	if (paren && *regparse++ != ')') {
  351 		FAIL("unmatched ()");
  352 	} else if (!paren && *regparse != '\0') {
  353 		if (*regparse == ')') {
  354 			FAIL("unmatched ()");
  355 		} else
  356 			FAIL("junk on end");	/* "Can't happen". */
  357 		/* NOTREACHED */
  358 	}
  359 
  360 	return(ret);
  361 }
  362 
  363 /*
  364  - regbranch - one alternative of an | operator
  365  *
  366  * Implements the concatenation operator.
  367  */
  368 static char *
  369 regbranch(flagp)
  370 int *flagp;
  371 {
  372 	register char *ret;
  373 	register char *chain;
  374 	register char *latest;
  375 	int flags;
  376 
  377 	*flagp = WORST;		/* Tentatively. */
  378 
  379 	ret = regnode(BRANCH);
  380 	chain = NULL;
  381 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
  382 		latest = regpiece(&flags);
  383 		if (latest == NULL)
  384 			return(NULL);
  385 		*flagp |= flags&HASWIDTH;
  386 		if (chain == NULL)	/* First piece. */
  387 			*flagp |= flags&SPSTART;
  388 		else
  389 			regtail(chain, latest);
  390 		chain = latest;
  391 	}
  392 	if (chain == NULL)	/* Loop ran zero times. */
  393 		(void) regnode(NOTHING);
  394 
  395 	return(ret);
  396 }
  397 
  398 /*
  399  - regpiece - something followed by possible [*+?]
  400  *
  401  * Note that the branching code sequences used for ? and the general cases
  402  * of * and + are somewhat optimized:  they use the same NOTHING node as
  403  * both the endmarker for their branch list and the body of the last branch.
  404  * It might seem that this node could be dispensed with entirely, but the
  405  * endmarker role is not redundant.
  406  */
  407 static char *
  408 regpiece(flagp)
  409 int *flagp;
  410 {
  411 	register char *ret;
  412 	register char op;
  413 	register char *next;
  414 	int flags;
  415 
  416 	ret = regatom(&flags);
  417 	if (ret == NULL)
  418 		return(NULL);
  419 
  420 	op = *regparse;
  421 	if (!ISMULT(op)) {
  422 		*flagp = flags;
  423 		return(ret);
  424 	}
  425 
  426 	if (!(flags&HASWIDTH) && op != '?')
  427 		FAIL("*+ operand could be empty");
  428 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
  429 
  430 	if (op == '*' && (flags&SIMPLE))
  431 		reginsert(STAR, ret);
  432 	else if (op == '*') {
  433 		/* Emit x* as (x&|), where & means "self". */
  434 		reginsert(BRANCH, ret);			/* Either x */
  435 		regoptail(ret, regnode(BACK));		/* and loop */
  436 		regoptail(ret, ret);			/* back */
  437 		regtail(ret, regnode(BRANCH));		/* or */
  438 		regtail(ret, regnode(NOTHING));		/* null. */
  439 	} else if (op == '+' && (flags&SIMPLE))
  440 		reginsert(PLUS, ret);
  441 	else if (op == '+') {
  442 		/* Emit x+ as x(&|), where & means "self". */
  443 		next = regnode(BRANCH);			/* Either */
  444 		regtail(ret, next);
  445 		regtail(regnode(BACK), ret);		/* loop back */
  446 		regtail(next, regnode(BRANCH));		/* or */
  447 		regtail(ret, regnode(NOTHING));		/* null. */
  448 	} else if (op == '?') {
  449 		/* Emit x? as (x|) */
  450 		reginsert(BRANCH, ret);			/* Either x */
  451 		regtail(ret, regnode(BRANCH));		/* or */
  452 		next = regnode(NOTHING);		/* null. */
  453 		regtail(ret, next);
  454 		regoptail(ret, next);
  455 	}
  456 	regparse++;
  457 	if (ISMULT(*regparse))
  458 		FAIL("nested *?+");
  459 
  460 	return(ret);
  461 }
  462 
  463 /*
  464  - regatom - the lowest level
  465  *
  466  * Optimization:  gobbles an entire sequence of ordinary characters so that
  467  * it can turn them into a single node, which is smaller to store and
  468  * faster to run.  Backslashed characters are exceptions, each becoming a
  469  * separate node; the code is simpler that way and it's not worth fixing.
  470  */
  471 static char *
  472 regatom(flagp)
  473 int *flagp;
  474 {
  475 	register char *ret;
  476 	int flags;
  477 
  478 	*flagp = WORST;		/* Tentatively. */
  479 
  480 	switch (*regparse++) {
  481 	case '^':
  482 		ret = regnode(BOL);
  483 		break;
  484 	case '$':
  485 		ret = regnode(EOL);
  486 		break;
  487 	case '.':
  488 		ret = regnode(ANY);
  489 		*flagp |= HASWIDTH|SIMPLE;
  490 		break;
  491 	case '[': {
  492 			register int clss;
  493 			register int classend;
  494 
  495 			if (*regparse == '^') {	/* Complement of range. */
  496 				ret = regnode(ANYBUT);
  497 				regparse++;
  498 			} else
  499 				ret = regnode(ANYOF);
  500 			if (*regparse == ']' || *regparse == '-')
  501 				regc(*regparse++);
  502 			while (*regparse != '\0' && *regparse != ']') {
  503 				if (*regparse == '-') {
  504 					regparse++;
  505 					if (*regparse == ']' || *regparse == '\0')
  506 						regc('-');
  507 					else {
  508 						clss = UCHARAT(regparse-2)+1;
  509 						classend = UCHARAT(regparse);
  510 						if (clss > classend+1)
  511 							FAIL("invalid [] range");
  512 						for (; clss <= classend; clss++)
  513 							regc(clss);
  514 						regparse++;
  515 					}
  516 				} else
  517 					regc(*regparse++);
  518 			}
  519 			regc('\0');
  520 			if (*regparse != ']')
  521 				FAIL("unmatched []");
  522 			regparse++;
  523 			*flagp |= HASWIDTH|SIMPLE;
  524 		}
  525 		break;
  526 	case '(':
  527 		ret = reg(1, &flags);
  528 		if (ret == NULL)
  529 			return(NULL);
  530 		*flagp |= flags&(HASWIDTH|SPSTART);
  531 		break;
  532 	case '\0':
  533 	case '|':
  534 	case ')':
  535 		FAIL("internal urp");	/* Supposed to be caught earlier. */
  536 		/* NOTREACHED */
  537 		break;
  538 	case '?':
  539 	case '+':
  540 	case '*':
  541 		FAIL("?+* follows nothing");
  542 		/* NOTREACHED */
  543 		break;
  544 	case '\\':
  545 		if (*regparse == '\0')
  546 			FAIL("trailing \\");
  547 		ret = regnode(EXACTLY);
  548 		regc(*regparse++);
  549 		regc('\0');
  550 		*flagp |= HASWIDTH|SIMPLE;
  551 		break;
  552 	default: {
  553 			register int len;
  554 			register char ender;
  555 
  556 			regparse--;
  557 			len = strcspn(regparse, META);
  558 			if (len <= 0)
  559 				FAIL("internal disaster");
  560 			ender = *(regparse+len);
  561 			if (len > 1 && ISMULT(ender))
  562 				len--;		/* Back off clear of ?+* operand. */
  563 			*flagp |= HASWIDTH;
  564 			if (len == 1)
  565 				*flagp |= SIMPLE;
  566 			ret = regnode(EXACTLY);
  567 			while (len > 0) {
  568 				regc(*regparse++);
  569 				len--;
  570 			}
  571 			regc('\0');
  572 		}
  573 		break;
  574 	}
  575 
  576 	return(ret);
  577 }
  578 
  579 /*
  580  - regnode - emit a node
  581  */
  582 static char *			/* Location. */
  583 regnode(op)
  584 char op;
  585 {
  586 	register char *ret;
  587 	register char *ptr;
  588 
  589 	ret = regcode;
  590 	if (ret == &regdummy) {
  591 		regsize += 3;
  592 		return(ret);
  593 	}
  594 
  595 	ptr = ret;
  596 	*ptr++ = op;
  597 	*ptr++ = '\0';		/* Null "next" pointer. */
  598 	*ptr++ = '\0';
  599 	regcode = ptr;
  600 
  601 	return(ret);
  602 }
  603 
  604 /*
  605  - regc - emit (if appropriate) a byte of code
  606  */
  607 static void
  608 regc(b)
  609 char b;
  610 {
  611 	if (regcode != &regdummy)
  612 		*regcode++ = b;
  613 	else
  614 		regsize++;
  615 }
  616 
  617 /*
  618  - reginsert - insert an operator in front of already-emitted operand
  619  *
  620  * Means relocating the operand.
  621  */
  622 static void
  623 reginsert(op, opnd)
  624 char op;
  625 char *opnd;
  626 {
  627 	register char *src;
  628 	register char *dst;
  629 	register char *place;
  630 
  631 	if (regcode == &regdummy) {
  632 		regsize += 3;
  633 		return;
  634 	}
  635 
  636 	src = regcode;
  637 	regcode += 3;
  638 	dst = regcode;
  639 	while (src > opnd)
  640 		*--dst = *--src;
  641 
  642 	place = opnd;		/* Op node, where operand used to be. */
  643 	*place++ = op;
  644 	*place++ = '\0';
  645 	*place++ = '\0';
  646 }
  647 
  648 /*
  649  - regtail - set the next-pointer at the end of a node chain
  650  */
  651 static void
  652 regtail(p, val)
  653 char *p;
  654 char *val;
  655 {
  656 	register char *scan;
  657 	register char *temp;
  658 	register int offset;
  659 
  660 	if (p == &regdummy)
  661 		return;
  662 
  663 	/* Find last node. */
  664 	scan = p;
  665 	for (;;) {
  666 		temp = regnext(scan);
  667 		if (temp == NULL)
  668 			break;
  669 		scan = temp;
  670 	}
  671 
  672 	if (OP(scan) == BACK)
  673 		offset = scan - val;
  674 	else
  675 		offset = val - scan;
  676 	*(scan+1) = (offset>>8)&0377;
  677 	*(scan+2) = offset&0377;
  678 }
  679 
  680 /*
  681  - regoptail - regtail on operand of first argument; nop if operandless
  682  */
  683 static void
  684 regoptail(p, val)
  685 char *p;
  686 char *val;
  687 {
  688 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
  689 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
  690 		return;
  691 	regtail(OPERAND(p), val);
  692 }
  693 
  694 /*
  695  * regexec and friends
  696  */
  697 
  698 /*
  699  * Global work variables for regexec().
  700  */
  701 static char *reginput;		/* String-input pointer. */
  702 static char *regbol;		/* Beginning of input, for ^ check. */
  703 static char **regstartp;	/* Pointer to startp array. */
  704 static char **regendp;		/* Ditto for endp. */
  705 
  706 /*
  707  * Forwards.
  708  */
  709 STATIC int regtry();
  710 STATIC int regmatch();
  711 STATIC int regrepeat();
  712 
  713 #ifdef DEBUG
  714 int regnarrate = 0;
  715 void regdump();
  716 STATIC char *regprop();
  717 #endif
  718 
  719 /*
  720  - regexec - match a regexp against a string
  721  */
  722 int
  723 regexec2(prog, string, notbol)
  724 register regexp *prog;
  725 register char *string;
  726 int notbol;
  727 {
  728 	register char *s;
  729 
  730 	/* Be paranoid... */
  731 	if (prog == NULL || string == NULL) {
  732 		regerror("NULL parameter");
  733 		return(0);
  734 	}
  735 
  736 	/* Check validity of program. */
  737 	if (UCHARAT(prog->program) != MAGIC) {
  738 		regerror("corrupted program");
  739 		return(0);
  740 	}
  741 
  742 	/* If there is a "must appear" string, look for it. */
  743 	if (prog->regmust != NULL) {
  744 		s = string;
  745 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
  746 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
  747 				break;	/* Found it. */
  748 			s++;
  749 		}
  750 		if (s == NULL)	/* Not present. */
  751 			return(0);
  752 	}
  753 
  754 	/* Mark beginning of line for ^ . */
  755 	if (notbol)
  756 		regbol = NULL;
  757 	else
  758 		regbol = string;
  759 
  760 	/* Simplest case:  anchored match need be tried only once. */
  761 	if (prog->reganch)
  762 		return(regtry(prog, string));
  763 
  764 	/* Messy cases:  unanchored match. */
  765 	s = string;
  766 	if (prog->regstart != '\0')
  767 		/* We know what char it must start with. */
  768 		while ((s = strchr(s, prog->regstart)) != NULL) {
  769 			if (regtry(prog, s))
  770 				return(1);
  771 			s++;
  772 		}
  773 	else
  774 		/* We don't -- general case. */
  775 		do {
  776 			if (regtry(prog, s))
  777 				return(1);
  778 		} while (*s++ != '\0');
  779 
  780 	/* Failure. */
  781 	return(0);
  782 }
  783 
  784 int
  785 regexec(prog, string)
  786 register regexp *prog;
  787 register char *string;
  788 {
  789 	return regexec2(prog, string, 0);
  790 }
  791 
  792 /*
  793  - regtry - try match at specific point
  794  */
  795 static int			/* 0 failure, 1 success */
  796 regtry(prog, string)
  797 regexp *prog;
  798 char *string;
  799 {
  800 	register int i;
  801 	register char **sp;
  802 	register char **ep;
  803 
  804 	reginput = string;
  805 	regstartp = prog->startp;
  806 	regendp = prog->endp;
  807 
  808 	sp = prog->startp;
  809 	ep = prog->endp;
  810 	for (i = NSUBEXP; i > 0; i--) {
  811 		*sp++ = NULL;
  812 		*ep++ = NULL;
  813 	}
  814 	if (regmatch(prog->program + 1)) {
  815 		prog->startp[0] = string;
  816 		prog->endp[0] = reginput;
  817 		return(1);
  818 	} else
  819 		return(0);
  820 }
  821 
  822 /*
  823  - regmatch - main matching routine
  824  *
  825  * Conceptually the strategy is simple:  check to see whether the current
  826  * node matches, call self recursively to see whether the rest matches,
  827  * and then act accordingly.  In practice we make some effort to avoid
  828  * recursion, in particular by going through "ordinary" nodes (that don't
  829  * need to know whether the rest of the match failed) by a loop instead of
  830  * by recursion.
  831  */
  832 static int			/* 0 failure, 1 success */
  833 regmatch(prog)
  834 char *prog;
  835 {
  836 	register char *scan;	/* Current node. */
  837 	char *next;		/* Next node. */
  838 
  839 	scan = prog;
  840 #ifdef DEBUG
  841 	if (scan != NULL && regnarrate)
  842 		fprintf(stderr, "%s(\n", regprop(scan));
  843 #endif
  844 	while (scan != NULL) {
  845 #ifdef DEBUG
  846 		if (regnarrate)
  847 			fprintf(stderr, "%s...\n", regprop(scan));
  848 #endif
  849 		next = regnext(scan);
  850 
  851 		switch (OP(scan)) {
  852 		case BOL:
  853 			if (reginput != regbol)
  854 				return(0);
  855 			break;
  856 		case EOL:
  857 			if (*reginput != '\0')
  858 				return(0);
  859 			break;
  860 		case ANY:
  861 			if (*reginput == '\0')
  862 				return(0);
  863 			reginput++;
  864 			break;
  865 		case EXACTLY: {
  866 				register int len;
  867 				register char *opnd;
  868 
  869 				opnd = OPERAND(scan);
  870 				/* Inline the first character, for speed. */
  871 				if (*opnd != *reginput)
  872 					return(0);
  873 				len = strlen(opnd);
  874 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
  875 					return(0);
  876 				reginput += len;
  877 			}
  878 			break;
  879 		case ANYOF:
  880  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
  881 				return(0);
  882 			reginput++;
  883 			break;
  884 		case ANYBUT:
  885  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
  886 				return(0);
  887 			reginput++;
  888 			break;
  889 		case NOTHING:
  890 			break;
  891 		case BACK:
  892 			break;
  893 		case OPEN+1:
  894 		case OPEN+2:
  895 		case OPEN+3:
  896 		case OPEN+4:
  897 		case OPEN+5:
  898 		case OPEN+6:
  899 		case OPEN+7:
  900 		case OPEN+8:
  901 		case OPEN+9: {
  902 				register int no;
  903 				register char *save;
  904 
  905 				no = OP(scan) - OPEN;
  906 				save = reginput;
  907 
  908 				if (regmatch(next)) {
  909 					/*
  910 					 * Don't set startp if some later
  911 					 * invocation of the same parentheses
  912 					 * already has.
  913 					 */
  914 					if (regstartp[no] == NULL)
  915 						regstartp[no] = save;
  916 					return(1);
  917 				} else
  918 					return(0);
  919 			}
  920 			/* NOTREACHED */
  921 			break;
  922 		case CLOSE+1:
  923 		case CLOSE+2:
  924 		case CLOSE+3:
  925 		case CLOSE+4:
  926 		case CLOSE+5:
  927 		case CLOSE+6:
  928 		case CLOSE+7:
  929 		case CLOSE+8:
  930 		case CLOSE+9: {
  931 				register int no;
  932 				register char *save;
  933 
  934 				no = OP(scan) - CLOSE;
  935 				save = reginput;
  936 
  937 				if (regmatch(next)) {
  938 					/*
  939 					 * Don't set endp if some later
  940 					 * invocation of the same parentheses
  941 					 * already has.
  942 					 */
  943 					if (regendp[no] == NULL)
  944 						regendp[no] = save;
  945 					return(1);
  946 				} else
  947 					return(0);
  948 			}
  949 			/* NOTREACHED */
  950 			break;
  951 		case BRANCH: {
  952 				register char *save;
  953 
  954 				if (OP(next) != BRANCH)		/* No choice. */
  955 					next = OPERAND(scan);	/* Avoid recursion. */
  956 				else {
  957 					do {
  958 						save = reginput;
  959 						if (regmatch(OPERAND(scan)))
  960 							return(1);
  961 						reginput = save;
  962 						scan = regnext(scan);
  963 					} while (scan != NULL && OP(scan) == BRANCH);
  964 					return(0);
  965 					/* NOTREACHED */
  966 				}
  967 			}
  968 			/* NOTREACHED */
  969 			break;
  970 		case STAR:
  971 		case PLUS: {
  972 				register char nextch;
  973 				register int no;
  974 				register char *save;
  975 				register int min;
  976 
  977 				/*
  978 				 * Lookahead to avoid useless match attempts
  979 				 * when we know what character comes next.
  980 				 */
  981 				nextch = '\0';
  982 				if (OP(next) == EXACTLY)
  983 					nextch = *OPERAND(next);
  984 				min = (OP(scan) == STAR) ? 0 : 1;
  985 				save = reginput;
  986 				no = regrepeat(OPERAND(scan));
  987 				while (no >= min) {
  988 					/* If it could work, try it. */
  989 					if (nextch == '\0' || *reginput == nextch)
  990 						if (regmatch(next))
  991 							return(1);
  992 					/* Couldn't or didn't -- back up. */
  993 					no--;
  994 					reginput = save + no;
  995 				}
  996 				return(0);
  997 			}
  998 			/* NOTREACHED */
  999 			break;
 1000 		case END:
 1001