"SfR Fresh" - the SfR Freeware/Shareware Archive 
Member "less-424/regexp.c" of archive less-424.tar.gz:
As a special service "SfR Fresh" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting with prefixed line numbers.
Alternatively you can here view or download the uninterpreted source code file.
That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 *
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
6 *
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
10 *
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
14 *
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
17 *
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
24 *
25 * *** NOTE: this code has been altered slightly for use in Tcl. ***
26 * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
28 */
29
30 #include "less.h"
31 #if HAVE_STDIO_H
32 #include <stdio.h>
33 #endif
34 #if HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #if HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #include "regexp.h"
41
42 /*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases. They are:
46 *
47 * regstart char that must begin a match; '\0' if none obvious
48 * reganch is the match anchored (at beginning-of-line only)?
49 * regmust string (pointer into program) that match must include, or NULL
50 * regmlen length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot. Regmust permits fast rejection
54 * of lines that cannot possibly match. The regmust tests are costly enough
55 * that regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup). Regmlen is
58 * supplied because the test in regexec() needs it and regcomp() is
59 * computing it anyway.
60 */
61
62 /*
63 * Structure for regexp "program". This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology). Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives. (Here we
69 * have one of the subtle syntax dependencies: an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.) The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM. In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure: the tail of the branch connects
75 * to the thing following the set of BRANCHes.) The opcodes are:
76 */
77
78 /* definition number opnd? meaning */
79 #undef EOL
80 #define END 0 /* no End of program. */
81 #define BOL 1 /* no Match "" at beginning of line. */
82 #define EOL 2 /* no Match "" at end of line. */
83 #define ANY 3 /* no Match any one character. */
84 #define ANYOF 4 /* str Match any character in this string. */
85 #define ANYBUT 5 /* str Match any character not in this string. */
86 #define BRANCH 6 /* node Match this alternative, or the next... */
87 #define BACK 7 /* no Match "", "next" ptr points backward. */
88 #define EXACTLY 8 /* str Match this string. */
89 #define NOTHING 9 /* no Match empty string. */
90 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
91 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
92 #define OPEN 20 /* no Mark this point in input as start of #n. */
93 /* OPEN+1 is number 1, etc. */
94 #define CLOSE 30 /* no Analogous to OPEN. */
95
96 /*
97 * Opcode notes:
98 *
99 * BRANCH The set of branches constituting a single choice are hooked
100 * together with their "next" pointers, since precedence prevents
101 * anything being concatenated to any individual branch. The
102 * "next" pointer of the last BRANCH in a choice points to the
103 * thing following the whole choice. This is also where the
104 * final "next" pointer of each individual branch points; each
105 * branch starts with the operand node of a BRANCH node.
106 *
107 * BACK Normal "next" pointers all implicitly point forward; BACK
108 * exists to make loop structures possible.
109 *
110 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
111 * BRANCH structures using BACK. Simple cases (one character
112 * per match) are implemented with STAR and PLUS for speed
113 * and to minimize recursive plunges.
114 *
115 * OPEN,CLOSE ...are numbered at compile time.
116 */
117
118 /*
119 * A node is one char of opcode followed by two chars of "next" pointer.
120 * "Next" pointers are stored as two 8-bit pieces, high order first. The
121 * value is a positive offset from the opcode of the node containing it.
122 * An operand, if any, simply follows the node. (Note that much of the
123 * code generation knows about this implicit relationship.)
124 *
125 * Using two bytes for the "next" pointer is vast overkill for most things,
126 * but allows patterns to get big without disasters.
127 */
128 #define OP(p) (*(p))
129 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130 #define OPERAND(p) ((p) + 3)
131
132 /*
133 * See regmagic.h for one further detail of program structure.
134 */
135
136
137 /*
138 * Utility definitions.
139 */
140 #ifndef CHARBITS
141 #define UCHARAT(p) ((int)*(unsigned char *)(p))
142 #else
143 #define UCHARAT(p) ((int)*(p)&CHARBITS)
144 #endif
145
146 #define FAIL(m) { regerror(m); return(NULL); }
147 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
148 #define META "^$.[()|?+*\\"
149
150 /*
151 * Flags to be passed up and down.
152 */
153 #define HASWIDTH 01 /* Known never to match null string. */
154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155 #define SPSTART 04 /* Starts with * or +. */
156 #define WORST 0 /* Worst case. */
157
158 /*
159 * Global work variables for regcomp().
160 */
161 static char *regparse; /* Input-scan pointer. */
162 static int regnpar; /* () count. */
163 static char regdummy;
164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165 static long regsize; /* Code size. */
166
167 /*
168 * The first byte of the regexp internal "program" is actually this magic
169 * number; the start node begins in the second byte.
170 */
171 #define MAGIC 0234
172
173
174 /*
175 * Forward declarations for regcomp()'s friends.
176 */
177 #ifndef STATIC
178 #define STATIC static
179 #endif
180 STATIC char *reg();
181 STATIC char *regbranch();
182 STATIC char *regpiece();
183 STATIC char *regatom();
184 STATIC char *regnode();
185 STATIC char *regnext();
186 STATIC void regc();
187 STATIC void reginsert();
188 STATIC void regtail();
189 STATIC void regoptail();
190 #ifdef STRCSPN
191 STATIC int strcspn();
192 #endif
193
194 /*
195 - regcomp - compile a regular expression into internal code
196 *
197 * We can't allocate space until we know how big the compiled form will be,
198 * but we can't compile it (and thus know how big it is) until we've got a
199 * place to put the code. So we cheat: we compile it twice, once with code
200 * generation turned off and size counting turned on, and once "for real".
201 * This also means that we don't allocate space until we are sure that the
202 * thing really will compile successfully, and we never have to move the
203 * code and thus invalidate pointers into it. (Note that it has to be in
204 * one piece because free() must be able to free it all.)
205 *
206 * Beware that the optimization-preparation code in here knows about some
207 * of the structure of the compiled regexp.
208 */
209 regexp *
210 regcomp(exp)
211 char *exp;
212 {
213 register regexp *r;
214 register char *scan;
215 register char *longest;
216 register int len;
217 int flags;
218
219 if (exp == NULL)
220 FAIL("NULL argument");
221
222 /* First pass: determine size, legality. */
223 regparse = exp;
224 regnpar = 1;
225 regsize = 0L;
226 regcode = ®dummy;
227 regc(MAGIC);
228 if (reg(0, &flags) == NULL)
229 return(NULL);
230
231 /* Small enough for pointer-storage convention? */
232 if (regsize >= 32767L) /* Probably could be 65535L. */
233 FAIL("regexp too big");
234
235 /* Allocate space. */
236 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237 if (r == NULL)
238 FAIL("out of space");
239
240 /* Second pass: emit code. */
241 regparse = exp;
242 regnpar = 1;
243 regcode = r->program;
244 regc(MAGIC);
245 if (reg(0, &flags) == NULL)
246 return(NULL);
247
248 /* Dig out information for optimizations. */
249 r->regstart = '\0'; /* Worst-case defaults. */
250 r->reganch = 0;
251 r->regmust = NULL;
252 r->regmlen = 0;
253 scan = r->program+1; /* First BRANCH. */
254 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
255 scan = OPERAND(scan);
256
257 /* Starting-point info. */
258 if (OP(scan) == EXACTLY)
259 r->regstart = *OPERAND(scan);
260 else if (OP(scan) == BOL)
261 r->reganch++;
262
263 /*
264 * If there's something expensive in the r.e., find the
265 * longest literal string that must appear and make it the
266 * regmust. Resolve ties in favor of later strings, since
267 * the regstart check works with the beginning of the r.e.
268 * and avoiding duplication strengthens checking. Not a
269 * strong reason, but sufficient in the absence of others.
270 */
271 if (flags&SPSTART) {
272 longest = NULL;
273 len = 0;
274 for (; scan != NULL; scan = regnext(scan))
275 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
276 longest = OPERAND(scan);
277 len = strlen(OPERAND(scan));
278 }
279 r->regmust = longest;
280 r->regmlen = len;
281 }
282 }
283
284 return(r);
285 }
286
287 /*
288 - reg - regular expression, i.e. main body or parenthesized thing
289 *
290 * Caller must absorb opening parenthesis.
291 *
292 * Combining parenthesis handling with the base level of regular expression
293 * is a trifle forced, but the need to tie the tails of the branches to what
294 * follows makes it hard to avoid.
295 */
296 static char *
297 reg(paren, flagp)
298 int paren; /* Parenthesized? */
299 int *flagp;
300 {
301 register char *ret;
302 register char *br;
303 register char *ender;
304 register int parno = 0;
305 int flags;
306
307 *flagp = HASWIDTH; /* Tentatively. */
308
309 /* Make an OPEN node, if parenthesized. */
310 if (paren) {
311 if (regnpar >= NSUBEXP)
312 FAIL("too many ()");
313 parno = regnpar;
314 regnpar++;
315 ret = regnode(OPEN+parno);
316 } else
317 ret = NULL;
318
319 /* Pick up the branches, linking them together. */
320 br = regbranch(&flags);
321 if (br == NULL)
322 return(NULL);
323 if (ret != NULL)
324 regtail(ret, br); /* OPEN -> first. */
325 else
326 ret = br;
327 if (!(flags&HASWIDTH))
328 *flagp &= ~HASWIDTH;
329 *flagp |= flags&SPSTART;
330 while (*regparse == '|') {
331 regparse++;
332 br = regbranch(&flags);
333 if (br == NULL)
334 return(NULL);
335 regtail(ret, br); /* BRANCH -> BRANCH. */
336 if (!(flags&HASWIDTH))
337 *flagp &= ~HASWIDTH;
338 *flagp |= flags&SPSTART;
339 }
340
341 /* Make a closing node, and hook it on the end. */
342 ender = regnode((paren) ? CLOSE+parno : END);
343 regtail(ret, ender);
344
345 /* Hook the tails of the branches to the closing node. */
346 for (br = ret; br != NULL; br = regnext(br))
347 regoptail(br, ender);
348
349 /* Check for proper termination. */
350 if (paren && *regparse++ != ')') {
351 FAIL("unmatched ()");
352 } else if (!paren && *regparse != '\0') {
353 if (*regparse == ')') {
354 FAIL("unmatched ()");
355 } else
356 FAIL("junk on end"); /* "Can't happen". */
357 /* NOTREACHED */
358 }
359
360 return(ret);
361 }
362
363 /*
364 - regbranch - one alternative of an | operator
365 *
366 * Implements the concatenation operator.
367 */
368 static char *
369 regbranch(flagp)
370 int *flagp;
371 {
372 register char *ret;
373 register char *chain;
374 register char *latest;
375 int flags;
376
377 *flagp = WORST; /* Tentatively. */
378
379 ret = regnode(BRANCH);
380 chain = NULL;
381 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
382 latest = regpiece(&flags);
383 if (latest == NULL)
384 return(NULL);
385 *flagp |= flags&HASWIDTH;
386 if (chain == NULL) /* First piece. */
387 *flagp |= flags&SPSTART;
388 else
389 regtail(chain, latest);
390 chain = latest;
391 }
392 if (chain == NULL) /* Loop ran zero times. */
393 (void) regnode(NOTHING);
394
395 return(ret);
396 }
397
398 /*
399 - regpiece - something followed by possible [*+?]
400 *
401 * Note that the branching code sequences used for ? and the general cases
402 * of * and + are somewhat optimized: they use the same NOTHING node as
403 * both the endmarker for their branch list and the body of the last branch.
404 * It might seem that this node could be dispensed with entirely, but the
405 * endmarker role is not redundant.
406 */
407 static char *
408 regpiece(flagp)
409 int *flagp;
410 {
411 register char *ret;
412 register char op;
413 register char *next;
414 int flags;
415
416 ret = regatom(&flags);
417 if (ret == NULL)
418 return(NULL);
419
420 op = *regparse;
421 if (!ISMULT(op)) {
422 *flagp = flags;
423 return(ret);
424 }
425
426 if (!(flags&HASWIDTH) && op != '?')
427 FAIL("*+ operand could be empty");
428 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
429
430 if (op == '*' && (flags&SIMPLE))
431 reginsert(STAR, ret);
432 else if (op == '*') {
433 /* Emit x* as (x&|), where & means "self". */
434 reginsert(BRANCH, ret); /* Either x */
435 regoptail(ret, regnode(BACK)); /* and loop */
436 regoptail(ret, ret); /* back */
437 regtail(ret, regnode(BRANCH)); /* or */
438 regtail(ret, regnode(NOTHING)); /* null. */
439 } else if (op == '+' && (flags&SIMPLE))
440 reginsert(PLUS, ret);
441 else if (op == '+') {
442 /* Emit x+ as x(&|), where & means "self". */
443 next = regnode(BRANCH); /* Either */
444 regtail(ret, next);
445 regtail(regnode(BACK), ret); /* loop back */
446 regtail(next, regnode(BRANCH)); /* or */
447 regtail(ret, regnode(NOTHING)); /* null. */
448 } else if (op == '?') {
449 /* Emit x? as (x|) */
450 reginsert(BRANCH, ret); /* Either x */
451 regtail(ret, regnode(BRANCH)); /* or */
452 next = regnode(NOTHING); /* null. */
453 regtail(ret, next);
454 regoptail(ret, next);
455 }
456 regparse++;
457 if (ISMULT(*regparse))
458 FAIL("nested *?+");
459
460 return(ret);
461 }
462
463 /*
464 - regatom - the lowest level
465 *
466 * Optimization: gobbles an entire sequence of ordinary characters so that
467 * it can turn them into a single node, which is smaller to store and
468 * faster to run. Backslashed characters are exceptions, each becoming a
469 * separate node; the code is simpler that way and it's not worth fixing.
470 */
471 static char *
472 regatom(flagp)
473 int *flagp;
474 {
475 register char *ret;
476 int flags;
477
478 *flagp = WORST; /* Tentatively. */
479
480 switch (*regparse++) {
481 case '^':
482 ret = regnode(BOL);
483 break;
484 case '$':
485 ret = regnode(EOL);
486 break;
487 case '.':
488 ret = regnode(ANY);
489 *flagp |= HASWIDTH|SIMPLE;
490 break;
491 case '[': {
492 register int clss;
493 register int classend;
494
495 if (*regparse == '^') { /* Complement of range. */
496 ret = regnode(ANYBUT);
497 regparse++;
498 } else
499 ret = regnode(ANYOF);
500 if (*regparse == ']' || *regparse == '-')
501 regc(*regparse++);
502 while (*regparse != '\0' && *regparse != ']') {
503 if (*regparse == '-') {
504 regparse++;
505 if (*regparse == ']' || *regparse == '\0')
506 regc('-');
507 else {
508 clss = UCHARAT(regparse-2)+1;
509 classend = UCHARAT(regparse);
510 if (clss > classend+1)
511 FAIL("invalid [] range");
512 for (; clss <= classend; clss++)
513 regc(clss);
514 regparse++;
515 }
516 } else
517 regc(*regparse++);
518 }
519 regc('\0');
520 if (*regparse != ']')
521 FAIL("unmatched []");
522 regparse++;
523 *flagp |= HASWIDTH|SIMPLE;
524 }
525 break;
526 case '(':
527 ret = reg(1, &flags);
528 if (ret == NULL)
529 return(NULL);
530 *flagp |= flags&(HASWIDTH|SPSTART);
531 break;
532 case '\0':
533 case '|':
534 case ')':
535 FAIL("internal urp"); /* Supposed to be caught earlier. */
536 /* NOTREACHED */
537 break;
538 case '?':
539 case '+':
540 case '*':
541 FAIL("?+* follows nothing");
542 /* NOTREACHED */
543 break;
544 case '\\':
545 if (*regparse == '\0')
546 FAIL("trailing \\");
547 ret = regnode(EXACTLY);
548 regc(*regparse++);
549 regc('\0');
550 *flagp |= HASWIDTH|SIMPLE;
551 break;
552 default: {
553 register int len;
554 register char ender;
555
556 regparse--;
557 len = strcspn(regparse, META);
558 if (len <= 0)
559 FAIL("internal disaster");
560 ender = *(regparse+len);
561 if (len > 1 && ISMULT(ender))
562 len--; /* Back off clear of ?+* operand. */
563 *flagp |= HASWIDTH;
564 if (len == 1)
565 *flagp |= SIMPLE;
566 ret = regnode(EXACTLY);
567 while (len > 0) {
568 regc(*regparse++);
569 len--;
570 }
571 regc('\0');
572 }
573 break;
574 }
575
576 return(ret);
577 }
578
579 /*
580 - regnode - emit a node
581 */
582 static char * /* Location. */
583 regnode(op)
584 char op;
585 {
586 register char *ret;
587 register char *ptr;
588
589 ret = regcode;
590 if (ret == ®dummy) {
591 regsize += 3;
592 return(ret);
593 }
594
595 ptr = ret;
596 *ptr++ = op;
597 *ptr++ = '\0'; /* Null "next" pointer. */
598 *ptr++ = '\0';
599 regcode = ptr;
600
601 return(ret);
602 }
603
604 /*
605 - regc - emit (if appropriate) a byte of code
606 */
607 static void
608 regc(b)
609 char b;
610 {
611 if (regcode != ®dummy)
612 *regcode++ = b;
613 else
614 regsize++;
615 }
616
617 /*
618 - reginsert - insert an operator in front of already-emitted operand
619 *
620 * Means relocating the operand.
621 */
622 static void
623 reginsert(op, opnd)
624 char op;
625 char *opnd;
626 {
627 register char *src;
628 register char *dst;
629 register char *place;
630
631 if (regcode == ®dummy) {
632 regsize += 3;
633 return;
634 }
635
636 src = regcode;
637 regcode += 3;
638 dst = regcode;
639 while (src > opnd)
640 *--dst = *--src;
641
642 place = opnd; /* Op node, where operand used to be. */
643 *place++ = op;
644 *place++ = '\0';
645 *place++ = '\0';
646 }
647
648 /*
649 - regtail - set the next-pointer at the end of a node chain
650 */
651 static void
652 regtail(p, val)
653 char *p;
654 char *val;
655 {
656 register char *scan;
657 register char *temp;
658 register int offset;
659
660 if (p == ®dummy)
661 return;
662
663 /* Find last node. */
664 scan = p;
665 for (;;) {
666 temp = regnext(scan);
667 if (temp == NULL)
668 break;
669 scan = temp;
670 }
671
672 if (OP(scan) == BACK)
673 offset = scan - val;
674 else
675 offset = val - scan;
676 *(scan+1) = (offset>>8)&0377;
677 *(scan+2) = offset&0377;
678 }
679
680 /*
681 - regoptail - regtail on operand of first argument; nop if operandless
682 */
683 static void
684 regoptail(p, val)
685 char *p;
686 char *val;
687 {
688 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
689 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
690 return;
691 regtail(OPERAND(p), val);
692 }
693
694 /*
695 * regexec and friends
696 */
697
698 /*
699 * Global work variables for regexec().
700 */
701 static char *reginput; /* String-input pointer. */
702 static char *regbol; /* Beginning of input, for ^ check. */
703 static char **regstartp; /* Pointer to startp array. */
704 static char **regendp; /* Ditto for endp. */
705
706 /*
707 * Forwards.
708 */
709 STATIC int regtry();
710 STATIC int regmatch();
711 STATIC int regrepeat();
712
713 #ifdef DEBUG
714 int regnarrate = 0;
715 void regdump();
716 STATIC char *regprop();
717 #endif
718
719 /*
720 - regexec - match a regexp against a string
721 */
722 int
723 regexec2(prog, string, notbol)
724 register regexp *prog;
725 register char *string;
726 int notbol;
727 {
728 register char *s;
729
730 /* Be paranoid... */
731 if (prog == NULL || string == NULL) {
732 regerror("NULL parameter");
733 return(0);
734 }
735
736 /* Check validity of program. */
737 if (UCHARAT(prog->program) != MAGIC) {
738 regerror("corrupted program");
739 return(0);
740 }
741
742 /* If there is a "must appear" string, look for it. */
743 if (prog->regmust != NULL) {
744 s = string;
745 while ((s = strchr(s, prog->regmust[0])) != NULL) {
746 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
747 break; /* Found it. */
748 s++;
749 }
750 if (s == NULL) /* Not present. */
751 return(0);
752 }
753
754 /* Mark beginning of line for ^ . */
755 if (notbol)
756 regbol = NULL;
757 else
758 regbol = string;
759
760 /* Simplest case: anchored match need be tried only once. */
761 if (prog->reganch)
762 return(regtry(prog, string));
763
764 /* Messy cases: unanchored match. */
765 s = string;
766 if (prog->regstart != '\0')
767 /* We know what char it must start with. */
768 while ((s = strchr(s, prog->regstart)) != NULL) {
769 if (regtry(prog, s))
770 return(1);
771 s++;
772 }
773 else
774 /* We don't -- general case. */
775 do {
776 if (regtry(prog, s))
777 return(1);
778 } while (*s++ != '\0');
779
780 /* Failure. */
781 return(0);
782 }
783
784 int
785 regexec(prog, string)
786 register regexp *prog;
787 register char *string;
788 {
789 return regexec2(prog, string, 0);
790 }
791
792 /*
793 - regtry - try match at specific point
794 */
795 static int /* 0 failure, 1 success */
796 regtry(prog, string)
797 regexp *prog;
798 char *string;
799 {
800 register int i;
801 register char **sp;
802 register char **ep;
803
804 reginput = string;
805 regstartp = prog->startp;
806 regendp = prog->endp;
807
808 sp = prog->startp;
809 ep = prog->endp;
810 for (i = NSUBEXP; i > 0; i--) {
811 *sp++ = NULL;
812 *ep++ = NULL;
813 }
814 if (regmatch(prog->program + 1)) {
815 prog->startp[0] = string;
816 prog->endp[0] = reginput;
817 return(1);
818 } else
819 return(0);
820 }
821
822 /*
823 - regmatch - main matching routine
824 *
825 * Conceptually the strategy is simple: check to see whether the current
826 * node matches, call self recursively to see whether the rest matches,
827 * and then act accordingly. In practice we make some effort to avoid
828 * recursion, in particular by going through "ordinary" nodes (that don't
829 * need to know whether the rest of the match failed) by a loop instead of
830 * by recursion.
831 */
832 static int /* 0 failure, 1 success */
833 regmatch(prog)
834 char *prog;
835 {
836 register char *scan; /* Current node. */
837 char *next; /* Next node. */
838
839 scan = prog;
840 #ifdef DEBUG
841 if (scan != NULL && regnarrate)
842 fprintf(stderr, "%s(\n", regprop(scan));
843 #endif
844 while (scan != NULL) {
845 #ifdef DEBUG
846 if (regnarrate)
847 fprintf(stderr, "%s...\n", regprop(scan));
848 #endif
849 next = regnext(scan);
850
851 switch (OP(scan)) {
852 case BOL:
853 if (reginput != regbol)
854 return(0);
855 break;
856 case EOL:
857 if (*reginput != '\0')
858 return(0);
859 break;
860 case ANY:
861 if (*reginput == '\0')
862 return(0);
863 reginput++;
864 break;
865 case EXACTLY: {
866 register int len;
867 register char *opnd;
868
869 opnd = OPERAND(scan);
870 /* Inline the first character, for speed. */
871 if (*opnd != *reginput)
872 return(0);
873 len = strlen(opnd);
874 if (len > 1 && strncmp(opnd, reginput, len) != 0)
875 return(0);
876 reginput += len;
877 }
878 break;
879 case ANYOF:
880 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
881 return(0);
882 reginput++;
883 break;
884 case ANYBUT:
885 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
886 return(0);
887 reginput++;
888 break;
889 case NOTHING:
890 break;
891 case BACK:
892 break;
893 case OPEN+1:
894 case OPEN+2:
895 case OPEN+3:
896 case OPEN+4:
897 case OPEN+5:
898 case OPEN+6:
899 case OPEN+7:
900 case OPEN+8:
901 case OPEN+9: {
902 register int no;
903 register char *save;
904
905 no = OP(scan) - OPEN;
906 save = reginput;
907
908 if (regmatch(next)) {
909 /*
910 * Don't set startp if some later
911 * invocation of the same parentheses
912 * already has.
913 */
914 if (regstartp[no] == NULL)
915 regstartp[no] = save;
916 return(1);
917 } else
918 return(0);
919 }
920 /* NOTREACHED */
921 break;
922 case CLOSE+1:
923 case CLOSE+2:
924 case CLOSE+3:
925 case CLOSE+4:
926 case CLOSE+5:
927 case CLOSE+6:
928 case CLOSE+7:
929 case CLOSE+8:
930 case CLOSE+9: {
931 register int no;
932 register char *save;
933
934 no = OP(scan) - CLOSE;
935 save = reginput;
936
937 if (regmatch(next)) {
938 /*
939 * Don't set endp if some later
940 * invocation of the same parentheses
941 * already has.
942 */
943 if (regendp[no] == NULL)
944 regendp[no] = save;
945 return(1);
946 } else
947 return(0);
948 }
949 /* NOTREACHED */
950 break;
951 case BRANCH: {
952 register char *save;
953
954 if (OP(next) != BRANCH) /* No choice. */
955 next = OPERAND(scan); /* Avoid recursion. */
956 else {
957 do {
958 save = reginput;
959 if (regmatch(OPERAND(scan)))
960 return(1);
961 reginput = save;
962 scan = regnext(scan);
963 } while (scan != NULL && OP(scan) == BRANCH);
964 return(0);
965 /* NOTREACHED */
966 }
967 }
968 /* NOTREACHED */
969 break;
970 case STAR:
971 case PLUS: {
972 register char nextch;
973 register int no;
974 register char *save;
975 register int min;
976
977 /*
978 * Lookahead to avoid useless match attempts
979 * when we know what character comes next.
980 */
981 nextch = '\0';
982 if (OP(next) == EXACTLY)
983 nextch = *OPERAND(next);
984 min = (OP(scan) == STAR) ? 0 : 1;
985 save = reginput;
986 no = regrepeat(OPERAND(scan));
987 while (no >= min) {
988 /* If it could work, try it. */
989 if (nextch == '\0' || *reginput == nextch)
990 if (regmatch(next))
991 return(1);
992 /* Couldn't or didn't -- back up. */
993 no--;
994 reginput = save + no;
995 }
996 return(0);
997 }
998 /* NOTREACHED */
999 break;
1000 case END:
1001