comparison src/ploki/re.c @ 4223:ac0403686959

<oerjan> rm -rf src/ploki; mv ploki src
author HackBot
date Fri, 20 Dec 2013 22:18:50 +0000
parents
children
comparison
equal deleted inserted replaced
4222:b0f3e267bb1e 4223:ac0403686959
1 #include "config.h"
2 #include "IO.h"
3 #include "Str.h"
4 #include "hash.h"
5 #include "main_io.h"
6 #include "main_opt.h"
7 #include "re.h"
8 #include "xmalloc.h"
9 #include "zz.h"
10
11 #include <ctype.h>
12 #include <limits.h>
13 #include <stddef.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <stdlib.h>
17 #include <assert.h>
18
19 enum {CLASS_SIZE = UCHAR_MAX / CHAR_BIT + 1u};
20
21 static unsigned char dc_word[CLASS_SIZE];
22 static unsigned char dc_alpha[CLASS_SIZE];
23 static unsigned char dc_cntrl[CLASS_SIZE];
24 static unsigned char dc_digit[CLASS_SIZE];
25 static unsigned char dc_lower[CLASS_SIZE];
26 static unsigned char dc_print[CLASS_SIZE];
27 static unsigned char dc_space[CLASS_SIZE];
28 static unsigned char dc_upper[CLASS_SIZE];
29 static unsigned char dc_xdigit[CLASS_SIZE];
30
31 static void fill(unsigned char *v, int (*pred)(int)) {
32 unsigned char c;
33
34 for (c = UCHAR_MAX; c; --c) {
35 v[c / CHAR_BIT] |= !!pred(c) << c % CHAR_BIT;
36 }
37 v[0] |= !!pred(0);
38 }
39
40 static void nfill(unsigned char *v, int (*pred)(int)) {
41 unsigned char c;
42
43 for (c = UCHAR_MAX; c; --c) {
44 v[c / CHAR_BIT] |= !pred(c) << c % CHAR_BIT;
45 }
46 v[0] |= !pred(0);
47 }
48
49 ATTR_CONST
50 static int siword(int c) {
51 return c == '_' || isalnum(c);
52 }
53
54 enum {MAGIC = 23};
55
56 static Hash rcache;
57 static String cached[MAGIC];
58 static size_t cachfill, cachlast;
59
60 static size_t hash(const void *s, size_t seed) {
61 return St_hash(s, seed);
62 }
63
64 static int compar(const void *a, const void *b) {
65 return St_cmp(a, b);
66 }
67
68 enum t_re_node {
69 RE_ALTER,
70 RE_ANYCH,
71 RE_AT_BOL,
72 RE_AT_BOS,
73 RE_AT_EOL,
74 RE_AT_EOS,
75 RE_AT_MATCH,
76 RE_AT_NOMATCH,
77 RE_AT_NWBOUND,
78 RE_AT_WBOUND,
79 RE_BACKCHECK,
80 RE_BACKREF,
81 RE_BEGIN_CAPTURE,
82 RE_CLASS,
83 RE_DEFCLASS,
84 RE_END_CAPTURE,
85 RE_INDEP,
86 RE_LITERAL,
87 RE_N_CLASS,
88 RE_N_DEFCLASS,
89 RE_PARTIAL,
90 RE_REP_FRUGAL,
91 RE_REP_GREEDY,
92 RE_SELECT,
93 RE_STUFF_FRUGAL,
94 RE_STUFF_GREEDY,
95 RE_XREP_FRUGAL,
96 RE_XREP_GREEDY
97 };
98
99 #define CMN_HDR enum t_re_node type; union my_node *next
100
101 typedef union my_node {
102 struct {
103 CMN_HDR;
104 } x;
105 struct {
106 CMN_HDR;
107 union my_node *arg;
108 } alter;
109 struct {
110 CMN_HDR;
111 size_t n;
112 } backref;
113 struct {
114 CMN_HDR;
115 size_t n;
116 } capture;
117 struct {
118 CMN_HDR;
119 unsigned char vec[CLASS_SIZE];
120 } class;
121 struct {
122 CMN_HDR;
123 unsigned char *vec;
124 } defclass;
125 struct {
126 CMN_HDR;
127 union my_node *arg;
128 } indep;
129 struct {
130 CMN_HDR;
131 unsigned char *buf;
132 size_t len;
133 } literal;
134 struct {
135 CMN_HDR;
136 union my_node *arg;
137 size_t min, max;
138 size_t n;
139 } rep;
140 struct {
141 CMN_HDR;
142 union my_node *arg;
143 union my_node *cond;
144 } select;
145 } re_node;
146
147 #include "re_block.c.h"
148
149 enum re_flags {
150 FL_NONE = 0,
151 FL_ANCHOR_START = 1
152 };
153
154 struct cap_state {
155 size_t start, end, pending;
156 };
157
158 struct my_regex {
159 re_node *start;
160 size_t repets;
161 size_t *repbuf;
162 t_block alloc;
163 size_t captures;
164 struct cap_state *cap_ptr;
165 size_t refs;
166 enum re_flags flags;
167 size_t minlen;
168 };
169
170 static void delv(void *v) {
171 re_free(v);
172 }
173
174 void re_init(void) {
175 fill(dc_word, siword);
176 fill(dc_alpha, isalpha);
177 fill(dc_cntrl, iscntrl);
178 fill(dc_digit, isdigit);
179 fill(dc_lower, islower);
180 fill(dc_print, isprint);
181 fill(dc_space, isspace);
182 fill(dc_upper, isupper);
183 fill(dc_xdigit, isxdigit);
184
185 h_init(&rcache, hash, compar, NULL, delv);
186 }
187
188 void re_end(void) {
189 h_end(&rcache);
190 while (cachfill) {
191 --cachfill;
192 St_clear(&cached[cachfill]);
193 }
194 cachlast = 0;
195 }
196
197
198 enum {ESCAPE = '!'};
199 #define MEATCHARP(c) strchr("!|({<[]>})*+?:.'^$`", c)
200 #define ENDBRANCHP(c) strchr("|({<[", c)
201
202 struct parse_context {
203 t_regex *base;
204 t_block *block;
205 const String *s;
206 size_t *p;
207 };
208
209 static void skipspace(const String *const s, size_t *const p) {
210 restart:
211 while (isspace(ST_INDEX(s, *p))) {
212 --*p;
213 }
214 if (ST_INDEX(s, *p) == ']' && ST_INDEX(s, *p - 1u) == '#') {
215 *p -= 2u;
216 while (ST_INDEX(s, *p) != EOF) {
217 if (ST_INDEX(s, *p) == '[') {
218 --*p;
219 goto restart;
220 }
221 if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1u) != EOF) {
222 --*p;
223 }
224 --*p;
225 }
226 }
227 }
228
229 #define PRAZ(name) static re_node *name(const struct parse_context *con, re_node *next, re_node *curolan_zebeth)
230 #define SNORK const String *const s = con->s; size_t *const p = con->p
231 #define CALL(name, nextn, glob) name(con, nextn, glob)
232 #define TAILC(name) return CALL(name, next, curolan_zebeth)
233
234 PRAZ(quantifire);
235 PRAZ(branch);
236 PRAZ(alternation);
237
238 PRAZ(atom) {
239 SNORK;
240 (void)curolan_zebeth;
241
242 skipspace(s, p);
243 switch (ST_INDEX(s, *p)) {
244 case '(':
245 case '<':
246 case '{':
247 case '[':
248 case EOF: {
249 re_node *cur = bl_node(con->block);
250 cur->literal.type = RE_LITERAL;
251 cur->literal.next = next;
252 cur->literal.buf = NULL;
253 cur->literal.len = 0;
254 return cur;
255 }
256
257 case ESCAPE: {
258 re_node *cur = bl_node(con->block);
259 --*p;
260 if (isdigit(ST_INDEX(s, *p))) {
261 do {
262 --*p;
263 } while (isdigit(ST_INDEX(s, *p)));
264 cur->backref.type = RE_BACKREF;
265 cur->backref.next = next;
266 cur->backref.n = strtoul(St_ptr(s) + *p + 1, NULL, 10);
267 } else if (isalpha(ST_INDEX(s, *p))) {
268 switch (ST_INDEX(s, *p)) {
269 case 'A':
270 cur->x.type = RE_AT_BOL;
271 cur->x.next = next;
272 --*p;
273 break;
274 case 'Z':
275 cur->x.type = RE_AT_EOL;
276 cur->x.next = next;
277 --*p;
278 break;
279
280 case 'b':
281 cur->x.type = RE_AT_WBOUND;
282 cur->x.next = next;
283 --*p;
284 break;
285 case 'B':
286 cur->x.type = RE_AT_NWBOUND;
287 cur->x.next = next;
288 --*p;
289 break;
290
291 case 'c':
292 cur->defclass.type = RE_DEFCLASS;
293 cur->defclass.vec = dc_cntrl;
294 cur->defclass.next = next;
295 --*p;
296 break;
297 case 'C':
298 cur->defclass.type = RE_N_DEFCLASS;
299 cur->defclass.vec = dc_cntrl;
300 cur->defclass.next = next;
301 --*p;
302 break;
303
304 case 'd':
305 cur->defclass.type = RE_DEFCLASS;
306 cur->defclass.vec = dc_digit;
307 cur->defclass.next = next;
308 --*p;
309 break;
310 case 'D':
311 cur->defclass.type = RE_N_DEFCLASS;
312 cur->defclass.vec = dc_digit;
313 cur->defclass.next = next;
314 --*p;
315 break;
316
317 case 'l':
318 cur->defclass.type = RE_DEFCLASS;
319 cur->defclass.vec = dc_lower;
320 cur->defclass.next = next;
321 --*p;
322 break;
323 case 'L':
324 cur->defclass.type = RE_N_DEFCLASS;
325 cur->defclass.vec = dc_lower;
326 cur->defclass.next = next;
327 --*p;
328 break;
329
330 case 'p':
331 cur->defclass.type = RE_DEFCLASS;
332 cur->defclass.vec = dc_print;
333 cur->defclass.next = next;
334 --*p;
335 break;
336 case 'P':
337 cur->defclass.type = RE_N_DEFCLASS;
338 cur->defclass.vec = dc_print;
339 cur->defclass.next = next;
340 --*p;
341 break;
342
343 case 'q':
344 cur->defclass.type = RE_DEFCLASS;
345 cur->defclass.vec = dc_alpha;
346 cur->defclass.next = next;
347 --*p;
348 break;
349 case 'Q':
350 cur->defclass.type = RE_N_DEFCLASS;
351 cur->defclass.vec = dc_alpha;
352 cur->defclass.next = next;
353 --*p;
354 break;
355
356 case 's':
357 cur->defclass.type = RE_DEFCLASS;
358 cur->defclass.vec = dc_space;
359 cur->defclass.next = next;
360 --*p;
361 break;
362 case 'S':
363 cur->defclass.type = RE_N_DEFCLASS;
364 cur->defclass.vec = dc_space;
365 cur->defclass.next = next;
366 --*p;
367 break;
368
369 case 'u':
370 cur->defclass.type = RE_DEFCLASS;
371 cur->defclass.vec = dc_upper;
372 cur->defclass.next = next;
373 --*p;
374 break;
375 case 'U':
376 cur->defclass.type = RE_N_DEFCLASS;
377 cur->defclass.vec = dc_upper;
378 cur->defclass.next = next;
379 --*p;
380 break;
381
382 case 'w':
383 cur->defclass.type = RE_DEFCLASS;
384 cur->defclass.vec = dc_word;
385 cur->defclass.next = next;
386 --*p;
387 break;
388 case 'W':
389 cur->defclass.type = RE_N_DEFCLASS;
390 cur->defclass.vec = dc_word;
391 cur->defclass.next = next;
392 --*p;
393 break;
394
395 case 'x':
396 cur->defclass.type = RE_DEFCLASS;
397 cur->defclass.vec = dc_xdigit;
398 cur->defclass.next = next;
399 --*p;
400 break;
401 case 'X':
402 cur->defclass.type = RE_N_DEFCLASS;
403 cur->defclass.vec = dc_xdigit;
404 cur->defclass.next = next;
405 --*p;
406 break;
407
408 default:
409 goto default_escape;
410 }
411
412 } else default_escape: {
413 cur->literal.type = RE_LITERAL;
414 cur->literal.next = next;
415 cur->literal.buf = bl_string(con->block, cur->literal.len = 1);
416 if (ST_INDEX(s, *p) == EOF) {
417 cur->literal.buf[0] = ESCAPE;
418 } else {
419 cur->literal.buf[0] = ST_INDEX(s, *p);
420 --*p;
421 }
422 }
423 return cur;
424 }
425
426 case ')': {
427 re_node *cur;
428 --*p;
429 cur = CALL(alternation, next, next);
430 skipspace(s, p);
431 if (ST_INDEX(s, *p) == '(') {
432 --*p;
433 }
434 return cur;
435 }
436
437 case ']': {
438 re_node *cur;
439 switch (ST_INDEX(s, *p - 1u)) {
440 case '&':
441 *p -= 2u;
442 cur = bl_node(con->block);
443 cur->indep.type = RE_AT_MATCH;
444 cur->indep.next = next;
445 cur->indep.arg = CALL(alternation, NULL, NULL);
446 break;
447
448 case '^':
449 *p -= 2u;
450 cur = bl_node(con->block);
451 cur->indep.type = RE_AT_NOMATCH;
452 cur->indep.next = next;
453 cur->indep.arg = CALL(alternation, NULL, NULL);
454 break;
455
456 case '?':
457 *p -= 2u;
458 cur = bl_node(con->block);
459 cur->select.type = RE_SELECT;
460 cur->select.cond = CALL(quantifire, NULL, NULL);
461 cur->select.arg = CALL(branch, next, next);
462 skipspace(s, p);
463 if (ST_INDEX(s, *p) == '|') {
464 --*p;
465 cur->select.next = CALL(alternation, next, next);
466 } else {
467 cur->select.next = next;
468 }
469 break;
470
471 case '0': case '1': case '2': case '3': case '4':
472 case '5': case '6': case '7': case '8': case '9': {
473 size_t i;
474
475 for (i = *p - 2u; isdigit(ST_INDEX(s, i)); --i)
476 ;
477 if (ST_INDEX(s, i) != '[') {
478 goto std_atom;
479 }
480 cur = bl_node(con->block);
481 cur->backref.type = RE_BACKCHECK;
482 cur->backref.next = next;
483 cur->backref.n = strtoul(St_ptr(s) + i + 1u, NULL, 10);
484 *p = i - 1u;
485
486 return cur;
487 }
488
489 default:
490 goto std_atom;
491 }
492
493 skipspace(s, p);
494 if (ST_INDEX(s, *p) == '[') {
495 --*p;
496 }
497 return cur;
498 }
499
500 case '>': {
501 re_node *cur = bl_node(con->block);
502 --*p;
503 cur->indep.type = RE_INDEP;
504 cur->indep.next = next;
505 cur->indep.arg = CALL(alternation, NULL, NULL);
506 skipspace(s, p);
507 if (ST_INDEX(s, *p) == '<') {
508 --*p;
509 }
510 return cur;
511 }
512
513 case '}': {
514 re_node *begin = bl_node(con->block);
515 re_node *end = bl_node(con->block);
516 --*p;
517 begin->capture.n = end->capture.n = con->base->captures++;
518 end->capture.type = RE_END_CAPTURE;
519 end->capture.next = next;
520 begin->capture.type = RE_BEGIN_CAPTURE;
521 begin->capture.next = CALL(alternation, end, end);
522 skipspace(s, p);
523 if (ST_INDEX(s, *p) == '{') {
524 --*p;
525 }
526 return begin;
527 }
528
529 case '^': {
530 re_node *cur = bl_node(con->block);
531 cur->x.type = RE_AT_BOS;
532 cur->x.next = next;
533 --*p;
534 return cur;
535 }
536
537 case '$': {
538 re_node *cur = bl_node(con->block);
539 cur->x.type = RE_AT_EOS;
540 cur->x.next = next;
541 --*p;
542 return cur;
543 }
544
545 case '.': {
546 re_node *cur = bl_node(con->block);
547 cur->x.type = RE_ANYCH;
548 cur->x.next = next;
549 --*p;
550 return cur;
551 }
552
553 case '`': {
554 String tmp;
555 int c;
556 re_node *cur = bl_node(con->block);
557 cur->literal.type = RE_PARTIAL;
558 cur->literal.next = next;
559
560 --*p;
561 St_init(&tmp);
562 while (
563 (c = ST_INDEX(s, *p)) != EOF &&
564 c != '`'
565 ) {
566 --*p;
567 if (c == ESCAPE && ST_INDEX(s, *p) != EOF) {
568 c = ST_INDEX(s, *p);
569 --*p;
570 }
571 St_cat_c(&tmp, c);
572 }
573 if (c == '`') {
574 --*p;
575 }
576 St_reverse(&tmp);
577
578 cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp));
579 memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp));
580 St_clear(&tmp);
581
582 return cur;
583 }
584
585 case '\'': {
586 int c;
587 re_node *cur = bl_node(con->block);
588 cur->class.type = RE_CLASS;
589 cur->class.next = next;
590 memset(cur->class.vec, '\0', sizeof cur->class.vec);
591 for (--*p; (c = ST_INDEX(s, *p)) != EOF && c != '\''; --*p) {
592 if (c == ESCAPE) {
593 switch (c = ST_INDEX(s, *p - 1)) {
594 case EOF:
595 cur->class.vec[ESCAPE / CHAR_BIT] |= 1u << ESCAPE % CHAR_BIT;
596 break;
597
598 case 'c':
599 fill(cur->class.vec, iscntrl);
600 --*p;
601 break;
602 case 'C':
603 nfill(cur->class.vec, iscntrl);
604 --*p;
605 break;
606
607 case 'd':
608 fill(cur->class.vec, isdigit);
609 --*p;
610 break;
611 case 'D':
612 nfill(cur->class.vec, isdigit);
613 --*p;
614 break;
615
616 case 'l':
617 fill(cur->class.vec, islower);
618 --*p;
619 break;
620 case 'L':
621 nfill(cur->class.vec, islower);
622 --*p;
623 break;
624
625 case 'p':
626 fill(cur->class.vec, isprint);
627 --*p;
628 break;
629 case 'P':
630 nfill(cur->class.vec, isprint);
631 --*p;
632 break;
633
634 case 'q':
635 fill(cur->class.vec, isalpha);
636 --*p;
637 break;
638 case 'Q':
639 nfill(cur->class.vec, isalpha);
640 --*p;
641 break;
642
643 case 's':
644 fill(cur->class.vec, isspace);
645 --*p;
646 break;
647 case 'S':
648 nfill(cur->class.vec, isspace);
649 --*p;
650 break;
651
652 case 'u':
653 fill(cur->class.vec, isupper);
654 --*p;
655 break;
656 case 'U':
657 nfill(cur->class.vec, isupper);
658 --*p;
659 break;
660
661 case 'w':
662 fill(cur->class.vec, siword);
663 --*p;
664 break;
665 case 'W':
666 nfill(cur->class.vec, siword);
667 --*p;
668 break;
669
670 case 'x':
671 fill(cur->class.vec, isxdigit);
672 --*p;
673 break;
674 case 'X':
675 nfill(cur->class.vec, isxdigit);
676 --*p;
677 break;
678
679 default:
680 cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT;
681 --*p;
682 break;
683 }
684 } else {
685 int b;
686 if (ST_INDEX(s, *p - 1) == '-' && (b = ST_INDEX(s, *p - 2)) != EOF) {
687 if (b == ESCAPE && (b = ST_INDEX(s, *p - 3)) == EOF) {
688 b = ESCAPE;
689 }
690 if (b > c) {
691 goto normal_char;
692 }
693 for (; b < c; ++b) {
694 cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT;
695 }
696 cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT;
697 *p -= 2;
698 if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1) != EOF) {
699 --*p;
700 }
701 } else if (ST_INDEX(s, *p) == ']' && *p >= 8 && ST_INDEX(s, *p - 1) == ':') {
702
703 #define CXCLASS_CHECK(n, t, f, o, b) if (1) { \
704 if (memcmp(St_ptr(s) + *p - (n), (t), (n) - 1) != 0) { \
705 goto normal_char; \
706 } \
707 (b)(cur->class.vec, (f)); \
708 *p -= (o); \
709 break; \
710 } else (void)0
711
712 #define CCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, fill)
713 #define CNCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, nfill)
714
715 if (ST_INDEX(s, *p - 8) == '[' && ST_INDEX(s, *p - 7) == ':') {
716 switch (ST_INDEX(s, *p - 6)) {
717 case 'a':
718 if (ST_INDEX(s, *p - 5) != 'l') {
719 goto normal_char;
720 }
721 switch (ST_INDEX(s, *p - 4)) {
722 case 'n': CCLASS_CHECK("um", isalnum, 8);
723 case 'p': CCLASS_CHECK("ha", isalpha, 8);
724 default: goto normal_char;
725 }
726 break;
727
728 case 'c': CCLASS_CHECK("ntrl", iscntrl, 8);
729 case 'd': CCLASS_CHECK("igit", isdigit, 8);
730 case 'g': CCLASS_CHECK("raph", isgraph, 8);
731 case 'l': CCLASS_CHECK("ower", islower, 8);
732
733 case 'p':
734 switch (ST_INDEX(s, *p - 5)) {
735 case 'r': CCLASS_CHECK("int", isprint, 8);
736 case 'u': CCLASS_CHECK("nct", ispunct, 8);
737 default: goto normal_char;
738 }
739 break;
740
741 case 's': CCLASS_CHECK("pace", isspace, 8);
742 case 'u': CCLASS_CHECK("pper", isupper, 8);
743 default: goto normal_char;
744 }
745 } else if (*p >= 9 && ST_INDEX(s, *p - 9) == '[' && ST_INDEX(s, *p - 8) == ':') {
746 switch (ST_INDEX(s, *p - 7)) {
747 case 'x':
748 CCLASS_CHECK("digit", isxdigit, 9);
749
750 case '^':
751 switch (ST_INDEX(s, *p - 6)) {
752 case 'a':
753 if (ST_INDEX(s, *p - 5) != 'l') {
754 goto normal_char;
755 }
756 switch (ST_INDEX(s, *p - 4)) {
757 case 'n': CNCLASS_CHECK("um", isalnum, 9);
758 case 'p': CNCLASS_CHECK("ha", isalpha, 9);
759 default: goto normal_char;
760 }
761 break;
762
763 case 'c': CNCLASS_CHECK("ntrl", iscntrl, 9);
764 case 'd': CNCLASS_CHECK("igit", isdigit, 9);
765 case 'g': CNCLASS_CHECK("raph", isgraph, 9);
766 case 'l': CNCLASS_CHECK("ower", islower, 9);
767
768 case 'p':
769 switch (ST_INDEX(s, *p - 5)) {
770 case 'r': CNCLASS_CHECK("int", isprint, 9);
771 case 'u': CNCLASS_CHECK("nct", ispunct, 9);
772 default: goto normal_char;
773 }
774 break;
775
776 case 's': CNCLASS_CHECK("pace", isspace, 9);
777 case 'u': CNCLASS_CHECK("pper", isupper, 9);
778 default: goto normal_char;
779 }
780 break;
781
782 default: goto normal_char;
783 }
784 } else if (*p >= 10 && memcmp(St_ptr(s) + *p - 10, "[:^xdigit", 9) == 0) {
785 nfill(cur->class.vec, isxdigit);
786 *p -= 10;
787 } else {
788 goto normal_char;
789 }
790 } else if (ST_INDEX(s, *p) == '^' && ((b = ST_INDEX(s, *p - 1) == '\'') || b == EOF)) {
791 cur->class.type = RE_N_CLASS;
792 } else normal_char: {
793 cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT;
794 }
795 }
796 }
797 if (c == '\'') {
798 --*p;
799 }
800
801 if (cur->class.type == RE_CLASS) {
802 size_t i;
803 for (i = 0; i < CLASS_SIZE; ++i) {
804 if (cur->class.vec[i] != (unsigned char)~0u) {
805 return cur;
806 }
807 }
808 cur->x.type = RE_ANYCH;
809 } else {
810 size_t i;
811 assert(cur->class.type == RE_N_CLASS);
812 for (i = 0; i < CLASS_SIZE; ++i) {
813 if (cur->class.vec[i] != 0) {
814 return cur;
815 }
816 }
817 cur->x.type = RE_ANYCH;
818 }
819
820 return cur;
821 }
822
823 std_atom:
824 default: {
825 re_node *cur = bl_node(con->block);
826 cur->literal.type = RE_LITERAL;
827
828 /*XXX?*/
829 if (next && next->x.type == RE_LITERAL) {
830 if (next->literal.len) {
831 cur->literal.buf = bl_string(con->block, cur->literal.len = 1u + next->literal.len);
832 memcpy(cur->literal.buf + 1, next->literal.buf, next->literal.len);
833 }
834 cur->literal.next = next->literal.next;
835 } else {
836 cur->literal.next = next;
837 cur->literal.buf = bl_string(con->block, cur->literal.len = 1);
838 }
839
840 cur->literal.buf[0] = ST_INDEX(s, *p);
841 --*p;
842 return cur;
843 }
844 }
845 }
846
847 PRAZ(literal) {
848 SNORK;
849 int c;
850 String tmp;
851 re_node *cur;
852
853 skipspace(s, p);
854 if ((ST_INDEX(s, *p) != ESCAPE || isalnum(ST_INDEX(s, *p - 1))) && MEATCHARP(ST_INDEX(s, *p))) {
855 TAILC(atom);
856 }
857
858 cur = bl_node(con->block);
859 cur->literal.type = RE_LITERAL;
860 cur->literal.next = next;
861
862 St_init(&tmp);
863 while (
864 (c = ST_INDEX(s, *p)) != EOF &&
865 (
866 !MEATCHARP(c) ||
867 (c == ESCAPE && !isalnum(ST_INDEX(s, *p - 1)))
868 )
869 ) {
870 --*p;
871 if (c == ESCAPE && ST_INDEX(s, *p) != EOF) {
872 c = ST_INDEX(s, *p);
873 --*p;
874 }
875 St_cat_c(&tmp, c);
876 skipspace(s, p);
877 }
878 St_reverse(&tmp);
879
880 if (next && next->x.type == RE_LITERAL) {
881 if (next->literal.len) {
882 St_cat_m(&tmp, next->literal.buf, next->literal.len);
883 }
884 cur->literal.next = next->literal.next;
885 }
886
887 cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp));
888 memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp));
889 St_clear(&tmp);
890
891 return cur;
892 }
893
894 #define RETURN_REP(lo, hi, grdy) \
895 do { \
896 re_node *cur = bl_node(con->block); \
897 cur->rep.type = (grdy); \
898 cur->rep.next = next; \
899 cur->rep.min = (lo); \
900 cur->rep.max = (hi); \
901 cur->rep.n = con->base->repets++; \
902 cur->rep.arg = CALL(atom, cur, NULL); \
903 return cur; \
904 } while (0)
905
906 #define STD_REP_CASE(lo, hi, grdy) \
907 do { \
908 --*p; \
909 RETURN_REP(lo, hi, grdy); \
910 } while (0)
911
912 #define CUSTOM_REP_CASE(off, grdy, mismatch) \
913 do { \
914 size_t from, to; \
915 to = *p - 1 - (off); \
916 while (isdigit(ST_INDEX(s, to))) { \
917 --to; \
918 } \
919 if (ST_INDEX(s, to) == ':') { \
920 *p = to - 1; \
921 to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \
922 from = to; \
923 } else { \
924 if (ST_INDEX(s, to) != ',') { \
925 mismatch; \
926 } \
927 from = to - 1; \
928 if (!isdigit(ST_INDEX(s, from))) { \
929 mismatch; \
930 } \
931 do { \
932 --from; \
933 } while (isdigit(ST_INDEX(s, from))); \
934 if (ST_INDEX(s, from) != ':') { \
935 mismatch; \
936 } \
937 *p = from - 1; \
938 from = strtoul(St_ptr(s) + from + 1, NULL, 10); \
939 to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \
940 } \
941 if (to == 1) { \
942 if (from == 1) { \
943 TAILC(atom); \
944 } \
945 if (from == 0) { \
946 re_node *cur = bl_node(con->block); \
947 cur->alter.type = RE_ALTER; \
948 if ((grdy) == RE_REP_GREEDY) { \
949 cur->alter.next = next; \
950 cur->alter.arg = CALL(atom, next, NULL); \
951 } else { \
952 cur->alter.arg = next; \
953 cur->alter.next = CALL(atom, next, NULL); \
954 } \
955 return cur; \
956 } \
957 } \
958 RETURN_REP(from, to, grdy); \
959 } while (0)
960
961 PRAZ(quantifire) {
962 SNORK;
963
964 skipspace(s, p);
965 switch (ST_INDEX(s, *p)) {
966 case EOF: return next;
967
968 case '?':
969 switch (ST_INDEX(s, *p - 1)) {
970 case ':': CUSTOM_REP_CASE(1, RE_REP_FRUGAL, goto hook);
971 case '*': --*p; STD_REP_CASE(0, -1, RE_REP_FRUGAL);
972 case '+': --*p; STD_REP_CASE(1, -1, RE_REP_FRUGAL);
973
974 case '?': {
975 re_node *cur = bl_node(con->block);
976 *p -= 2;
977 cur->alter.type = RE_ALTER;
978 cur->alter.next = CALL(atom, next, NULL);
979 cur->alter.arg = next;
980 return cur;
981 }
982
983 hook:
984 default: {
985 re_node *cur = bl_node(con->block);
986 --*p;
987 cur->alter.type = RE_ALTER;
988 cur->alter.next = next;
989 cur->alter.arg = CALL(atom, next, NULL);
990 return cur;
991 }
992 }
993
994 case ':': CUSTOM_REP_CASE(0, RE_REP_GREEDY, goto noquanti);
995 case '*': STD_REP_CASE(0, -1, RE_REP_GREEDY);
996 case '+': STD_REP_CASE(1, -1, RE_REP_GREEDY);
997
998 noquanti:
999 default: TAILC(literal);
1000 }
1001 }
1002
1003 PRAZ(branch) {
1004 SNORK;
1005 skipspace(s, p);
1006 while (ST_INDEX(s, *p) != EOF && !ENDBRANCHP(ST_INDEX(s, *p))) {
1007 next = CALL(quantifire, next, curolan_zebeth);
1008 skipspace(s, p);
1009 }
1010 return next;
1011 }
1012
1013 PRAZ(alternation) {
1014 SNORK;
1015 re_node *cur;
1016
1017 skipspace(s, p);
1018 if (ST_INDEX(s, *p) == EOF) {
1019 return next;
1020 }
1021
1022 next = CALL(branch, next, curolan_zebeth);
1023 skipspace(s, p);
1024 if (ST_INDEX(s, *p) != '|') {
1025 return next;
1026 }
1027
1028 --*p;
1029 cur = bl_node(con->block);
1030 cur->alter.type = RE_ALTER;
1031 cur->alter.next = next;
1032 cur->alter.arg = CALL(alternation, curolan_zebeth, curolan_zebeth);
1033 return cur;
1034 }
1035
1036 ATTR_CONST
1037 static size_t minimum(const size_t a, const size_t b) {
1038 return a < b ? a : b;
1039 }
1040
1041 ATTR_CONST
1042 static size_t maximum(const size_t a, const size_t b) {
1043 return a > b ? a : b;
1044 }
1045
1046 ATTR_PURE
1047 static size_t subminlen(const re_node *node, const re_node *end, size_t start) {
1048 if (!node || node == end) {
1049 return start;
1050 }
1051
1052 switch (node->x.type) {
1053 case RE_ALTER:
1054 return minimum(
1055 subminlen(node->alter.arg, end, start),
1056 subminlen(node->alter.next, end, start)
1057 );
1058
1059 case RE_ANYCH:
1060 case RE_CLASS:
1061 case RE_DEFCLASS:
1062 case RE_N_CLASS:
1063 case RE_N_DEFCLASS:
1064 return subminlen(node->x.next, end, start + 1u);
1065
1066 case RE_AT_BOL:
1067 case RE_AT_BOS:
1068 case RE_AT_EOL:
1069 case RE_AT_EOS:
1070 case RE_AT_NWBOUND:
1071 case RE_AT_WBOUND:
1072 case RE_BACKCHECK:
1073 case RE_BACKREF:
1074 case RE_BEGIN_CAPTURE:
1075 case RE_END_CAPTURE:
1076 case RE_PARTIAL:
1077 case RE_STUFF_FRUGAL:
1078 case RE_STUFF_GREEDY:
1079 return subminlen(node->x.next, end, start);
1080
1081 case RE_AT_MATCH:
1082 return maximum(
1083 subminlen(node->indep.arg, end, start),
1084 subminlen(node->indep.next, end, start)
1085 );
1086
1087 case RE_AT_NOMATCH:
1088 return subminlen(node->indep.next, end, start);
1089
1090 case RE_INDEP:
1091 return subminlen(node->indep.next, end, start + subminlen(node->indep.arg, NULL, 0));
1092
1093 case RE_LITERAL:
1094 return subminlen(node->literal.next, end, start + node->literal.len);
1095
1096 case RE_SELECT:
1097 return minimum(
1098 subminlen(node->select.cond, end, start) + subminlen(node->select.arg, end, start),
1099 subminlen(node->select.next, end, start)
1100 );
1101
1102 case RE_REP_FRUGAL:
1103 case RE_REP_GREEDY:
1104 if (node->rep.min == 0) {
1105 return subminlen(node->rep.next, end, start);
1106 }
1107 return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, node, 0));
1108
1109 case RE_XREP_FRUGAL:
1110 case RE_XREP_GREEDY:
1111 if (node->rep.min == 0) {
1112 return subminlen(node->rep.next, end, start);
1113 }
1114 return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, NULL, 0));
1115 }
1116
1117 NOTREACHED;
1118 }
1119
1120 ATTR_PURE
1121 static size_t minlen(const re_node *node) {
1122 return subminlen(node, NULL, 0);
1123 }
1124
1125 static void reinit(t_regex *re) {
1126 const struct cap_state null = { -1, -1, -1 };
1127 size_t i;
1128 for (i = 0; i < re->captures; ++i) {
1129 re->cap_ptr[i] = null;
1130 }
1131 }
1132
1133 static re_node **notcomplex(re_node **node, const re_node *end) {
1134 assert(*node != NULL);
1135
1136 if (*node == end) {
1137 return node;
1138 }
1139
1140 switch ((*node)->x.type) {
1141 case RE_ALTER:
1142 case RE_AT_MATCH:
1143 case RE_AT_NOMATCH:
1144 case RE_BACKCHECK:
1145 case RE_BACKREF:
1146 case RE_BEGIN_CAPTURE:
1147 case RE_END_CAPTURE:
1148 case RE_INDEP:
1149 case RE_PARTIAL:
1150 case RE_REP_FRUGAL:
1151 case RE_REP_GREEDY:
1152 case RE_SELECT:
1153 case RE_STUFF_FRUGAL:
1154 case RE_STUFF_GREEDY:
1155 return NULL;
1156
1157 case RE_XREP_FRUGAL:
1158 case RE_XREP_GREEDY:
1159 if ((*node)->rep.min == (*node)->rep.max) {
1160 return notcomplex(&(*node)->rep.next, end);
1161 }
1162 return NULL;
1163
1164 case RE_ANYCH:
1165 case RE_AT_BOL:
1166 case RE_AT_BOS:
1167 case RE_AT_EOL:
1168 case RE_AT_EOS:
1169 case RE_AT_NWBOUND:
1170 case RE_AT_WBOUND:
1171 case RE_CLASS:
1172 case RE_DEFCLASS:
1173 case RE_LITERAL:
1174 case RE_N_CLASS:
1175 case RE_N_DEFCLASS:
1176 return notcomplex(&(*node)->x.next, end);
1177 }
1178
1179 NOTREACHED;
1180 }
1181
1182 static void trysimpl(re_node *node) {
1183 assert(node->x.type == RE_REP_FRUGAL || node->x.type == RE_REP_GREEDY);
1184
1185 {
1186 re_node **const tmp = notcomplex(&node->rep.arg, node);
1187 if (!tmp) {
1188 return;
1189 }
1190 *tmp = NULL;
1191 }
1192
1193 switch (node->rep.type) {
1194 case RE_REP_FRUGAL:
1195 if (
1196 node->rep.arg->x.type == RE_ANYCH &&
1197 node->rep.arg->x.next == NULL &&
1198 node->rep.max == (size_t)-1
1199 ) {
1200 if (node->rep.min == 0) {
1201 node->x.type = RE_STUFF_FRUGAL;
1202 break;
1203 }
1204 if (node->rep.min == 1) {
1205 node->rep.arg->x.next = node->x.next;
1206 node->rep.arg->x.type = RE_STUFF_FRUGAL;
1207 node->x.next = node->rep.arg;
1208 node->x.type = RE_ANYCH;
1209 break;
1210 }
1211 }
1212
1213 node->rep.n = minlen(node->rep.arg);
1214 node->rep.type = RE_XREP_FRUGAL;
1215 break;
1216
1217 case RE_REP_GREEDY:
1218 if (
1219 node->rep.arg->x.type == RE_ANYCH &&
1220 node->rep.arg->x.next == NULL &&
1221 node->rep.max == (size_t)-1
1222 ) {
1223 if (node->rep.min == 0) {
1224 node->x.type = RE_STUFF_GREEDY;
1225 break;
1226 }
1227 if (node->rep.min == 1) {
1228 node->rep.arg->x.next = node->x.next;
1229 node->rep.arg->x.type = RE_STUFF_GREEDY;
1230 node->x.next = node->rep.arg;
1231 node->x.type = RE_ANYCH;
1232 break;
1233 }
1234 }
1235
1236 node->rep.n = minlen(node->rep.arg);
1237 node->rep.type = RE_XREP_GREEDY;
1238 break;
1239
1240 default:
1241 NOTREACHED;
1242 }
1243
1244 }
1245
1246 static void simplerep(re_node *node, const re_node *const end) {
1247 if (!node || node == end) {
1248 return;
1249 }
1250
1251 switch (node->x.type) {
1252 case RE_ALTER:
1253 simplerep(node->alter.arg, end);
1254 simplerep(node->alter.next, end);
1255 return;
1256
1257 case RE_AT_MATCH:
1258 case RE_AT_NOMATCH:
1259 case RE_INDEP:
1260 simplerep(node->indep.arg, end);
1261 simplerep(node->indep.next, end);
1262 return;
1263
1264 case RE_ANYCH:
1265 case RE_AT_BOL:
1266 case RE_AT_BOS:
1267 case RE_AT_EOL:
1268 case RE_AT_EOS:
1269 case RE_AT_NWBOUND:
1270 case RE_AT_WBOUND:
1271 case RE_BACKCHECK:
1272 case RE_BACKREF:
1273 case RE_BEGIN_CAPTURE:
1274 case RE_CLASS:
1275 case RE_DEFCLASS:
1276 case RE_END_CAPTURE:
1277 case RE_LITERAL:
1278 case RE_N_CLASS:
1279 case RE_N_DEFCLASS:
1280 case RE_PARTIAL:
1281 simplerep(node->x.next, end);
1282 return;
1283
1284 case RE_REP_FRUGAL:
1285 case RE_REP_GREEDY:
1286 simplerep(node->rep.next, end);
1287 simplerep(node->rep.arg, node);
1288 trysimpl(node);
1289 return;
1290
1291 case RE_SELECT:
1292 simplerep(node->select.cond, end);
1293 simplerep(node->select.arg, end);
1294 simplerep(node->select.next, end);
1295 return;
1296
1297 case RE_STUFF_FRUGAL:
1298 case RE_STUFF_GREEDY:
1299 case RE_XREP_FRUGAL:
1300 case RE_XREP_GREEDY:
1301 return;
1302 }
1303
1304 NOTREACHED;
1305 }
1306
1307 ATTR_PURE
1308 static int anchorbegp(const re_node *node) {
1309 return
1310 node &&
1311 (
1312 node->x.type == RE_AT_BOS ||
1313 node->x.type == RE_STUFF_FRUGAL ||
1314 node->x.type == RE_STUFF_GREEDY ||
1315 (
1316 node->x.type == RE_ALTER &&
1317 anchorbegp(node->alter.arg) &&
1318 anchorbegp(node->alter.next)
1319 ) ||
1320 (
1321 node->x.type == RE_INDEP &&
1322 anchorbegp(node->indep.arg)
1323 ) ||
1324 (
1325 node->x.type == RE_AT_MATCH &&
1326 (
1327 anchorbegp(node->indep.arg) ||
1328 anchorbegp(node->indep.next)
1329 )
1330 ) ||
1331 (
1332 (
1333 node->x.type == RE_AT_NOMATCH ||
1334 node->x.type == RE_AT_BOL ||
1335 node->x.type == RE_AT_EOL ||
1336 node->x.type == RE_AT_EOS ||
1337 node->x.type == RE_AT_NWBOUND ||
1338 node->x.type == RE_AT_WBOUND ||
1339 node->x.type == RE_BEGIN_CAPTURE ||
1340 node->x.type == RE_BACKCHECK
1341 ) &&
1342 anchorbegp(node->x.next)
1343 )
1344 )
1345 ;
1346 }
1347
1348 static void dostuff(t_regex *re) {
1349 simplerep(re->start, NULL);
1350
1351 if (!re->start || anchorbegp(re->start)) {
1352 re->flags |= FL_ANCHOR_START;
1353 }
1354 }
1355
1356 t_regex *re_compile(const String *s) {
1357 size_t p;
1358 struct parse_context dcon;
1359 t_regex *re;
1360 void *pre;
1361
1362 if (h_get(&rcache, s, &pre) == H_OK) {
1363 re = pre;
1364 ++re->refs;
1365 return re;
1366 }
1367
1368 re = xmalloc(1, sizeof *re);
1369 re->flags = FL_NONE;
1370 re->refs = 0;
1371 re->repets = 0;
1372 bl_init(&re->alloc);
1373 re->cap_ptr = NULL;
1374 re->captures = 0;
1375 dcon.base = re;
1376 dcon.block = &re->alloc;
1377 dcon.s = s;
1378 p = St_len(s) - 1;
1379 dcon.p = &p;
1380
1381 re->start = alternation(&dcon, NULL, NULL);
1382
1383 if (re->captures) {
1384 re->cap_ptr = xmalloc(re->captures, sizeof *re->cap_ptr);
1385 reinit(re);
1386 }
1387
1388 dostuff(re);
1389 re->minlen = minlen(re->start);
1390 re->repbuf = xmalloc(re->repets, sizeof *re->repbuf);
1391
1392 if (cachfill < sizeof cached / sizeof cached[0]) {
1393 St_init(&cached[cachfill]);
1394 St_cpy(&cached[cachfill], s);
1395 ++re->refs;
1396 h_put(&rcache, &cached[cachfill], re, 1);
1397 ++cachfill;
1398 } else {
1399 h_del(&rcache, &cached[cachlast]);
1400 St_cpy(&cached[cachlast], s);
1401 ++re->refs;
1402 h_put(&rcache, &cached[cachlast], re, 1);
1403 cachlast = (cachlast + 1u) % (sizeof cached / sizeof cached[0]);
1404 }
1405
1406 if (Opt.debug & DBG_REGEX) {
1407 io_write_s(Err, "Compiled ");
1408 re_dump(re, io_fp(Err));
1409 }
1410
1411 return re;
1412 }
1413
1414 t_regex *re_dup(t_regex *re) {
1415 ++re->refs;
1416 return re;
1417 }
1418
1419 void re_free(t_regex *re) {
1420 if (re->refs) {
1421 --re->refs;
1422 return;
1423 }
1424 bl_free(&re->alloc);
1425 xfree(re->repbuf);
1426 xfree(re->cap_ptr);
1427 xfree(re);
1428 }
1429
1430 ATTR_PURE
1431 static int match_class(unsigned char c, const unsigned char *vec) {
1432 return vec[c / CHAR_BIT] & 1u << c % CHAR_BIT;
1433 }
1434
1435 static void *mem_dup(const void *p, size_t n, size_t m) {
1436 void *q = xmalloc(n, m);
1437 memcpy(q, p, n * m);
1438 return q;
1439 }
1440
1441 #define NO_MATCH ((size_t)-1)
1442
1443 #define Mmatch0(Xmatch, Xindex) \
1444 size_t tmp; \
1445 \
1446 if (!node) { \
1447 return o; \
1448 } \
1449 \
1450 switch (node->x.type) { \
1451 case RE_ALTER: \
1452 if ((tmp = Xmatch(base, node->alter.arg, s, o)) == NO_MATCH) { \
1453 return Xmatch(base, node->alter.next, s, o); \
1454 } \
1455 return tmp; \
1456 \
1457 case RE_ANYCH: \
1458 if (Xindex(s, o) != EOF) { \
1459 return Xmatch(base, node->x.next, s, o + 1u); \
1460 } \
1461 return NO_MATCH; \
1462 \
1463 case RE_AT_BOL: \
1464 if (o == 0 || Xindex(s, o - 1u) == '\n') { \
1465 return Xmatch(base, node->x.next, s, o); \
1466 } \
1467 return NO_MATCH; \
1468 \
1469 case RE_AT_BOS: \
1470 if (o == 0) { \
1471 return Xmatch(base, node->x.next, s, o); \
1472 } \
1473 return NO_MATCH; \
1474 \
1475 case RE_AT_EOL: \
1476 if ( \
1477 Xindex(s, o) == '\n' || \
1478 ( \
1479 o && Xindex(s, o) == EOF && Xindex(s, o - 1u) != '\n' \
1480 ) \
1481 ) { \
1482 return Xmatch(base, node->x.next, s, o); \
1483 } \
1484 return NO_MATCH; \
1485 \
1486 case RE_AT_EOS: \
1487 if (Xindex(s, o) == EOF) { \
1488 return Xmatch(base, node->x.next, s, o); \
1489 } \
1490 return NO_MATCH; \
1491 \
1492 case RE_AT_MATCH: { \
1493 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
1494 tmp = Xmatch(base, node->indep.arg, s, o); \
1495 if (tmp != NO_MATCH) { \
1496 tmp = Xmatch(base, node->indep.next, s, o); \
1497 if (tmp == NO_MATCH) { \
1498 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
1499 } \
1500 } \
1501 xfree(prev); \
1502 return tmp; \
1503 } \
1504 \
1505 case RE_AT_NOMATCH: { \
1506 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
1507 tmp = Xmatch(base, node->indep.arg, s, o); \
1508 if (tmp == NO_MATCH) { \
1509 tmp = Xmatch(base, node->indep.next, s, o); \
1510 } else { \
1511 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
1512 tmp = NO_MATCH; \
1513 } \
1514 xfree(prev); \
1515 return tmp; \
1516 } \
1517 \
1518 case RE_AT_NWBOUND: \
1519 if (siword(Xindex(s, o - 1u)) == siword(Xindex(s, o))) { \
1520 return Xmatch(base, node->x.next, s, o); \
1521 } \
1522 return NO_MATCH; \
1523 \
1524 case RE_AT_WBOUND: \
1525 if (siword(Xindex(s, o - 1u)) != siword(Xindex(s, o))) { \
1526 return Xmatch(base, node->x.next, s, o); \
1527 } \
1528 return NO_MATCH; \
1529 \
1530 case RE_BACKCHECK: \
1531 if ( \
1532 node->backref.n < base->captures && \
1533 base->cap_ptr[node->backref.n].start + 1u && \
1534 base->cap_ptr[node->backref.n].end + 1u && \
1535 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start \
1536 ) { \
1537 return Xmatch(base, node->backref.next, s, o); \
1538 } \
1539 return NO_MATCH
1540
1541
1542 #define Mmatch1(Xmatch, Xindex) \
1543 case RE_BEGIN_CAPTURE: { \
1544 const size_t prev = base->cap_ptr[node->capture.n].pending; \
1545 base->cap_ptr[node->capture.n].pending = o; \
1546 if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \
1547 base->cap_ptr[node->capture.n].pending = prev; \
1548 } \
1549 return tmp; \
1550 } \
1551 \
1552 case RE_CLASS: \
1553 if ( \
1554 Xindex(s, o) != EOF && \
1555 match_class(Xindex(s, o), node->class.vec) \
1556 ) { \
1557 return Xmatch(base, node->class.next, s, o + 1u); \
1558 } \
1559 return NO_MATCH; \
1560 \
1561 case RE_DEFCLASS: \
1562 if ( \
1563 Xindex(s, o) != EOF && \
1564 match_class(Xindex(s, o), node->defclass.vec) \
1565 ) { \
1566 return Xmatch(base, node->defclass.next, s, o + 1u); \
1567 } \
1568 return NO_MATCH; \
1569 \
1570 case RE_END_CAPTURE: { \
1571 const size_t prevbeg = base->cap_ptr[node->capture.n].start; \
1572 const size_t prevend = base->cap_ptr[node->capture.n].end; \
1573 if (base->cap_ptr[node->capture.n].pending + 1u == 0u) { \
1574 return NO_MATCH; \
1575 } \
1576 \
1577 base->cap_ptr[node->capture.n].end = o; \
1578 base->cap_ptr[node->capture.n].start = base->cap_ptr[node->capture.n].pending; \
1579 if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \
1580 base->cap_ptr[node->capture.n].end = prevend; \
1581 base->cap_ptr[node->capture.n].start = prevbeg; \
1582 } \
1583 return tmp; \
1584 } \
1585 \
1586 case RE_INDEP: { \
1587 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
1588 if ((tmp = Xmatch(base, node->indep.arg, s, o)) != NO_MATCH) { \
1589 tmp = Xmatch(base, node->indep.next, s, tmp); \
1590 if (tmp == NO_MATCH) { \
1591 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
1592 } \
1593 } \
1594 xfree(prev); \
1595 return tmp; \
1596 } \
1597 \
1598 case RE_N_CLASS: \
1599 if ( \
1600 Xindex(s, o) != EOF && \
1601 !match_class(Xindex(s, o), node->class.vec) \
1602 ) { \
1603 return Xmatch(base, node->class.next, s, o + 1u); \
1604 } \
1605 return NO_MATCH; \
1606 \
1607 case RE_N_DEFCLASS: \
1608 if ( \
1609 Xindex(s, o) != EOF && \
1610 !match_class(Xindex(s, o), node->defclass.vec) \
1611 ) { \
1612 return Xmatch(base, node->defclass.next, s, o + 1u); \
1613 } \
1614 return NO_MATCH; \
1615 \
1616 case RE_PARTIAL: { \
1617 size_t n; \
1618 for (n = 0; n < node->literal.len; ++n) { \
1619 if (node->literal.buf[n] != Xindex(s, o + n)) { \
1620 break; \
1621 } \
1622 } \
1623 for (; n; --n) { \
1624 if ((tmp = Xmatch(base, node->literal.next, s, o + n)) != NO_MATCH) { \
1625 return tmp; \
1626 } \
1627 } \
1628 return Xmatch(base, node->literal.next, s, o); \
1629 } \
1630 \
1631 case RE_SELECT: { \
1632 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
1633 if ((tmp = Xmatch(base, node->select.cond, s, o)) != NO_MATCH) { \
1634 tmp = Xmatch(base, node->select.arg, s, tmp); \
1635 if (tmp == NO_MATCH) { \
1636 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
1637 } \
1638 } else { \
1639 tmp = Xmatch(base, node->select.next, s, o); \
1640 } \
1641 xfree(prev); \
1642 return tmp; \
1643 } \
1644 \
1645 case RE_REP_FRUGAL: \
1646 if (base->repbuf[node->rep.n] >= node->rep.max) { \
1647 const size_t n = base->repbuf[node->rep.n]; \
1648 \
1649 base->repbuf[node->rep.n] = 0; \
1650 tmp = Xmatch(base, node->rep.next, s, o); \
1651 base->repbuf[node->rep.n] = n; \
1652 return tmp; \
1653 } else if (base->repbuf[node->rep.n] >= node->rep.min) { \
1654 const size_t n = base->repbuf[node->rep.n]; \
1655 \
1656 base->repbuf[node->rep.n] = 0; \
1657 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \
1658 base->repbuf[node->rep.n] = n; \
1659 return tmp; \
1660 } \
1661 base->repbuf[node->rep.n] = n + 1u; \
1662 tmp = Xmatch(base, node->rep.arg, s, o); \
1663 --base->repbuf[node->rep.n]; \
1664 return tmp; \
1665 } else { \
1666 ++base->repbuf[node->rep.n]; \
1667 tmp = Xmatch(base, node->rep.arg, s, o); \
1668 --base->repbuf[node->rep.n]; \
1669 return tmp; \
1670 } \
1671 \
1672 case RE_REP_GREEDY: \
1673 if (base->repbuf[node->rep.n] >= node->rep.max) { \
1674 const size_t n = base->repbuf[node->rep.n]; \
1675 \
1676 base->repbuf[node->rep.n] = 0; \
1677 tmp = Xmatch(base, node->rep.next, s, o); \
1678 base->repbuf[node->rep.n] = n; \
1679 return tmp; \
1680 } else if (base->repbuf[node->rep.n] >= node->rep.min) { \
1681 const size_t n = base->repbuf[node->rep.n]; \
1682 \
1683 ++base->repbuf[node->rep.n]; \
1684 if ((tmp = Xmatch(base, node->rep.arg, s, o)) != NO_MATCH) { \
1685 --base->repbuf[node->rep.n]; \
1686 return tmp; \
1687 } \
1688 base->repbuf[node->rep.n] = 0; \
1689 tmp = Xmatch(base, node->rep.next, s, o); \
1690 base->repbuf[node->rep.n] = n; \
1691 return tmp; \
1692 } else { \
1693 ++base->repbuf[node->rep.n]; \
1694 tmp = Xmatch(base, node->rep.arg, s, o); \
1695 --base->repbuf[node->rep.n]; \
1696 return tmp; \
1697 } \
1698 \
1699 case RE_XREP_FRUGAL: { \
1700 size_t n; \
1701 \
1702 for (n = 0; n < node->rep.min; ++n) { \
1703 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
1704 return NO_MATCH; \
1705 } \
1706 o = tmp; \
1707 } \
1708 \
1709 for (; n < node->rep.max; ++n) { \
1710 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \
1711 return tmp; \
1712 } \
1713 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
1714 return NO_MATCH; \
1715 } \
1716 o = tmp; \
1717 } \
1718 return Xmatch(base, node->rep.next, s, o); \
1719 } \
1720 \
1721 case RE_XREP_GREEDY: { \
1722 size_t n; \
1723 \
1724 for (n = 0; n < node->rep.min; ++n) { \
1725 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
1726 return NO_MATCH; \
1727 } \
1728 o = tmp; \
1729 } \
1730 \
1731 for (; n < node->rep.max; ++n) { \
1732 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
1733 break; \
1734 } \
1735 o = tmp; \
1736 } \
1737 \
1738 for (; n > node->rep.min; --n) { \
1739 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \
1740 return tmp; \
1741 } \
1742 o -= node->rep.n; \
1743 } \
1744 \
1745 return Xmatch(base, node->rep.next, s, o); \
1746 } \
1747 } \
1748 \
1749 NOTREACHED
1750
1751
1752 static size_t match(const t_regex *base, const re_node *node, const String *s, size_t o) {
1753
1754 Mmatch0(match, ST_INDEX);
1755
1756 case RE_BACKREF:
1757 if (
1758 node->backref.n < base->captures &&
1759 base->cap_ptr[node->backref.n].start + 1u &&
1760 base->cap_ptr[node->backref.n].end + 1u &&
1761 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start &&
1762 (
1763 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start <= St_len(s) - o &&
1764 memcmp(
1765 St_ptr(s) + base->cap_ptr[node->backref.n].start,
1766 St_ptr(s) + o,
1767 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start
1768 ) == 0
1769 )
1770 ) {
1771 return match(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start);
1772 }
1773 return NO_MATCH;
1774
1775
1776 case RE_LITERAL:
1777 if (
1778 node->literal.len == 0 ||
1779 (
1780 node->literal.len <= St_len(s) - o &&
1781 memcmp(node->literal.buf, St_ptr(s) + o, node->literal.len) == 0
1782 )
1783 ) {
1784 return match(base, node->literal.next, s, o + node->literal.len);
1785 }
1786 return NO_MATCH;
1787
1788 case RE_STUFF_FRUGAL:
1789 if (!node->x.next) {
1790 return o;
1791 }
1792
1793 if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) {
1794 for (
1795 o = St_stro_m(s, o, node->x.next->literal.buf, node->x.next->literal.len);
1796 o + 1u;
1797 o = St_stro_m(s, o + 1u, node->x.next->literal.buf, node->x.next->literal.len)
1798 ) {
1799 if ((tmp = match(base, node->x.next->literal.next, s, o + node->x.next->literal.len)) != NO_MATCH) {
1800 return tmp;
1801 }
1802 }
1803 return NO_MATCH;
1804 }
1805
1806 for (; o <= St_len(s); ++o) {
1807 if ((tmp = match(base, node->x.next, s, o)) != NO_MATCH) {
1808 return tmp;
1809 }
1810 }
1811 return NO_MATCH;
1812
1813 case RE_STUFF_GREEDY: {
1814 size_t n;
1815
1816 if (!node->x.next) {
1817 return St_len(s);
1818 }
1819
1820 if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) {
1821 for (
1822 n = St_rstr_m(s, node->x.next->literal.buf, node->x.next->literal.len);
1823 n > o && n + 1u;
1824 n = St_rstro_m(s, n - 1u, node->x.next->literal.buf, node->x.next->literal.len)
1825 ) {
1826 if ((tmp = match(base, node->x.next->literal.next, s, n + node->x.next->literal.len)) != NO_MATCH) {
1827 return tmp;
1828 }
1829 if (!n) {
1830 break;
1831 }
1832 }
1833 if (n == o) {
1834 return match(base, node->x.next->literal.next, s, n + node->x.next->literal.len);
1835 }
1836 return NO_MATCH;
1837 }
1838
1839 for (n = St_len(s); n > o; --n) {
1840 if ((tmp = match(base, node->x.next, s, n)) != NO_MATCH) {
1841 return tmp;
1842 }
1843 }
1844 return match(base, node->x.next, s, o);
1845 }
1846
1847 Mmatch1(match, ST_INDEX);
1848 }
1849
1850 static size_t iomatch(const t_regex *base, const re_node *node, IO *s, size_t o) {
1851
1852 Mmatch0(iomatch, io_peek);
1853
1854 case RE_BACKREF:
1855 if (
1856 node->backref.n < base->captures &&
1857 base->cap_ptr[node->backref.n].start + 1u &&
1858 base->cap_ptr[node->backref.n].end + 1u &&
1859 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start &&
1860 (
1861 io_xcmp(
1862 s,
1863 base->cap_ptr[node->backref.n].start,
1864 o,
1865 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start
1866 ) == 0
1867 )
1868 ) {
1869 return iomatch(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start);
1870 }
1871 return NO_MATCH;
1872
1873
1874 case RE_LITERAL:
1875 if (
1876 node->literal.len == 0 ||
1877 (
1878 io_cmppeek(s, o, node->literal.buf, node->literal.len) == 0
1879 )
1880 ) {
1881 return iomatch(base, node->literal.next, s, o + node->literal.len);
1882 }
1883 return NO_MATCH;
1884
1885 case RE_STUFF_FRUGAL:
1886 if (!node->x.next) {
1887 return o;
1888 }
1889
1890 for (; io_peek(s, o) != EOF; ++o) {
1891 if ((tmp = iomatch(base, node->x.next, s, o)) != NO_MATCH) {
1892 return tmp;
1893 }
1894 }
1895 return iomatch(base, node->x.next, s, o);
1896
1897 case RE_STUFF_GREEDY: {
1898 size_t n;
1899
1900 for (n = o; io_peek(s, n) != EOF; ++n)
1901 ;
1902
1903 if (!node->x.next) {
1904 return n;
1905 }
1906
1907 for (; n > o; --n) {
1908 if ((tmp = iomatch(base, node->x.next, s, n)) != NO_MATCH) {
1909 return tmp;
1910 }
1911 }
1912 return iomatch(base, node->x.next, s, o);
1913 }
1914
1915 Mmatch1(iomatch, io_peek);
1916 }
1917
1918 #define DEFRBUF ((void)0)
1919 #define MRETURN(x) return (x)
1920 #define INITRBUF(re) memset((re)->repbuf, '\0', (re)->repets * sizeof *(re)->repbuf)
1921
1922 int re_iomatch(t_regex *re, IO *io, size_t *ms, size_t *me) {
1923 size_t i;
1924 DEFRBUF;
1925 INITRBUF(re);
1926
1927 assert(re != NULL);
1928 assert(io != NULL);
1929 assert(io_bufred(io));
1930 assert(ms != NULL);
1931 assert(me != NULL);
1932
1933 reinit(re);
1934
1935 if (re->flags & FL_ANCHOR_START) {
1936 if ((*me = iomatch(re, re->start, io, 0)) == NO_MATCH) {
1937 MRETURN(0);
1938 }
1939 *ms = 0;
1940 MRETURN(1);
1941 }
1942
1943 for (i = 0; io_peek(io, i) != EOF; ++i) {
1944 if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) {
1945 *ms = i;
1946 MRETURN(1);
1947 }
1948 }
1949
1950 if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) {
1951 *ms = i;
1952 MRETURN(1);
1953 }
1954 MRETURN(0);
1955 }
1956
1957 int re_match(t_regex *re, const String *s, size_t *ms, size_t *me) {
1958 size_t i;
1959 DEFRBUF;
1960 INITRBUF(re);
1961
1962 assert(re != NULL);
1963 assert(s != NULL);
1964 assert(ms != NULL);
1965 assert(me != NULL);
1966
1967 if (re->minlen > St_len(s)) {
1968 MRETURN(0);
1969 }
1970
1971 reinit(re);
1972
1973 if (re->flags & FL_ANCHOR_START) {
1974 if ((*me = match(re, re->start, s, 0)) == NO_MATCH) {
1975 MRETURN(0);
1976 }
1977 *ms = 0;
1978 MRETURN(1);
1979 }
1980
1981 if (re->start->x.type == RE_LITERAL && re->start->literal.len) {
1982 for (
1983 i = St_str_m(s, re->start->literal.buf, re->start->literal.len);
1984 i + 1u;
1985 i = St_stro_m(s, i + 1u, re->start->literal.buf, re->start->literal.len)
1986 ) {
1987 if ((*me = match(re, re->start->literal.next, s, i + re->start->literal.len)) != NO_MATCH) {
1988 *ms = i;
1989 MRETURN(1);
1990 }
1991 }
1992 MRETURN(0);
1993 }
1994
1995 for (i = 0; i <= St_len(s) - re->minlen; ++i) {
1996 if ((*me = match(re, re->start, s, i)) != NO_MATCH) {
1997 *ms = i;
1998 MRETURN(1);
1999 }
2000 }
2001
2002 MRETURN(0);
2003 }
2004
2005 size_t re_cabra(const t_regex *re) {
2006 return re->captures;
2007 }
2008
2009 int re_backref(const t_regex *re, size_t i, size_t *a, size_t *z) {
2010 if (i >= re->captures) {
2011 return -1;
2012 }
2013 if (!(re->cap_ptr[i].start + 1u && re->cap_ptr[i].end + 1u)) {
2014 return 1;
2015 }
2016 *a = re->cap_ptr[i].start;
2017 *z = re->cap_ptr[i].end;
2018 return 0;
2019 }
2020
2021 ATTR_PURE
2022 static const re_node *branchend(const re_node *a, const re_node *b) {
2023 for (; a; a = a->x.next) {
2024 const re_node *q = b;
2025 for (; q; q = q->x.next) {
2026 if (a == q) {
2027 return q;
2028 }
2029 }
2030 }
2031 return NULL;
2032 }
2033
2034 static void decom_class(String *s, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) {
2035 unsigned char i;
2036 int flag = 0;
2037 char big_enough[100];
2038
2039 for (i = 0; i < UCHAR_MAX; ++i) {
2040 if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) {
2041 if (vec[(i + 1u) / CHAR_BIT] & 1u << (i + 1u) % CHAR_BIT) {
2042 if (!flag) {
2043 flag = 1;
2044 goto mess;
2045 }
2046 flag = 1;
2047 } else {
2048 if (flag) {
2049 if (i > 1u && vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) {
2050 St_cat_c(s, '-');
2051 }
2052 flag = 0;
2053 }
2054 mess:
2055 if (!isprint(i)) {
2056 St_cat_c(s, '\\');
2057 switch (i) {
2058 case '\a': St_cat_c(s, 'a'); break;
2059 case '\b': St_cat_c(s, 'b'); break;
2060 case '\f': St_cat_c(s, 'f'); break;
2061 case '\n': St_cat_c(s, 'n'); break;
2062 case '\r': St_cat_c(s, 'r'); break;
2063 case '\t': St_cat_c(s, 't'); break;
2064 case '\v': St_cat_c(s, 'v'); break;
2065 default:
2066 sprintf(big_enough, "%03o", (unsigned)i);
2067 St_cat_s(s, big_enough);
2068 break;
2069 }
2070 } else {
2071 if (strchr("\\\"", (char)i)) {
2072 St_cat_c(s, '\\');
2073 }
2074 St_cat_c(s, i);
2075 if (strchr("!'-", (char)i)) {
2076 St_cat_c(s, ESCAPE);
2077 }
2078 }
2079 }
2080 }
2081 }
2082 if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) {
2083 if (flag) {
2084 if (vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) {
2085 St_cat_c(s, '-');
2086 }
2087 flag = 0;
2088 }
2089 if (!isprint(i)) {
2090 St_cat_c(s, '\\');
2091 switch (i) {
2092 case '\a': St_cat_c(s, 'a'); break;
2093 case '\b': St_cat_c(s, 'b'); break;
2094 case '\f': St_cat_c(s, 'f'); break;
2095 case '\n': St_cat_c(s, 'n'); break;
2096 case '\r': St_cat_c(s, 'r'); break;
2097 case '\t': St_cat_c(s, 't'); break;
2098 case '\v': St_cat_c(s, 'v'); break;
2099 default:
2100 sprintf(big_enough, "%03o", (unsigned)i);
2101 St_cat_s(s, big_enough);
2102 break;
2103 }
2104 } else {
2105 if (strchr("\\\"", (char)i)) {
2106 St_cat_c(s, '\\');
2107 }
2108 St_cat_c(s, i);
2109 if (strchr("!'-", (char)i)) {
2110 St_cat_c(s, ESCAPE);
2111 }
2112 }
2113 }
2114 }
2115
2116 static void dumpclass(FILE *fp, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) {
2117 String tmp;
2118 St_init(&tmp);
2119 decom_class(&tmp, vec);
2120 ST_WRITE(&tmp, fp);
2121 St_clear(&tmp);
2122 }
2123
2124 ATTR_CONST
2125 static int cdefclass(const unsigned char *ptr) {
2126 if (ptr == dc_word) { return 'w'; }
2127 if (ptr == dc_alpha) { return 'q'; }
2128 if (ptr == dc_cntrl) { return 'c'; }
2129 if (ptr == dc_digit) { return 'd'; }
2130 if (ptr == dc_lower) { return 'l'; }
2131 if (ptr == dc_print) { return 'p'; }
2132 if (ptr == dc_space) { return 's'; }
2133 if (ptr == dc_upper) { return 'u'; }
2134 if (ptr == dc_xdigit) { return 'x'; }
2135 NOTREACHED;
2136 }
2137
2138 static void decom_liter(String *s, const re_node *node) {
2139 size_t i;
2140 assert(node->x.type == RE_LITERAL || node->x.type == RE_PARTIAL);
2141 for (i = 0; i < node->literal.len; ++i) {
2142 if (!isprint((unsigned char)node->literal.buf[i])) {
2143 St_cat_c(s, '\\');
2144 switch (node->literal.buf[i]) {
2145 case '\a': St_cat_c(s, 'a'); break;
2146 case '\b': St_cat_c(s, 'b'); break;
2147 case '\f': St_cat_c(s, 'f'); break;
2148 case '\n': St_cat_c(s, 'n'); break;
2149 case '\r': St_cat_c(s, 'r'); break;
2150 case '\t': St_cat_c(s, 't'); break;
2151 case '\v': St_cat_c(s, 'v'); break;
2152 default: {
2153 char big_enough[100];
2154 sprintf(big_enough, "%03o", (unsigned)(unsigned char)node->literal.buf[i]);
2155 St_cat_s(s, big_enough);
2156 break;
2157 }
2158 }
2159 } else {
2160 if (node->x.type == RE_LITERAL && strchr("\"\\", node->literal.buf[i])) {
2161 St_cat_c(s, '\\');
2162 }
2163 St_cat_c(s, node->literal.buf[i]);
2164 if (MEATCHARP(node->literal.buf[i])) {
2165 St_cat_c(s, ESCAPE);
2166 }
2167 }
2168 }
2169 }
2170
2171 static void dumpliter(FILE *fp, const re_node *node) {
2172 String tmp;
2173 St_init(&tmp);
2174 decom_liter(&tmp, node);
2175 ST_WRITE(&tmp, fp);
2176 St_clear(&tmp);
2177 }
2178
2179 static void do_indent(FILE *fp, size_t n) {
2180 for (; n; --n) {
2181 fputs(" ", fp);
2182 }
2183 }
2184
2185 static void dumpnode(FILE *fp, const re_node *node, const re_node *end, const size_t indent) {
2186 if (node == end) {
2187 return;
2188 }
2189
2190 assert(node != NULL);
2191 fprintf(fp, "[%p] ", (const void *)node);
2192 do_indent(fp, indent);
2193
2194 switch (node->x.type) {
2195 case RE_ALTER: {
2196 const re_node *const be = branchend(node->alter.arg, node->alter.next);
2197 fputs("branch {\n", fp);
2198 dumpnode(fp, node->alter.arg, be, indent + 1);
2199 fprintf(fp, "[%p] ", (const void *)node);
2200 do_indent(fp, indent);
2201 fputs("} or {\n", fp);
2202 dumpnode(fp, node->alter.next, be, indent + 1);
2203 fprintf(fp, "[%p] ", (const void *)node);
2204 do_indent(fp, indent);
2205 fputs("} /branch\n", fp);
2206 dumpnode(fp, be, end, indent);
2207 return;
2208 }
2209
2210 case RE_ANYCH:
2211 fputs("anychar\n", fp);
2212 break;
2213
2214 case RE_AT_BOL:
2215 fputs("at beginning of line\n", fp);
2216 break;
2217
2218 case RE_AT_BOS:
2219 fputs("at beginning of string\n", fp);
2220 break;
2221
2222 case RE_AT_EOL:
2223 fputs("at end of line\n", fp);
2224 break;
2225
2226 case RE_AT_EOS:
2227 fputs("at end of string\n", fp);
2228 break;
2229
2230 case RE_AT_MATCH:
2231 fputs("at match {\n", fp);
2232 dumpnode(fp, node->indep.arg, NULL, indent + 1);
2233 fprintf(fp, "[%p] ", (const void *)node);
2234 do_indent(fp, indent);
2235 fputs("} /at match\n", fp);
2236 break;
2237
2238 case RE_AT_NOMATCH:
2239 fputs("not at match {\n", fp);
2240 dumpnode(fp, node->indep.arg, NULL, indent + 1);
2241 fprintf(fp, "[%p] ", (const void *)node);
2242 do_indent(fp, indent);
2243 fputs("} /not at match\n", fp);
2244 break;
2245
2246 case RE_AT_NWBOUND:
2247 fputs("not at word boundary\n", fp);
2248 break;
2249
2250 case RE_AT_WBOUND:
2251 fputs("at word boundary\n", fp);
2252 break;
2253
2254 case RE_BACKCHECK:
2255 fprintf(fp, "check capture (%lu)\n", (unsigned long)node->backref.n);
2256 break;
2257
2258 case RE_BACKREF:
2259 fprintf(fp, "backref (%lu)\n", (unsigned long)node->backref.n);
2260 break;
2261
2262 case RE_BEGIN_CAPTURE:
2263 fprintf(fp, "begin capture (%lu)\n", (unsigned long)node->backref.n);
2264 break;
2265
2266 case RE_CLASS:
2267 fputs("character class '", fp);
2268 dumpclass(fp, node->class.vec);
2269 fputs("'\n", fp);
2270 break;
2271
2272 case RE_DEFCLASS:
2273 fprintf(fp, "character class %c!\n", cdefclass(node->defclass.vec));
2274 break;
2275
2276 case RE_END_CAPTURE:
2277 fprintf(fp, "end capture (%lu)\n", (unsigned long)node->backref.n);
2278 break;
2279
2280 case RE_INDEP:
2281 fputs("independent submatch {\n", fp);
2282 dumpnode(fp, node->indep.arg, NULL, indent + 1);
2283 fprintf(fp, "[%p] ", (const void *)node);
2284 do_indent(fp, indent);
2285 fputs("} /independent submatch\n", fp);
2286 break;
2287
2288 case RE_LITERAL:
2289 fputs("literal \"", fp);
2290 dumpliter(fp, node);
2291 fputs("\"\n", fp);
2292 break;
2293
2294 case RE_N_CLASS:
2295 fputs("negated character class '", fp);
2296 dumpclass(fp, node->class.vec);
2297 fputs("'\n", fp);
2298 break;
2299
2300 case RE_N_DEFCLASS:
2301 fprintf(fp, "character class %c!\n", toupper(cdefclass(node->defclass.vec)));
2302 break;
2303
2304 case RE_PARTIAL:
2305 fputs("abbreviation `", fp);
2306 dumpliter(fp, node);
2307 fputs("`\n", fp);
2308 break;
2309
2310 case RE_REP_FRUGAL:
2311 fputs("repetition ", fp);
2312 if (node->rep.min == node->rep.max) {
2313 fprintf(fp, ":%lu:?", (unsigned long)node->rep.min);
2314 } else if (node->rep.max != (size_t)-1) {
2315 fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
2316 } else {
2317 switch (node->rep.min) {
2318 case 0: fputs("*?", fp); break;
2319 case 1: fputs("+?", fp); break;
2320 default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break;
2321 }
2322 }
2323 fputs(" {\n", fp);
2324 dumpnode(fp, node->rep.arg, node, indent + 1);
2325 fprintf(fp, "[%p] ", (const void *)node);
2326 do_indent(fp, indent);
2327 fputs("} /repetition\n", fp);
2328 break;
2329
2330 case RE_REP_GREEDY:
2331 fputs("repetition ", fp);
2332 if (node->rep.min == node->rep.max) {
2333 fprintf(fp, ":%lu:", (unsigned long)node->rep.min);
2334 } else if (node->rep.max != (size_t)-1) {
2335 fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
2336 } else {
2337 switch (node->rep.min) {
2338 case 0: fputs("*", fp); break;
2339 case 1: fputs("+", fp); break;
2340 default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break;
2341 }
2342 }
2343 fputs(" {\n", fp);
2344 dumpnode(fp, node->rep.arg, node, indent + 1);
2345 fprintf(fp, "[%p] ", (const void *)node);
2346 do_indent(fp, indent);
2347 fputs("} /repetition\n", fp);
2348 break;
2349
2350 case RE_SELECT: {
2351 const re_node *const be = branchend(node->select.arg, node->select.next);
2352 fputs("if {\n", fp);
2353 dumpnode(fp, node->select.cond, NULL, indent + 1);
2354 fprintf(fp, "[%p] ", (const void *)node);
2355 do_indent(fp, indent);
2356 fputs("} then {\n", fp);
2357 dumpnode(fp, node->select.arg, be, indent + 1);
2358 if (node->select.next != be) {
2359 fprintf(fp, "[%p] ", (const void *)node);
2360 do_indent(fp, indent);
2361 fputs("} else {\n", fp);
2362 dumpnode(fp, node->select.next, be, indent + 1);
2363 }
2364 fprintf(fp, "[%p] ", (const void *)node);
2365 do_indent(fp, indent);
2366 fputs("} /if\n", fp);
2367 dumpnode(fp, be, end, indent);
2368 return;
2369 }
2370
2371 case RE_STUFF_FRUGAL:
2372 fputs(".*?\n", fp);
2373 break;
2374
2375 case RE_STUFF_GREEDY:
2376 fputs(".*\n", fp);
2377 break;
2378
2379 case RE_XREP_FRUGAL:
2380 fputs("simple repetition ", fp);
2381 if (node->rep.min == node->rep.max) {
2382 fprintf(fp, ":%lu:?", (unsigned long)node->rep.min);
2383 } else if (node->rep.max != (size_t)-1) {
2384 fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
2385 } else {
2386 switch (node->rep.min) {
2387 case 0: fputs("*?", fp); break;
2388 case 1: fputs("+?", fp); break;
2389 default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break;
2390 }
2391 }
2392 fputs(" {\n", fp);
2393 dumpnode(fp, node->rep.arg, NULL, indent + 1);
2394 fprintf(fp, "[%p] ", (const void *)node);
2395 do_indent(fp, indent);
2396 fputs("} /simple repetition\n", fp);
2397 break;
2398
2399 case RE_XREP_GREEDY:
2400 fputs("simple repetition ", fp);
2401 if (node->rep.min == node->rep.max) {
2402 fprintf(fp, ":%lu:", (unsigned long)node->rep.min);
2403 } else if (node->rep.max != (size_t)-1) {
2404 fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
2405 } else {
2406 switch (node->rep.min) {
2407 case 0: fputs("*", fp); break;
2408 case 1: fputs("+", fp); break;
2409 default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break;
2410 }
2411 }
2412 fputs(" {\n", fp);
2413 dumpnode(fp, node->rep.arg, NULL, indent + 1);
2414 fprintf(fp, "[%p] ", (const void *)node);
2415 do_indent(fp, indent);
2416 fputs("} /simple repetition\n", fp);
2417 break;
2418 }
2419 dumpnode(fp, node->x.next, end, indent);
2420 }
2421
2422 void re_dump(const t_regex *re, FILE *fp) {
2423 fprintf(fp, "RE: minlen=%lu, anchor=%s\n", (unsigned long)re->minlen, re->flags & FL_ANCHOR_START ? "start" : "none");
2424 dumpnode(fp, re->start, NULL, 0);
2425 }
2426
2427 static void cat_n(String *s, unsigned long n) {
2428 char big_enough[100];
2429 sprintf(big_enough, "%lu", n);
2430 St_cat_s(s, big_enough);
2431 }
2432
2433 static void decom_repeat(String *s, const re_node *node) {
2434 assert(
2435 node->x.type == RE_REP_FRUGAL ||
2436 node->x.type == RE_REP_GREEDY ||
2437 node->x.type == RE_XREP_FRUGAL ||
2438 node->x.type == RE_XREP_GREEDY
2439 );
2440
2441 if (node->rep.min == 0 && node->rep.max == (size_t)-1) {
2442 St_cat_c(s, '*');
2443 } else if (node->rep.min == 0 && node->rep.max == 1) {
2444 St_cat_c(s, '?');
2445 } else if (node->rep.min == 1 && node->rep.max == (size_t)-1) {
2446 St_cat_c(s, '+');
2447 } else if (node->rep.min == node->rep.max) {
2448 St_cat_c(s, ':');
2449 cat_n(s, node->rep.min);
2450 St_cat_c(s, ':');
2451 } else if (node->rep.max == (size_t)-1) {
2452 St_cat_c(s, ':');
2453 cat_n(s, node->rep.min);
2454 St_cat_m(s, ",:", 2);
2455 } else {
2456 St_cat_c(s, ':');
2457 cat_n(s, node->rep.min);
2458 St_cat_c(s, ',');
2459 cat_n(s, node->rep.max);
2460 St_cat_c(s, ':');
2461 }
2462
2463 if (node->rep.type == RE_REP_FRUGAL || node->rep.type == RE_XREP_FRUGAL) {
2464 St_cat_c(s, '?');
2465 }
2466 }
2467
2468 static void decom_node(String *s, const re_node *node, const re_node *end) {
2469 if (node == end) {
2470 return;
2471 }
2472
2473 assert(node != NULL);
2474 switch (node->x.type) {
2475 case RE_ALTER: {
2476 const re_node *const be = branchend(node->alter.arg, node->alter.next);
2477 St_cat_c(s, '(');
2478 decom_node(s, node->alter.arg, be);
2479 St_cat_c(s, '|');
2480 decom_node(s, node->alter.next, be);
2481 St_cat_c(s, ')');
2482 decom_node(s, be, end);
2483 break;
2484 }
2485
2486 case RE_ANYCH:
2487 St_cat_c(s, '.');
2488 decom_node(s, node->x.next, end);
2489 break;
2490
2491 case RE_AT_BOL:
2492 St_cat_m(s, "A!", 2);
2493 decom_node(s, node->x.next, end);
2494 break;
2495
2496 case RE_AT_BOS:
2497 St_cat_c(s, '^');
2498 decom_node(s, node->x.next, end);
2499 break;
2500
2501 case RE_AT_EOL:
2502 St_cat_m(s, "Z!", 2);
2503 decom_node(s, node->x.next, end);
2504 break;
2505
2506 case RE_AT_EOS:
2507 St_cat_c(s, '$');
2508 decom_node(s, node->x.next, end);
2509 break;
2510
2511 case RE_AT_MATCH:
2512 St_cat_c(s, '[');
2513 decom_node(s, node->indep.arg, NULL);
2514 St_cat_m(s, "&]", 2);
2515 decom_node(s, node->indep.next, end);
2516 break;
2517
2518 case RE_AT_NOMATCH:
2519 St_cat_c(s, '[');
2520 decom_node(s, node->indep.arg, NULL);
2521 St_cat_m(s, "^]", 2);
2522 decom_node(s, node->indep.next, end);
2523 break;
2524
2525 case RE_AT_NWBOUND:
2526 St_cat_m(s, "B!", 2);
2527 decom_node(s, node->x.next, end);
2528 break;
2529
2530 case RE_AT_WBOUND:
2531 St_cat_m(s, "b!", 2);
2532 decom_node(s, node->x.next, end);
2533 break;
2534
2535 case RE_BACKREF:
2536 case RE_BACKCHECK: {
2537 String tmp;
2538 if (node->x.type == RE_BACKCHECK) {
2539 St_cat_c(s, '[');
2540 }
2541 St_init(&tmp);
2542 #if HAVE_VSNPRINTF_P
2543 St_xprintf(&tmp, "%lu", (unsigned long)node->backref.n);
2544 #else
2545 St_num(&tmp, node->backref.n);
2546 #endif
2547 St_cat(s, &tmp);
2548 St_clear(&tmp);
2549 if (node->x.type == RE_BACKCHECK) {
2550 St_cat_c(s, ']');
2551 } else {
2552 St_cat_c(s, ESCAPE);
2553 }
2554 decom_node(s, node->backref.next, end);
2555 break;
2556 }
2557
2558 case RE_BEGIN_CAPTURE:
2559 St_cat_c(s, '{');
2560 decom_node(s, node->x.next, end);
2561 break;
2562
2563 case RE_CLASS:
2564 St_cat_c(s, '\'');
2565 decom_class(s, node->class.vec);
2566 St_cat_c(s, '\'');
2567 decom_node(s, node->class.next, end);
2568 break;
2569
2570 case RE_DEFCLASS:
2571 St_cat_c(s, cdefclass(node->defclass.vec));
2572 St_cat_c(s, ESCAPE);
2573 decom_node(s, node->defclass.next, end);
2574 break;
2575
2576 case RE_END_CAPTURE:
2577 St_cat_c(s, '}');
2578 decom_node(s, node->x.next, end);
2579 break;
2580
2581 case RE_INDEP:
2582 St_cat_c(s, '<');
2583 decom_node(s, node->indep.arg, NULL);
2584 St_cat_c(s, '>');
2585 decom_node(s, node->indep.next, end);
2586 break;
2587
2588 case RE_LITERAL:
2589 decom_liter(s, node);
2590 decom_node(s, node->literal.next, end);
2591 break;
2592
2593 case RE_N_CLASS:
2594 St_cat_m(s, "'^", 2);
2595 decom_class(s, node->class.vec);
2596 St_cat_c(s, '\'');
2597 decom_node(s, node->class.next, end);
2598 break;
2599
2600 case RE_N_DEFCLASS:
2601 St_cat_c(s, toupper(cdefclass(node->defclass.vec)));
2602 St_cat_c(s, ESCAPE);
2603 decom_node(s, node->defclass.next, end);
2604 break;
2605
2606 case RE_PARTIAL:
2607 St_cat_c(s, '`');
2608 decom_liter(s, node);
2609 St_cat_c(s, '`');
2610 break;
2611
2612 case RE_REP_FRUGAL:
2613 case RE_REP_GREEDY:
2614 St_cat_c(s, '(');
2615 decom_node(s, node->rep.arg, node);
2616 St_cat_c(s, ')');
2617 decom_repeat(s, node);
2618 decom_node(s, node->rep.next, end);
2619 break;
2620
2621 case RE_SELECT: {
2622 const re_node *const be = branchend(node->select.arg, node->select.next);
2623 St_cat_c(s, '[');
2624 if (be != node->select.next) {
2625 decom_node(s, node->select.next, be);
2626 St_cat_c(s, '|');
2627 }
2628 decom_node(s, node->select.arg, be);
2629 St_cat_c(s, '(');
2630 decom_node(s, node->select.cond, NULL);
2631 St_cat_c(s, ')');
2632 St_cat_m(s, "?]", 2);
2633 decom_node(s, be, end);
2634 break;
2635 }
2636
2637 case RE_STUFF_FRUGAL:
2638 St_cat_m(s, ".*?", 3);
2639 decom_node(s, node->x.next, end);
2640 break;
2641
2642 case RE_STUFF_GREEDY:
2643 St_cat_m(s, ".*", 2);
2644 decom_node(s, node->x.next, end);
2645 break;
2646
2647 case RE_XREP_FRUGAL:
2648 case RE_XREP_GREEDY:
2649 St_cat_c(s, '(');
2650 decom_node(s, node->rep.arg, NULL);
2651 St_cat_c(s, ')');
2652 decom_repeat(s, node);
2653 decom_node(s, node->rep.next, end);
2654 break;
2655 }
2656 }
2657
2658 void re_decompile(const t_regex *re, String *s) {
2659 St_zero(s);
2660 decom_node(s, re->start, NULL);
2661 }