Mercurial > repo
comparison src/ploki/re.c @ 4223:ac0403686959
<oerjan> rm -rf src/ploki; mv ploki src
author | HackBot |
---|---|
date | Fri, 20 Dec 2013 22:18:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4222:b0f3e267bb1e | 4223:ac0403686959 |
---|---|
1 #include "config.h" | |
2 #include "IO.h" | |
3 #include "Str.h" | |
4 #include "hash.h" | |
5 #include "main_io.h" | |
6 #include "main_opt.h" | |
7 #include "re.h" | |
8 #include "xmalloc.h" | |
9 #include "zz.h" | |
10 | |
11 #include <ctype.h> | |
12 #include <limits.h> | |
13 #include <stddef.h> | |
14 #include <stdio.h> | |
15 #include <string.h> | |
16 #include <stdlib.h> | |
17 #include <assert.h> | |
18 | |
19 enum {CLASS_SIZE = UCHAR_MAX / CHAR_BIT + 1u}; | |
20 | |
21 static unsigned char dc_word[CLASS_SIZE]; | |
22 static unsigned char dc_alpha[CLASS_SIZE]; | |
23 static unsigned char dc_cntrl[CLASS_SIZE]; | |
24 static unsigned char dc_digit[CLASS_SIZE]; | |
25 static unsigned char dc_lower[CLASS_SIZE]; | |
26 static unsigned char dc_print[CLASS_SIZE]; | |
27 static unsigned char dc_space[CLASS_SIZE]; | |
28 static unsigned char dc_upper[CLASS_SIZE]; | |
29 static unsigned char dc_xdigit[CLASS_SIZE]; | |
30 | |
31 static void fill(unsigned char *v, int (*pred)(int)) { | |
32 unsigned char c; | |
33 | |
34 for (c = UCHAR_MAX; c; --c) { | |
35 v[c / CHAR_BIT] |= !!pred(c) << c % CHAR_BIT; | |
36 } | |
37 v[0] |= !!pred(0); | |
38 } | |
39 | |
40 static void nfill(unsigned char *v, int (*pred)(int)) { | |
41 unsigned char c; | |
42 | |
43 for (c = UCHAR_MAX; c; --c) { | |
44 v[c / CHAR_BIT] |= !pred(c) << c % CHAR_BIT; | |
45 } | |
46 v[0] |= !pred(0); | |
47 } | |
48 | |
49 ATTR_CONST | |
50 static int siword(int c) { | |
51 return c == '_' || isalnum(c); | |
52 } | |
53 | |
54 enum {MAGIC = 23}; | |
55 | |
56 static Hash rcache; | |
57 static String cached[MAGIC]; | |
58 static size_t cachfill, cachlast; | |
59 | |
60 static size_t hash(const void *s, size_t seed) { | |
61 return St_hash(s, seed); | |
62 } | |
63 | |
64 static int compar(const void *a, const void *b) { | |
65 return St_cmp(a, b); | |
66 } | |
67 | |
68 enum t_re_node { | |
69 RE_ALTER, | |
70 RE_ANYCH, | |
71 RE_AT_BOL, | |
72 RE_AT_BOS, | |
73 RE_AT_EOL, | |
74 RE_AT_EOS, | |
75 RE_AT_MATCH, | |
76 RE_AT_NOMATCH, | |
77 RE_AT_NWBOUND, | |
78 RE_AT_WBOUND, | |
79 RE_BACKCHECK, | |
80 RE_BACKREF, | |
81 RE_BEGIN_CAPTURE, | |
82 RE_CLASS, | |
83 RE_DEFCLASS, | |
84 RE_END_CAPTURE, | |
85 RE_INDEP, | |
86 RE_LITERAL, | |
87 RE_N_CLASS, | |
88 RE_N_DEFCLASS, | |
89 RE_PARTIAL, | |
90 RE_REP_FRUGAL, | |
91 RE_REP_GREEDY, | |
92 RE_SELECT, | |
93 RE_STUFF_FRUGAL, | |
94 RE_STUFF_GREEDY, | |
95 RE_XREP_FRUGAL, | |
96 RE_XREP_GREEDY | |
97 }; | |
98 | |
99 #define CMN_HDR enum t_re_node type; union my_node *next | |
100 | |
101 typedef union my_node { | |
102 struct { | |
103 CMN_HDR; | |
104 } x; | |
105 struct { | |
106 CMN_HDR; | |
107 union my_node *arg; | |
108 } alter; | |
109 struct { | |
110 CMN_HDR; | |
111 size_t n; | |
112 } backref; | |
113 struct { | |
114 CMN_HDR; | |
115 size_t n; | |
116 } capture; | |
117 struct { | |
118 CMN_HDR; | |
119 unsigned char vec[CLASS_SIZE]; | |
120 } class; | |
121 struct { | |
122 CMN_HDR; | |
123 unsigned char *vec; | |
124 } defclass; | |
125 struct { | |
126 CMN_HDR; | |
127 union my_node *arg; | |
128 } indep; | |
129 struct { | |
130 CMN_HDR; | |
131 unsigned char *buf; | |
132 size_t len; | |
133 } literal; | |
134 struct { | |
135 CMN_HDR; | |
136 union my_node *arg; | |
137 size_t min, max; | |
138 size_t n; | |
139 } rep; | |
140 struct { | |
141 CMN_HDR; | |
142 union my_node *arg; | |
143 union my_node *cond; | |
144 } select; | |
145 } re_node; | |
146 | |
147 #include "re_block.c.h" | |
148 | |
149 enum re_flags { | |
150 FL_NONE = 0, | |
151 FL_ANCHOR_START = 1 | |
152 }; | |
153 | |
154 struct cap_state { | |
155 size_t start, end, pending; | |
156 }; | |
157 | |
158 struct my_regex { | |
159 re_node *start; | |
160 size_t repets; | |
161 size_t *repbuf; | |
162 t_block alloc; | |
163 size_t captures; | |
164 struct cap_state *cap_ptr; | |
165 size_t refs; | |
166 enum re_flags flags; | |
167 size_t minlen; | |
168 }; | |
169 | |
170 static void delv(void *v) { | |
171 re_free(v); | |
172 } | |
173 | |
174 void re_init(void) { | |
175 fill(dc_word, siword); | |
176 fill(dc_alpha, isalpha); | |
177 fill(dc_cntrl, iscntrl); | |
178 fill(dc_digit, isdigit); | |
179 fill(dc_lower, islower); | |
180 fill(dc_print, isprint); | |
181 fill(dc_space, isspace); | |
182 fill(dc_upper, isupper); | |
183 fill(dc_xdigit, isxdigit); | |
184 | |
185 h_init(&rcache, hash, compar, NULL, delv); | |
186 } | |
187 | |
188 void re_end(void) { | |
189 h_end(&rcache); | |
190 while (cachfill) { | |
191 --cachfill; | |
192 St_clear(&cached[cachfill]); | |
193 } | |
194 cachlast = 0; | |
195 } | |
196 | |
197 | |
198 enum {ESCAPE = '!'}; | |
199 #define MEATCHARP(c) strchr("!|({<[]>})*+?:.'^$`", c) | |
200 #define ENDBRANCHP(c) strchr("|({<[", c) | |
201 | |
202 struct parse_context { | |
203 t_regex *base; | |
204 t_block *block; | |
205 const String *s; | |
206 size_t *p; | |
207 }; | |
208 | |
209 static void skipspace(const String *const s, size_t *const p) { | |
210 restart: | |
211 while (isspace(ST_INDEX(s, *p))) { | |
212 --*p; | |
213 } | |
214 if (ST_INDEX(s, *p) == ']' && ST_INDEX(s, *p - 1u) == '#') { | |
215 *p -= 2u; | |
216 while (ST_INDEX(s, *p) != EOF) { | |
217 if (ST_INDEX(s, *p) == '[') { | |
218 --*p; | |
219 goto restart; | |
220 } | |
221 if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1u) != EOF) { | |
222 --*p; | |
223 } | |
224 --*p; | |
225 } | |
226 } | |
227 } | |
228 | |
229 #define PRAZ(name) static re_node *name(const struct parse_context *con, re_node *next, re_node *curolan_zebeth) | |
230 #define SNORK const String *const s = con->s; size_t *const p = con->p | |
231 #define CALL(name, nextn, glob) name(con, nextn, glob) | |
232 #define TAILC(name) return CALL(name, next, curolan_zebeth) | |
233 | |
234 PRAZ(quantifire); | |
235 PRAZ(branch); | |
236 PRAZ(alternation); | |
237 | |
238 PRAZ(atom) { | |
239 SNORK; | |
240 (void)curolan_zebeth; | |
241 | |
242 skipspace(s, p); | |
243 switch (ST_INDEX(s, *p)) { | |
244 case '(': | |
245 case '<': | |
246 case '{': | |
247 case '[': | |
248 case EOF: { | |
249 re_node *cur = bl_node(con->block); | |
250 cur->literal.type = RE_LITERAL; | |
251 cur->literal.next = next; | |
252 cur->literal.buf = NULL; | |
253 cur->literal.len = 0; | |
254 return cur; | |
255 } | |
256 | |
257 case ESCAPE: { | |
258 re_node *cur = bl_node(con->block); | |
259 --*p; | |
260 if (isdigit(ST_INDEX(s, *p))) { | |
261 do { | |
262 --*p; | |
263 } while (isdigit(ST_INDEX(s, *p))); | |
264 cur->backref.type = RE_BACKREF; | |
265 cur->backref.next = next; | |
266 cur->backref.n = strtoul(St_ptr(s) + *p + 1, NULL, 10); | |
267 } else if (isalpha(ST_INDEX(s, *p))) { | |
268 switch (ST_INDEX(s, *p)) { | |
269 case 'A': | |
270 cur->x.type = RE_AT_BOL; | |
271 cur->x.next = next; | |
272 --*p; | |
273 break; | |
274 case 'Z': | |
275 cur->x.type = RE_AT_EOL; | |
276 cur->x.next = next; | |
277 --*p; | |
278 break; | |
279 | |
280 case 'b': | |
281 cur->x.type = RE_AT_WBOUND; | |
282 cur->x.next = next; | |
283 --*p; | |
284 break; | |
285 case 'B': | |
286 cur->x.type = RE_AT_NWBOUND; | |
287 cur->x.next = next; | |
288 --*p; | |
289 break; | |
290 | |
291 case 'c': | |
292 cur->defclass.type = RE_DEFCLASS; | |
293 cur->defclass.vec = dc_cntrl; | |
294 cur->defclass.next = next; | |
295 --*p; | |
296 break; | |
297 case 'C': | |
298 cur->defclass.type = RE_N_DEFCLASS; | |
299 cur->defclass.vec = dc_cntrl; | |
300 cur->defclass.next = next; | |
301 --*p; | |
302 break; | |
303 | |
304 case 'd': | |
305 cur->defclass.type = RE_DEFCLASS; | |
306 cur->defclass.vec = dc_digit; | |
307 cur->defclass.next = next; | |
308 --*p; | |
309 break; | |
310 case 'D': | |
311 cur->defclass.type = RE_N_DEFCLASS; | |
312 cur->defclass.vec = dc_digit; | |
313 cur->defclass.next = next; | |
314 --*p; | |
315 break; | |
316 | |
317 case 'l': | |
318 cur->defclass.type = RE_DEFCLASS; | |
319 cur->defclass.vec = dc_lower; | |
320 cur->defclass.next = next; | |
321 --*p; | |
322 break; | |
323 case 'L': | |
324 cur->defclass.type = RE_N_DEFCLASS; | |
325 cur->defclass.vec = dc_lower; | |
326 cur->defclass.next = next; | |
327 --*p; | |
328 break; | |
329 | |
330 case 'p': | |
331 cur->defclass.type = RE_DEFCLASS; | |
332 cur->defclass.vec = dc_print; | |
333 cur->defclass.next = next; | |
334 --*p; | |
335 break; | |
336 case 'P': | |
337 cur->defclass.type = RE_N_DEFCLASS; | |
338 cur->defclass.vec = dc_print; | |
339 cur->defclass.next = next; | |
340 --*p; | |
341 break; | |
342 | |
343 case 'q': | |
344 cur->defclass.type = RE_DEFCLASS; | |
345 cur->defclass.vec = dc_alpha; | |
346 cur->defclass.next = next; | |
347 --*p; | |
348 break; | |
349 case 'Q': | |
350 cur->defclass.type = RE_N_DEFCLASS; | |
351 cur->defclass.vec = dc_alpha; | |
352 cur->defclass.next = next; | |
353 --*p; | |
354 break; | |
355 | |
356 case 's': | |
357 cur->defclass.type = RE_DEFCLASS; | |
358 cur->defclass.vec = dc_space; | |
359 cur->defclass.next = next; | |
360 --*p; | |
361 break; | |
362 case 'S': | |
363 cur->defclass.type = RE_N_DEFCLASS; | |
364 cur->defclass.vec = dc_space; | |
365 cur->defclass.next = next; | |
366 --*p; | |
367 break; | |
368 | |
369 case 'u': | |
370 cur->defclass.type = RE_DEFCLASS; | |
371 cur->defclass.vec = dc_upper; | |
372 cur->defclass.next = next; | |
373 --*p; | |
374 break; | |
375 case 'U': | |
376 cur->defclass.type = RE_N_DEFCLASS; | |
377 cur->defclass.vec = dc_upper; | |
378 cur->defclass.next = next; | |
379 --*p; | |
380 break; | |
381 | |
382 case 'w': | |
383 cur->defclass.type = RE_DEFCLASS; | |
384 cur->defclass.vec = dc_word; | |
385 cur->defclass.next = next; | |
386 --*p; | |
387 break; | |
388 case 'W': | |
389 cur->defclass.type = RE_N_DEFCLASS; | |
390 cur->defclass.vec = dc_word; | |
391 cur->defclass.next = next; | |
392 --*p; | |
393 break; | |
394 | |
395 case 'x': | |
396 cur->defclass.type = RE_DEFCLASS; | |
397 cur->defclass.vec = dc_xdigit; | |
398 cur->defclass.next = next; | |
399 --*p; | |
400 break; | |
401 case 'X': | |
402 cur->defclass.type = RE_N_DEFCLASS; | |
403 cur->defclass.vec = dc_xdigit; | |
404 cur->defclass.next = next; | |
405 --*p; | |
406 break; | |
407 | |
408 default: | |
409 goto default_escape; | |
410 } | |
411 | |
412 } else default_escape: { | |
413 cur->literal.type = RE_LITERAL; | |
414 cur->literal.next = next; | |
415 cur->literal.buf = bl_string(con->block, cur->literal.len = 1); | |
416 if (ST_INDEX(s, *p) == EOF) { | |
417 cur->literal.buf[0] = ESCAPE; | |
418 } else { | |
419 cur->literal.buf[0] = ST_INDEX(s, *p); | |
420 --*p; | |
421 } | |
422 } | |
423 return cur; | |
424 } | |
425 | |
426 case ')': { | |
427 re_node *cur; | |
428 --*p; | |
429 cur = CALL(alternation, next, next); | |
430 skipspace(s, p); | |
431 if (ST_INDEX(s, *p) == '(') { | |
432 --*p; | |
433 } | |
434 return cur; | |
435 } | |
436 | |
437 case ']': { | |
438 re_node *cur; | |
439 switch (ST_INDEX(s, *p - 1u)) { | |
440 case '&': | |
441 *p -= 2u; | |
442 cur = bl_node(con->block); | |
443 cur->indep.type = RE_AT_MATCH; | |
444 cur->indep.next = next; | |
445 cur->indep.arg = CALL(alternation, NULL, NULL); | |
446 break; | |
447 | |
448 case '^': | |
449 *p -= 2u; | |
450 cur = bl_node(con->block); | |
451 cur->indep.type = RE_AT_NOMATCH; | |
452 cur->indep.next = next; | |
453 cur->indep.arg = CALL(alternation, NULL, NULL); | |
454 break; | |
455 | |
456 case '?': | |
457 *p -= 2u; | |
458 cur = bl_node(con->block); | |
459 cur->select.type = RE_SELECT; | |
460 cur->select.cond = CALL(quantifire, NULL, NULL); | |
461 cur->select.arg = CALL(branch, next, next); | |
462 skipspace(s, p); | |
463 if (ST_INDEX(s, *p) == '|') { | |
464 --*p; | |
465 cur->select.next = CALL(alternation, next, next); | |
466 } else { | |
467 cur->select.next = next; | |
468 } | |
469 break; | |
470 | |
471 case '0': case '1': case '2': case '3': case '4': | |
472 case '5': case '6': case '7': case '8': case '9': { | |
473 size_t i; | |
474 | |
475 for (i = *p - 2u; isdigit(ST_INDEX(s, i)); --i) | |
476 ; | |
477 if (ST_INDEX(s, i) != '[') { | |
478 goto std_atom; | |
479 } | |
480 cur = bl_node(con->block); | |
481 cur->backref.type = RE_BACKCHECK; | |
482 cur->backref.next = next; | |
483 cur->backref.n = strtoul(St_ptr(s) + i + 1u, NULL, 10); | |
484 *p = i - 1u; | |
485 | |
486 return cur; | |
487 } | |
488 | |
489 default: | |
490 goto std_atom; | |
491 } | |
492 | |
493 skipspace(s, p); | |
494 if (ST_INDEX(s, *p) == '[') { | |
495 --*p; | |
496 } | |
497 return cur; | |
498 } | |
499 | |
500 case '>': { | |
501 re_node *cur = bl_node(con->block); | |
502 --*p; | |
503 cur->indep.type = RE_INDEP; | |
504 cur->indep.next = next; | |
505 cur->indep.arg = CALL(alternation, NULL, NULL); | |
506 skipspace(s, p); | |
507 if (ST_INDEX(s, *p) == '<') { | |
508 --*p; | |
509 } | |
510 return cur; | |
511 } | |
512 | |
513 case '}': { | |
514 re_node *begin = bl_node(con->block); | |
515 re_node *end = bl_node(con->block); | |
516 --*p; | |
517 begin->capture.n = end->capture.n = con->base->captures++; | |
518 end->capture.type = RE_END_CAPTURE; | |
519 end->capture.next = next; | |
520 begin->capture.type = RE_BEGIN_CAPTURE; | |
521 begin->capture.next = CALL(alternation, end, end); | |
522 skipspace(s, p); | |
523 if (ST_INDEX(s, *p) == '{') { | |
524 --*p; | |
525 } | |
526 return begin; | |
527 } | |
528 | |
529 case '^': { | |
530 re_node *cur = bl_node(con->block); | |
531 cur->x.type = RE_AT_BOS; | |
532 cur->x.next = next; | |
533 --*p; | |
534 return cur; | |
535 } | |
536 | |
537 case '$': { | |
538 re_node *cur = bl_node(con->block); | |
539 cur->x.type = RE_AT_EOS; | |
540 cur->x.next = next; | |
541 --*p; | |
542 return cur; | |
543 } | |
544 | |
545 case '.': { | |
546 re_node *cur = bl_node(con->block); | |
547 cur->x.type = RE_ANYCH; | |
548 cur->x.next = next; | |
549 --*p; | |
550 return cur; | |
551 } | |
552 | |
553 case '`': { | |
554 String tmp; | |
555 int c; | |
556 re_node *cur = bl_node(con->block); | |
557 cur->literal.type = RE_PARTIAL; | |
558 cur->literal.next = next; | |
559 | |
560 --*p; | |
561 St_init(&tmp); | |
562 while ( | |
563 (c = ST_INDEX(s, *p)) != EOF && | |
564 c != '`' | |
565 ) { | |
566 --*p; | |
567 if (c == ESCAPE && ST_INDEX(s, *p) != EOF) { | |
568 c = ST_INDEX(s, *p); | |
569 --*p; | |
570 } | |
571 St_cat_c(&tmp, c); | |
572 } | |
573 if (c == '`') { | |
574 --*p; | |
575 } | |
576 St_reverse(&tmp); | |
577 | |
578 cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp)); | |
579 memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp)); | |
580 St_clear(&tmp); | |
581 | |
582 return cur; | |
583 } | |
584 | |
585 case '\'': { | |
586 int c; | |
587 re_node *cur = bl_node(con->block); | |
588 cur->class.type = RE_CLASS; | |
589 cur->class.next = next; | |
590 memset(cur->class.vec, '\0', sizeof cur->class.vec); | |
591 for (--*p; (c = ST_INDEX(s, *p)) != EOF && c != '\''; --*p) { | |
592 if (c == ESCAPE) { | |
593 switch (c = ST_INDEX(s, *p - 1)) { | |
594 case EOF: | |
595 cur->class.vec[ESCAPE / CHAR_BIT] |= 1u << ESCAPE % CHAR_BIT; | |
596 break; | |
597 | |
598 case 'c': | |
599 fill(cur->class.vec, iscntrl); | |
600 --*p; | |
601 break; | |
602 case 'C': | |
603 nfill(cur->class.vec, iscntrl); | |
604 --*p; | |
605 break; | |
606 | |
607 case 'd': | |
608 fill(cur->class.vec, isdigit); | |
609 --*p; | |
610 break; | |
611 case 'D': | |
612 nfill(cur->class.vec, isdigit); | |
613 --*p; | |
614 break; | |
615 | |
616 case 'l': | |
617 fill(cur->class.vec, islower); | |
618 --*p; | |
619 break; | |
620 case 'L': | |
621 nfill(cur->class.vec, islower); | |
622 --*p; | |
623 break; | |
624 | |
625 case 'p': | |
626 fill(cur->class.vec, isprint); | |
627 --*p; | |
628 break; | |
629 case 'P': | |
630 nfill(cur->class.vec, isprint); | |
631 --*p; | |
632 break; | |
633 | |
634 case 'q': | |
635 fill(cur->class.vec, isalpha); | |
636 --*p; | |
637 break; | |
638 case 'Q': | |
639 nfill(cur->class.vec, isalpha); | |
640 --*p; | |
641 break; | |
642 | |
643 case 's': | |
644 fill(cur->class.vec, isspace); | |
645 --*p; | |
646 break; | |
647 case 'S': | |
648 nfill(cur->class.vec, isspace); | |
649 --*p; | |
650 break; | |
651 | |
652 case 'u': | |
653 fill(cur->class.vec, isupper); | |
654 --*p; | |
655 break; | |
656 case 'U': | |
657 nfill(cur->class.vec, isupper); | |
658 --*p; | |
659 break; | |
660 | |
661 case 'w': | |
662 fill(cur->class.vec, siword); | |
663 --*p; | |
664 break; | |
665 case 'W': | |
666 nfill(cur->class.vec, siword); | |
667 --*p; | |
668 break; | |
669 | |
670 case 'x': | |
671 fill(cur->class.vec, isxdigit); | |
672 --*p; | |
673 break; | |
674 case 'X': | |
675 nfill(cur->class.vec, isxdigit); | |
676 --*p; | |
677 break; | |
678 | |
679 default: | |
680 cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT; | |
681 --*p; | |
682 break; | |
683 } | |
684 } else { | |
685 int b; | |
686 if (ST_INDEX(s, *p - 1) == '-' && (b = ST_INDEX(s, *p - 2)) != EOF) { | |
687 if (b == ESCAPE && (b = ST_INDEX(s, *p - 3)) == EOF) { | |
688 b = ESCAPE; | |
689 } | |
690 if (b > c) { | |
691 goto normal_char; | |
692 } | |
693 for (; b < c; ++b) { | |
694 cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT; | |
695 } | |
696 cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT; | |
697 *p -= 2; | |
698 if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1) != EOF) { | |
699 --*p; | |
700 } | |
701 } else if (ST_INDEX(s, *p) == ']' && *p >= 8 && ST_INDEX(s, *p - 1) == ':') { | |
702 | |
703 #define CXCLASS_CHECK(n, t, f, o, b) if (1) { \ | |
704 if (memcmp(St_ptr(s) + *p - (n), (t), (n) - 1) != 0) { \ | |
705 goto normal_char; \ | |
706 } \ | |
707 (b)(cur->class.vec, (f)); \ | |
708 *p -= (o); \ | |
709 break; \ | |
710 } else (void)0 | |
711 | |
712 #define CCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, fill) | |
713 #define CNCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, nfill) | |
714 | |
715 if (ST_INDEX(s, *p - 8) == '[' && ST_INDEX(s, *p - 7) == ':') { | |
716 switch (ST_INDEX(s, *p - 6)) { | |
717 case 'a': | |
718 if (ST_INDEX(s, *p - 5) != 'l') { | |
719 goto normal_char; | |
720 } | |
721 switch (ST_INDEX(s, *p - 4)) { | |
722 case 'n': CCLASS_CHECK("um", isalnum, 8); | |
723 case 'p': CCLASS_CHECK("ha", isalpha, 8); | |
724 default: goto normal_char; | |
725 } | |
726 break; | |
727 | |
728 case 'c': CCLASS_CHECK("ntrl", iscntrl, 8); | |
729 case 'd': CCLASS_CHECK("igit", isdigit, 8); | |
730 case 'g': CCLASS_CHECK("raph", isgraph, 8); | |
731 case 'l': CCLASS_CHECK("ower", islower, 8); | |
732 | |
733 case 'p': | |
734 switch (ST_INDEX(s, *p - 5)) { | |
735 case 'r': CCLASS_CHECK("int", isprint, 8); | |
736 case 'u': CCLASS_CHECK("nct", ispunct, 8); | |
737 default: goto normal_char; | |
738 } | |
739 break; | |
740 | |
741 case 's': CCLASS_CHECK("pace", isspace, 8); | |
742 case 'u': CCLASS_CHECK("pper", isupper, 8); | |
743 default: goto normal_char; | |
744 } | |
745 } else if (*p >= 9 && ST_INDEX(s, *p - 9) == '[' && ST_INDEX(s, *p - 8) == ':') { | |
746 switch (ST_INDEX(s, *p - 7)) { | |
747 case 'x': | |
748 CCLASS_CHECK("digit", isxdigit, 9); | |
749 | |
750 case '^': | |
751 switch (ST_INDEX(s, *p - 6)) { | |
752 case 'a': | |
753 if (ST_INDEX(s, *p - 5) != 'l') { | |
754 goto normal_char; | |
755 } | |
756 switch (ST_INDEX(s, *p - 4)) { | |
757 case 'n': CNCLASS_CHECK("um", isalnum, 9); | |
758 case 'p': CNCLASS_CHECK("ha", isalpha, 9); | |
759 default: goto normal_char; | |
760 } | |
761 break; | |
762 | |
763 case 'c': CNCLASS_CHECK("ntrl", iscntrl, 9); | |
764 case 'd': CNCLASS_CHECK("igit", isdigit, 9); | |
765 case 'g': CNCLASS_CHECK("raph", isgraph, 9); | |
766 case 'l': CNCLASS_CHECK("ower", islower, 9); | |
767 | |
768 case 'p': | |
769 switch (ST_INDEX(s, *p - 5)) { | |
770 case 'r': CNCLASS_CHECK("int", isprint, 9); | |
771 case 'u': CNCLASS_CHECK("nct", ispunct, 9); | |
772 default: goto normal_char; | |
773 } | |
774 break; | |
775 | |
776 case 's': CNCLASS_CHECK("pace", isspace, 9); | |
777 case 'u': CNCLASS_CHECK("pper", isupper, 9); | |
778 default: goto normal_char; | |
779 } | |
780 break; | |
781 | |
782 default: goto normal_char; | |
783 } | |
784 } else if (*p >= 10 && memcmp(St_ptr(s) + *p - 10, "[:^xdigit", 9) == 0) { | |
785 nfill(cur->class.vec, isxdigit); | |
786 *p -= 10; | |
787 } else { | |
788 goto normal_char; | |
789 } | |
790 } else if (ST_INDEX(s, *p) == '^' && ((b = ST_INDEX(s, *p - 1) == '\'') || b == EOF)) { | |
791 cur->class.type = RE_N_CLASS; | |
792 } else normal_char: { | |
793 cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT; | |
794 } | |
795 } | |
796 } | |
797 if (c == '\'') { | |
798 --*p; | |
799 } | |
800 | |
801 if (cur->class.type == RE_CLASS) { | |
802 size_t i; | |
803 for (i = 0; i < CLASS_SIZE; ++i) { | |
804 if (cur->class.vec[i] != (unsigned char)~0u) { | |
805 return cur; | |
806 } | |
807 } | |
808 cur->x.type = RE_ANYCH; | |
809 } else { | |
810 size_t i; | |
811 assert(cur->class.type == RE_N_CLASS); | |
812 for (i = 0; i < CLASS_SIZE; ++i) { | |
813 if (cur->class.vec[i] != 0) { | |
814 return cur; | |
815 } | |
816 } | |
817 cur->x.type = RE_ANYCH; | |
818 } | |
819 | |
820 return cur; | |
821 } | |
822 | |
823 std_atom: | |
824 default: { | |
825 re_node *cur = bl_node(con->block); | |
826 cur->literal.type = RE_LITERAL; | |
827 | |
828 /*XXX?*/ | |
829 if (next && next->x.type == RE_LITERAL) { | |
830 if (next->literal.len) { | |
831 cur->literal.buf = bl_string(con->block, cur->literal.len = 1u + next->literal.len); | |
832 memcpy(cur->literal.buf + 1, next->literal.buf, next->literal.len); | |
833 } | |
834 cur->literal.next = next->literal.next; | |
835 } else { | |
836 cur->literal.next = next; | |
837 cur->literal.buf = bl_string(con->block, cur->literal.len = 1); | |
838 } | |
839 | |
840 cur->literal.buf[0] = ST_INDEX(s, *p); | |
841 --*p; | |
842 return cur; | |
843 } | |
844 } | |
845 } | |
846 | |
847 PRAZ(literal) { | |
848 SNORK; | |
849 int c; | |
850 String tmp; | |
851 re_node *cur; | |
852 | |
853 skipspace(s, p); | |
854 if ((ST_INDEX(s, *p) != ESCAPE || isalnum(ST_INDEX(s, *p - 1))) && MEATCHARP(ST_INDEX(s, *p))) { | |
855 TAILC(atom); | |
856 } | |
857 | |
858 cur = bl_node(con->block); | |
859 cur->literal.type = RE_LITERAL; | |
860 cur->literal.next = next; | |
861 | |
862 St_init(&tmp); | |
863 while ( | |
864 (c = ST_INDEX(s, *p)) != EOF && | |
865 ( | |
866 !MEATCHARP(c) || | |
867 (c == ESCAPE && !isalnum(ST_INDEX(s, *p - 1))) | |
868 ) | |
869 ) { | |
870 --*p; | |
871 if (c == ESCAPE && ST_INDEX(s, *p) != EOF) { | |
872 c = ST_INDEX(s, *p); | |
873 --*p; | |
874 } | |
875 St_cat_c(&tmp, c); | |
876 skipspace(s, p); | |
877 } | |
878 St_reverse(&tmp); | |
879 | |
880 if (next && next->x.type == RE_LITERAL) { | |
881 if (next->literal.len) { | |
882 St_cat_m(&tmp, next->literal.buf, next->literal.len); | |
883 } | |
884 cur->literal.next = next->literal.next; | |
885 } | |
886 | |
887 cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp)); | |
888 memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp)); | |
889 St_clear(&tmp); | |
890 | |
891 return cur; | |
892 } | |
893 | |
894 #define RETURN_REP(lo, hi, grdy) \ | |
895 do { \ | |
896 re_node *cur = bl_node(con->block); \ | |
897 cur->rep.type = (grdy); \ | |
898 cur->rep.next = next; \ | |
899 cur->rep.min = (lo); \ | |
900 cur->rep.max = (hi); \ | |
901 cur->rep.n = con->base->repets++; \ | |
902 cur->rep.arg = CALL(atom, cur, NULL); \ | |
903 return cur; \ | |
904 } while (0) | |
905 | |
906 #define STD_REP_CASE(lo, hi, grdy) \ | |
907 do { \ | |
908 --*p; \ | |
909 RETURN_REP(lo, hi, grdy); \ | |
910 } while (0) | |
911 | |
912 #define CUSTOM_REP_CASE(off, grdy, mismatch) \ | |
913 do { \ | |
914 size_t from, to; \ | |
915 to = *p - 1 - (off); \ | |
916 while (isdigit(ST_INDEX(s, to))) { \ | |
917 --to; \ | |
918 } \ | |
919 if (ST_INDEX(s, to) == ':') { \ | |
920 *p = to - 1; \ | |
921 to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \ | |
922 from = to; \ | |
923 } else { \ | |
924 if (ST_INDEX(s, to) != ',') { \ | |
925 mismatch; \ | |
926 } \ | |
927 from = to - 1; \ | |
928 if (!isdigit(ST_INDEX(s, from))) { \ | |
929 mismatch; \ | |
930 } \ | |
931 do { \ | |
932 --from; \ | |
933 } while (isdigit(ST_INDEX(s, from))); \ | |
934 if (ST_INDEX(s, from) != ':') { \ | |
935 mismatch; \ | |
936 } \ | |
937 *p = from - 1; \ | |
938 from = strtoul(St_ptr(s) + from + 1, NULL, 10); \ | |
939 to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \ | |
940 } \ | |
941 if (to == 1) { \ | |
942 if (from == 1) { \ | |
943 TAILC(atom); \ | |
944 } \ | |
945 if (from == 0) { \ | |
946 re_node *cur = bl_node(con->block); \ | |
947 cur->alter.type = RE_ALTER; \ | |
948 if ((grdy) == RE_REP_GREEDY) { \ | |
949 cur->alter.next = next; \ | |
950 cur->alter.arg = CALL(atom, next, NULL); \ | |
951 } else { \ | |
952 cur->alter.arg = next; \ | |
953 cur->alter.next = CALL(atom, next, NULL); \ | |
954 } \ | |
955 return cur; \ | |
956 } \ | |
957 } \ | |
958 RETURN_REP(from, to, grdy); \ | |
959 } while (0) | |
960 | |
961 PRAZ(quantifire) { | |
962 SNORK; | |
963 | |
964 skipspace(s, p); | |
965 switch (ST_INDEX(s, *p)) { | |
966 case EOF: return next; | |
967 | |
968 case '?': | |
969 switch (ST_INDEX(s, *p - 1)) { | |
970 case ':': CUSTOM_REP_CASE(1, RE_REP_FRUGAL, goto hook); | |
971 case '*': --*p; STD_REP_CASE(0, -1, RE_REP_FRUGAL); | |
972 case '+': --*p; STD_REP_CASE(1, -1, RE_REP_FRUGAL); | |
973 | |
974 case '?': { | |
975 re_node *cur = bl_node(con->block); | |
976 *p -= 2; | |
977 cur->alter.type = RE_ALTER; | |
978 cur->alter.next = CALL(atom, next, NULL); | |
979 cur->alter.arg = next; | |
980 return cur; | |
981 } | |
982 | |
983 hook: | |
984 default: { | |
985 re_node *cur = bl_node(con->block); | |
986 --*p; | |
987 cur->alter.type = RE_ALTER; | |
988 cur->alter.next = next; | |
989 cur->alter.arg = CALL(atom, next, NULL); | |
990 return cur; | |
991 } | |
992 } | |
993 | |
994 case ':': CUSTOM_REP_CASE(0, RE_REP_GREEDY, goto noquanti); | |
995 case '*': STD_REP_CASE(0, -1, RE_REP_GREEDY); | |
996 case '+': STD_REP_CASE(1, -1, RE_REP_GREEDY); | |
997 | |
998 noquanti: | |
999 default: TAILC(literal); | |
1000 } | |
1001 } | |
1002 | |
1003 PRAZ(branch) { | |
1004 SNORK; | |
1005 skipspace(s, p); | |
1006 while (ST_INDEX(s, *p) != EOF && !ENDBRANCHP(ST_INDEX(s, *p))) { | |
1007 next = CALL(quantifire, next, curolan_zebeth); | |
1008 skipspace(s, p); | |
1009 } | |
1010 return next; | |
1011 } | |
1012 | |
1013 PRAZ(alternation) { | |
1014 SNORK; | |
1015 re_node *cur; | |
1016 | |
1017 skipspace(s, p); | |
1018 if (ST_INDEX(s, *p) == EOF) { | |
1019 return next; | |
1020 } | |
1021 | |
1022 next = CALL(branch, next, curolan_zebeth); | |
1023 skipspace(s, p); | |
1024 if (ST_INDEX(s, *p) != '|') { | |
1025 return next; | |
1026 } | |
1027 | |
1028 --*p; | |
1029 cur = bl_node(con->block); | |
1030 cur->alter.type = RE_ALTER; | |
1031 cur->alter.next = next; | |
1032 cur->alter.arg = CALL(alternation, curolan_zebeth, curolan_zebeth); | |
1033 return cur; | |
1034 } | |
1035 | |
1036 ATTR_CONST | |
1037 static size_t minimum(const size_t a, const size_t b) { | |
1038 return a < b ? a : b; | |
1039 } | |
1040 | |
1041 ATTR_CONST | |
1042 static size_t maximum(const size_t a, const size_t b) { | |
1043 return a > b ? a : b; | |
1044 } | |
1045 | |
1046 ATTR_PURE | |
1047 static size_t subminlen(const re_node *node, const re_node *end, size_t start) { | |
1048 if (!node || node == end) { | |
1049 return start; | |
1050 } | |
1051 | |
1052 switch (node->x.type) { | |
1053 case RE_ALTER: | |
1054 return minimum( | |
1055 subminlen(node->alter.arg, end, start), | |
1056 subminlen(node->alter.next, end, start) | |
1057 ); | |
1058 | |
1059 case RE_ANYCH: | |
1060 case RE_CLASS: | |
1061 case RE_DEFCLASS: | |
1062 case RE_N_CLASS: | |
1063 case RE_N_DEFCLASS: | |
1064 return subminlen(node->x.next, end, start + 1u); | |
1065 | |
1066 case RE_AT_BOL: | |
1067 case RE_AT_BOS: | |
1068 case RE_AT_EOL: | |
1069 case RE_AT_EOS: | |
1070 case RE_AT_NWBOUND: | |
1071 case RE_AT_WBOUND: | |
1072 case RE_BACKCHECK: | |
1073 case RE_BACKREF: | |
1074 case RE_BEGIN_CAPTURE: | |
1075 case RE_END_CAPTURE: | |
1076 case RE_PARTIAL: | |
1077 case RE_STUFF_FRUGAL: | |
1078 case RE_STUFF_GREEDY: | |
1079 return subminlen(node->x.next, end, start); | |
1080 | |
1081 case RE_AT_MATCH: | |
1082 return maximum( | |
1083 subminlen(node->indep.arg, end, start), | |
1084 subminlen(node->indep.next, end, start) | |
1085 ); | |
1086 | |
1087 case RE_AT_NOMATCH: | |
1088 return subminlen(node->indep.next, end, start); | |
1089 | |
1090 case RE_INDEP: | |
1091 return subminlen(node->indep.next, end, start + subminlen(node->indep.arg, NULL, 0)); | |
1092 | |
1093 case RE_LITERAL: | |
1094 return subminlen(node->literal.next, end, start + node->literal.len); | |
1095 | |
1096 case RE_SELECT: | |
1097 return minimum( | |
1098 subminlen(node->select.cond, end, start) + subminlen(node->select.arg, end, start), | |
1099 subminlen(node->select.next, end, start) | |
1100 ); | |
1101 | |
1102 case RE_REP_FRUGAL: | |
1103 case RE_REP_GREEDY: | |
1104 if (node->rep.min == 0) { | |
1105 return subminlen(node->rep.next, end, start); | |
1106 } | |
1107 return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, node, 0)); | |
1108 | |
1109 case RE_XREP_FRUGAL: | |
1110 case RE_XREP_GREEDY: | |
1111 if (node->rep.min == 0) { | |
1112 return subminlen(node->rep.next, end, start); | |
1113 } | |
1114 return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, NULL, 0)); | |
1115 } | |
1116 | |
1117 NOTREACHED; | |
1118 } | |
1119 | |
1120 ATTR_PURE | |
1121 static size_t minlen(const re_node *node) { | |
1122 return subminlen(node, NULL, 0); | |
1123 } | |
1124 | |
1125 static void reinit(t_regex *re) { | |
1126 const struct cap_state null = { -1, -1, -1 }; | |
1127 size_t i; | |
1128 for (i = 0; i < re->captures; ++i) { | |
1129 re->cap_ptr[i] = null; | |
1130 } | |
1131 } | |
1132 | |
1133 static re_node **notcomplex(re_node **node, const re_node *end) { | |
1134 assert(*node != NULL); | |
1135 | |
1136 if (*node == end) { | |
1137 return node; | |
1138 } | |
1139 | |
1140 switch ((*node)->x.type) { | |
1141 case RE_ALTER: | |
1142 case RE_AT_MATCH: | |
1143 case RE_AT_NOMATCH: | |
1144 case RE_BACKCHECK: | |
1145 case RE_BACKREF: | |
1146 case RE_BEGIN_CAPTURE: | |
1147 case RE_END_CAPTURE: | |
1148 case RE_INDEP: | |
1149 case RE_PARTIAL: | |
1150 case RE_REP_FRUGAL: | |
1151 case RE_REP_GREEDY: | |
1152 case RE_SELECT: | |
1153 case RE_STUFF_FRUGAL: | |
1154 case RE_STUFF_GREEDY: | |
1155 return NULL; | |
1156 | |
1157 case RE_XREP_FRUGAL: | |
1158 case RE_XREP_GREEDY: | |
1159 if ((*node)->rep.min == (*node)->rep.max) { | |
1160 return notcomplex(&(*node)->rep.next, end); | |
1161 } | |
1162 return NULL; | |
1163 | |
1164 case RE_ANYCH: | |
1165 case RE_AT_BOL: | |
1166 case RE_AT_BOS: | |
1167 case RE_AT_EOL: | |
1168 case RE_AT_EOS: | |
1169 case RE_AT_NWBOUND: | |
1170 case RE_AT_WBOUND: | |
1171 case RE_CLASS: | |
1172 case RE_DEFCLASS: | |
1173 case RE_LITERAL: | |
1174 case RE_N_CLASS: | |
1175 case RE_N_DEFCLASS: | |
1176 return notcomplex(&(*node)->x.next, end); | |
1177 } | |
1178 | |
1179 NOTREACHED; | |
1180 } | |
1181 | |
1182 static void trysimpl(re_node *node) { | |
1183 assert(node->x.type == RE_REP_FRUGAL || node->x.type == RE_REP_GREEDY); | |
1184 | |
1185 { | |
1186 re_node **const tmp = notcomplex(&node->rep.arg, node); | |
1187 if (!tmp) { | |
1188 return; | |
1189 } | |
1190 *tmp = NULL; | |
1191 } | |
1192 | |
1193 switch (node->rep.type) { | |
1194 case RE_REP_FRUGAL: | |
1195 if ( | |
1196 node->rep.arg->x.type == RE_ANYCH && | |
1197 node->rep.arg->x.next == NULL && | |
1198 node->rep.max == (size_t)-1 | |
1199 ) { | |
1200 if (node->rep.min == 0) { | |
1201 node->x.type = RE_STUFF_FRUGAL; | |
1202 break; | |
1203 } | |
1204 if (node->rep.min == 1) { | |
1205 node->rep.arg->x.next = node->x.next; | |
1206 node->rep.arg->x.type = RE_STUFF_FRUGAL; | |
1207 node->x.next = node->rep.arg; | |
1208 node->x.type = RE_ANYCH; | |
1209 break; | |
1210 } | |
1211 } | |
1212 | |
1213 node->rep.n = minlen(node->rep.arg); | |
1214 node->rep.type = RE_XREP_FRUGAL; | |
1215 break; | |
1216 | |
1217 case RE_REP_GREEDY: | |
1218 if ( | |
1219 node->rep.arg->x.type == RE_ANYCH && | |
1220 node->rep.arg->x.next == NULL && | |
1221 node->rep.max == (size_t)-1 | |
1222 ) { | |
1223 if (node->rep.min == 0) { | |
1224 node->x.type = RE_STUFF_GREEDY; | |
1225 break; | |
1226 } | |
1227 if (node->rep.min == 1) { | |
1228 node->rep.arg->x.next = node->x.next; | |
1229 node->rep.arg->x.type = RE_STUFF_GREEDY; | |
1230 node->x.next = node->rep.arg; | |
1231 node->x.type = RE_ANYCH; | |
1232 break; | |
1233 } | |
1234 } | |
1235 | |
1236 node->rep.n = minlen(node->rep.arg); | |
1237 node->rep.type = RE_XREP_GREEDY; | |
1238 break; | |
1239 | |
1240 default: | |
1241 NOTREACHED; | |
1242 } | |
1243 | |
1244 } | |
1245 | |
1246 static void simplerep(re_node *node, const re_node *const end) { | |
1247 if (!node || node == end) { | |
1248 return; | |
1249 } | |
1250 | |
1251 switch (node->x.type) { | |
1252 case RE_ALTER: | |
1253 simplerep(node->alter.arg, end); | |
1254 simplerep(node->alter.next, end); | |
1255 return; | |
1256 | |
1257 case RE_AT_MATCH: | |
1258 case RE_AT_NOMATCH: | |
1259 case RE_INDEP: | |
1260 simplerep(node->indep.arg, end); | |
1261 simplerep(node->indep.next, end); | |
1262 return; | |
1263 | |
1264 case RE_ANYCH: | |
1265 case RE_AT_BOL: | |
1266 case RE_AT_BOS: | |
1267 case RE_AT_EOL: | |
1268 case RE_AT_EOS: | |
1269 case RE_AT_NWBOUND: | |
1270 case RE_AT_WBOUND: | |
1271 case RE_BACKCHECK: | |
1272 case RE_BACKREF: | |
1273 case RE_BEGIN_CAPTURE: | |
1274 case RE_CLASS: | |
1275 case RE_DEFCLASS: | |
1276 case RE_END_CAPTURE: | |
1277 case RE_LITERAL: | |
1278 case RE_N_CLASS: | |
1279 case RE_N_DEFCLASS: | |
1280 case RE_PARTIAL: | |
1281 simplerep(node->x.next, end); | |
1282 return; | |
1283 | |
1284 case RE_REP_FRUGAL: | |
1285 case RE_REP_GREEDY: | |
1286 simplerep(node->rep.next, end); | |
1287 simplerep(node->rep.arg, node); | |
1288 trysimpl(node); | |
1289 return; | |
1290 | |
1291 case RE_SELECT: | |
1292 simplerep(node->select.cond, end); | |
1293 simplerep(node->select.arg, end); | |
1294 simplerep(node->select.next, end); | |
1295 return; | |
1296 | |
1297 case RE_STUFF_FRUGAL: | |
1298 case RE_STUFF_GREEDY: | |
1299 case RE_XREP_FRUGAL: | |
1300 case RE_XREP_GREEDY: | |
1301 return; | |
1302 } | |
1303 | |
1304 NOTREACHED; | |
1305 } | |
1306 | |
1307 ATTR_PURE | |
1308 static int anchorbegp(const re_node *node) { | |
1309 return | |
1310 node && | |
1311 ( | |
1312 node->x.type == RE_AT_BOS || | |
1313 node->x.type == RE_STUFF_FRUGAL || | |
1314 node->x.type == RE_STUFF_GREEDY || | |
1315 ( | |
1316 node->x.type == RE_ALTER && | |
1317 anchorbegp(node->alter.arg) && | |
1318 anchorbegp(node->alter.next) | |
1319 ) || | |
1320 ( | |
1321 node->x.type == RE_INDEP && | |
1322 anchorbegp(node->indep.arg) | |
1323 ) || | |
1324 ( | |
1325 node->x.type == RE_AT_MATCH && | |
1326 ( | |
1327 anchorbegp(node->indep.arg) || | |
1328 anchorbegp(node->indep.next) | |
1329 ) | |
1330 ) || | |
1331 ( | |
1332 ( | |
1333 node->x.type == RE_AT_NOMATCH || | |
1334 node->x.type == RE_AT_BOL || | |
1335 node->x.type == RE_AT_EOL || | |
1336 node->x.type == RE_AT_EOS || | |
1337 node->x.type == RE_AT_NWBOUND || | |
1338 node->x.type == RE_AT_WBOUND || | |
1339 node->x.type == RE_BEGIN_CAPTURE || | |
1340 node->x.type == RE_BACKCHECK | |
1341 ) && | |
1342 anchorbegp(node->x.next) | |
1343 ) | |
1344 ) | |
1345 ; | |
1346 } | |
1347 | |
1348 static void dostuff(t_regex *re) { | |
1349 simplerep(re->start, NULL); | |
1350 | |
1351 if (!re->start || anchorbegp(re->start)) { | |
1352 re->flags |= FL_ANCHOR_START; | |
1353 } | |
1354 } | |
1355 | |
1356 t_regex *re_compile(const String *s) { | |
1357 size_t p; | |
1358 struct parse_context dcon; | |
1359 t_regex *re; | |
1360 void *pre; | |
1361 | |
1362 if (h_get(&rcache, s, &pre) == H_OK) { | |
1363 re = pre; | |
1364 ++re->refs; | |
1365 return re; | |
1366 } | |
1367 | |
1368 re = xmalloc(1, sizeof *re); | |
1369 re->flags = FL_NONE; | |
1370 re->refs = 0; | |
1371 re->repets = 0; | |
1372 bl_init(&re->alloc); | |
1373 re->cap_ptr = NULL; | |
1374 re->captures = 0; | |
1375 dcon.base = re; | |
1376 dcon.block = &re->alloc; | |
1377 dcon.s = s; | |
1378 p = St_len(s) - 1; | |
1379 dcon.p = &p; | |
1380 | |
1381 re->start = alternation(&dcon, NULL, NULL); | |
1382 | |
1383 if (re->captures) { | |
1384 re->cap_ptr = xmalloc(re->captures, sizeof *re->cap_ptr); | |
1385 reinit(re); | |
1386 } | |
1387 | |
1388 dostuff(re); | |
1389 re->minlen = minlen(re->start); | |
1390 re->repbuf = xmalloc(re->repets, sizeof *re->repbuf); | |
1391 | |
1392 if (cachfill < sizeof cached / sizeof cached[0]) { | |
1393 St_init(&cached[cachfill]); | |
1394 St_cpy(&cached[cachfill], s); | |
1395 ++re->refs; | |
1396 h_put(&rcache, &cached[cachfill], re, 1); | |
1397 ++cachfill; | |
1398 } else { | |
1399 h_del(&rcache, &cached[cachlast]); | |
1400 St_cpy(&cached[cachlast], s); | |
1401 ++re->refs; | |
1402 h_put(&rcache, &cached[cachlast], re, 1); | |
1403 cachlast = (cachlast + 1u) % (sizeof cached / sizeof cached[0]); | |
1404 } | |
1405 | |
1406 if (Opt.debug & DBG_REGEX) { | |
1407 io_write_s(Err, "Compiled "); | |
1408 re_dump(re, io_fp(Err)); | |
1409 } | |
1410 | |
1411 return re; | |
1412 } | |
1413 | |
1414 t_regex *re_dup(t_regex *re) { | |
1415 ++re->refs; | |
1416 return re; | |
1417 } | |
1418 | |
1419 void re_free(t_regex *re) { | |
1420 if (re->refs) { | |
1421 --re->refs; | |
1422 return; | |
1423 } | |
1424 bl_free(&re->alloc); | |
1425 xfree(re->repbuf); | |
1426 xfree(re->cap_ptr); | |
1427 xfree(re); | |
1428 } | |
1429 | |
1430 ATTR_PURE | |
1431 static int match_class(unsigned char c, const unsigned char *vec) { | |
1432 return vec[c / CHAR_BIT] & 1u << c % CHAR_BIT; | |
1433 } | |
1434 | |
1435 static void *mem_dup(const void *p, size_t n, size_t m) { | |
1436 void *q = xmalloc(n, m); | |
1437 memcpy(q, p, n * m); | |
1438 return q; | |
1439 } | |
1440 | |
1441 #define NO_MATCH ((size_t)-1) | |
1442 | |
1443 #define Mmatch0(Xmatch, Xindex) \ | |
1444 size_t tmp; \ | |
1445 \ | |
1446 if (!node) { \ | |
1447 return o; \ | |
1448 } \ | |
1449 \ | |
1450 switch (node->x.type) { \ | |
1451 case RE_ALTER: \ | |
1452 if ((tmp = Xmatch(base, node->alter.arg, s, o)) == NO_MATCH) { \ | |
1453 return Xmatch(base, node->alter.next, s, o); \ | |
1454 } \ | |
1455 return tmp; \ | |
1456 \ | |
1457 case RE_ANYCH: \ | |
1458 if (Xindex(s, o) != EOF) { \ | |
1459 return Xmatch(base, node->x.next, s, o + 1u); \ | |
1460 } \ | |
1461 return NO_MATCH; \ | |
1462 \ | |
1463 case RE_AT_BOL: \ | |
1464 if (o == 0 || Xindex(s, o - 1u) == '\n') { \ | |
1465 return Xmatch(base, node->x.next, s, o); \ | |
1466 } \ | |
1467 return NO_MATCH; \ | |
1468 \ | |
1469 case RE_AT_BOS: \ | |
1470 if (o == 0) { \ | |
1471 return Xmatch(base, node->x.next, s, o); \ | |
1472 } \ | |
1473 return NO_MATCH; \ | |
1474 \ | |
1475 case RE_AT_EOL: \ | |
1476 if ( \ | |
1477 Xindex(s, o) == '\n' || \ | |
1478 ( \ | |
1479 o && Xindex(s, o) == EOF && Xindex(s, o - 1u) != '\n' \ | |
1480 ) \ | |
1481 ) { \ | |
1482 return Xmatch(base, node->x.next, s, o); \ | |
1483 } \ | |
1484 return NO_MATCH; \ | |
1485 \ | |
1486 case RE_AT_EOS: \ | |
1487 if (Xindex(s, o) == EOF) { \ | |
1488 return Xmatch(base, node->x.next, s, o); \ | |
1489 } \ | |
1490 return NO_MATCH; \ | |
1491 \ | |
1492 case RE_AT_MATCH: { \ | |
1493 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ | |
1494 tmp = Xmatch(base, node->indep.arg, s, o); \ | |
1495 if (tmp != NO_MATCH) { \ | |
1496 tmp = Xmatch(base, node->indep.next, s, o); \ | |
1497 if (tmp == NO_MATCH) { \ | |
1498 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ | |
1499 } \ | |
1500 } \ | |
1501 xfree(prev); \ | |
1502 return tmp; \ | |
1503 } \ | |
1504 \ | |
1505 case RE_AT_NOMATCH: { \ | |
1506 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ | |
1507 tmp = Xmatch(base, node->indep.arg, s, o); \ | |
1508 if (tmp == NO_MATCH) { \ | |
1509 tmp = Xmatch(base, node->indep.next, s, o); \ | |
1510 } else { \ | |
1511 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ | |
1512 tmp = NO_MATCH; \ | |
1513 } \ | |
1514 xfree(prev); \ | |
1515 return tmp; \ | |
1516 } \ | |
1517 \ | |
1518 case RE_AT_NWBOUND: \ | |
1519 if (siword(Xindex(s, o - 1u)) == siword(Xindex(s, o))) { \ | |
1520 return Xmatch(base, node->x.next, s, o); \ | |
1521 } \ | |
1522 return NO_MATCH; \ | |
1523 \ | |
1524 case RE_AT_WBOUND: \ | |
1525 if (siword(Xindex(s, o - 1u)) != siword(Xindex(s, o))) { \ | |
1526 return Xmatch(base, node->x.next, s, o); \ | |
1527 } \ | |
1528 return NO_MATCH; \ | |
1529 \ | |
1530 case RE_BACKCHECK: \ | |
1531 if ( \ | |
1532 node->backref.n < base->captures && \ | |
1533 base->cap_ptr[node->backref.n].start + 1u && \ | |
1534 base->cap_ptr[node->backref.n].end + 1u && \ | |
1535 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start \ | |
1536 ) { \ | |
1537 return Xmatch(base, node->backref.next, s, o); \ | |
1538 } \ | |
1539 return NO_MATCH | |
1540 | |
1541 | |
1542 #define Mmatch1(Xmatch, Xindex) \ | |
1543 case RE_BEGIN_CAPTURE: { \ | |
1544 const size_t prev = base->cap_ptr[node->capture.n].pending; \ | |
1545 base->cap_ptr[node->capture.n].pending = o; \ | |
1546 if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \ | |
1547 base->cap_ptr[node->capture.n].pending = prev; \ | |
1548 } \ | |
1549 return tmp; \ | |
1550 } \ | |
1551 \ | |
1552 case RE_CLASS: \ | |
1553 if ( \ | |
1554 Xindex(s, o) != EOF && \ | |
1555 match_class(Xindex(s, o), node->class.vec) \ | |
1556 ) { \ | |
1557 return Xmatch(base, node->class.next, s, o + 1u); \ | |
1558 } \ | |
1559 return NO_MATCH; \ | |
1560 \ | |
1561 case RE_DEFCLASS: \ | |
1562 if ( \ | |
1563 Xindex(s, o) != EOF && \ | |
1564 match_class(Xindex(s, o), node->defclass.vec) \ | |
1565 ) { \ | |
1566 return Xmatch(base, node->defclass.next, s, o + 1u); \ | |
1567 } \ | |
1568 return NO_MATCH; \ | |
1569 \ | |
1570 case RE_END_CAPTURE: { \ | |
1571 const size_t prevbeg = base->cap_ptr[node->capture.n].start; \ | |
1572 const size_t prevend = base->cap_ptr[node->capture.n].end; \ | |
1573 if (base->cap_ptr[node->capture.n].pending + 1u == 0u) { \ | |
1574 return NO_MATCH; \ | |
1575 } \ | |
1576 \ | |
1577 base->cap_ptr[node->capture.n].end = o; \ | |
1578 base->cap_ptr[node->capture.n].start = base->cap_ptr[node->capture.n].pending; \ | |
1579 if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \ | |
1580 base->cap_ptr[node->capture.n].end = prevend; \ | |
1581 base->cap_ptr[node->capture.n].start = prevbeg; \ | |
1582 } \ | |
1583 return tmp; \ | |
1584 } \ | |
1585 \ | |
1586 case RE_INDEP: { \ | |
1587 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ | |
1588 if ((tmp = Xmatch(base, node->indep.arg, s, o)) != NO_MATCH) { \ | |
1589 tmp = Xmatch(base, node->indep.next, s, tmp); \ | |
1590 if (tmp == NO_MATCH) { \ | |
1591 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ | |
1592 } \ | |
1593 } \ | |
1594 xfree(prev); \ | |
1595 return tmp; \ | |
1596 } \ | |
1597 \ | |
1598 case RE_N_CLASS: \ | |
1599 if ( \ | |
1600 Xindex(s, o) != EOF && \ | |
1601 !match_class(Xindex(s, o), node->class.vec) \ | |
1602 ) { \ | |
1603 return Xmatch(base, node->class.next, s, o + 1u); \ | |
1604 } \ | |
1605 return NO_MATCH; \ | |
1606 \ | |
1607 case RE_N_DEFCLASS: \ | |
1608 if ( \ | |
1609 Xindex(s, o) != EOF && \ | |
1610 !match_class(Xindex(s, o), node->defclass.vec) \ | |
1611 ) { \ | |
1612 return Xmatch(base, node->defclass.next, s, o + 1u); \ | |
1613 } \ | |
1614 return NO_MATCH; \ | |
1615 \ | |
1616 case RE_PARTIAL: { \ | |
1617 size_t n; \ | |
1618 for (n = 0; n < node->literal.len; ++n) { \ | |
1619 if (node->literal.buf[n] != Xindex(s, o + n)) { \ | |
1620 break; \ | |
1621 } \ | |
1622 } \ | |
1623 for (; n; --n) { \ | |
1624 if ((tmp = Xmatch(base, node->literal.next, s, o + n)) != NO_MATCH) { \ | |
1625 return tmp; \ | |
1626 } \ | |
1627 } \ | |
1628 return Xmatch(base, node->literal.next, s, o); \ | |
1629 } \ | |
1630 \ | |
1631 case RE_SELECT: { \ | |
1632 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ | |
1633 if ((tmp = Xmatch(base, node->select.cond, s, o)) != NO_MATCH) { \ | |
1634 tmp = Xmatch(base, node->select.arg, s, tmp); \ | |
1635 if (tmp == NO_MATCH) { \ | |
1636 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ | |
1637 } \ | |
1638 } else { \ | |
1639 tmp = Xmatch(base, node->select.next, s, o); \ | |
1640 } \ | |
1641 xfree(prev); \ | |
1642 return tmp; \ | |
1643 } \ | |
1644 \ | |
1645 case RE_REP_FRUGAL: \ | |
1646 if (base->repbuf[node->rep.n] >= node->rep.max) { \ | |
1647 const size_t n = base->repbuf[node->rep.n]; \ | |
1648 \ | |
1649 base->repbuf[node->rep.n] = 0; \ | |
1650 tmp = Xmatch(base, node->rep.next, s, o); \ | |
1651 base->repbuf[node->rep.n] = n; \ | |
1652 return tmp; \ | |
1653 } else if (base->repbuf[node->rep.n] >= node->rep.min) { \ | |
1654 const size_t n = base->repbuf[node->rep.n]; \ | |
1655 \ | |
1656 base->repbuf[node->rep.n] = 0; \ | |
1657 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ | |
1658 base->repbuf[node->rep.n] = n; \ | |
1659 return tmp; \ | |
1660 } \ | |
1661 base->repbuf[node->rep.n] = n + 1u; \ | |
1662 tmp = Xmatch(base, node->rep.arg, s, o); \ | |
1663 --base->repbuf[node->rep.n]; \ | |
1664 return tmp; \ | |
1665 } else { \ | |
1666 ++base->repbuf[node->rep.n]; \ | |
1667 tmp = Xmatch(base, node->rep.arg, s, o); \ | |
1668 --base->repbuf[node->rep.n]; \ | |
1669 return tmp; \ | |
1670 } \ | |
1671 \ | |
1672 case RE_REP_GREEDY: \ | |
1673 if (base->repbuf[node->rep.n] >= node->rep.max) { \ | |
1674 const size_t n = base->repbuf[node->rep.n]; \ | |
1675 \ | |
1676 base->repbuf[node->rep.n] = 0; \ | |
1677 tmp = Xmatch(base, node->rep.next, s, o); \ | |
1678 base->repbuf[node->rep.n] = n; \ | |
1679 return tmp; \ | |
1680 } else if (base->repbuf[node->rep.n] >= node->rep.min) { \ | |
1681 const size_t n = base->repbuf[node->rep.n]; \ | |
1682 \ | |
1683 ++base->repbuf[node->rep.n]; \ | |
1684 if ((tmp = Xmatch(base, node->rep.arg, s, o)) != NO_MATCH) { \ | |
1685 --base->repbuf[node->rep.n]; \ | |
1686 return tmp; \ | |
1687 } \ | |
1688 base->repbuf[node->rep.n] = 0; \ | |
1689 tmp = Xmatch(base, node->rep.next, s, o); \ | |
1690 base->repbuf[node->rep.n] = n; \ | |
1691 return tmp; \ | |
1692 } else { \ | |
1693 ++base->repbuf[node->rep.n]; \ | |
1694 tmp = Xmatch(base, node->rep.arg, s, o); \ | |
1695 --base->repbuf[node->rep.n]; \ | |
1696 return tmp; \ | |
1697 } \ | |
1698 \ | |
1699 case RE_XREP_FRUGAL: { \ | |
1700 size_t n; \ | |
1701 \ | |
1702 for (n = 0; n < node->rep.min; ++n) { \ | |
1703 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ | |
1704 return NO_MATCH; \ | |
1705 } \ | |
1706 o = tmp; \ | |
1707 } \ | |
1708 \ | |
1709 for (; n < node->rep.max; ++n) { \ | |
1710 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ | |
1711 return tmp; \ | |
1712 } \ | |
1713 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ | |
1714 return NO_MATCH; \ | |
1715 } \ | |
1716 o = tmp; \ | |
1717 } \ | |
1718 return Xmatch(base, node->rep.next, s, o); \ | |
1719 } \ | |
1720 \ | |
1721 case RE_XREP_GREEDY: { \ | |
1722 size_t n; \ | |
1723 \ | |
1724 for (n = 0; n < node->rep.min; ++n) { \ | |
1725 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ | |
1726 return NO_MATCH; \ | |
1727 } \ | |
1728 o = tmp; \ | |
1729 } \ | |
1730 \ | |
1731 for (; n < node->rep.max; ++n) { \ | |
1732 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ | |
1733 break; \ | |
1734 } \ | |
1735 o = tmp; \ | |
1736 } \ | |
1737 \ | |
1738 for (; n > node->rep.min; --n) { \ | |
1739 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ | |
1740 return tmp; \ | |
1741 } \ | |
1742 o -= node->rep.n; \ | |
1743 } \ | |
1744 \ | |
1745 return Xmatch(base, node->rep.next, s, o); \ | |
1746 } \ | |
1747 } \ | |
1748 \ | |
1749 NOTREACHED | |
1750 | |
1751 | |
1752 static size_t match(const t_regex *base, const re_node *node, const String *s, size_t o) { | |
1753 | |
1754 Mmatch0(match, ST_INDEX); | |
1755 | |
1756 case RE_BACKREF: | |
1757 if ( | |
1758 node->backref.n < base->captures && | |
1759 base->cap_ptr[node->backref.n].start + 1u && | |
1760 base->cap_ptr[node->backref.n].end + 1u && | |
1761 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start && | |
1762 ( | |
1763 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start <= St_len(s) - o && | |
1764 memcmp( | |
1765 St_ptr(s) + base->cap_ptr[node->backref.n].start, | |
1766 St_ptr(s) + o, | |
1767 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start | |
1768 ) == 0 | |
1769 ) | |
1770 ) { | |
1771 return match(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start); | |
1772 } | |
1773 return NO_MATCH; | |
1774 | |
1775 | |
1776 case RE_LITERAL: | |
1777 if ( | |
1778 node->literal.len == 0 || | |
1779 ( | |
1780 node->literal.len <= St_len(s) - o && | |
1781 memcmp(node->literal.buf, St_ptr(s) + o, node->literal.len) == 0 | |
1782 ) | |
1783 ) { | |
1784 return match(base, node->literal.next, s, o + node->literal.len); | |
1785 } | |
1786 return NO_MATCH; | |
1787 | |
1788 case RE_STUFF_FRUGAL: | |
1789 if (!node->x.next) { | |
1790 return o; | |
1791 } | |
1792 | |
1793 if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) { | |
1794 for ( | |
1795 o = St_stro_m(s, o, node->x.next->literal.buf, node->x.next->literal.len); | |
1796 o + 1u; | |
1797 o = St_stro_m(s, o + 1u, node->x.next->literal.buf, node->x.next->literal.len) | |
1798 ) { | |
1799 if ((tmp = match(base, node->x.next->literal.next, s, o + node->x.next->literal.len)) != NO_MATCH) { | |
1800 return tmp; | |
1801 } | |
1802 } | |
1803 return NO_MATCH; | |
1804 } | |
1805 | |
1806 for (; o <= St_len(s); ++o) { | |
1807 if ((tmp = match(base, node->x.next, s, o)) != NO_MATCH) { | |
1808 return tmp; | |
1809 } | |
1810 } | |
1811 return NO_MATCH; | |
1812 | |
1813 case RE_STUFF_GREEDY: { | |
1814 size_t n; | |
1815 | |
1816 if (!node->x.next) { | |
1817 return St_len(s); | |
1818 } | |
1819 | |
1820 if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) { | |
1821 for ( | |
1822 n = St_rstr_m(s, node->x.next->literal.buf, node->x.next->literal.len); | |
1823 n > o && n + 1u; | |
1824 n = St_rstro_m(s, n - 1u, node->x.next->literal.buf, node->x.next->literal.len) | |
1825 ) { | |
1826 if ((tmp = match(base, node->x.next->literal.next, s, n + node->x.next->literal.len)) != NO_MATCH) { | |
1827 return tmp; | |
1828 } | |
1829 if (!n) { | |
1830 break; | |
1831 } | |
1832 } | |
1833 if (n == o) { | |
1834 return match(base, node->x.next->literal.next, s, n + node->x.next->literal.len); | |
1835 } | |
1836 return NO_MATCH; | |
1837 } | |
1838 | |
1839 for (n = St_len(s); n > o; --n) { | |
1840 if ((tmp = match(base, node->x.next, s, n)) != NO_MATCH) { | |
1841 return tmp; | |
1842 } | |
1843 } | |
1844 return match(base, node->x.next, s, o); | |
1845 } | |
1846 | |
1847 Mmatch1(match, ST_INDEX); | |
1848 } | |
1849 | |
1850 static size_t iomatch(const t_regex *base, const re_node *node, IO *s, size_t o) { | |
1851 | |
1852 Mmatch0(iomatch, io_peek); | |
1853 | |
1854 case RE_BACKREF: | |
1855 if ( | |
1856 node->backref.n < base->captures && | |
1857 base->cap_ptr[node->backref.n].start + 1u && | |
1858 base->cap_ptr[node->backref.n].end + 1u && | |
1859 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start && | |
1860 ( | |
1861 io_xcmp( | |
1862 s, | |
1863 base->cap_ptr[node->backref.n].start, | |
1864 o, | |
1865 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start | |
1866 ) == 0 | |
1867 ) | |
1868 ) { | |
1869 return iomatch(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start); | |
1870 } | |
1871 return NO_MATCH; | |
1872 | |
1873 | |
1874 case RE_LITERAL: | |
1875 if ( | |
1876 node->literal.len == 0 || | |
1877 ( | |
1878 io_cmppeek(s, o, node->literal.buf, node->literal.len) == 0 | |
1879 ) | |
1880 ) { | |
1881 return iomatch(base, node->literal.next, s, o + node->literal.len); | |
1882 } | |
1883 return NO_MATCH; | |
1884 | |
1885 case RE_STUFF_FRUGAL: | |
1886 if (!node->x.next) { | |
1887 return o; | |
1888 } | |
1889 | |
1890 for (; io_peek(s, o) != EOF; ++o) { | |
1891 if ((tmp = iomatch(base, node->x.next, s, o)) != NO_MATCH) { | |
1892 return tmp; | |
1893 } | |
1894 } | |
1895 return iomatch(base, node->x.next, s, o); | |
1896 | |
1897 case RE_STUFF_GREEDY: { | |
1898 size_t n; | |
1899 | |
1900 for (n = o; io_peek(s, n) != EOF; ++n) | |
1901 ; | |
1902 | |
1903 if (!node->x.next) { | |
1904 return n; | |
1905 } | |
1906 | |
1907 for (; n > o; --n) { | |
1908 if ((tmp = iomatch(base, node->x.next, s, n)) != NO_MATCH) { | |
1909 return tmp; | |
1910 } | |
1911 } | |
1912 return iomatch(base, node->x.next, s, o); | |
1913 } | |
1914 | |
1915 Mmatch1(iomatch, io_peek); | |
1916 } | |
1917 | |
1918 #define DEFRBUF ((void)0) | |
1919 #define MRETURN(x) return (x) | |
1920 #define INITRBUF(re) memset((re)->repbuf, '\0', (re)->repets * sizeof *(re)->repbuf) | |
1921 | |
1922 int re_iomatch(t_regex *re, IO *io, size_t *ms, size_t *me) { | |
1923 size_t i; | |
1924 DEFRBUF; | |
1925 INITRBUF(re); | |
1926 | |
1927 assert(re != NULL); | |
1928 assert(io != NULL); | |
1929 assert(io_bufred(io)); | |
1930 assert(ms != NULL); | |
1931 assert(me != NULL); | |
1932 | |
1933 reinit(re); | |
1934 | |
1935 if (re->flags & FL_ANCHOR_START) { | |
1936 if ((*me = iomatch(re, re->start, io, 0)) == NO_MATCH) { | |
1937 MRETURN(0); | |
1938 } | |
1939 *ms = 0; | |
1940 MRETURN(1); | |
1941 } | |
1942 | |
1943 for (i = 0; io_peek(io, i) != EOF; ++i) { | |
1944 if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) { | |
1945 *ms = i; | |
1946 MRETURN(1); | |
1947 } | |
1948 } | |
1949 | |
1950 if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) { | |
1951 *ms = i; | |
1952 MRETURN(1); | |
1953 } | |
1954 MRETURN(0); | |
1955 } | |
1956 | |
1957 int re_match(t_regex *re, const String *s, size_t *ms, size_t *me) { | |
1958 size_t i; | |
1959 DEFRBUF; | |
1960 INITRBUF(re); | |
1961 | |
1962 assert(re != NULL); | |
1963 assert(s != NULL); | |
1964 assert(ms != NULL); | |
1965 assert(me != NULL); | |
1966 | |
1967 if (re->minlen > St_len(s)) { | |
1968 MRETURN(0); | |
1969 } | |
1970 | |
1971 reinit(re); | |
1972 | |
1973 if (re->flags & FL_ANCHOR_START) { | |
1974 if ((*me = match(re, re->start, s, 0)) == NO_MATCH) { | |
1975 MRETURN(0); | |
1976 } | |
1977 *ms = 0; | |
1978 MRETURN(1); | |
1979 } | |
1980 | |
1981 if (re->start->x.type == RE_LITERAL && re->start->literal.len) { | |
1982 for ( | |
1983 i = St_str_m(s, re->start->literal.buf, re->start->literal.len); | |
1984 i + 1u; | |
1985 i = St_stro_m(s, i + 1u, re->start->literal.buf, re->start->literal.len) | |
1986 ) { | |
1987 if ((*me = match(re, re->start->literal.next, s, i + re->start->literal.len)) != NO_MATCH) { | |
1988 *ms = i; | |
1989 MRETURN(1); | |
1990 } | |
1991 } | |
1992 MRETURN(0); | |
1993 } | |
1994 | |
1995 for (i = 0; i <= St_len(s) - re->minlen; ++i) { | |
1996 if ((*me = match(re, re->start, s, i)) != NO_MATCH) { | |
1997 *ms = i; | |
1998 MRETURN(1); | |
1999 } | |
2000 } | |
2001 | |
2002 MRETURN(0); | |
2003 } | |
2004 | |
2005 size_t re_cabra(const t_regex *re) { | |
2006 return re->captures; | |
2007 } | |
2008 | |
2009 int re_backref(const t_regex *re, size_t i, size_t *a, size_t *z) { | |
2010 if (i >= re->captures) { | |
2011 return -1; | |
2012 } | |
2013 if (!(re->cap_ptr[i].start + 1u && re->cap_ptr[i].end + 1u)) { | |
2014 return 1; | |
2015 } | |
2016 *a = re->cap_ptr[i].start; | |
2017 *z = re->cap_ptr[i].end; | |
2018 return 0; | |
2019 } | |
2020 | |
2021 ATTR_PURE | |
2022 static const re_node *branchend(const re_node *a, const re_node *b) { | |
2023 for (; a; a = a->x.next) { | |
2024 const re_node *q = b; | |
2025 for (; q; q = q->x.next) { | |
2026 if (a == q) { | |
2027 return q; | |
2028 } | |
2029 } | |
2030 } | |
2031 return NULL; | |
2032 } | |
2033 | |
2034 static void decom_class(String *s, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) { | |
2035 unsigned char i; | |
2036 int flag = 0; | |
2037 char big_enough[100]; | |
2038 | |
2039 for (i = 0; i < UCHAR_MAX; ++i) { | |
2040 if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) { | |
2041 if (vec[(i + 1u) / CHAR_BIT] & 1u << (i + 1u) % CHAR_BIT) { | |
2042 if (!flag) { | |
2043 flag = 1; | |
2044 goto mess; | |
2045 } | |
2046 flag = 1; | |
2047 } else { | |
2048 if (flag) { | |
2049 if (i > 1u && vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) { | |
2050 St_cat_c(s, '-'); | |
2051 } | |
2052 flag = 0; | |
2053 } | |
2054 mess: | |
2055 if (!isprint(i)) { | |
2056 St_cat_c(s, '\\'); | |
2057 switch (i) { | |
2058 case '\a': St_cat_c(s, 'a'); break; | |
2059 case '\b': St_cat_c(s, 'b'); break; | |
2060 case '\f': St_cat_c(s, 'f'); break; | |
2061 case '\n': St_cat_c(s, 'n'); break; | |
2062 case '\r': St_cat_c(s, 'r'); break; | |
2063 case '\t': St_cat_c(s, 't'); break; | |
2064 case '\v': St_cat_c(s, 'v'); break; | |
2065 default: | |
2066 sprintf(big_enough, "%03o", (unsigned)i); | |
2067 St_cat_s(s, big_enough); | |
2068 break; | |
2069 } | |
2070 } else { | |
2071 if (strchr("\\\"", (char)i)) { | |
2072 St_cat_c(s, '\\'); | |
2073 } | |
2074 St_cat_c(s, i); | |
2075 if (strchr("!'-", (char)i)) { | |
2076 St_cat_c(s, ESCAPE); | |
2077 } | |
2078 } | |
2079 } | |
2080 } | |
2081 } | |
2082 if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) { | |
2083 if (flag) { | |
2084 if (vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) { | |
2085 St_cat_c(s, '-'); | |
2086 } | |
2087 flag = 0; | |
2088 } | |
2089 if (!isprint(i)) { | |
2090 St_cat_c(s, '\\'); | |
2091 switch (i) { | |
2092 case '\a': St_cat_c(s, 'a'); break; | |
2093 case '\b': St_cat_c(s, 'b'); break; | |
2094 case '\f': St_cat_c(s, 'f'); break; | |
2095 case '\n': St_cat_c(s, 'n'); break; | |
2096 case '\r': St_cat_c(s, 'r'); break; | |
2097 case '\t': St_cat_c(s, 't'); break; | |
2098 case '\v': St_cat_c(s, 'v'); break; | |
2099 default: | |
2100 sprintf(big_enough, "%03o", (unsigned)i); | |
2101 St_cat_s(s, big_enough); | |
2102 break; | |
2103 } | |
2104 } else { | |
2105 if (strchr("\\\"", (char)i)) { | |
2106 St_cat_c(s, '\\'); | |
2107 } | |
2108 St_cat_c(s, i); | |
2109 if (strchr("!'-", (char)i)) { | |
2110 St_cat_c(s, ESCAPE); | |
2111 } | |
2112 } | |
2113 } | |
2114 } | |
2115 | |
2116 static void dumpclass(FILE *fp, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) { | |
2117 String tmp; | |
2118 St_init(&tmp); | |
2119 decom_class(&tmp, vec); | |
2120 ST_WRITE(&tmp, fp); | |
2121 St_clear(&tmp); | |
2122 } | |
2123 | |
2124 ATTR_CONST | |
2125 static int cdefclass(const unsigned char *ptr) { | |
2126 if (ptr == dc_word) { return 'w'; } | |
2127 if (ptr == dc_alpha) { return 'q'; } | |
2128 if (ptr == dc_cntrl) { return 'c'; } | |
2129 if (ptr == dc_digit) { return 'd'; } | |
2130 if (ptr == dc_lower) { return 'l'; } | |
2131 if (ptr == dc_print) { return 'p'; } | |
2132 if (ptr == dc_space) { return 's'; } | |
2133 if (ptr == dc_upper) { return 'u'; } | |
2134 if (ptr == dc_xdigit) { return 'x'; } | |
2135 NOTREACHED; | |
2136 } | |
2137 | |
2138 static void decom_liter(String *s, const re_node *node) { | |
2139 size_t i; | |
2140 assert(node->x.type == RE_LITERAL || node->x.type == RE_PARTIAL); | |
2141 for (i = 0; i < node->literal.len; ++i) { | |
2142 if (!isprint((unsigned char)node->literal.buf[i])) { | |
2143 St_cat_c(s, '\\'); | |
2144 switch (node->literal.buf[i]) { | |
2145 case '\a': St_cat_c(s, 'a'); break; | |
2146 case '\b': St_cat_c(s, 'b'); break; | |
2147 case '\f': St_cat_c(s, 'f'); break; | |
2148 case '\n': St_cat_c(s, 'n'); break; | |
2149 case '\r': St_cat_c(s, 'r'); break; | |
2150 case '\t': St_cat_c(s, 't'); break; | |
2151 case '\v': St_cat_c(s, 'v'); break; | |
2152 default: { | |
2153 char big_enough[100]; | |
2154 sprintf(big_enough, "%03o", (unsigned)(unsigned char)node->literal.buf[i]); | |
2155 St_cat_s(s, big_enough); | |
2156 break; | |
2157 } | |
2158 } | |
2159 } else { | |
2160 if (node->x.type == RE_LITERAL && strchr("\"\\", node->literal.buf[i])) { | |
2161 St_cat_c(s, '\\'); | |
2162 } | |
2163 St_cat_c(s, node->literal.buf[i]); | |
2164 if (MEATCHARP(node->literal.buf[i])) { | |
2165 St_cat_c(s, ESCAPE); | |
2166 } | |
2167 } | |
2168 } | |
2169 } | |
2170 | |
2171 static void dumpliter(FILE *fp, const re_node *node) { | |
2172 String tmp; | |
2173 St_init(&tmp); | |
2174 decom_liter(&tmp, node); | |
2175 ST_WRITE(&tmp, fp); | |
2176 St_clear(&tmp); | |
2177 } | |
2178 | |
2179 static void do_indent(FILE *fp, size_t n) { | |
2180 for (; n; --n) { | |
2181 fputs(" ", fp); | |
2182 } | |
2183 } | |
2184 | |
2185 static void dumpnode(FILE *fp, const re_node *node, const re_node *end, const size_t indent) { | |
2186 if (node == end) { | |
2187 return; | |
2188 } | |
2189 | |
2190 assert(node != NULL); | |
2191 fprintf(fp, "[%p] ", (const void *)node); | |
2192 do_indent(fp, indent); | |
2193 | |
2194 switch (node->x.type) { | |
2195 case RE_ALTER: { | |
2196 const re_node *const be = branchend(node->alter.arg, node->alter.next); | |
2197 fputs("branch {\n", fp); | |
2198 dumpnode(fp, node->alter.arg, be, indent + 1); | |
2199 fprintf(fp, "[%p] ", (const void *)node); | |
2200 do_indent(fp, indent); | |
2201 fputs("} or {\n", fp); | |
2202 dumpnode(fp, node->alter.next, be, indent + 1); | |
2203 fprintf(fp, "[%p] ", (const void *)node); | |
2204 do_indent(fp, indent); | |
2205 fputs("} /branch\n", fp); | |
2206 dumpnode(fp, be, end, indent); | |
2207 return; | |
2208 } | |
2209 | |
2210 case RE_ANYCH: | |
2211 fputs("anychar\n", fp); | |
2212 break; | |
2213 | |
2214 case RE_AT_BOL: | |
2215 fputs("at beginning of line\n", fp); | |
2216 break; | |
2217 | |
2218 case RE_AT_BOS: | |
2219 fputs("at beginning of string\n", fp); | |
2220 break; | |
2221 | |
2222 case RE_AT_EOL: | |
2223 fputs("at end of line\n", fp); | |
2224 break; | |
2225 | |
2226 case RE_AT_EOS: | |
2227 fputs("at end of string\n", fp); | |
2228 break; | |
2229 | |
2230 case RE_AT_MATCH: | |
2231 fputs("at match {\n", fp); | |
2232 dumpnode(fp, node->indep.arg, NULL, indent + 1); | |
2233 fprintf(fp, "[%p] ", (const void *)node); | |
2234 do_indent(fp, indent); | |
2235 fputs("} /at match\n", fp); | |
2236 break; | |
2237 | |
2238 case RE_AT_NOMATCH: | |
2239 fputs("not at match {\n", fp); | |
2240 dumpnode(fp, node->indep.arg, NULL, indent + 1); | |
2241 fprintf(fp, "[%p] ", (const void *)node); | |
2242 do_indent(fp, indent); | |
2243 fputs("} /not at match\n", fp); | |
2244 break; | |
2245 | |
2246 case RE_AT_NWBOUND: | |
2247 fputs("not at word boundary\n", fp); | |
2248 break; | |
2249 | |
2250 case RE_AT_WBOUND: | |
2251 fputs("at word boundary\n", fp); | |
2252 break; | |
2253 | |
2254 case RE_BACKCHECK: | |
2255 fprintf(fp, "check capture (%lu)\n", (unsigned long)node->backref.n); | |
2256 break; | |
2257 | |
2258 case RE_BACKREF: | |
2259 fprintf(fp, "backref (%lu)\n", (unsigned long)node->backref.n); | |
2260 break; | |
2261 | |
2262 case RE_BEGIN_CAPTURE: | |
2263 fprintf(fp, "begin capture (%lu)\n", (unsigned long)node->backref.n); | |
2264 break; | |
2265 | |
2266 case RE_CLASS: | |
2267 fputs("character class '", fp); | |
2268 dumpclass(fp, node->class.vec); | |
2269 fputs("'\n", fp); | |
2270 break; | |
2271 | |
2272 case RE_DEFCLASS: | |
2273 fprintf(fp, "character class %c!\n", cdefclass(node->defclass.vec)); | |
2274 break; | |
2275 | |
2276 case RE_END_CAPTURE: | |
2277 fprintf(fp, "end capture (%lu)\n", (unsigned long)node->backref.n); | |
2278 break; | |
2279 | |
2280 case RE_INDEP: | |
2281 fputs("independent submatch {\n", fp); | |
2282 dumpnode(fp, node->indep.arg, NULL, indent + 1); | |
2283 fprintf(fp, "[%p] ", (const void *)node); | |
2284 do_indent(fp, indent); | |
2285 fputs("} /independent submatch\n", fp); | |
2286 break; | |
2287 | |
2288 case RE_LITERAL: | |
2289 fputs("literal \"", fp); | |
2290 dumpliter(fp, node); | |
2291 fputs("\"\n", fp); | |
2292 break; | |
2293 | |
2294 case RE_N_CLASS: | |
2295 fputs("negated character class '", fp); | |
2296 dumpclass(fp, node->class.vec); | |
2297 fputs("'\n", fp); | |
2298 break; | |
2299 | |
2300 case RE_N_DEFCLASS: | |
2301 fprintf(fp, "character class %c!\n", toupper(cdefclass(node->defclass.vec))); | |
2302 break; | |
2303 | |
2304 case RE_PARTIAL: | |
2305 fputs("abbreviation `", fp); | |
2306 dumpliter(fp, node); | |
2307 fputs("`\n", fp); | |
2308 break; | |
2309 | |
2310 case RE_REP_FRUGAL: | |
2311 fputs("repetition ", fp); | |
2312 if (node->rep.min == node->rep.max) { | |
2313 fprintf(fp, ":%lu:?", (unsigned long)node->rep.min); | |
2314 } else if (node->rep.max != (size_t)-1) { | |
2315 fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max); | |
2316 } else { | |
2317 switch (node->rep.min) { | |
2318 case 0: fputs("*?", fp); break; | |
2319 case 1: fputs("+?", fp); break; | |
2320 default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break; | |
2321 } | |
2322 } | |
2323 fputs(" {\n", fp); | |
2324 dumpnode(fp, node->rep.arg, node, indent + 1); | |
2325 fprintf(fp, "[%p] ", (const void *)node); | |
2326 do_indent(fp, indent); | |
2327 fputs("} /repetition\n", fp); | |
2328 break; | |
2329 | |
2330 case RE_REP_GREEDY: | |
2331 fputs("repetition ", fp); | |
2332 if (node->rep.min == node->rep.max) { | |
2333 fprintf(fp, ":%lu:", (unsigned long)node->rep.min); | |
2334 } else if (node->rep.max != (size_t)-1) { | |
2335 fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max); | |
2336 } else { | |
2337 switch (node->rep.min) { | |
2338 case 0: fputs("*", fp); break; | |
2339 case 1: fputs("+", fp); break; | |
2340 default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break; | |
2341 } | |
2342 } | |
2343 fputs(" {\n", fp); | |
2344 dumpnode(fp, node->rep.arg, node, indent + 1); | |
2345 fprintf(fp, "[%p] ", (const void *)node); | |
2346 do_indent(fp, indent); | |
2347 fputs("} /repetition\n", fp); | |
2348 break; | |
2349 | |
2350 case RE_SELECT: { | |
2351 const re_node *const be = branchend(node->select.arg, node->select.next); | |
2352 fputs("if {\n", fp); | |
2353 dumpnode(fp, node->select.cond, NULL, indent + 1); | |
2354 fprintf(fp, "[%p] ", (const void *)node); | |
2355 do_indent(fp, indent); | |
2356 fputs("} then {\n", fp); | |
2357 dumpnode(fp, node->select.arg, be, indent + 1); | |
2358 if (node->select.next != be) { | |
2359 fprintf(fp, "[%p] ", (const void *)node); | |
2360 do_indent(fp, indent); | |
2361 fputs("} else {\n", fp); | |
2362 dumpnode(fp, node->select.next, be, indent + 1); | |
2363 } | |
2364 fprintf(fp, "[%p] ", (const void *)node); | |
2365 do_indent(fp, indent); | |
2366 fputs("} /if\n", fp); | |
2367 dumpnode(fp, be, end, indent); | |
2368 return; | |
2369 } | |
2370 | |
2371 case RE_STUFF_FRUGAL: | |
2372 fputs(".*?\n", fp); | |
2373 break; | |
2374 | |
2375 case RE_STUFF_GREEDY: | |
2376 fputs(".*\n", fp); | |
2377 break; | |
2378 | |
2379 case RE_XREP_FRUGAL: | |
2380 fputs("simple repetition ", fp); | |
2381 if (node->rep.min == node->rep.max) { | |
2382 fprintf(fp, ":%lu:?", (unsigned long)node->rep.min); | |
2383 } else if (node->rep.max != (size_t)-1) { | |
2384 fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max); | |
2385 } else { | |
2386 switch (node->rep.min) { | |
2387 case 0: fputs("*?", fp); break; | |
2388 case 1: fputs("+?", fp); break; | |
2389 default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break; | |
2390 } | |
2391 } | |
2392 fputs(" {\n", fp); | |
2393 dumpnode(fp, node->rep.arg, NULL, indent + 1); | |
2394 fprintf(fp, "[%p] ", (const void *)node); | |
2395 do_indent(fp, indent); | |
2396 fputs("} /simple repetition\n", fp); | |
2397 break; | |
2398 | |
2399 case RE_XREP_GREEDY: | |
2400 fputs("simple repetition ", fp); | |
2401 if (node->rep.min == node->rep.max) { | |
2402 fprintf(fp, ":%lu:", (unsigned long)node->rep.min); | |
2403 } else if (node->rep.max != (size_t)-1) { | |
2404 fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max); | |
2405 } else { | |
2406 switch (node->rep.min) { | |
2407 case 0: fputs("*", fp); break; | |
2408 case 1: fputs("+", fp); break; | |
2409 default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break; | |
2410 } | |
2411 } | |
2412 fputs(" {\n", fp); | |
2413 dumpnode(fp, node->rep.arg, NULL, indent + 1); | |
2414 fprintf(fp, "[%p] ", (const void *)node); | |
2415 do_indent(fp, indent); | |
2416 fputs("} /simple repetition\n", fp); | |
2417 break; | |
2418 } | |
2419 dumpnode(fp, node->x.next, end, indent); | |
2420 } | |
2421 | |
2422 void re_dump(const t_regex *re, FILE *fp) { | |
2423 fprintf(fp, "RE: minlen=%lu, anchor=%s\n", (unsigned long)re->minlen, re->flags & FL_ANCHOR_START ? "start" : "none"); | |
2424 dumpnode(fp, re->start, NULL, 0); | |
2425 } | |
2426 | |
2427 static void cat_n(String *s, unsigned long n) { | |
2428 char big_enough[100]; | |
2429 sprintf(big_enough, "%lu", n); | |
2430 St_cat_s(s, big_enough); | |
2431 } | |
2432 | |
2433 static void decom_repeat(String *s, const re_node *node) { | |
2434 assert( | |
2435 node->x.type == RE_REP_FRUGAL || | |
2436 node->x.type == RE_REP_GREEDY || | |
2437 node->x.type == RE_XREP_FRUGAL || | |
2438 node->x.type == RE_XREP_GREEDY | |
2439 ); | |
2440 | |
2441 if (node->rep.min == 0 && node->rep.max == (size_t)-1) { | |
2442 St_cat_c(s, '*'); | |
2443 } else if (node->rep.min == 0 && node->rep.max == 1) { | |
2444 St_cat_c(s, '?'); | |
2445 } else if (node->rep.min == 1 && node->rep.max == (size_t)-1) { | |
2446 St_cat_c(s, '+'); | |
2447 } else if (node->rep.min == node->rep.max) { | |
2448 St_cat_c(s, ':'); | |
2449 cat_n(s, node->rep.min); | |
2450 St_cat_c(s, ':'); | |
2451 } else if (node->rep.max == (size_t)-1) { | |
2452 St_cat_c(s, ':'); | |
2453 cat_n(s, node->rep.min); | |
2454 St_cat_m(s, ",:", 2); | |
2455 } else { | |
2456 St_cat_c(s, ':'); | |
2457 cat_n(s, node->rep.min); | |
2458 St_cat_c(s, ','); | |
2459 cat_n(s, node->rep.max); | |
2460 St_cat_c(s, ':'); | |
2461 } | |
2462 | |
2463 if (node->rep.type == RE_REP_FRUGAL || node->rep.type == RE_XREP_FRUGAL) { | |
2464 St_cat_c(s, '?'); | |
2465 } | |
2466 } | |
2467 | |
2468 static void decom_node(String *s, const re_node *node, const re_node *end) { | |
2469 if (node == end) { | |
2470 return; | |
2471 } | |
2472 | |
2473 assert(node != NULL); | |
2474 switch (node->x.type) { | |
2475 case RE_ALTER: { | |
2476 const re_node *const be = branchend(node->alter.arg, node->alter.next); | |
2477 St_cat_c(s, '('); | |
2478 decom_node(s, node->alter.arg, be); | |
2479 St_cat_c(s, '|'); | |
2480 decom_node(s, node->alter.next, be); | |
2481 St_cat_c(s, ')'); | |
2482 decom_node(s, be, end); | |
2483 break; | |
2484 } | |
2485 | |
2486 case RE_ANYCH: | |
2487 St_cat_c(s, '.'); | |
2488 decom_node(s, node->x.next, end); | |
2489 break; | |
2490 | |
2491 case RE_AT_BOL: | |
2492 St_cat_m(s, "A!", 2); | |
2493 decom_node(s, node->x.next, end); | |
2494 break; | |
2495 | |
2496 case RE_AT_BOS: | |
2497 St_cat_c(s, '^'); | |
2498 decom_node(s, node->x.next, end); | |
2499 break; | |
2500 | |
2501 case RE_AT_EOL: | |
2502 St_cat_m(s, "Z!", 2); | |
2503 decom_node(s, node->x.next, end); | |
2504 break; | |
2505 | |
2506 case RE_AT_EOS: | |
2507 St_cat_c(s, '$'); | |
2508 decom_node(s, node->x.next, end); | |
2509 break; | |
2510 | |
2511 case RE_AT_MATCH: | |
2512 St_cat_c(s, '['); | |
2513 decom_node(s, node->indep.arg, NULL); | |
2514 St_cat_m(s, "&]", 2); | |
2515 decom_node(s, node->indep.next, end); | |
2516 break; | |
2517 | |
2518 case RE_AT_NOMATCH: | |
2519 St_cat_c(s, '['); | |
2520 decom_node(s, node->indep.arg, NULL); | |
2521 St_cat_m(s, "^]", 2); | |
2522 decom_node(s, node->indep.next, end); | |
2523 break; | |
2524 | |
2525 case RE_AT_NWBOUND: | |
2526 St_cat_m(s, "B!", 2); | |
2527 decom_node(s, node->x.next, end); | |
2528 break; | |
2529 | |
2530 case RE_AT_WBOUND: | |
2531 St_cat_m(s, "b!", 2); | |
2532 decom_node(s, node->x.next, end); | |
2533 break; | |
2534 | |
2535 case RE_BACKREF: | |
2536 case RE_BACKCHECK: { | |
2537 String tmp; | |
2538 if (node->x.type == RE_BACKCHECK) { | |
2539 St_cat_c(s, '['); | |
2540 } | |
2541 St_init(&tmp); | |
2542 #if HAVE_VSNPRINTF_P | |
2543 St_xprintf(&tmp, "%lu", (unsigned long)node->backref.n); | |
2544 #else | |
2545 St_num(&tmp, node->backref.n); | |
2546 #endif | |
2547 St_cat(s, &tmp); | |
2548 St_clear(&tmp); | |
2549 if (node->x.type == RE_BACKCHECK) { | |
2550 St_cat_c(s, ']'); | |
2551 } else { | |
2552 St_cat_c(s, ESCAPE); | |
2553 } | |
2554 decom_node(s, node->backref.next, end); | |
2555 break; | |
2556 } | |
2557 | |
2558 case RE_BEGIN_CAPTURE: | |
2559 St_cat_c(s, '{'); | |
2560 decom_node(s, node->x.next, end); | |
2561 break; | |
2562 | |
2563 case RE_CLASS: | |
2564 St_cat_c(s, '\''); | |
2565 decom_class(s, node->class.vec); | |
2566 St_cat_c(s, '\''); | |
2567 decom_node(s, node->class.next, end); | |
2568 break; | |
2569 | |
2570 case RE_DEFCLASS: | |
2571 St_cat_c(s, cdefclass(node->defclass.vec)); | |
2572 St_cat_c(s, ESCAPE); | |
2573 decom_node(s, node->defclass.next, end); | |
2574 break; | |
2575 | |
2576 case RE_END_CAPTURE: | |
2577 St_cat_c(s, '}'); | |
2578 decom_node(s, node->x.next, end); | |
2579 break; | |
2580 | |
2581 case RE_INDEP: | |
2582 St_cat_c(s, '<'); | |
2583 decom_node(s, node->indep.arg, NULL); | |
2584 St_cat_c(s, '>'); | |
2585 decom_node(s, node->indep.next, end); | |
2586 break; | |
2587 | |
2588 case RE_LITERAL: | |
2589 decom_liter(s, node); | |
2590 decom_node(s, node->literal.next, end); | |
2591 break; | |
2592 | |
2593 case RE_N_CLASS: | |
2594 St_cat_m(s, "'^", 2); | |
2595 decom_class(s, node->class.vec); | |
2596 St_cat_c(s, '\''); | |
2597 decom_node(s, node->class.next, end); | |
2598 break; | |
2599 | |
2600 case RE_N_DEFCLASS: | |
2601 St_cat_c(s, toupper(cdefclass(node->defclass.vec))); | |
2602 St_cat_c(s, ESCAPE); | |
2603 decom_node(s, node->defclass.next, end); | |
2604 break; | |
2605 | |
2606 case RE_PARTIAL: | |
2607 St_cat_c(s, '`'); | |
2608 decom_liter(s, node); | |
2609 St_cat_c(s, '`'); | |
2610 break; | |
2611 | |
2612 case RE_REP_FRUGAL: | |
2613 case RE_REP_GREEDY: | |
2614 St_cat_c(s, '('); | |
2615 decom_node(s, node->rep.arg, node); | |
2616 St_cat_c(s, ')'); | |
2617 decom_repeat(s, node); | |
2618 decom_node(s, node->rep.next, end); | |
2619 break; | |
2620 | |
2621 case RE_SELECT: { | |
2622 const re_node *const be = branchend(node->select.arg, node->select.next); | |
2623 St_cat_c(s, '['); | |
2624 if (be != node->select.next) { | |
2625 decom_node(s, node->select.next, be); | |
2626 St_cat_c(s, '|'); | |
2627 } | |
2628 decom_node(s, node->select.arg, be); | |
2629 St_cat_c(s, '('); | |
2630 decom_node(s, node->select.cond, NULL); | |
2631 St_cat_c(s, ')'); | |
2632 St_cat_m(s, "?]", 2); | |
2633 decom_node(s, be, end); | |
2634 break; | |
2635 } | |
2636 | |
2637 case RE_STUFF_FRUGAL: | |
2638 St_cat_m(s, ".*?", 3); | |
2639 decom_node(s, node->x.next, end); | |
2640 break; | |
2641 | |
2642 case RE_STUFF_GREEDY: | |
2643 St_cat_m(s, ".*", 2); | |
2644 decom_node(s, node->x.next, end); | |
2645 break; | |
2646 | |
2647 case RE_XREP_FRUGAL: | |
2648 case RE_XREP_GREEDY: | |
2649 St_cat_c(s, '('); | |
2650 decom_node(s, node->rep.arg, NULL); | |
2651 St_cat_c(s, ')'); | |
2652 decom_repeat(s, node); | |
2653 decom_node(s, node->rep.next, end); | |
2654 break; | |
2655 } | |
2656 } | |
2657 | |
2658 void re_decompile(const t_regex *re, String *s) { | |
2659 St_zero(s); | |
2660 decom_node(s, re->start, NULL); | |
2661 } |