4223
|
1 #include "config.h"
|
|
2 #include "IO.h"
|
|
3 #include "Str.h"
|
|
4 #include "hash.h"
|
|
5 #include "main_io.h"
|
|
6 #include "main_opt.h"
|
|
7 #include "re.h"
|
|
8 #include "xmalloc.h"
|
|
9 #include "zz.h"
|
|
10
|
|
11 #include <ctype.h>
|
|
12 #include <limits.h>
|
|
13 #include <stddef.h>
|
|
14 #include <stdio.h>
|
|
15 #include <string.h>
|
|
16 #include <stdlib.h>
|
|
17 #include <assert.h>
|
|
18
|
|
19 enum {CLASS_SIZE = UCHAR_MAX / CHAR_BIT + 1u};
|
|
20
|
|
21 static unsigned char dc_word[CLASS_SIZE];
|
|
22 static unsigned char dc_alpha[CLASS_SIZE];
|
|
23 static unsigned char dc_cntrl[CLASS_SIZE];
|
|
24 static unsigned char dc_digit[CLASS_SIZE];
|
|
25 static unsigned char dc_lower[CLASS_SIZE];
|
|
26 static unsigned char dc_print[CLASS_SIZE];
|
|
27 static unsigned char dc_space[CLASS_SIZE];
|
|
28 static unsigned char dc_upper[CLASS_SIZE];
|
|
29 static unsigned char dc_xdigit[CLASS_SIZE];
|
|
30
|
|
31 static void fill(unsigned char *v, int (*pred)(int)) {
|
|
32 unsigned char c;
|
|
33
|
|
34 for (c = UCHAR_MAX; c; --c) {
|
|
35 v[c / CHAR_BIT] |= !!pred(c) << c % CHAR_BIT;
|
|
36 }
|
|
37 v[0] |= !!pred(0);
|
|
38 }
|
|
39
|
|
40 static void nfill(unsigned char *v, int (*pred)(int)) {
|
|
41 unsigned char c;
|
|
42
|
|
43 for (c = UCHAR_MAX; c; --c) {
|
|
44 v[c / CHAR_BIT] |= !pred(c) << c % CHAR_BIT;
|
|
45 }
|
|
46 v[0] |= !pred(0);
|
|
47 }
|
|
48
|
|
49 ATTR_CONST
|
|
50 static int siword(int c) {
|
|
51 return c == '_' || isalnum(c);
|
|
52 }
|
|
53
|
|
54 enum {MAGIC = 23};
|
|
55
|
|
56 static Hash rcache;
|
|
57 static String cached[MAGIC];
|
|
58 static size_t cachfill, cachlast;
|
|
59
|
|
60 static size_t hash(const void *s, size_t seed) {
|
|
61 return St_hash(s, seed);
|
|
62 }
|
|
63
|
|
64 static int compar(const void *a, const void *b) {
|
|
65 return St_cmp(a, b);
|
|
66 }
|
|
67
|
|
68 enum t_re_node {
|
|
69 RE_ALTER,
|
|
70 RE_ANYCH,
|
|
71 RE_AT_BOL,
|
|
72 RE_AT_BOS,
|
|
73 RE_AT_EOL,
|
|
74 RE_AT_EOS,
|
|
75 RE_AT_MATCH,
|
|
76 RE_AT_NOMATCH,
|
|
77 RE_AT_NWBOUND,
|
|
78 RE_AT_WBOUND,
|
|
79 RE_BACKCHECK,
|
|
80 RE_BACKREF,
|
|
81 RE_BEGIN_CAPTURE,
|
|
82 RE_CLASS,
|
|
83 RE_DEFCLASS,
|
|
84 RE_END_CAPTURE,
|
|
85 RE_INDEP,
|
|
86 RE_LITERAL,
|
|
87 RE_N_CLASS,
|
|
88 RE_N_DEFCLASS,
|
|
89 RE_PARTIAL,
|
|
90 RE_REP_FRUGAL,
|
|
91 RE_REP_GREEDY,
|
|
92 RE_SELECT,
|
|
93 RE_STUFF_FRUGAL,
|
|
94 RE_STUFF_GREEDY,
|
|
95 RE_XREP_FRUGAL,
|
|
96 RE_XREP_GREEDY
|
|
97 };
|
|
98
|
|
99 #define CMN_HDR enum t_re_node type; union my_node *next
|
|
100
|
|
101 typedef union my_node {
|
|
102 struct {
|
|
103 CMN_HDR;
|
|
104 } x;
|
|
105 struct {
|
|
106 CMN_HDR;
|
|
107 union my_node *arg;
|
|
108 } alter;
|
|
109 struct {
|
|
110 CMN_HDR;
|
|
111 size_t n;
|
|
112 } backref;
|
|
113 struct {
|
|
114 CMN_HDR;
|
|
115 size_t n;
|
|
116 } capture;
|
|
117 struct {
|
|
118 CMN_HDR;
|
|
119 unsigned char vec[CLASS_SIZE];
|
|
120 } class;
|
|
121 struct {
|
|
122 CMN_HDR;
|
|
123 unsigned char *vec;
|
|
124 } defclass;
|
|
125 struct {
|
|
126 CMN_HDR;
|
|
127 union my_node *arg;
|
|
128 } indep;
|
|
129 struct {
|
|
130 CMN_HDR;
|
|
131 unsigned char *buf;
|
|
132 size_t len;
|
|
133 } literal;
|
|
134 struct {
|
|
135 CMN_HDR;
|
|
136 union my_node *arg;
|
|
137 size_t min, max;
|
|
138 size_t n;
|
|
139 } rep;
|
|
140 struct {
|
|
141 CMN_HDR;
|
|
142 union my_node *arg;
|
|
143 union my_node *cond;
|
|
144 } select;
|
|
145 } re_node;
|
|
146
|
|
147 #include "re_block.c.h"
|
|
148
|
|
149 enum re_flags {
|
|
150 FL_NONE = 0,
|
|
151 FL_ANCHOR_START = 1
|
|
152 };
|
|
153
|
|
154 struct cap_state {
|
|
155 size_t start, end, pending;
|
|
156 };
|
|
157
|
|
158 struct my_regex {
|
|
159 re_node *start;
|
|
160 size_t repets;
|
|
161 size_t *repbuf;
|
|
162 t_block alloc;
|
|
163 size_t captures;
|
|
164 struct cap_state *cap_ptr;
|
|
165 size_t refs;
|
|
166 enum re_flags flags;
|
|
167 size_t minlen;
|
|
168 };
|
|
169
|
|
170 static void delv(void *v) {
|
|
171 re_free(v);
|
|
172 }
|
|
173
|
|
174 void re_init(void) {
|
|
175 fill(dc_word, siword);
|
|
176 fill(dc_alpha, isalpha);
|
|
177 fill(dc_cntrl, iscntrl);
|
|
178 fill(dc_digit, isdigit);
|
|
179 fill(dc_lower, islower);
|
|
180 fill(dc_print, isprint);
|
|
181 fill(dc_space, isspace);
|
|
182 fill(dc_upper, isupper);
|
|
183 fill(dc_xdigit, isxdigit);
|
|
184
|
|
185 h_init(&rcache, hash, compar, NULL, delv);
|
|
186 }
|
|
187
|
|
188 void re_end(void) {
|
|
189 h_end(&rcache);
|
|
190 while (cachfill) {
|
|
191 --cachfill;
|
|
192 St_clear(&cached[cachfill]);
|
|
193 }
|
|
194 cachlast = 0;
|
|
195 }
|
|
196
|
|
197
|
|
198 enum {ESCAPE = '!'};
|
|
199 #define MEATCHARP(c) strchr("!|({<[]>})*+?:.'^$`", c)
|
|
200 #define ENDBRANCHP(c) strchr("|({<[", c)
|
|
201
|
|
202 struct parse_context {
|
|
203 t_regex *base;
|
|
204 t_block *block;
|
|
205 const String *s;
|
|
206 size_t *p;
|
|
207 };
|
|
208
|
|
209 static void skipspace(const String *const s, size_t *const p) {
|
|
210 restart:
|
|
211 while (isspace(ST_INDEX(s, *p))) {
|
|
212 --*p;
|
|
213 }
|
|
214 if (ST_INDEX(s, *p) == ']' && ST_INDEX(s, *p - 1u) == '#') {
|
|
215 *p -= 2u;
|
|
216 while (ST_INDEX(s, *p) != EOF) {
|
|
217 if (ST_INDEX(s, *p) == '[') {
|
|
218 --*p;
|
|
219 goto restart;
|
|
220 }
|
|
221 if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1u) != EOF) {
|
|
222 --*p;
|
|
223 }
|
|
224 --*p;
|
|
225 }
|
|
226 }
|
|
227 }
|
|
228
|
|
229 #define PRAZ(name) static re_node *name(const struct parse_context *con, re_node *next, re_node *curolan_zebeth)
|
|
230 #define SNORK const String *const s = con->s; size_t *const p = con->p
|
|
231 #define CALL(name, nextn, glob) name(con, nextn, glob)
|
|
232 #define TAILC(name) return CALL(name, next, curolan_zebeth)
|
|
233
|
|
234 PRAZ(quantifire);
|
|
235 PRAZ(branch);
|
|
236 PRAZ(alternation);
|
|
237
|
|
238 PRAZ(atom) {
|
|
239 SNORK;
|
|
240 (void)curolan_zebeth;
|
|
241
|
|
242 skipspace(s, p);
|
|
243 switch (ST_INDEX(s, *p)) {
|
|
244 case '(':
|
|
245 case '<':
|
|
246 case '{':
|
|
247 case '[':
|
|
248 case EOF: {
|
|
249 re_node *cur = bl_node(con->block);
|
|
250 cur->literal.type = RE_LITERAL;
|
|
251 cur->literal.next = next;
|
|
252 cur->literal.buf = NULL;
|
|
253 cur->literal.len = 0;
|
|
254 return cur;
|
|
255 }
|
|
256
|
|
257 case ESCAPE: {
|
|
258 re_node *cur = bl_node(con->block);
|
|
259 --*p;
|
|
260 if (isdigit(ST_INDEX(s, *p))) {
|
|
261 do {
|
|
262 --*p;
|
|
263 } while (isdigit(ST_INDEX(s, *p)));
|
|
264 cur->backref.type = RE_BACKREF;
|
|
265 cur->backref.next = next;
|
|
266 cur->backref.n = strtoul(St_ptr(s) + *p + 1, NULL, 10);
|
|
267 } else if (isalpha(ST_INDEX(s, *p))) {
|
|
268 switch (ST_INDEX(s, *p)) {
|
|
269 case 'A':
|
|
270 cur->x.type = RE_AT_BOL;
|
|
271 cur->x.next = next;
|
|
272 --*p;
|
|
273 break;
|
|
274 case 'Z':
|
|
275 cur->x.type = RE_AT_EOL;
|
|
276 cur->x.next = next;
|
|
277 --*p;
|
|
278 break;
|
|
279
|
|
280 case 'b':
|
|
281 cur->x.type = RE_AT_WBOUND;
|
|
282 cur->x.next = next;
|
|
283 --*p;
|
|
284 break;
|
|
285 case 'B':
|
|
286 cur->x.type = RE_AT_NWBOUND;
|
|
287 cur->x.next = next;
|
|
288 --*p;
|
|
289 break;
|
|
290
|
|
291 case 'c':
|
|
292 cur->defclass.type = RE_DEFCLASS;
|
|
293 cur->defclass.vec = dc_cntrl;
|
|
294 cur->defclass.next = next;
|
|
295 --*p;
|
|
296 break;
|
|
297 case 'C':
|
|
298 cur->defclass.type = RE_N_DEFCLASS;
|
|
299 cur->defclass.vec = dc_cntrl;
|
|
300 cur->defclass.next = next;
|
|
301 --*p;
|
|
302 break;
|
|
303
|
|
304 case 'd':
|
|
305 cur->defclass.type = RE_DEFCLASS;
|
|
306 cur->defclass.vec = dc_digit;
|
|
307 cur->defclass.next = next;
|
|
308 --*p;
|
|
309 break;
|
|
310 case 'D':
|
|
311 cur->defclass.type = RE_N_DEFCLASS;
|
|
312 cur->defclass.vec = dc_digit;
|
|
313 cur->defclass.next = next;
|
|
314 --*p;
|
|
315 break;
|
|
316
|
|
317 case 'l':
|
|
318 cur->defclass.type = RE_DEFCLASS;
|
|
319 cur->defclass.vec = dc_lower;
|
|
320 cur->defclass.next = next;
|
|
321 --*p;
|
|
322 break;
|
|
323 case 'L':
|
|
324 cur->defclass.type = RE_N_DEFCLASS;
|
|
325 cur->defclass.vec = dc_lower;
|
|
326 cur->defclass.next = next;
|
|
327 --*p;
|
|
328 break;
|
|
329
|
|
330 case 'p':
|
|
331 cur->defclass.type = RE_DEFCLASS;
|
|
332 cur->defclass.vec = dc_print;
|
|
333 cur->defclass.next = next;
|
|
334 --*p;
|
|
335 break;
|
|
336 case 'P':
|
|
337 cur->defclass.type = RE_N_DEFCLASS;
|
|
338 cur->defclass.vec = dc_print;
|
|
339 cur->defclass.next = next;
|
|
340 --*p;
|
|
341 break;
|
|
342
|
|
343 case 'q':
|
|
344 cur->defclass.type = RE_DEFCLASS;
|
|
345 cur->defclass.vec = dc_alpha;
|
|
346 cur->defclass.next = next;
|
|
347 --*p;
|
|
348 break;
|
|
349 case 'Q':
|
|
350 cur->defclass.type = RE_N_DEFCLASS;
|
|
351 cur->defclass.vec = dc_alpha;
|
|
352 cur->defclass.next = next;
|
|
353 --*p;
|
|
354 break;
|
|
355
|
|
356 case 's':
|
|
357 cur->defclass.type = RE_DEFCLASS;
|
|
358 cur->defclass.vec = dc_space;
|
|
359 cur->defclass.next = next;
|
|
360 --*p;
|
|
361 break;
|
|
362 case 'S':
|
|
363 cur->defclass.type = RE_N_DEFCLASS;
|
|
364 cur->defclass.vec = dc_space;
|
|
365 cur->defclass.next = next;
|
|
366 --*p;
|
|
367 break;
|
|
368
|
|
369 case 'u':
|
|
370 cur->defclass.type = RE_DEFCLASS;
|
|
371 cur->defclass.vec = dc_upper;
|
|
372 cur->defclass.next = next;
|
|
373 --*p;
|
|
374 break;
|
|
375 case 'U':
|
|
376 cur->defclass.type = RE_N_DEFCLASS;
|
|
377 cur->defclass.vec = dc_upper;
|
|
378 cur->defclass.next = next;
|
|
379 --*p;
|
|
380 break;
|
|
381
|
|
382 case 'w':
|
|
383 cur->defclass.type = RE_DEFCLASS;
|
|
384 cur->defclass.vec = dc_word;
|
|
385 cur->defclass.next = next;
|
|
386 --*p;
|
|
387 break;
|
|
388 case 'W':
|
|
389 cur->defclass.type = RE_N_DEFCLASS;
|
|
390 cur->defclass.vec = dc_word;
|
|
391 cur->defclass.next = next;
|
|
392 --*p;
|
|
393 break;
|
|
394
|
|
395 case 'x':
|
|
396 cur->defclass.type = RE_DEFCLASS;
|
|
397 cur->defclass.vec = dc_xdigit;
|
|
398 cur->defclass.next = next;
|
|
399 --*p;
|
|
400 break;
|
|
401 case 'X':
|
|
402 cur->defclass.type = RE_N_DEFCLASS;
|
|
403 cur->defclass.vec = dc_xdigit;
|
|
404 cur->defclass.next = next;
|
|
405 --*p;
|
|
406 break;
|
|
407
|
|
408 default:
|
|
409 goto default_escape;
|
|
410 }
|
|
411
|
|
412 } else default_escape: {
|
|
413 cur->literal.type = RE_LITERAL;
|
|
414 cur->literal.next = next;
|
|
415 cur->literal.buf = bl_string(con->block, cur->literal.len = 1);
|
|
416 if (ST_INDEX(s, *p) == EOF) {
|
|
417 cur->literal.buf[0] = ESCAPE;
|
|
418 } else {
|
|
419 cur->literal.buf[0] = ST_INDEX(s, *p);
|
|
420 --*p;
|
|
421 }
|
|
422 }
|
|
423 return cur;
|
|
424 }
|
|
425
|
|
426 case ')': {
|
|
427 re_node *cur;
|
|
428 --*p;
|
|
429 cur = CALL(alternation, next, next);
|
|
430 skipspace(s, p);
|
|
431 if (ST_INDEX(s, *p) == '(') {
|
|
432 --*p;
|
|
433 }
|
|
434 return cur;
|
|
435 }
|
|
436
|
|
437 case ']': {
|
|
438 re_node *cur;
|
|
439 switch (ST_INDEX(s, *p - 1u)) {
|
|
440 case '&':
|
|
441 *p -= 2u;
|
|
442 cur = bl_node(con->block);
|
|
443 cur->indep.type = RE_AT_MATCH;
|
|
444 cur->indep.next = next;
|
|
445 cur->indep.arg = CALL(alternation, NULL, NULL);
|
|
446 break;
|
|
447
|
|
448 case '^':
|
|
449 *p -= 2u;
|
|
450 cur = bl_node(con->block);
|
|
451 cur->indep.type = RE_AT_NOMATCH;
|
|
452 cur->indep.next = next;
|
|
453 cur->indep.arg = CALL(alternation, NULL, NULL);
|
|
454 break;
|
|
455
|
|
456 case '?':
|
|
457 *p -= 2u;
|
|
458 cur = bl_node(con->block);
|
|
459 cur->select.type = RE_SELECT;
|
|
460 cur->select.cond = CALL(quantifire, NULL, NULL);
|
|
461 cur->select.arg = CALL(branch, next, next);
|
|
462 skipspace(s, p);
|
|
463 if (ST_INDEX(s, *p) == '|') {
|
|
464 --*p;
|
|
465 cur->select.next = CALL(alternation, next, next);
|
|
466 } else {
|
|
467 cur->select.next = next;
|
|
468 }
|
|
469 break;
|
|
470
|
|
471 case '0': case '1': case '2': case '3': case '4':
|
|
472 case '5': case '6': case '7': case '8': case '9': {
|
|
473 size_t i;
|
|
474
|
|
475 for (i = *p - 2u; isdigit(ST_INDEX(s, i)); --i)
|
|
476 ;
|
|
477 if (ST_INDEX(s, i) != '[') {
|
|
478 goto std_atom;
|
|
479 }
|
|
480 cur = bl_node(con->block);
|
|
481 cur->backref.type = RE_BACKCHECK;
|
|
482 cur->backref.next = next;
|
|
483 cur->backref.n = strtoul(St_ptr(s) + i + 1u, NULL, 10);
|
|
484 *p = i - 1u;
|
|
485
|
|
486 return cur;
|
|
487 }
|
|
488
|
|
489 default:
|
|
490 goto std_atom;
|
|
491 }
|
|
492
|
|
493 skipspace(s, p);
|
|
494 if (ST_INDEX(s, *p) == '[') {
|
|
495 --*p;
|
|
496 }
|
|
497 return cur;
|
|
498 }
|
|
499
|
|
500 case '>': {
|
|
501 re_node *cur = bl_node(con->block);
|
|
502 --*p;
|
|
503 cur->indep.type = RE_INDEP;
|
|
504 cur->indep.next = next;
|
|
505 cur->indep.arg = CALL(alternation, NULL, NULL);
|
|
506 skipspace(s, p);
|
|
507 if (ST_INDEX(s, *p) == '<') {
|
|
508 --*p;
|
|
509 }
|
|
510 return cur;
|
|
511 }
|
|
512
|
|
513 case '}': {
|
|
514 re_node *begin = bl_node(con->block);
|
|
515 re_node *end = bl_node(con->block);
|
|
516 --*p;
|
|
517 begin->capture.n = end->capture.n = con->base->captures++;
|
|
518 end->capture.type = RE_END_CAPTURE;
|
|
519 end->capture.next = next;
|
|
520 begin->capture.type = RE_BEGIN_CAPTURE;
|
|
521 begin->capture.next = CALL(alternation, end, end);
|
|
522 skipspace(s, p);
|
|
523 if (ST_INDEX(s, *p) == '{') {
|
|
524 --*p;
|
|
525 }
|
|
526 return begin;
|
|
527 }
|
|
528
|
|
529 case '^': {
|
|
530 re_node *cur = bl_node(con->block);
|
|
531 cur->x.type = RE_AT_BOS;
|
|
532 cur->x.next = next;
|
|
533 --*p;
|
|
534 return cur;
|
|
535 }
|
|
536
|
|
537 case '$': {
|
|
538 re_node *cur = bl_node(con->block);
|
|
539 cur->x.type = RE_AT_EOS;
|
|
540 cur->x.next = next;
|
|
541 --*p;
|
|
542 return cur;
|
|
543 }
|
|
544
|
|
545 case '.': {
|
|
546 re_node *cur = bl_node(con->block);
|
|
547 cur->x.type = RE_ANYCH;
|
|
548 cur->x.next = next;
|
|
549 --*p;
|
|
550 return cur;
|
|
551 }
|
|
552
|
|
553 case '`': {
|
|
554 String tmp;
|
|
555 int c;
|
|
556 re_node *cur = bl_node(con->block);
|
|
557 cur->literal.type = RE_PARTIAL;
|
|
558 cur->literal.next = next;
|
|
559
|
|
560 --*p;
|
|
561 St_init(&tmp);
|
|
562 while (
|
|
563 (c = ST_INDEX(s, *p)) != EOF &&
|
|
564 c != '`'
|
|
565 ) {
|
|
566 --*p;
|
|
567 if (c == ESCAPE && ST_INDEX(s, *p) != EOF) {
|
|
568 c = ST_INDEX(s, *p);
|
|
569 --*p;
|
|
570 }
|
|
571 St_cat_c(&tmp, c);
|
|
572 }
|
|
573 if (c == '`') {
|
|
574 --*p;
|
|
575 }
|
|
576 St_reverse(&tmp);
|
|
577
|
|
578 cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp));
|
|
579 memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp));
|
|
580 St_clear(&tmp);
|
|
581
|
|
582 return cur;
|
|
583 }
|
|
584
|
|
585 case '\'': {
|
|
586 int c;
|
|
587 re_node *cur = bl_node(con->block);
|
|
588 cur->class.type = RE_CLASS;
|
|
589 cur->class.next = next;
|
|
590 memset(cur->class.vec, '\0', sizeof cur->class.vec);
|
|
591 for (--*p; (c = ST_INDEX(s, *p)) != EOF && c != '\''; --*p) {
|
|
592 if (c == ESCAPE) {
|
|
593 switch (c = ST_INDEX(s, *p - 1)) {
|
|
594 case EOF:
|
|
595 cur->class.vec[ESCAPE / CHAR_BIT] |= 1u << ESCAPE % CHAR_BIT;
|
|
596 break;
|
|
597
|
|
598 case 'c':
|
|
599 fill(cur->class.vec, iscntrl);
|
|
600 --*p;
|
|
601 break;
|
|
602 case 'C':
|
|
603 nfill(cur->class.vec, iscntrl);
|
|
604 --*p;
|
|
605 break;
|
|
606
|
|
607 case 'd':
|
|
608 fill(cur->class.vec, isdigit);
|
|
609 --*p;
|
|
610 break;
|
|
611 case 'D':
|
|
612 nfill(cur->class.vec, isdigit);
|
|
613 --*p;
|
|
614 break;
|
|
615
|
|
616 case 'l':
|
|
617 fill(cur->class.vec, islower);
|
|
618 --*p;
|
|
619 break;
|
|
620 case 'L':
|
|
621 nfill(cur->class.vec, islower);
|
|
622 --*p;
|
|
623 break;
|
|
624
|
|
625 case 'p':
|
|
626 fill(cur->class.vec, isprint);
|
|
627 --*p;
|
|
628 break;
|
|
629 case 'P':
|
|
630 nfill(cur->class.vec, isprint);
|
|
631 --*p;
|
|
632 break;
|
|
633
|
|
634 case 'q':
|
|
635 fill(cur->class.vec, isalpha);
|
|
636 --*p;
|
|
637 break;
|
|
638 case 'Q':
|
|
639 nfill(cur->class.vec, isalpha);
|
|
640 --*p;
|
|
641 break;
|
|
642
|
|
643 case 's':
|
|
644 fill(cur->class.vec, isspace);
|
|
645 --*p;
|
|
646 break;
|
|
647 case 'S':
|
|
648 nfill(cur->class.vec, isspace);
|
|
649 --*p;
|
|
650 break;
|
|
651
|
|
652 case 'u':
|
|
653 fill(cur->class.vec, isupper);
|
|
654 --*p;
|
|
655 break;
|
|
656 case 'U':
|
|
657 nfill(cur->class.vec, isupper);
|
|
658 --*p;
|
|
659 break;
|
|
660
|
|
661 case 'w':
|
|
662 fill(cur->class.vec, siword);
|
|
663 --*p;
|
|
664 break;
|
|
665 case 'W':
|
|
666 nfill(cur->class.vec, siword);
|
|
667 --*p;
|
|
668 break;
|
|
669
|
|
670 case 'x':
|
|
671 fill(cur->class.vec, isxdigit);
|
|
672 --*p;
|
|
673 break;
|
|
674 case 'X':
|
|
675 nfill(cur->class.vec, isxdigit);
|
|
676 --*p;
|
|
677 break;
|
|
678
|
|
679 default:
|
|
680 cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT;
|
|
681 --*p;
|
|
682 break;
|
|
683 }
|
|
684 } else {
|
|
685 int b;
|
|
686 if (ST_INDEX(s, *p - 1) == '-' && (b = ST_INDEX(s, *p - 2)) != EOF) {
|
|
687 if (b == ESCAPE && (b = ST_INDEX(s, *p - 3)) == EOF) {
|
|
688 b = ESCAPE;
|
|
689 }
|
|
690 if (b > c) {
|
|
691 goto normal_char;
|
|
692 }
|
|
693 for (; b < c; ++b) {
|
|
694 cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT;
|
|
695 }
|
|
696 cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT;
|
|
697 *p -= 2;
|
|
698 if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1) != EOF) {
|
|
699 --*p;
|
|
700 }
|
|
701 } else if (ST_INDEX(s, *p) == ']' && *p >= 8 && ST_INDEX(s, *p - 1) == ':') {
|
|
702
|
|
703 #define CXCLASS_CHECK(n, t, f, o, b) if (1) { \
|
|
704 if (memcmp(St_ptr(s) + *p - (n), (t), (n) - 1) != 0) { \
|
|
705 goto normal_char; \
|
|
706 } \
|
|
707 (b)(cur->class.vec, (f)); \
|
|
708 *p -= (o); \
|
|
709 break; \
|
|
710 } else (void)0
|
|
711
|
|
712 #define CCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, fill)
|
|
713 #define CNCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, nfill)
|
|
714
|
|
715 if (ST_INDEX(s, *p - 8) == '[' && ST_INDEX(s, *p - 7) == ':') {
|
|
716 switch (ST_INDEX(s, *p - 6)) {
|
|
717 case 'a':
|
|
718 if (ST_INDEX(s, *p - 5) != 'l') {
|
|
719 goto normal_char;
|
|
720 }
|
|
721 switch (ST_INDEX(s, *p - 4)) {
|
|
722 case 'n': CCLASS_CHECK("um", isalnum, 8);
|
|
723 case 'p': CCLASS_CHECK("ha", isalpha, 8);
|
|
724 default: goto normal_char;
|
|
725 }
|
|
726 break;
|
|
727
|
|
728 case 'c': CCLASS_CHECK("ntrl", iscntrl, 8);
|
|
729 case 'd': CCLASS_CHECK("igit", isdigit, 8);
|
|
730 case 'g': CCLASS_CHECK("raph", isgraph, 8);
|
|
731 case 'l': CCLASS_CHECK("ower", islower, 8);
|
|
732
|
|
733 case 'p':
|
|
734 switch (ST_INDEX(s, *p - 5)) {
|
|
735 case 'r': CCLASS_CHECK("int", isprint, 8);
|
|
736 case 'u': CCLASS_CHECK("nct", ispunct, 8);
|
|
737 default: goto normal_char;
|
|
738 }
|
|
739 break;
|
|
740
|
|
741 case 's': CCLASS_CHECK("pace", isspace, 8);
|
|
742 case 'u': CCLASS_CHECK("pper", isupper, 8);
|
|
743 default: goto normal_char;
|
|
744 }
|
|
745 } else if (*p >= 9 && ST_INDEX(s, *p - 9) == '[' && ST_INDEX(s, *p - 8) == ':') {
|
|
746 switch (ST_INDEX(s, *p - 7)) {
|
|
747 case 'x':
|
|
748 CCLASS_CHECK("digit", isxdigit, 9);
|
|
749
|
|
750 case '^':
|
|
751 switch (ST_INDEX(s, *p - 6)) {
|
|
752 case 'a':
|
|
753 if (ST_INDEX(s, *p - 5) != 'l') {
|
|
754 goto normal_char;
|
|
755 }
|
|
756 switch (ST_INDEX(s, *p - 4)) {
|
|
757 case 'n': CNCLASS_CHECK("um", isalnum, 9);
|
|
758 case 'p': CNCLASS_CHECK("ha", isalpha, 9);
|
|
759 default: goto normal_char;
|
|
760 }
|
|
761 break;
|
|
762
|
|
763 case 'c': CNCLASS_CHECK("ntrl", iscntrl, 9);
|
|
764 case 'd': CNCLASS_CHECK("igit", isdigit, 9);
|
|
765 case 'g': CNCLASS_CHECK("raph", isgraph, 9);
|
|
766 case 'l': CNCLASS_CHECK("ower", islower, 9);
|
|
767
|
|
768 case 'p':
|
|
769 switch (ST_INDEX(s, *p - 5)) {
|
|
770 case 'r': CNCLASS_CHECK("int", isprint, 9);
|
|
771 case 'u': CNCLASS_CHECK("nct", ispunct, 9);
|
|
772 default: goto normal_char;
|
|
773 }
|
|
774 break;
|
|
775
|
|
776 case 's': CNCLASS_CHECK("pace", isspace, 9);
|
|
777 case 'u': CNCLASS_CHECK("pper", isupper, 9);
|
|
778 default: goto normal_char;
|
|
779 }
|
|
780 break;
|
|
781
|
|
782 default: goto normal_char;
|
|
783 }
|
|
784 } else if (*p >= 10 && memcmp(St_ptr(s) + *p - 10, "[:^xdigit", 9) == 0) {
|
|
785 nfill(cur->class.vec, isxdigit);
|
|
786 *p -= 10;
|
|
787 } else {
|
|
788 goto normal_char;
|
|
789 }
|
|
790 } else if (ST_INDEX(s, *p) == '^' && ((b = ST_INDEX(s, *p - 1) == '\'') || b == EOF)) {
|
|
791 cur->class.type = RE_N_CLASS;
|
|
792 } else normal_char: {
|
|
793 cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT;
|
|
794 }
|
|
795 }
|
|
796 }
|
|
797 if (c == '\'') {
|
|
798 --*p;
|
|
799 }
|
|
800
|
|
801 if (cur->class.type == RE_CLASS) {
|
|
802 size_t i;
|
|
803 for (i = 0; i < CLASS_SIZE; ++i) {
|
|
804 if (cur->class.vec[i] != (unsigned char)~0u) {
|
|
805 return cur;
|
|
806 }
|
|
807 }
|
|
808 cur->x.type = RE_ANYCH;
|
|
809 } else {
|
|
810 size_t i;
|
|
811 assert(cur->class.type == RE_N_CLASS);
|
|
812 for (i = 0; i < CLASS_SIZE; ++i) {
|
|
813 if (cur->class.vec[i] != 0) {
|
|
814 return cur;
|
|
815 }
|
|
816 }
|
|
817 cur->x.type = RE_ANYCH;
|
|
818 }
|
|
819
|
|
820 return cur;
|
|
821 }
|
|
822
|
|
823 std_atom:
|
|
824 default: {
|
|
825 re_node *cur = bl_node(con->block);
|
|
826 cur->literal.type = RE_LITERAL;
|
|
827
|
|
828 /*XXX?*/
|
|
829 if (next && next->x.type == RE_LITERAL) {
|
|
830 if (next->literal.len) {
|
|
831 cur->literal.buf = bl_string(con->block, cur->literal.len = 1u + next->literal.len);
|
|
832 memcpy(cur->literal.buf + 1, next->literal.buf, next->literal.len);
|
|
833 }
|
|
834 cur->literal.next = next->literal.next;
|
|
835 } else {
|
|
836 cur->literal.next = next;
|
|
837 cur->literal.buf = bl_string(con->block, cur->literal.len = 1);
|
|
838 }
|
|
839
|
|
840 cur->literal.buf[0] = ST_INDEX(s, *p);
|
|
841 --*p;
|
|
842 return cur;
|
|
843 }
|
|
844 }
|
|
845 }
|
|
846
|
|
847 PRAZ(literal) {
|
|
848 SNORK;
|
|
849 int c;
|
|
850 String tmp;
|
|
851 re_node *cur;
|
|
852
|
|
853 skipspace(s, p);
|
|
854 if ((ST_INDEX(s, *p) != ESCAPE || isalnum(ST_INDEX(s, *p - 1))) && MEATCHARP(ST_INDEX(s, *p))) {
|
|
855 TAILC(atom);
|
|
856 }
|
|
857
|
|
858 cur = bl_node(con->block);
|
|
859 cur->literal.type = RE_LITERAL;
|
|
860 cur->literal.next = next;
|
|
861
|
|
862 St_init(&tmp);
|
|
863 while (
|
|
864 (c = ST_INDEX(s, *p)) != EOF &&
|
|
865 (
|
|
866 !MEATCHARP(c) ||
|
|
867 (c == ESCAPE && !isalnum(ST_INDEX(s, *p - 1)))
|
|
868 )
|
|
869 ) {
|
|
870 --*p;
|
|
871 if (c == ESCAPE && ST_INDEX(s, *p) != EOF) {
|
|
872 c = ST_INDEX(s, *p);
|
|
873 --*p;
|
|
874 }
|
|
875 St_cat_c(&tmp, c);
|
|
876 skipspace(s, p);
|
|
877 }
|
|
878 St_reverse(&tmp);
|
|
879
|
|
880 if (next && next->x.type == RE_LITERAL) {
|
|
881 if (next->literal.len) {
|
|
882 St_cat_m(&tmp, next->literal.buf, next->literal.len);
|
|
883 }
|
|
884 cur->literal.next = next->literal.next;
|
|
885 }
|
|
886
|
|
887 cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp));
|
|
888 memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp));
|
|
889 St_clear(&tmp);
|
|
890
|
|
891 return cur;
|
|
892 }
|
|
893
|
|
894 #define RETURN_REP(lo, hi, grdy) \
|
|
895 do { \
|
|
896 re_node *cur = bl_node(con->block); \
|
|
897 cur->rep.type = (grdy); \
|
|
898 cur->rep.next = next; \
|
|
899 cur->rep.min = (lo); \
|
|
900 cur->rep.max = (hi); \
|
|
901 cur->rep.n = con->base->repets++; \
|
|
902 cur->rep.arg = CALL(atom, cur, NULL); \
|
|
903 return cur; \
|
|
904 } while (0)
|
|
905
|
|
906 #define STD_REP_CASE(lo, hi, grdy) \
|
|
907 do { \
|
|
908 --*p; \
|
|
909 RETURN_REP(lo, hi, grdy); \
|
|
910 } while (0)
|
|
911
|
|
912 #define CUSTOM_REP_CASE(off, grdy, mismatch) \
|
|
913 do { \
|
|
914 size_t from, to; \
|
|
915 to = *p - 1 - (off); \
|
|
916 while (isdigit(ST_INDEX(s, to))) { \
|
|
917 --to; \
|
|
918 } \
|
|
919 if (ST_INDEX(s, to) == ':') { \
|
|
920 *p = to - 1; \
|
|
921 to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \
|
|
922 from = to; \
|
|
923 } else { \
|
|
924 if (ST_INDEX(s, to) != ',') { \
|
|
925 mismatch; \
|
|
926 } \
|
|
927 from = to - 1; \
|
|
928 if (!isdigit(ST_INDEX(s, from))) { \
|
|
929 mismatch; \
|
|
930 } \
|
|
931 do { \
|
|
932 --from; \
|
|
933 } while (isdigit(ST_INDEX(s, from))); \
|
|
934 if (ST_INDEX(s, from) != ':') { \
|
|
935 mismatch; \
|
|
936 } \
|
|
937 *p = from - 1; \
|
|
938 from = strtoul(St_ptr(s) + from + 1, NULL, 10); \
|
|
939 to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \
|
|
940 } \
|
|
941 if (to == 1) { \
|
|
942 if (from == 1) { \
|
|
943 TAILC(atom); \
|
|
944 } \
|
|
945 if (from == 0) { \
|
|
946 re_node *cur = bl_node(con->block); \
|
|
947 cur->alter.type = RE_ALTER; \
|
|
948 if ((grdy) == RE_REP_GREEDY) { \
|
|
949 cur->alter.next = next; \
|
|
950 cur->alter.arg = CALL(atom, next, NULL); \
|
|
951 } else { \
|
|
952 cur->alter.arg = next; \
|
|
953 cur->alter.next = CALL(atom, next, NULL); \
|
|
954 } \
|
|
955 return cur; \
|
|
956 } \
|
|
957 } \
|
|
958 RETURN_REP(from, to, grdy); \
|
|
959 } while (0)
|
|
960
|
|
961 PRAZ(quantifire) {
|
|
962 SNORK;
|
|
963
|
|
964 skipspace(s, p);
|
|
965 switch (ST_INDEX(s, *p)) {
|
|
966 case EOF: return next;
|
|
967
|
|
968 case '?':
|
|
969 switch (ST_INDEX(s, *p - 1)) {
|
|
970 case ':': CUSTOM_REP_CASE(1, RE_REP_FRUGAL, goto hook);
|
|
971 case '*': --*p; STD_REP_CASE(0, -1, RE_REP_FRUGAL);
|
|
972 case '+': --*p; STD_REP_CASE(1, -1, RE_REP_FRUGAL);
|
|
973
|
|
974 case '?': {
|
|
975 re_node *cur = bl_node(con->block);
|
|
976 *p -= 2;
|
|
977 cur->alter.type = RE_ALTER;
|
|
978 cur->alter.next = CALL(atom, next, NULL);
|
|
979 cur->alter.arg = next;
|
|
980 return cur;
|
|
981 }
|
|
982
|
|
983 hook:
|
|
984 default: {
|
|
985 re_node *cur = bl_node(con->block);
|
|
986 --*p;
|
|
987 cur->alter.type = RE_ALTER;
|
|
988 cur->alter.next = next;
|
|
989 cur->alter.arg = CALL(atom, next, NULL);
|
|
990 return cur;
|
|
991 }
|
|
992 }
|
|
993
|
|
994 case ':': CUSTOM_REP_CASE(0, RE_REP_GREEDY, goto noquanti);
|
|
995 case '*': STD_REP_CASE(0, -1, RE_REP_GREEDY);
|
|
996 case '+': STD_REP_CASE(1, -1, RE_REP_GREEDY);
|
|
997
|
|
998 noquanti:
|
|
999 default: TAILC(literal);
|
|
1000 }
|
|
1001 }
|
|
1002
|
|
1003 PRAZ(branch) {
|
|
1004 SNORK;
|
|
1005 skipspace(s, p);
|
|
1006 while (ST_INDEX(s, *p) != EOF && !ENDBRANCHP(ST_INDEX(s, *p))) {
|
|
1007 next = CALL(quantifire, next, curolan_zebeth);
|
|
1008 skipspace(s, p);
|
|
1009 }
|
|
1010 return next;
|
|
1011 }
|
|
1012
|
|
1013 PRAZ(alternation) {
|
|
1014 SNORK;
|
|
1015 re_node *cur;
|
|
1016
|
|
1017 skipspace(s, p);
|
|
1018 if (ST_INDEX(s, *p) == EOF) {
|
|
1019 return next;
|
|
1020 }
|
|
1021
|
|
1022 next = CALL(branch, next, curolan_zebeth);
|
|
1023 skipspace(s, p);
|
|
1024 if (ST_INDEX(s, *p) != '|') {
|
|
1025 return next;
|
|
1026 }
|
|
1027
|
|
1028 --*p;
|
|
1029 cur = bl_node(con->block);
|
|
1030 cur->alter.type = RE_ALTER;
|
|
1031 cur->alter.next = next;
|
|
1032 cur->alter.arg = CALL(alternation, curolan_zebeth, curolan_zebeth);
|
|
1033 return cur;
|
|
1034 }
|
|
1035
|
|
1036 ATTR_CONST
|
|
1037 static size_t minimum(const size_t a, const size_t b) {
|
|
1038 return a < b ? a : b;
|
|
1039 }
|
|
1040
|
|
1041 ATTR_CONST
|
|
1042 static size_t maximum(const size_t a, const size_t b) {
|
|
1043 return a > b ? a : b;
|
|
1044 }
|
|
1045
|
|
1046 ATTR_PURE
|
|
1047 static size_t subminlen(const re_node *node, const re_node *end, size_t start) {
|
|
1048 if (!node || node == end) {
|
|
1049 return start;
|
|
1050 }
|
|
1051
|
|
1052 switch (node->x.type) {
|
|
1053 case RE_ALTER:
|
|
1054 return minimum(
|
|
1055 subminlen(node->alter.arg, end, start),
|
|
1056 subminlen(node->alter.next, end, start)
|
|
1057 );
|
|
1058
|
|
1059 case RE_ANYCH:
|
|
1060 case RE_CLASS:
|
|
1061 case RE_DEFCLASS:
|
|
1062 case RE_N_CLASS:
|
|
1063 case RE_N_DEFCLASS:
|
|
1064 return subminlen(node->x.next, end, start + 1u);
|
|
1065
|
|
1066 case RE_AT_BOL:
|
|
1067 case RE_AT_BOS:
|
|
1068 case RE_AT_EOL:
|
|
1069 case RE_AT_EOS:
|
|
1070 case RE_AT_NWBOUND:
|
|
1071 case RE_AT_WBOUND:
|
|
1072 case RE_BACKCHECK:
|
|
1073 case RE_BACKREF:
|
|
1074 case RE_BEGIN_CAPTURE:
|
|
1075 case RE_END_CAPTURE:
|
|
1076 case RE_PARTIAL:
|
|
1077 case RE_STUFF_FRUGAL:
|
|
1078 case RE_STUFF_GREEDY:
|
|
1079 return subminlen(node->x.next, end, start);
|
|
1080
|
|
1081 case RE_AT_MATCH:
|
|
1082 return maximum(
|
|
1083 subminlen(node->indep.arg, end, start),
|
|
1084 subminlen(node->indep.next, end, start)
|
|
1085 );
|
|
1086
|
|
1087 case RE_AT_NOMATCH:
|
|
1088 return subminlen(node->indep.next, end, start);
|
|
1089
|
|
1090 case RE_INDEP:
|
|
1091 return subminlen(node->indep.next, end, start + subminlen(node->indep.arg, NULL, 0));
|
|
1092
|
|
1093 case RE_LITERAL:
|
|
1094 return subminlen(node->literal.next, end, start + node->literal.len);
|
|
1095
|
|
1096 case RE_SELECT:
|
|
1097 return minimum(
|
|
1098 subminlen(node->select.cond, end, start) + subminlen(node->select.arg, end, start),
|
|
1099 subminlen(node->select.next, end, start)
|
|
1100 );
|
|
1101
|
|
1102 case RE_REP_FRUGAL:
|
|
1103 case RE_REP_GREEDY:
|
|
1104 if (node->rep.min == 0) {
|
|
1105 return subminlen(node->rep.next, end, start);
|
|
1106 }
|
|
1107 return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, node, 0));
|
|
1108
|
|
1109 case RE_XREP_FRUGAL:
|
|
1110 case RE_XREP_GREEDY:
|
|
1111 if (node->rep.min == 0) {
|
|
1112 return subminlen(node->rep.next, end, start);
|
|
1113 }
|
|
1114 return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, NULL, 0));
|
|
1115 }
|
|
1116
|
|
1117 NOTREACHED;
|
|
1118 }
|
|
1119
|
|
1120 ATTR_PURE
|
|
1121 static size_t minlen(const re_node *node) {
|
|
1122 return subminlen(node, NULL, 0);
|
|
1123 }
|
|
1124
|
|
1125 static void reinit(t_regex *re) {
|
|
1126 const struct cap_state null = { -1, -1, -1 };
|
|
1127 size_t i;
|
|
1128 for (i = 0; i < re->captures; ++i) {
|
|
1129 re->cap_ptr[i] = null;
|
|
1130 }
|
|
1131 }
|
|
1132
|
|
1133 static re_node **notcomplex(re_node **node, const re_node *end) {
|
|
1134 assert(*node != NULL);
|
|
1135
|
|
1136 if (*node == end) {
|
|
1137 return node;
|
|
1138 }
|
|
1139
|
|
1140 switch ((*node)->x.type) {
|
|
1141 case RE_ALTER:
|
|
1142 case RE_AT_MATCH:
|
|
1143 case RE_AT_NOMATCH:
|
|
1144 case RE_BACKCHECK:
|
|
1145 case RE_BACKREF:
|
|
1146 case RE_BEGIN_CAPTURE:
|
|
1147 case RE_END_CAPTURE:
|
|
1148 case RE_INDEP:
|
|
1149 case RE_PARTIAL:
|
|
1150 case RE_REP_FRUGAL:
|
|
1151 case RE_REP_GREEDY:
|
|
1152 case RE_SELECT:
|
|
1153 case RE_STUFF_FRUGAL:
|
|
1154 case RE_STUFF_GREEDY:
|
|
1155 return NULL;
|
|
1156
|
|
1157 case RE_XREP_FRUGAL:
|
|
1158 case RE_XREP_GREEDY:
|
|
1159 if ((*node)->rep.min == (*node)->rep.max) {
|
|
1160 return notcomplex(&(*node)->rep.next, end);
|
|
1161 }
|
|
1162 return NULL;
|
|
1163
|
|
1164 case RE_ANYCH:
|
|
1165 case RE_AT_BOL:
|
|
1166 case RE_AT_BOS:
|
|
1167 case RE_AT_EOL:
|
|
1168 case RE_AT_EOS:
|
|
1169 case RE_AT_NWBOUND:
|
|
1170 case RE_AT_WBOUND:
|
|
1171 case RE_CLASS:
|
|
1172 case RE_DEFCLASS:
|
|
1173 case RE_LITERAL:
|
|
1174 case RE_N_CLASS:
|
|
1175 case RE_N_DEFCLASS:
|
|
1176 return notcomplex(&(*node)->x.next, end);
|
|
1177 }
|
|
1178
|
|
1179 NOTREACHED;
|
|
1180 }
|
|
1181
|
|
1182 static void trysimpl(re_node *node) {
|
|
1183 assert(node->x.type == RE_REP_FRUGAL || node->x.type == RE_REP_GREEDY);
|
|
1184
|
|
1185 {
|
|
1186 re_node **const tmp = notcomplex(&node->rep.arg, node);
|
|
1187 if (!tmp) {
|
|
1188 return;
|
|
1189 }
|
|
1190 *tmp = NULL;
|
|
1191 }
|
|
1192
|
|
1193 switch (node->rep.type) {
|
|
1194 case RE_REP_FRUGAL:
|
|
1195 if (
|
|
1196 node->rep.arg->x.type == RE_ANYCH &&
|
|
1197 node->rep.arg->x.next == NULL &&
|
|
1198 node->rep.max == (size_t)-1
|
|
1199 ) {
|
|
1200 if (node->rep.min == 0) {
|
|
1201 node->x.type = RE_STUFF_FRUGAL;
|
|
1202 break;
|
|
1203 }
|
|
1204 if (node->rep.min == 1) {
|
|
1205 node->rep.arg->x.next = node->x.next;
|
|
1206 node->rep.arg->x.type = RE_STUFF_FRUGAL;
|
|
1207 node->x.next = node->rep.arg;
|
|
1208 node->x.type = RE_ANYCH;
|
|
1209 break;
|
|
1210 }
|
|
1211 }
|
|
1212
|
|
1213 node->rep.n = minlen(node->rep.arg);
|
|
1214 node->rep.type = RE_XREP_FRUGAL;
|
|
1215 break;
|
|
1216
|
|
1217 case RE_REP_GREEDY:
|
|
1218 if (
|
|
1219 node->rep.arg->x.type == RE_ANYCH &&
|
|
1220 node->rep.arg->x.next == NULL &&
|
|
1221 node->rep.max == (size_t)-1
|
|
1222 ) {
|
|
1223 if (node->rep.min == 0) {
|
|
1224 node->x.type = RE_STUFF_GREEDY;
|
|
1225 break;
|
|
1226 }
|
|
1227 if (node->rep.min == 1) {
|
|
1228 node->rep.arg->x.next = node->x.next;
|
|
1229 node->rep.arg->x.type = RE_STUFF_GREEDY;
|
|
1230 node->x.next = node->rep.arg;
|
|
1231 node->x.type = RE_ANYCH;
|
|
1232 break;
|
|
1233 }
|
|
1234 }
|
|
1235
|
|
1236 node->rep.n = minlen(node->rep.arg);
|
|
1237 node->rep.type = RE_XREP_GREEDY;
|
|
1238 break;
|
|
1239
|
|
1240 default:
|
|
1241 NOTREACHED;
|
|
1242 }
|
|
1243
|
|
1244 }
|
|
1245
|
|
1246 static void simplerep(re_node *node, const re_node *const end) {
|
|
1247 if (!node || node == end) {
|
|
1248 return;
|
|
1249 }
|
|
1250
|
|
1251 switch (node->x.type) {
|
|
1252 case RE_ALTER:
|
|
1253 simplerep(node->alter.arg, end);
|
|
1254 simplerep(node->alter.next, end);
|
|
1255 return;
|
|
1256
|
|
1257 case RE_AT_MATCH:
|
|
1258 case RE_AT_NOMATCH:
|
|
1259 case RE_INDEP:
|
|
1260 simplerep(node->indep.arg, end);
|
|
1261 simplerep(node->indep.next, end);
|
|
1262 return;
|
|
1263
|
|
1264 case RE_ANYCH:
|
|
1265 case RE_AT_BOL:
|
|
1266 case RE_AT_BOS:
|
|
1267 case RE_AT_EOL:
|
|
1268 case RE_AT_EOS:
|
|
1269 case RE_AT_NWBOUND:
|
|
1270 case RE_AT_WBOUND:
|
|
1271 case RE_BACKCHECK:
|
|
1272 case RE_BACKREF:
|
|
1273 case RE_BEGIN_CAPTURE:
|
|
1274 case RE_CLASS:
|
|
1275 case RE_DEFCLASS:
|
|
1276 case RE_END_CAPTURE:
|
|
1277 case RE_LITERAL:
|
|
1278 case RE_N_CLASS:
|
|
1279 case RE_N_DEFCLASS:
|
|
1280 case RE_PARTIAL:
|
|
1281 simplerep(node->x.next, end);
|
|
1282 return;
|
|
1283
|
|
1284 case RE_REP_FRUGAL:
|
|
1285 case RE_REP_GREEDY:
|
|
1286 simplerep(node->rep.next, end);
|
|
1287 simplerep(node->rep.arg, node);
|
|
1288 trysimpl(node);
|
|
1289 return;
|
|
1290
|
|
1291 case RE_SELECT:
|
|
1292 simplerep(node->select.cond, end);
|
|
1293 simplerep(node->select.arg, end);
|
|
1294 simplerep(node->select.next, end);
|
|
1295 return;
|
|
1296
|
|
1297 case RE_STUFF_FRUGAL:
|
|
1298 case RE_STUFF_GREEDY:
|
|
1299 case RE_XREP_FRUGAL:
|
|
1300 case RE_XREP_GREEDY:
|
|
1301 return;
|
|
1302 }
|
|
1303
|
|
1304 NOTREACHED;
|
|
1305 }
|
|
1306
|
|
1307 ATTR_PURE
|
|
1308 static int anchorbegp(const re_node *node) {
|
|
1309 return
|
|
1310 node &&
|
|
1311 (
|
|
1312 node->x.type == RE_AT_BOS ||
|
|
1313 node->x.type == RE_STUFF_FRUGAL ||
|
|
1314 node->x.type == RE_STUFF_GREEDY ||
|
|
1315 (
|
|
1316 node->x.type == RE_ALTER &&
|
|
1317 anchorbegp(node->alter.arg) &&
|
|
1318 anchorbegp(node->alter.next)
|
|
1319 ) ||
|
|
1320 (
|
|
1321 node->x.type == RE_INDEP &&
|
|
1322 anchorbegp(node->indep.arg)
|
|
1323 ) ||
|
|
1324 (
|
|
1325 node->x.type == RE_AT_MATCH &&
|
|
1326 (
|
|
1327 anchorbegp(node->indep.arg) ||
|
|
1328 anchorbegp(node->indep.next)
|
|
1329 )
|
|
1330 ) ||
|
|
1331 (
|
|
1332 (
|
|
1333 node->x.type == RE_AT_NOMATCH ||
|
|
1334 node->x.type == RE_AT_BOL ||
|
|
1335 node->x.type == RE_AT_EOL ||
|
|
1336 node->x.type == RE_AT_EOS ||
|
|
1337 node->x.type == RE_AT_NWBOUND ||
|
|
1338 node->x.type == RE_AT_WBOUND ||
|
|
1339 node->x.type == RE_BEGIN_CAPTURE ||
|
|
1340 node->x.type == RE_BACKCHECK
|
|
1341 ) &&
|
|
1342 anchorbegp(node->x.next)
|
|
1343 )
|
|
1344 )
|
|
1345 ;
|
|
1346 }
|
|
1347
|
|
1348 static void dostuff(t_regex *re) {
|
|
1349 simplerep(re->start, NULL);
|
|
1350
|
|
1351 if (!re->start || anchorbegp(re->start)) {
|
|
1352 re->flags |= FL_ANCHOR_START;
|
|
1353 }
|
|
1354 }
|
|
1355
|
|
1356 t_regex *re_compile(const String *s) {
|
|
1357 size_t p;
|
|
1358 struct parse_context dcon;
|
|
1359 t_regex *re;
|
|
1360 void *pre;
|
|
1361
|
|
1362 if (h_get(&rcache, s, &pre) == H_OK) {
|
|
1363 re = pre;
|
|
1364 ++re->refs;
|
|
1365 return re;
|
|
1366 }
|
|
1367
|
|
1368 re = xmalloc(1, sizeof *re);
|
|
1369 re->flags = FL_NONE;
|
|
1370 re->refs = 0;
|
|
1371 re->repets = 0;
|
|
1372 bl_init(&re->alloc);
|
|
1373 re->cap_ptr = NULL;
|
|
1374 re->captures = 0;
|
|
1375 dcon.base = re;
|
|
1376 dcon.block = &re->alloc;
|
|
1377 dcon.s = s;
|
|
1378 p = St_len(s) - 1;
|
|
1379 dcon.p = &p;
|
|
1380
|
|
1381 re->start = alternation(&dcon, NULL, NULL);
|
|
1382
|
|
1383 if (re->captures) {
|
|
1384 re->cap_ptr = xmalloc(re->captures, sizeof *re->cap_ptr);
|
|
1385 reinit(re);
|
|
1386 }
|
|
1387
|
|
1388 dostuff(re);
|
|
1389 re->minlen = minlen(re->start);
|
|
1390 re->repbuf = xmalloc(re->repets, sizeof *re->repbuf);
|
|
1391
|
|
1392 if (cachfill < sizeof cached / sizeof cached[0]) {
|
|
1393 St_init(&cached[cachfill]);
|
|
1394 St_cpy(&cached[cachfill], s);
|
|
1395 ++re->refs;
|
|
1396 h_put(&rcache, &cached[cachfill], re, 1);
|
|
1397 ++cachfill;
|
|
1398 } else {
|
|
1399 h_del(&rcache, &cached[cachlast]);
|
|
1400 St_cpy(&cached[cachlast], s);
|
|
1401 ++re->refs;
|
|
1402 h_put(&rcache, &cached[cachlast], re, 1);
|
|
1403 cachlast = (cachlast + 1u) % (sizeof cached / sizeof cached[0]);
|
|
1404 }
|
|
1405
|
|
1406 if (Opt.debug & DBG_REGEX) {
|
|
1407 io_write_s(Err, "Compiled ");
|
|
1408 re_dump(re, io_fp(Err));
|
|
1409 }
|
|
1410
|
|
1411 return re;
|
|
1412 }
|
|
1413
|
|
1414 t_regex *re_dup(t_regex *re) {
|
|
1415 ++re->refs;
|
|
1416 return re;
|
|
1417 }
|
|
1418
|
|
1419 void re_free(t_regex *re) {
|
|
1420 if (re->refs) {
|
|
1421 --re->refs;
|
|
1422 return;
|
|
1423 }
|
|
1424 bl_free(&re->alloc);
|
|
1425 xfree(re->repbuf);
|
|
1426 xfree(re->cap_ptr);
|
|
1427 xfree(re);
|
|
1428 }
|
|
1429
|
|
1430 ATTR_PURE
|
|
1431 static int match_class(unsigned char c, const unsigned char *vec) {
|
|
1432 return vec[c / CHAR_BIT] & 1u << c % CHAR_BIT;
|
|
1433 }
|
|
1434
|
|
1435 static void *mem_dup(const void *p, size_t n, size_t m) {
|
|
1436 void *q = xmalloc(n, m);
|
|
1437 memcpy(q, p, n * m);
|
|
1438 return q;
|
|
1439 }
|
|
1440
|
|
1441 #define NO_MATCH ((size_t)-1)
|
|
1442
|
|
1443 #define Mmatch0(Xmatch, Xindex) \
|
|
1444 size_t tmp; \
|
|
1445 \
|
|
1446 if (!node) { \
|
|
1447 return o; \
|
|
1448 } \
|
|
1449 \
|
|
1450 switch (node->x.type) { \
|
|
1451 case RE_ALTER: \
|
|
1452 if ((tmp = Xmatch(base, node->alter.arg, s, o)) == NO_MATCH) { \
|
|
1453 return Xmatch(base, node->alter.next, s, o); \
|
|
1454 } \
|
|
1455 return tmp; \
|
|
1456 \
|
|
1457 case RE_ANYCH: \
|
|
1458 if (Xindex(s, o) != EOF) { \
|
|
1459 return Xmatch(base, node->x.next, s, o + 1u); \
|
|
1460 } \
|
|
1461 return NO_MATCH; \
|
|
1462 \
|
|
1463 case RE_AT_BOL: \
|
|
1464 if (o == 0 || Xindex(s, o - 1u) == '\n') { \
|
|
1465 return Xmatch(base, node->x.next, s, o); \
|
|
1466 } \
|
|
1467 return NO_MATCH; \
|
|
1468 \
|
|
1469 case RE_AT_BOS: \
|
|
1470 if (o == 0) { \
|
|
1471 return Xmatch(base, node->x.next, s, o); \
|
|
1472 } \
|
|
1473 return NO_MATCH; \
|
|
1474 \
|
|
1475 case RE_AT_EOL: \
|
|
1476 if ( \
|
|
1477 Xindex(s, o) == '\n' || \
|
|
1478 ( \
|
|
1479 o && Xindex(s, o) == EOF && Xindex(s, o - 1u) != '\n' \
|
|
1480 ) \
|
|
1481 ) { \
|
|
1482 return Xmatch(base, node->x.next, s, o); \
|
|
1483 } \
|
|
1484 return NO_MATCH; \
|
|
1485 \
|
|
1486 case RE_AT_EOS: \
|
|
1487 if (Xindex(s, o) == EOF) { \
|
|
1488 return Xmatch(base, node->x.next, s, o); \
|
|
1489 } \
|
|
1490 return NO_MATCH; \
|
|
1491 \
|
|
1492 case RE_AT_MATCH: { \
|
|
1493 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
|
|
1494 tmp = Xmatch(base, node->indep.arg, s, o); \
|
|
1495 if (tmp != NO_MATCH) { \
|
|
1496 tmp = Xmatch(base, node->indep.next, s, o); \
|
|
1497 if (tmp == NO_MATCH) { \
|
|
1498 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
|
|
1499 } \
|
|
1500 } \
|
|
1501 xfree(prev); \
|
|
1502 return tmp; \
|
|
1503 } \
|
|
1504 \
|
|
1505 case RE_AT_NOMATCH: { \
|
|
1506 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
|
|
1507 tmp = Xmatch(base, node->indep.arg, s, o); \
|
|
1508 if (tmp == NO_MATCH) { \
|
|
1509 tmp = Xmatch(base, node->indep.next, s, o); \
|
|
1510 } else { \
|
|
1511 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
|
|
1512 tmp = NO_MATCH; \
|
|
1513 } \
|
|
1514 xfree(prev); \
|
|
1515 return tmp; \
|
|
1516 } \
|
|
1517 \
|
|
1518 case RE_AT_NWBOUND: \
|
|
1519 if (siword(Xindex(s, o - 1u)) == siword(Xindex(s, o))) { \
|
|
1520 return Xmatch(base, node->x.next, s, o); \
|
|
1521 } \
|
|
1522 return NO_MATCH; \
|
|
1523 \
|
|
1524 case RE_AT_WBOUND: \
|
|
1525 if (siword(Xindex(s, o - 1u)) != siword(Xindex(s, o))) { \
|
|
1526 return Xmatch(base, node->x.next, s, o); \
|
|
1527 } \
|
|
1528 return NO_MATCH; \
|
|
1529 \
|
|
1530 case RE_BACKCHECK: \
|
|
1531 if ( \
|
|
1532 node->backref.n < base->captures && \
|
|
1533 base->cap_ptr[node->backref.n].start + 1u && \
|
|
1534 base->cap_ptr[node->backref.n].end + 1u && \
|
|
1535 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start \
|
|
1536 ) { \
|
|
1537 return Xmatch(base, node->backref.next, s, o); \
|
|
1538 } \
|
|
1539 return NO_MATCH
|
|
1540
|
|
1541
|
|
1542 #define Mmatch1(Xmatch, Xindex) \
|
|
1543 case RE_BEGIN_CAPTURE: { \
|
|
1544 const size_t prev = base->cap_ptr[node->capture.n].pending; \
|
|
1545 base->cap_ptr[node->capture.n].pending = o; \
|
|
1546 if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \
|
|
1547 base->cap_ptr[node->capture.n].pending = prev; \
|
|
1548 } \
|
|
1549 return tmp; \
|
|
1550 } \
|
|
1551 \
|
|
1552 case RE_CLASS: \
|
|
1553 if ( \
|
|
1554 Xindex(s, o) != EOF && \
|
|
1555 match_class(Xindex(s, o), node->class.vec) \
|
|
1556 ) { \
|
|
1557 return Xmatch(base, node->class.next, s, o + 1u); \
|
|
1558 } \
|
|
1559 return NO_MATCH; \
|
|
1560 \
|
|
1561 case RE_DEFCLASS: \
|
|
1562 if ( \
|
|
1563 Xindex(s, o) != EOF && \
|
|
1564 match_class(Xindex(s, o), node->defclass.vec) \
|
|
1565 ) { \
|
|
1566 return Xmatch(base, node->defclass.next, s, o + 1u); \
|
|
1567 } \
|
|
1568 return NO_MATCH; \
|
|
1569 \
|
|
1570 case RE_END_CAPTURE: { \
|
|
1571 const size_t prevbeg = base->cap_ptr[node->capture.n].start; \
|
|
1572 const size_t prevend = base->cap_ptr[node->capture.n].end; \
|
|
1573 if (base->cap_ptr[node->capture.n].pending + 1u == 0u) { \
|
|
1574 return NO_MATCH; \
|
|
1575 } \
|
|
1576 \
|
|
1577 base->cap_ptr[node->capture.n].end = o; \
|
|
1578 base->cap_ptr[node->capture.n].start = base->cap_ptr[node->capture.n].pending; \
|
|
1579 if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \
|
|
1580 base->cap_ptr[node->capture.n].end = prevend; \
|
|
1581 base->cap_ptr[node->capture.n].start = prevbeg; \
|
|
1582 } \
|
|
1583 return tmp; \
|
|
1584 } \
|
|
1585 \
|
|
1586 case RE_INDEP: { \
|
|
1587 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
|
|
1588 if ((tmp = Xmatch(base, node->indep.arg, s, o)) != NO_MATCH) { \
|
|
1589 tmp = Xmatch(base, node->indep.next, s, tmp); \
|
|
1590 if (tmp == NO_MATCH) { \
|
|
1591 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
|
|
1592 } \
|
|
1593 } \
|
|
1594 xfree(prev); \
|
|
1595 return tmp; \
|
|
1596 } \
|
|
1597 \
|
|
1598 case RE_N_CLASS: \
|
|
1599 if ( \
|
|
1600 Xindex(s, o) != EOF && \
|
|
1601 !match_class(Xindex(s, o), node->class.vec) \
|
|
1602 ) { \
|
|
1603 return Xmatch(base, node->class.next, s, o + 1u); \
|
|
1604 } \
|
|
1605 return NO_MATCH; \
|
|
1606 \
|
|
1607 case RE_N_DEFCLASS: \
|
|
1608 if ( \
|
|
1609 Xindex(s, o) != EOF && \
|
|
1610 !match_class(Xindex(s, o), node->defclass.vec) \
|
|
1611 ) { \
|
|
1612 return Xmatch(base, node->defclass.next, s, o + 1u); \
|
|
1613 } \
|
|
1614 return NO_MATCH; \
|
|
1615 \
|
|
1616 case RE_PARTIAL: { \
|
|
1617 size_t n; \
|
|
1618 for (n = 0; n < node->literal.len; ++n) { \
|
|
1619 if (node->literal.buf[n] != Xindex(s, o + n)) { \
|
|
1620 break; \
|
|
1621 } \
|
|
1622 } \
|
|
1623 for (; n; --n) { \
|
|
1624 if ((tmp = Xmatch(base, node->literal.next, s, o + n)) != NO_MATCH) { \
|
|
1625 return tmp; \
|
|
1626 } \
|
|
1627 } \
|
|
1628 return Xmatch(base, node->literal.next, s, o); \
|
|
1629 } \
|
|
1630 \
|
|
1631 case RE_SELECT: { \
|
|
1632 struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \
|
|
1633 if ((tmp = Xmatch(base, node->select.cond, s, o)) != NO_MATCH) { \
|
|
1634 tmp = Xmatch(base, node->select.arg, s, tmp); \
|
|
1635 if (tmp == NO_MATCH) { \
|
|
1636 memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \
|
|
1637 } \
|
|
1638 } else { \
|
|
1639 tmp = Xmatch(base, node->select.next, s, o); \
|
|
1640 } \
|
|
1641 xfree(prev); \
|
|
1642 return tmp; \
|
|
1643 } \
|
|
1644 \
|
|
1645 case RE_REP_FRUGAL: \
|
|
1646 if (base->repbuf[node->rep.n] >= node->rep.max) { \
|
|
1647 const size_t n = base->repbuf[node->rep.n]; \
|
|
1648 \
|
|
1649 base->repbuf[node->rep.n] = 0; \
|
|
1650 tmp = Xmatch(base, node->rep.next, s, o); \
|
|
1651 base->repbuf[node->rep.n] = n; \
|
|
1652 return tmp; \
|
|
1653 } else if (base->repbuf[node->rep.n] >= node->rep.min) { \
|
|
1654 const size_t n = base->repbuf[node->rep.n]; \
|
|
1655 \
|
|
1656 base->repbuf[node->rep.n] = 0; \
|
|
1657 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \
|
|
1658 base->repbuf[node->rep.n] = n; \
|
|
1659 return tmp; \
|
|
1660 } \
|
|
1661 base->repbuf[node->rep.n] = n + 1u; \
|
|
1662 tmp = Xmatch(base, node->rep.arg, s, o); \
|
|
1663 --base->repbuf[node->rep.n]; \
|
|
1664 return tmp; \
|
|
1665 } else { \
|
|
1666 ++base->repbuf[node->rep.n]; \
|
|
1667 tmp = Xmatch(base, node->rep.arg, s, o); \
|
|
1668 --base->repbuf[node->rep.n]; \
|
|
1669 return tmp; \
|
|
1670 } \
|
|
1671 \
|
|
1672 case RE_REP_GREEDY: \
|
|
1673 if (base->repbuf[node->rep.n] >= node->rep.max) { \
|
|
1674 const size_t n = base->repbuf[node->rep.n]; \
|
|
1675 \
|
|
1676 base->repbuf[node->rep.n] = 0; \
|
|
1677 tmp = Xmatch(base, node->rep.next, s, o); \
|
|
1678 base->repbuf[node->rep.n] = n; \
|
|
1679 return tmp; \
|
|
1680 } else if (base->repbuf[node->rep.n] >= node->rep.min) { \
|
|
1681 const size_t n = base->repbuf[node->rep.n]; \
|
|
1682 \
|
|
1683 ++base->repbuf[node->rep.n]; \
|
|
1684 if ((tmp = Xmatch(base, node->rep.arg, s, o)) != NO_MATCH) { \
|
|
1685 --base->repbuf[node->rep.n]; \
|
|
1686 return tmp; \
|
|
1687 } \
|
|
1688 base->repbuf[node->rep.n] = 0; \
|
|
1689 tmp = Xmatch(base, node->rep.next, s, o); \
|
|
1690 base->repbuf[node->rep.n] = n; \
|
|
1691 return tmp; \
|
|
1692 } else { \
|
|
1693 ++base->repbuf[node->rep.n]; \
|
|
1694 tmp = Xmatch(base, node->rep.arg, s, o); \
|
|
1695 --base->repbuf[node->rep.n]; \
|
|
1696 return tmp; \
|
|
1697 } \
|
|
1698 \
|
|
1699 case RE_XREP_FRUGAL: { \
|
|
1700 size_t n; \
|
|
1701 \
|
|
1702 for (n = 0; n < node->rep.min; ++n) { \
|
|
1703 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
|
|
1704 return NO_MATCH; \
|
|
1705 } \
|
|
1706 o = tmp; \
|
|
1707 } \
|
|
1708 \
|
|
1709 for (; n < node->rep.max; ++n) { \
|
|
1710 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \
|
|
1711 return tmp; \
|
|
1712 } \
|
|
1713 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
|
|
1714 return NO_MATCH; \
|
|
1715 } \
|
|
1716 o = tmp; \
|
|
1717 } \
|
|
1718 return Xmatch(base, node->rep.next, s, o); \
|
|
1719 } \
|
|
1720 \
|
|
1721 case RE_XREP_GREEDY: { \
|
|
1722 size_t n; \
|
|
1723 \
|
|
1724 for (n = 0; n < node->rep.min; ++n) { \
|
|
1725 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
|
|
1726 return NO_MATCH; \
|
|
1727 } \
|
|
1728 o = tmp; \
|
|
1729 } \
|
|
1730 \
|
|
1731 for (; n < node->rep.max; ++n) { \
|
|
1732 if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \
|
|
1733 break; \
|
|
1734 } \
|
|
1735 o = tmp; \
|
|
1736 } \
|
|
1737 \
|
|
1738 for (; n > node->rep.min; --n) { \
|
|
1739 if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \
|
|
1740 return tmp; \
|
|
1741 } \
|
|
1742 o -= node->rep.n; \
|
|
1743 } \
|
|
1744 \
|
|
1745 return Xmatch(base, node->rep.next, s, o); \
|
|
1746 } \
|
|
1747 } \
|
|
1748 \
|
|
1749 NOTREACHED
|
|
1750
|
|
1751
|
|
1752 static size_t match(const t_regex *base, const re_node *node, const String *s, size_t o) {
|
|
1753
|
|
1754 Mmatch0(match, ST_INDEX);
|
|
1755
|
|
1756 case RE_BACKREF:
|
|
1757 if (
|
|
1758 node->backref.n < base->captures &&
|
|
1759 base->cap_ptr[node->backref.n].start + 1u &&
|
|
1760 base->cap_ptr[node->backref.n].end + 1u &&
|
|
1761 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start &&
|
|
1762 (
|
|
1763 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start <= St_len(s) - o &&
|
|
1764 memcmp(
|
|
1765 St_ptr(s) + base->cap_ptr[node->backref.n].start,
|
|
1766 St_ptr(s) + o,
|
|
1767 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start
|
|
1768 ) == 0
|
|
1769 )
|
|
1770 ) {
|
|
1771 return match(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start);
|
|
1772 }
|
|
1773 return NO_MATCH;
|
|
1774
|
|
1775
|
|
1776 case RE_LITERAL:
|
|
1777 if (
|
|
1778 node->literal.len == 0 ||
|
|
1779 (
|
|
1780 node->literal.len <= St_len(s) - o &&
|
|
1781 memcmp(node->literal.buf, St_ptr(s) + o, node->literal.len) == 0
|
|
1782 )
|
|
1783 ) {
|
|
1784 return match(base, node->literal.next, s, o + node->literal.len);
|
|
1785 }
|
|
1786 return NO_MATCH;
|
|
1787
|
|
1788 case RE_STUFF_FRUGAL:
|
|
1789 if (!node->x.next) {
|
|
1790 return o;
|
|
1791 }
|
|
1792
|
|
1793 if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) {
|
|
1794 for (
|
|
1795 o = St_stro_m(s, o, node->x.next->literal.buf, node->x.next->literal.len);
|
|
1796 o + 1u;
|
|
1797 o = St_stro_m(s, o + 1u, node->x.next->literal.buf, node->x.next->literal.len)
|
|
1798 ) {
|
|
1799 if ((tmp = match(base, node->x.next->literal.next, s, o + node->x.next->literal.len)) != NO_MATCH) {
|
|
1800 return tmp;
|
|
1801 }
|
|
1802 }
|
|
1803 return NO_MATCH;
|
|
1804 }
|
|
1805
|
|
1806 for (; o <= St_len(s); ++o) {
|
|
1807 if ((tmp = match(base, node->x.next, s, o)) != NO_MATCH) {
|
|
1808 return tmp;
|
|
1809 }
|
|
1810 }
|
|
1811 return NO_MATCH;
|
|
1812
|
|
1813 case RE_STUFF_GREEDY: {
|
|
1814 size_t n;
|
|
1815
|
|
1816 if (!node->x.next) {
|
|
1817 return St_len(s);
|
|
1818 }
|
|
1819
|
|
1820 if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) {
|
|
1821 for (
|
|
1822 n = St_rstr_m(s, node->x.next->literal.buf, node->x.next->literal.len);
|
|
1823 n > o && n + 1u;
|
|
1824 n = St_rstro_m(s, n - 1u, node->x.next->literal.buf, node->x.next->literal.len)
|
|
1825 ) {
|
|
1826 if ((tmp = match(base, node->x.next->literal.next, s, n + node->x.next->literal.len)) != NO_MATCH) {
|
|
1827 return tmp;
|
|
1828 }
|
|
1829 if (!n) {
|
|
1830 break;
|
|
1831 }
|
|
1832 }
|
|
1833 if (n == o) {
|
|
1834 return match(base, node->x.next->literal.next, s, n + node->x.next->literal.len);
|
|
1835 }
|
|
1836 return NO_MATCH;
|
|
1837 }
|
|
1838
|
|
1839 for (n = St_len(s); n > o; --n) {
|
|
1840 if ((tmp = match(base, node->x.next, s, n)) != NO_MATCH) {
|
|
1841 return tmp;
|
|
1842 }
|
|
1843 }
|
|
1844 return match(base, node->x.next, s, o);
|
|
1845 }
|
|
1846
|
|
1847 Mmatch1(match, ST_INDEX);
|
|
1848 }
|
|
1849
|
|
1850 static size_t iomatch(const t_regex *base, const re_node *node, IO *s, size_t o) {
|
|
1851
|
|
1852 Mmatch0(iomatch, io_peek);
|
|
1853
|
|
1854 case RE_BACKREF:
|
|
1855 if (
|
|
1856 node->backref.n < base->captures &&
|
|
1857 base->cap_ptr[node->backref.n].start + 1u &&
|
|
1858 base->cap_ptr[node->backref.n].end + 1u &&
|
|
1859 base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start &&
|
|
1860 (
|
|
1861 io_xcmp(
|
|
1862 s,
|
|
1863 base->cap_ptr[node->backref.n].start,
|
|
1864 o,
|
|
1865 base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start
|
|
1866 ) == 0
|
|
1867 )
|
|
1868 ) {
|
|
1869 return iomatch(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start);
|
|
1870 }
|
|
1871 return NO_MATCH;
|
|
1872
|
|
1873
|
|
1874 case RE_LITERAL:
|
|
1875 if (
|
|
1876 node->literal.len == 0 ||
|
|
1877 (
|
|
1878 io_cmppeek(s, o, node->literal.buf, node->literal.len) == 0
|
|
1879 )
|
|
1880 ) {
|
|
1881 return iomatch(base, node->literal.next, s, o + node->literal.len);
|
|
1882 }
|
|
1883 return NO_MATCH;
|
|
1884
|
|
1885 case RE_STUFF_FRUGAL:
|
|
1886 if (!node->x.next) {
|
|
1887 return o;
|
|
1888 }
|
|
1889
|
|
1890 for (; io_peek(s, o) != EOF; ++o) {
|
|
1891 if ((tmp = iomatch(base, node->x.next, s, o)) != NO_MATCH) {
|
|
1892 return tmp;
|
|
1893 }
|
|
1894 }
|
|
1895 return iomatch(base, node->x.next, s, o);
|
|
1896
|
|
1897 case RE_STUFF_GREEDY: {
|
|
1898 size_t n;
|
|
1899
|
|
1900 for (n = o; io_peek(s, n) != EOF; ++n)
|
|
1901 ;
|
|
1902
|
|
1903 if (!node->x.next) {
|
|
1904 return n;
|
|
1905 }
|
|
1906
|
|
1907 for (; n > o; --n) {
|
|
1908 if ((tmp = iomatch(base, node->x.next, s, n)) != NO_MATCH) {
|
|
1909 return tmp;
|
|
1910 }
|
|
1911 }
|
|
1912 return iomatch(base, node->x.next, s, o);
|
|
1913 }
|
|
1914
|
|
1915 Mmatch1(iomatch, io_peek);
|
|
1916 }
|
|
1917
|
|
1918 #define DEFRBUF ((void)0)
|
|
1919 #define MRETURN(x) return (x)
|
|
1920 #define INITRBUF(re) memset((re)->repbuf, '\0', (re)->repets * sizeof *(re)->repbuf)
|
|
1921
|
|
1922 int re_iomatch(t_regex *re, IO *io, size_t *ms, size_t *me) {
|
|
1923 size_t i;
|
|
1924 DEFRBUF;
|
|
1925 INITRBUF(re);
|
|
1926
|
|
1927 assert(re != NULL);
|
|
1928 assert(io != NULL);
|
|
1929 assert(io_bufred(io));
|
|
1930 assert(ms != NULL);
|
|
1931 assert(me != NULL);
|
|
1932
|
|
1933 reinit(re);
|
|
1934
|
|
1935 if (re->flags & FL_ANCHOR_START) {
|
|
1936 if ((*me = iomatch(re, re->start, io, 0)) == NO_MATCH) {
|
|
1937 MRETURN(0);
|
|
1938 }
|
|
1939 *ms = 0;
|
|
1940 MRETURN(1);
|
|
1941 }
|
|
1942
|
|
1943 for (i = 0; io_peek(io, i) != EOF; ++i) {
|
|
1944 if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) {
|
|
1945 *ms = i;
|
|
1946 MRETURN(1);
|
|
1947 }
|
|
1948 }
|
|
1949
|
|
1950 if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) {
|
|
1951 *ms = i;
|
|
1952 MRETURN(1);
|
|
1953 }
|
|
1954 MRETURN(0);
|
|
1955 }
|
|
1956
|
|
1957 int re_match(t_regex *re, const String *s, size_t *ms, size_t *me) {
|
|
1958 size_t i;
|
|
1959 DEFRBUF;
|
|
1960 INITRBUF(re);
|
|
1961
|
|
1962 assert(re != NULL);
|
|
1963 assert(s != NULL);
|
|
1964 assert(ms != NULL);
|
|
1965 assert(me != NULL);
|
|
1966
|
|
1967 if (re->minlen > St_len(s)) {
|
|
1968 MRETURN(0);
|
|
1969 }
|
|
1970
|
|
1971 reinit(re);
|
|
1972
|
|
1973 if (re->flags & FL_ANCHOR_START) {
|
|
1974 if ((*me = match(re, re->start, s, 0)) == NO_MATCH) {
|
|
1975 MRETURN(0);
|
|
1976 }
|
|
1977 *ms = 0;
|
|
1978 MRETURN(1);
|
|
1979 }
|
|
1980
|
|
1981 if (re->start->x.type == RE_LITERAL && re->start->literal.len) {
|
|
1982 for (
|
|
1983 i = St_str_m(s, re->start->literal.buf, re->start->literal.len);
|
|
1984 i + 1u;
|
|
1985 i = St_stro_m(s, i + 1u, re->start->literal.buf, re->start->literal.len)
|
|
1986 ) {
|
|
1987 if ((*me = match(re, re->start->literal.next, s, i + re->start->literal.len)) != NO_MATCH) {
|
|
1988 *ms = i;
|
|
1989 MRETURN(1);
|
|
1990 }
|
|
1991 }
|
|
1992 MRETURN(0);
|
|
1993 }
|
|
1994
|
|
1995 for (i = 0; i <= St_len(s) - re->minlen; ++i) {
|
|
1996 if ((*me = match(re, re->start, s, i)) != NO_MATCH) {
|
|
1997 *ms = i;
|
|
1998 MRETURN(1);
|
|
1999 }
|
|
2000 }
|
|
2001
|
|
2002 MRETURN(0);
|
|
2003 }
|
|
2004
|
|
2005 size_t re_cabra(const t_regex *re) {
|
|
2006 return re->captures;
|
|
2007 }
|
|
2008
|
|
2009 int re_backref(const t_regex *re, size_t i, size_t *a, size_t *z) {
|
|
2010 if (i >= re->captures) {
|
|
2011 return -1;
|
|
2012 }
|
|
2013 if (!(re->cap_ptr[i].start + 1u && re->cap_ptr[i].end + 1u)) {
|
|
2014 return 1;
|
|
2015 }
|
|
2016 *a = re->cap_ptr[i].start;
|
|
2017 *z = re->cap_ptr[i].end;
|
|
2018 return 0;
|
|
2019 }
|
|
2020
|
|
2021 ATTR_PURE
|
|
2022 static const re_node *branchend(const re_node *a, const re_node *b) {
|
|
2023 for (; a; a = a->x.next) {
|
|
2024 const re_node *q = b;
|
|
2025 for (; q; q = q->x.next) {
|
|
2026 if (a == q) {
|
|
2027 return q;
|
|
2028 }
|
|
2029 }
|
|
2030 }
|
|
2031 return NULL;
|
|
2032 }
|
|
2033
|
|
2034 static void decom_class(String *s, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) {
|
|
2035 unsigned char i;
|
|
2036 int flag = 0;
|
|
2037 char big_enough[100];
|
|
2038
|
|
2039 for (i = 0; i < UCHAR_MAX; ++i) {
|
|
2040 if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) {
|
|
2041 if (vec[(i + 1u) / CHAR_BIT] & 1u << (i + 1u) % CHAR_BIT) {
|
|
2042 if (!flag) {
|
|
2043 flag = 1;
|
|
2044 goto mess;
|
|
2045 }
|
|
2046 flag = 1;
|
|
2047 } else {
|
|
2048 if (flag) {
|
|
2049 if (i > 1u && vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) {
|
|
2050 St_cat_c(s, '-');
|
|
2051 }
|
|
2052 flag = 0;
|
|
2053 }
|
|
2054 mess:
|
|
2055 if (!isprint(i)) {
|
|
2056 St_cat_c(s, '\\');
|
|
2057 switch (i) {
|
|
2058 case '\a': St_cat_c(s, 'a'); break;
|
|
2059 case '\b': St_cat_c(s, 'b'); break;
|
|
2060 case '\f': St_cat_c(s, 'f'); break;
|
|
2061 case '\n': St_cat_c(s, 'n'); break;
|
|
2062 case '\r': St_cat_c(s, 'r'); break;
|
|
2063 case '\t': St_cat_c(s, 't'); break;
|
|
2064 case '\v': St_cat_c(s, 'v'); break;
|
|
2065 default:
|
|
2066 sprintf(big_enough, "%03o", (unsigned)i);
|
|
2067 St_cat_s(s, big_enough);
|
|
2068 break;
|
|
2069 }
|
|
2070 } else {
|
|
2071 if (strchr("\\\"", (char)i)) {
|
|
2072 St_cat_c(s, '\\');
|
|
2073 }
|
|
2074 St_cat_c(s, i);
|
|
2075 if (strchr("!'-", (char)i)) {
|
|
2076 St_cat_c(s, ESCAPE);
|
|
2077 }
|
|
2078 }
|
|
2079 }
|
|
2080 }
|
|
2081 }
|
|
2082 if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) {
|
|
2083 if (flag) {
|
|
2084 if (vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) {
|
|
2085 St_cat_c(s, '-');
|
|
2086 }
|
|
2087 flag = 0;
|
|
2088 }
|
|
2089 if (!isprint(i)) {
|
|
2090 St_cat_c(s, '\\');
|
|
2091 switch (i) {
|
|
2092 case '\a': St_cat_c(s, 'a'); break;
|
|
2093 case '\b': St_cat_c(s, 'b'); break;
|
|
2094 case '\f': St_cat_c(s, 'f'); break;
|
|
2095 case '\n': St_cat_c(s, 'n'); break;
|
|
2096 case '\r': St_cat_c(s, 'r'); break;
|
|
2097 case '\t': St_cat_c(s, 't'); break;
|
|
2098 case '\v': St_cat_c(s, 'v'); break;
|
|
2099 default:
|
|
2100 sprintf(big_enough, "%03o", (unsigned)i);
|
|
2101 St_cat_s(s, big_enough);
|
|
2102 break;
|
|
2103 }
|
|
2104 } else {
|
|
2105 if (strchr("\\\"", (char)i)) {
|
|
2106 St_cat_c(s, '\\');
|
|
2107 }
|
|
2108 St_cat_c(s, i);
|
|
2109 if (strchr("!'-", (char)i)) {
|
|
2110 St_cat_c(s, ESCAPE);
|
|
2111 }
|
|
2112 }
|
|
2113 }
|
|
2114 }
|
|
2115
|
|
2116 static void dumpclass(FILE *fp, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) {
|
|
2117 String tmp;
|
|
2118 St_init(&tmp);
|
|
2119 decom_class(&tmp, vec);
|
|
2120 ST_WRITE(&tmp, fp);
|
|
2121 St_clear(&tmp);
|
|
2122 }
|
|
2123
|
|
2124 ATTR_CONST
|
|
2125 static int cdefclass(const unsigned char *ptr) {
|
|
2126 if (ptr == dc_word) { return 'w'; }
|
|
2127 if (ptr == dc_alpha) { return 'q'; }
|
|
2128 if (ptr == dc_cntrl) { return 'c'; }
|
|
2129 if (ptr == dc_digit) { return 'd'; }
|
|
2130 if (ptr == dc_lower) { return 'l'; }
|
|
2131 if (ptr == dc_print) { return 'p'; }
|
|
2132 if (ptr == dc_space) { return 's'; }
|
|
2133 if (ptr == dc_upper) { return 'u'; }
|
|
2134 if (ptr == dc_xdigit) { return 'x'; }
|
|
2135 NOTREACHED;
|
|
2136 }
|
|
2137
|
|
2138 static void decom_liter(String *s, const re_node *node) {
|
|
2139 size_t i;
|
|
2140 assert(node->x.type == RE_LITERAL || node->x.type == RE_PARTIAL);
|
|
2141 for (i = 0; i < node->literal.len; ++i) {
|
|
2142 if (!isprint((unsigned char)node->literal.buf[i])) {
|
|
2143 St_cat_c(s, '\\');
|
|
2144 switch (node->literal.buf[i]) {
|
|
2145 case '\a': St_cat_c(s, 'a'); break;
|
|
2146 case '\b': St_cat_c(s, 'b'); break;
|
|
2147 case '\f': St_cat_c(s, 'f'); break;
|
|
2148 case '\n': St_cat_c(s, 'n'); break;
|
|
2149 case '\r': St_cat_c(s, 'r'); break;
|
|
2150 case '\t': St_cat_c(s, 't'); break;
|
|
2151 case '\v': St_cat_c(s, 'v'); break;
|
|
2152 default: {
|
|
2153 char big_enough[100];
|
|
2154 sprintf(big_enough, "%03o", (unsigned)(unsigned char)node->literal.buf[i]);
|
|
2155 St_cat_s(s, big_enough);
|
|
2156 break;
|
|
2157 }
|
|
2158 }
|
|
2159 } else {
|
|
2160 if (node->x.type == RE_LITERAL && strchr("\"\\", node->literal.buf[i])) {
|
|
2161 St_cat_c(s, '\\');
|
|
2162 }
|
|
2163 St_cat_c(s, node->literal.buf[i]);
|
|
2164 if (MEATCHARP(node->literal.buf[i])) {
|
|
2165 St_cat_c(s, ESCAPE);
|
|
2166 }
|
|
2167 }
|
|
2168 }
|
|
2169 }
|
|
2170
|
|
2171 static void dumpliter(FILE *fp, const re_node *node) {
|
|
2172 String tmp;
|
|
2173 St_init(&tmp);
|
|
2174 decom_liter(&tmp, node);
|
|
2175 ST_WRITE(&tmp, fp);
|
|
2176 St_clear(&tmp);
|
|
2177 }
|
|
2178
|
|
2179 static void do_indent(FILE *fp, size_t n) {
|
|
2180 for (; n; --n) {
|
|
2181 fputs(" ", fp);
|
|
2182 }
|
|
2183 }
|
|
2184
|
|
2185 static void dumpnode(FILE *fp, const re_node *node, const re_node *end, const size_t indent) {
|
|
2186 if (node == end) {
|
|
2187 return;
|
|
2188 }
|
|
2189
|
|
2190 assert(node != NULL);
|
|
2191 fprintf(fp, "[%p] ", (const void *)node);
|
|
2192 do_indent(fp, indent);
|
|
2193
|
|
2194 switch (node->x.type) {
|
|
2195 case RE_ALTER: {
|
|
2196 const re_node *const be = branchend(node->alter.arg, node->alter.next);
|
|
2197 fputs("branch {\n", fp);
|
|
2198 dumpnode(fp, node->alter.arg, be, indent + 1);
|
|
2199 fprintf(fp, "[%p] ", (const void *)node);
|
|
2200 do_indent(fp, indent);
|
|
2201 fputs("} or {\n", fp);
|
|
2202 dumpnode(fp, node->alter.next, be, indent + 1);
|
|
2203 fprintf(fp, "[%p] ", (const void *)node);
|
|
2204 do_indent(fp, indent);
|
|
2205 fputs("} /branch\n", fp);
|
|
2206 dumpnode(fp, be, end, indent);
|
|
2207 return;
|
|
2208 }
|
|
2209
|
|
2210 case RE_ANYCH:
|
|
2211 fputs("anychar\n", fp);
|
|
2212 break;
|
|
2213
|
|
2214 case RE_AT_BOL:
|
|
2215 fputs("at beginning of line\n", fp);
|
|
2216 break;
|
|
2217
|
|
2218 case RE_AT_BOS:
|
|
2219 fputs("at beginning of string\n", fp);
|
|
2220 break;
|
|
2221
|
|
2222 case RE_AT_EOL:
|
|
2223 fputs("at end of line\n", fp);
|
|
2224 break;
|
|
2225
|
|
2226 case RE_AT_EOS:
|
|
2227 fputs("at end of string\n", fp);
|
|
2228 break;
|
|
2229
|
|
2230 case RE_AT_MATCH:
|
|
2231 fputs("at match {\n", fp);
|
|
2232 dumpnode(fp, node->indep.arg, NULL, indent + 1);
|
|
2233 fprintf(fp, "[%p] ", (const void *)node);
|
|
2234 do_indent(fp, indent);
|
|
2235 fputs("} /at match\n", fp);
|
|
2236 break;
|
|
2237
|
|
2238 case RE_AT_NOMATCH:
|
|
2239 fputs("not at match {\n", fp);
|
|
2240 dumpnode(fp, node->indep.arg, NULL, indent + 1);
|
|
2241 fprintf(fp, "[%p] ", (const void *)node);
|
|
2242 do_indent(fp, indent);
|
|
2243 fputs("} /not at match\n", fp);
|
|
2244 break;
|
|
2245
|
|
2246 case RE_AT_NWBOUND:
|
|
2247 fputs("not at word boundary\n", fp);
|
|
2248 break;
|
|
2249
|
|
2250 case RE_AT_WBOUND:
|
|
2251 fputs("at word boundary\n", fp);
|
|
2252 break;
|
|
2253
|
|
2254 case RE_BACKCHECK:
|
|
2255 fprintf(fp, "check capture (%lu)\n", (unsigned long)node->backref.n);
|
|
2256 break;
|
|
2257
|
|
2258 case RE_BACKREF:
|
|
2259 fprintf(fp, "backref (%lu)\n", (unsigned long)node->backref.n);
|
|
2260 break;
|
|
2261
|
|
2262 case RE_BEGIN_CAPTURE:
|
|
2263 fprintf(fp, "begin capture (%lu)\n", (unsigned long)node->backref.n);
|
|
2264 break;
|
|
2265
|
|
2266 case RE_CLASS:
|
|
2267 fputs("character class '", fp);
|
|
2268 dumpclass(fp, node->class.vec);
|
|
2269 fputs("'\n", fp);
|
|
2270 break;
|
|
2271
|
|
2272 case RE_DEFCLASS:
|
|
2273 fprintf(fp, "character class %c!\n", cdefclass(node->defclass.vec));
|
|
2274 break;
|
|
2275
|
|
2276 case RE_END_CAPTURE:
|
|
2277 fprintf(fp, "end capture (%lu)\n", (unsigned long)node->backref.n);
|
|
2278 break;
|
|
2279
|
|
2280 case RE_INDEP:
|
|
2281 fputs("independent submatch {\n", fp);
|
|
2282 dumpnode(fp, node->indep.arg, NULL, indent + 1);
|
|
2283 fprintf(fp, "[%p] ", (const void *)node);
|
|
2284 do_indent(fp, indent);
|
|
2285 fputs("} /independent submatch\n", fp);
|
|
2286 break;
|
|
2287
|
|
2288 case RE_LITERAL:
|
|
2289 fputs("literal \"", fp);
|
|
2290 dumpliter(fp, node);
|
|
2291 fputs("\"\n", fp);
|
|
2292 break;
|
|
2293
|
|
2294 case RE_N_CLASS:
|
|
2295 fputs("negated character class '", fp);
|
|
2296 dumpclass(fp, node->class.vec);
|
|
2297 fputs("'\n", fp);
|
|
2298 break;
|
|
2299
|
|
2300 case RE_N_DEFCLASS:
|
|
2301 fprintf(fp, "character class %c!\n", toupper(cdefclass(node->defclass.vec)));
|
|
2302 break;
|
|
2303
|
|
2304 case RE_PARTIAL:
|
|
2305 fputs("abbreviation `", fp);
|
|
2306 dumpliter(fp, node);
|
|
2307 fputs("`\n", fp);
|
|
2308 break;
|
|
2309
|
|
2310 case RE_REP_FRUGAL:
|
|
2311 fputs("repetition ", fp);
|
|
2312 if (node->rep.min == node->rep.max) {
|
|
2313 fprintf(fp, ":%lu:?", (unsigned long)node->rep.min);
|
|
2314 } else if (node->rep.max != (size_t)-1) {
|
|
2315 fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
|
|
2316 } else {
|
|
2317 switch (node->rep.min) {
|
|
2318 case 0: fputs("*?", fp); break;
|
|
2319 case 1: fputs("+?", fp); break;
|
|
2320 default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break;
|
|
2321 }
|
|
2322 }
|
|
2323 fputs(" {\n", fp);
|
|
2324 dumpnode(fp, node->rep.arg, node, indent + 1);
|
|
2325 fprintf(fp, "[%p] ", (const void *)node);
|
|
2326 do_indent(fp, indent);
|
|
2327 fputs("} /repetition\n", fp);
|
|
2328 break;
|
|
2329
|
|
2330 case RE_REP_GREEDY:
|
|
2331 fputs("repetition ", fp);
|
|
2332 if (node->rep.min == node->rep.max) {
|
|
2333 fprintf(fp, ":%lu:", (unsigned long)node->rep.min);
|
|
2334 } else if (node->rep.max != (size_t)-1) {
|
|
2335 fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
|
|
2336 } else {
|
|
2337 switch (node->rep.min) {
|
|
2338 case 0: fputs("*", fp); break;
|
|
2339 case 1: fputs("+", fp); break;
|
|
2340 default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break;
|
|
2341 }
|
|
2342 }
|
|
2343 fputs(" {\n", fp);
|
|
2344 dumpnode(fp, node->rep.arg, node, indent + 1);
|
|
2345 fprintf(fp, "[%p] ", (const void *)node);
|
|
2346 do_indent(fp, indent);
|
|
2347 fputs("} /repetition\n", fp);
|
|
2348 break;
|
|
2349
|
|
2350 case RE_SELECT: {
|
|
2351 const re_node *const be = branchend(node->select.arg, node->select.next);
|
|
2352 fputs("if {\n", fp);
|
|
2353 dumpnode(fp, node->select.cond, NULL, indent + 1);
|
|
2354 fprintf(fp, "[%p] ", (const void *)node);
|
|
2355 do_indent(fp, indent);
|
|
2356 fputs("} then {\n", fp);
|
|
2357 dumpnode(fp, node->select.arg, be, indent + 1);
|
|
2358 if (node->select.next != be) {
|
|
2359 fprintf(fp, "[%p] ", (const void *)node);
|
|
2360 do_indent(fp, indent);
|
|
2361 fputs("} else {\n", fp);
|
|
2362 dumpnode(fp, node->select.next, be, indent + 1);
|
|
2363 }
|
|
2364 fprintf(fp, "[%p] ", (const void *)node);
|
|
2365 do_indent(fp, indent);
|
|
2366 fputs("} /if\n", fp);
|
|
2367 dumpnode(fp, be, end, indent);
|
|
2368 return;
|
|
2369 }
|
|
2370
|
|
2371 case RE_STUFF_FRUGAL:
|
|
2372 fputs(".*?\n", fp);
|
|
2373 break;
|
|
2374
|
|
2375 case RE_STUFF_GREEDY:
|
|
2376 fputs(".*\n", fp);
|
|
2377 break;
|
|
2378
|
|
2379 case RE_XREP_FRUGAL:
|
|
2380 fputs("simple repetition ", fp);
|
|
2381 if (node->rep.min == node->rep.max) {
|
|
2382 fprintf(fp, ":%lu:?", (unsigned long)node->rep.min);
|
|
2383 } else if (node->rep.max != (size_t)-1) {
|
|
2384 fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
|
|
2385 } else {
|
|
2386 switch (node->rep.min) {
|
|
2387 case 0: fputs("*?", fp); break;
|
|
2388 case 1: fputs("+?", fp); break;
|
|
2389 default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break;
|
|
2390 }
|
|
2391 }
|
|
2392 fputs(" {\n", fp);
|
|
2393 dumpnode(fp, node->rep.arg, NULL, indent + 1);
|
|
2394 fprintf(fp, "[%p] ", (const void *)node);
|
|
2395 do_indent(fp, indent);
|
|
2396 fputs("} /simple repetition\n", fp);
|
|
2397 break;
|
|
2398
|
|
2399 case RE_XREP_GREEDY:
|
|
2400 fputs("simple repetition ", fp);
|
|
2401 if (node->rep.min == node->rep.max) {
|
|
2402 fprintf(fp, ":%lu:", (unsigned long)node->rep.min);
|
|
2403 } else if (node->rep.max != (size_t)-1) {
|
|
2404 fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max);
|
|
2405 } else {
|
|
2406 switch (node->rep.min) {
|
|
2407 case 0: fputs("*", fp); break;
|
|
2408 case 1: fputs("+", fp); break;
|
|
2409 default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break;
|
|
2410 }
|
|
2411 }
|
|
2412 fputs(" {\n", fp);
|
|
2413 dumpnode(fp, node->rep.arg, NULL, indent + 1);
|
|
2414 fprintf(fp, "[%p] ", (const void *)node);
|
|
2415 do_indent(fp, indent);
|
|
2416 fputs("} /simple repetition\n", fp);
|
|
2417 break;
|
|
2418 }
|
|
2419 dumpnode(fp, node->x.next, end, indent);
|
|
2420 }
|
|
2421
|
|
2422 void re_dump(const t_regex *re, FILE *fp) {
|
|
2423 fprintf(fp, "RE: minlen=%lu, anchor=%s\n", (unsigned long)re->minlen, re->flags & FL_ANCHOR_START ? "start" : "none");
|
|
2424 dumpnode(fp, re->start, NULL, 0);
|
|
2425 }
|
|
2426
|
|
2427 static void cat_n(String *s, unsigned long n) {
|
|
2428 char big_enough[100];
|
|
2429 sprintf(big_enough, "%lu", n);
|
|
2430 St_cat_s(s, big_enough);
|
|
2431 }
|
|
2432
|
|
2433 static void decom_repeat(String *s, const re_node *node) {
|
|
2434 assert(
|
|
2435 node->x.type == RE_REP_FRUGAL ||
|
|
2436 node->x.type == RE_REP_GREEDY ||
|
|
2437 node->x.type == RE_XREP_FRUGAL ||
|
|
2438 node->x.type == RE_XREP_GREEDY
|
|
2439 );
|
|
2440
|
|
2441 if (node->rep.min == 0 && node->rep.max == (size_t)-1) {
|
|
2442 St_cat_c(s, '*');
|
|
2443 } else if (node->rep.min == 0 && node->rep.max == 1) {
|
|
2444 St_cat_c(s, '?');
|
|
2445 } else if (node->rep.min == 1 && node->rep.max == (size_t)-1) {
|
|
2446 St_cat_c(s, '+');
|
|
2447 } else if (node->rep.min == node->rep.max) {
|
|
2448 St_cat_c(s, ':');
|
|
2449 cat_n(s, node->rep.min);
|
|
2450 St_cat_c(s, ':');
|
|
2451 } else if (node->rep.max == (size_t)-1) {
|
|
2452 St_cat_c(s, ':');
|
|
2453 cat_n(s, node->rep.min);
|
|
2454 St_cat_m(s, ",:", 2);
|
|
2455 } else {
|
|
2456 St_cat_c(s, ':');
|
|
2457 cat_n(s, node->rep.min);
|
|
2458 St_cat_c(s, ',');
|
|
2459 cat_n(s, node->rep.max);
|
|
2460 St_cat_c(s, ':');
|
|
2461 }
|
|
2462
|
|
2463 if (node->rep.type == RE_REP_FRUGAL || node->rep.type == RE_XREP_FRUGAL) {
|
|
2464 St_cat_c(s, '?');
|
|
2465 }
|
|
2466 }
|
|
2467
|
|
2468 static void decom_node(String *s, const re_node *node, const re_node *end) {
|
|
2469 if (node == end) {
|
|
2470 return;
|
|
2471 }
|
|
2472
|
|
2473 assert(node != NULL);
|
|
2474 switch (node->x.type) {
|
|
2475 case RE_ALTER: {
|
|
2476 const re_node *const be = branchend(node->alter.arg, node->alter.next);
|
|
2477 St_cat_c(s, '(');
|
|
2478 decom_node(s, node->alter.arg, be);
|
|
2479 St_cat_c(s, '|');
|
|
2480 decom_node(s, node->alter.next, be);
|
|
2481 St_cat_c(s, ')');
|
|
2482 decom_node(s, be, end);
|
|
2483 break;
|
|
2484 }
|
|
2485
|
|
2486 case RE_ANYCH:
|
|
2487 St_cat_c(s, '.');
|
|
2488 decom_node(s, node->x.next, end);
|
|
2489 break;
|
|
2490
|
|
2491 case RE_AT_BOL:
|
|
2492 St_cat_m(s, "A!", 2);
|
|
2493 decom_node(s, node->x.next, end);
|
|
2494 break;
|
|
2495
|
|
2496 case RE_AT_BOS:
|
|
2497 St_cat_c(s, '^');
|
|
2498 decom_node(s, node->x.next, end);
|
|
2499 break;
|
|
2500
|
|
2501 case RE_AT_EOL:
|
|
2502 St_cat_m(s, "Z!", 2);
|
|
2503 decom_node(s, node->x.next, end);
|
|
2504 break;
|
|
2505
|
|
2506 case RE_AT_EOS:
|
|
2507 St_cat_c(s, '$');
|
|
2508 decom_node(s, node->x.next, end);
|
|
2509 break;
|
|
2510
|
|
2511 case RE_AT_MATCH:
|
|
2512 St_cat_c(s, '[');
|
|
2513 decom_node(s, node->indep.arg, NULL);
|
|
2514 St_cat_m(s, "&]", 2);
|
|
2515 decom_node(s, node->indep.next, end);
|
|
2516 break;
|
|
2517
|
|
2518 case RE_AT_NOMATCH:
|
|
2519 St_cat_c(s, '[');
|
|
2520 decom_node(s, node->indep.arg, NULL);
|
|
2521 St_cat_m(s, "^]", 2);
|
|
2522 decom_node(s, node->indep.next, end);
|
|
2523 break;
|
|
2524
|
|
2525 case RE_AT_NWBOUND:
|
|
2526 St_cat_m(s, "B!", 2);
|
|
2527 decom_node(s, node->x.next, end);
|
|
2528 break;
|
|
2529
|
|
2530 case RE_AT_WBOUND:
|
|
2531 St_cat_m(s, "b!", 2);
|
|
2532 decom_node(s, node->x.next, end);
|
|
2533 break;
|
|
2534
|
|
2535 case RE_BACKREF:
|
|
2536 case RE_BACKCHECK: {
|
|
2537 String tmp;
|
|
2538 if (node->x.type == RE_BACKCHECK) {
|
|
2539 St_cat_c(s, '[');
|
|
2540 }
|
|
2541 St_init(&tmp);
|
|
2542 #if HAVE_VSNPRINTF_P
|
|
2543 St_xprintf(&tmp, "%lu", (unsigned long)node->backref.n);
|
|
2544 #else
|
|
2545 St_num(&tmp, node->backref.n);
|
|
2546 #endif
|
|
2547 St_cat(s, &tmp);
|
|
2548 St_clear(&tmp);
|
|
2549 if (node->x.type == RE_BACKCHECK) {
|
|
2550 St_cat_c(s, ']');
|
|
2551 } else {
|
|
2552 St_cat_c(s, ESCAPE);
|
|
2553 }
|
|
2554 decom_node(s, node->backref.next, end);
|
|
2555 break;
|
|
2556 }
|
|
2557
|
|
2558 case RE_BEGIN_CAPTURE:
|
|
2559 St_cat_c(s, '{');
|
|
2560 decom_node(s, node->x.next, end);
|
|
2561 break;
|
|
2562
|
|
2563 case RE_CLASS:
|
|
2564 St_cat_c(s, '\'');
|
|
2565 decom_class(s, node->class.vec);
|
|
2566 St_cat_c(s, '\'');
|
|
2567 decom_node(s, node->class.next, end);
|
|
2568 break;
|
|
2569
|
|
2570 case RE_DEFCLASS:
|
|
2571 St_cat_c(s, cdefclass(node->defclass.vec));
|
|
2572 St_cat_c(s, ESCAPE);
|
|
2573 decom_node(s, node->defclass.next, end);
|
|
2574 break;
|
|
2575
|
|
2576 case RE_END_CAPTURE:
|
|
2577 St_cat_c(s, '}');
|
|
2578 decom_node(s, node->x.next, end);
|
|
2579 break;
|
|
2580
|
|
2581 case RE_INDEP:
|
|
2582 St_cat_c(s, '<');
|
|
2583 decom_node(s, node->indep.arg, NULL);
|
|
2584 St_cat_c(s, '>');
|
|
2585 decom_node(s, node->indep.next, end);
|
|
2586 break;
|
|
2587
|
|
2588 case RE_LITERAL:
|
|
2589 decom_liter(s, node);
|
|
2590 decom_node(s, node->literal.next, end);
|
|
2591 break;
|
|
2592
|
|
2593 case RE_N_CLASS:
|
|
2594 St_cat_m(s, "'^", 2);
|
|
2595 decom_class(s, node->class.vec);
|
|
2596 St_cat_c(s, '\'');
|
|
2597 decom_node(s, node->class.next, end);
|
|
2598 break;
|
|
2599
|
|
2600 case RE_N_DEFCLASS:
|
|
2601 St_cat_c(s, toupper(cdefclass(node->defclass.vec)));
|
|
2602 St_cat_c(s, ESCAPE);
|
|
2603 decom_node(s, node->defclass.next, end);
|
|
2604 break;
|
|
2605
|
|
2606 case RE_PARTIAL:
|
|
2607 St_cat_c(s, '`');
|
|
2608 decom_liter(s, node);
|
|
2609 St_cat_c(s, '`');
|
|
2610 break;
|
|
2611
|
|
2612 case RE_REP_FRUGAL:
|
|
2613 case RE_REP_GREEDY:
|
|
2614 St_cat_c(s, '(');
|
|
2615 decom_node(s, node->rep.arg, node);
|
|
2616 St_cat_c(s, ')');
|
|
2617 decom_repeat(s, node);
|
|
2618 decom_node(s, node->rep.next, end);
|
|
2619 break;
|
|
2620
|
|
2621 case RE_SELECT: {
|
|
2622 const re_node *const be = branchend(node->select.arg, node->select.next);
|
|
2623 St_cat_c(s, '[');
|
|
2624 if (be != node->select.next) {
|
|
2625 decom_node(s, node->select.next, be);
|
|
2626 St_cat_c(s, '|');
|
|
2627 }
|
|
2628 decom_node(s, node->select.arg, be);
|
|
2629 St_cat_c(s, '(');
|
|
2630 decom_node(s, node->select.cond, NULL);
|
|
2631 St_cat_c(s, ')');
|
|
2632 St_cat_m(s, "?]", 2);
|
|
2633 decom_node(s, be, end);
|
|
2634 break;
|
|
2635 }
|
|
2636
|
|
2637 case RE_STUFF_FRUGAL:
|
|
2638 St_cat_m(s, ".*?", 3);
|
|
2639 decom_node(s, node->x.next, end);
|
|
2640 break;
|
|
2641
|
|
2642 case RE_STUFF_GREEDY:
|
|
2643 St_cat_m(s, ".*", 2);
|
|
2644 decom_node(s, node->x.next, end);
|
|
2645 break;
|
|
2646
|
|
2647 case RE_XREP_FRUGAL:
|
|
2648 case RE_XREP_GREEDY:
|
|
2649 St_cat_c(s, '(');
|
|
2650 decom_node(s, node->rep.arg, NULL);
|
|
2651 St_cat_c(s, ')');
|
|
2652 decom_repeat(s, node);
|
|
2653 decom_node(s, node->rep.next, end);
|
|
2654 break;
|
|
2655 }
|
|
2656 }
|
|
2657
|
|
2658 void re_decompile(const t_regex *re, String *s) {
|
|
2659 St_zero(s);
|
|
2660 decom_node(s, re->start, NULL);
|
|
2661 }
|