Mercurial > repo
diff src/ploki/re.c @ 4223:ac0403686959
<oerjan> rm -rf src/ploki; mv ploki src
author | HackBot |
---|---|
date | Fri, 20 Dec 2013 22:18:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ploki/re.c Fri Dec 20 22:18:50 2013 +0000 @@ -0,0 +1,2661 @@ +#include "config.h" +#include "IO.h" +#include "Str.h" +#include "hash.h" +#include "main_io.h" +#include "main_opt.h" +#include "re.h" +#include "xmalloc.h" +#include "zz.h" + +#include <ctype.h> +#include <limits.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <assert.h> + +enum {CLASS_SIZE = UCHAR_MAX / CHAR_BIT + 1u}; + +static unsigned char dc_word[CLASS_SIZE]; +static unsigned char dc_alpha[CLASS_SIZE]; +static unsigned char dc_cntrl[CLASS_SIZE]; +static unsigned char dc_digit[CLASS_SIZE]; +static unsigned char dc_lower[CLASS_SIZE]; +static unsigned char dc_print[CLASS_SIZE]; +static unsigned char dc_space[CLASS_SIZE]; +static unsigned char dc_upper[CLASS_SIZE]; +static unsigned char dc_xdigit[CLASS_SIZE]; + +static void fill(unsigned char *v, int (*pred)(int)) { + unsigned char c; + + for (c = UCHAR_MAX; c; --c) { + v[c / CHAR_BIT] |= !!pred(c) << c % CHAR_BIT; + } + v[0] |= !!pred(0); +} + +static void nfill(unsigned char *v, int (*pred)(int)) { + unsigned char c; + + for (c = UCHAR_MAX; c; --c) { + v[c / CHAR_BIT] |= !pred(c) << c % CHAR_BIT; + } + v[0] |= !pred(0); +} + +ATTR_CONST +static int siword(int c) { + return c == '_' || isalnum(c); +} + +enum {MAGIC = 23}; + +static Hash rcache; +static String cached[MAGIC]; +static size_t cachfill, cachlast; + +static size_t hash(const void *s, size_t seed) { + return St_hash(s, seed); +} + +static int compar(const void *a, const void *b) { + return St_cmp(a, b); +} + +enum t_re_node { + RE_ALTER, + RE_ANYCH, + RE_AT_BOL, + RE_AT_BOS, + RE_AT_EOL, + RE_AT_EOS, + RE_AT_MATCH, + RE_AT_NOMATCH, + RE_AT_NWBOUND, + RE_AT_WBOUND, + RE_BACKCHECK, + RE_BACKREF, + RE_BEGIN_CAPTURE, + RE_CLASS, + RE_DEFCLASS, + RE_END_CAPTURE, + RE_INDEP, + RE_LITERAL, + RE_N_CLASS, + RE_N_DEFCLASS, + RE_PARTIAL, + RE_REP_FRUGAL, + RE_REP_GREEDY, + RE_SELECT, + RE_STUFF_FRUGAL, + RE_STUFF_GREEDY, + RE_XREP_FRUGAL, + RE_XREP_GREEDY +}; + +#define CMN_HDR enum t_re_node type; union my_node *next + +typedef union my_node { + struct { + CMN_HDR; + } x; + struct { + CMN_HDR; + union my_node *arg; + } alter; + struct { + CMN_HDR; + size_t n; + } backref; + struct { + CMN_HDR; + size_t n; + } capture; + struct { + CMN_HDR; + unsigned char vec[CLASS_SIZE]; + } class; + struct { + CMN_HDR; + unsigned char *vec; + } defclass; + struct { + CMN_HDR; + union my_node *arg; + } indep; + struct { + CMN_HDR; + unsigned char *buf; + size_t len; + } literal; + struct { + CMN_HDR; + union my_node *arg; + size_t min, max; + size_t n; + } rep; + struct { + CMN_HDR; + union my_node *arg; + union my_node *cond; + } select; +} re_node; + +#include "re_block.c.h" + +enum re_flags { + FL_NONE = 0, + FL_ANCHOR_START = 1 +}; + +struct cap_state { + size_t start, end, pending; +}; + +struct my_regex { + re_node *start; + size_t repets; + size_t *repbuf; + t_block alloc; + size_t captures; + struct cap_state *cap_ptr; + size_t refs; + enum re_flags flags; + size_t minlen; +}; + +static void delv(void *v) { + re_free(v); +} + +void re_init(void) { + fill(dc_word, siword); + fill(dc_alpha, isalpha); + fill(dc_cntrl, iscntrl); + fill(dc_digit, isdigit); + fill(dc_lower, islower); + fill(dc_print, isprint); + fill(dc_space, isspace); + fill(dc_upper, isupper); + fill(dc_xdigit, isxdigit); + + h_init(&rcache, hash, compar, NULL, delv); +} + +void re_end(void) { + h_end(&rcache); + while (cachfill) { + --cachfill; + St_clear(&cached[cachfill]); + } + cachlast = 0; +} + + +enum {ESCAPE = '!'}; +#define MEATCHARP(c) strchr("!|({<[]>})*+?:.'^$`", c) +#define ENDBRANCHP(c) strchr("|({<[", c) + +struct parse_context { + t_regex *base; + t_block *block; + const String *s; + size_t *p; +}; + +static void skipspace(const String *const s, size_t *const p) { +restart: + while (isspace(ST_INDEX(s, *p))) { + --*p; + } + if (ST_INDEX(s, *p) == ']' && ST_INDEX(s, *p - 1u) == '#') { + *p -= 2u; + while (ST_INDEX(s, *p) != EOF) { + if (ST_INDEX(s, *p) == '[') { + --*p; + goto restart; + } + if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1u) != EOF) { + --*p; + } + --*p; + } + } +} + +#define PRAZ(name) static re_node *name(const struct parse_context *con, re_node *next, re_node *curolan_zebeth) +#define SNORK const String *const s = con->s; size_t *const p = con->p +#define CALL(name, nextn, glob) name(con, nextn, glob) +#define TAILC(name) return CALL(name, next, curolan_zebeth) + +PRAZ(quantifire); +PRAZ(branch); +PRAZ(alternation); + +PRAZ(atom) { + SNORK; + (void)curolan_zebeth; + + skipspace(s, p); + switch (ST_INDEX(s, *p)) { + case '(': + case '<': + case '{': + case '[': + case EOF: { + re_node *cur = bl_node(con->block); + cur->literal.type = RE_LITERAL; + cur->literal.next = next; + cur->literal.buf = NULL; + cur->literal.len = 0; + return cur; + } + + case ESCAPE: { + re_node *cur = bl_node(con->block); + --*p; + if (isdigit(ST_INDEX(s, *p))) { + do { + --*p; + } while (isdigit(ST_INDEX(s, *p))); + cur->backref.type = RE_BACKREF; + cur->backref.next = next; + cur->backref.n = strtoul(St_ptr(s) + *p + 1, NULL, 10); + } else if (isalpha(ST_INDEX(s, *p))) { + switch (ST_INDEX(s, *p)) { + case 'A': + cur->x.type = RE_AT_BOL; + cur->x.next = next; + --*p; + break; + case 'Z': + cur->x.type = RE_AT_EOL; + cur->x.next = next; + --*p; + break; + + case 'b': + cur->x.type = RE_AT_WBOUND; + cur->x.next = next; + --*p; + break; + case 'B': + cur->x.type = RE_AT_NWBOUND; + cur->x.next = next; + --*p; + break; + + case 'c': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_cntrl; + cur->defclass.next = next; + --*p; + break; + case 'C': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_cntrl; + cur->defclass.next = next; + --*p; + break; + + case 'd': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_digit; + cur->defclass.next = next; + --*p; + break; + case 'D': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_digit; + cur->defclass.next = next; + --*p; + break; + + case 'l': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_lower; + cur->defclass.next = next; + --*p; + break; + case 'L': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_lower; + cur->defclass.next = next; + --*p; + break; + + case 'p': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_print; + cur->defclass.next = next; + --*p; + break; + case 'P': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_print; + cur->defclass.next = next; + --*p; + break; + + case 'q': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_alpha; + cur->defclass.next = next; + --*p; + break; + case 'Q': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_alpha; + cur->defclass.next = next; + --*p; + break; + + case 's': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_space; + cur->defclass.next = next; + --*p; + break; + case 'S': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_space; + cur->defclass.next = next; + --*p; + break; + + case 'u': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_upper; + cur->defclass.next = next; + --*p; + break; + case 'U': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_upper; + cur->defclass.next = next; + --*p; + break; + + case 'w': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_word; + cur->defclass.next = next; + --*p; + break; + case 'W': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_word; + cur->defclass.next = next; + --*p; + break; + + case 'x': + cur->defclass.type = RE_DEFCLASS; + cur->defclass.vec = dc_xdigit; + cur->defclass.next = next; + --*p; + break; + case 'X': + cur->defclass.type = RE_N_DEFCLASS; + cur->defclass.vec = dc_xdigit; + cur->defclass.next = next; + --*p; + break; + + default: + goto default_escape; + } + + } else default_escape: { + cur->literal.type = RE_LITERAL; + cur->literal.next = next; + cur->literal.buf = bl_string(con->block, cur->literal.len = 1); + if (ST_INDEX(s, *p) == EOF) { + cur->literal.buf[0] = ESCAPE; + } else { + cur->literal.buf[0] = ST_INDEX(s, *p); + --*p; + } + } + return cur; + } + + case ')': { + re_node *cur; + --*p; + cur = CALL(alternation, next, next); + skipspace(s, p); + if (ST_INDEX(s, *p) == '(') { + --*p; + } + return cur; + } + + case ']': { + re_node *cur; + switch (ST_INDEX(s, *p - 1u)) { + case '&': + *p -= 2u; + cur = bl_node(con->block); + cur->indep.type = RE_AT_MATCH; + cur->indep.next = next; + cur->indep.arg = CALL(alternation, NULL, NULL); + break; + + case '^': + *p -= 2u; + cur = bl_node(con->block); + cur->indep.type = RE_AT_NOMATCH; + cur->indep.next = next; + cur->indep.arg = CALL(alternation, NULL, NULL); + break; + + case '?': + *p -= 2u; + cur = bl_node(con->block); + cur->select.type = RE_SELECT; + cur->select.cond = CALL(quantifire, NULL, NULL); + cur->select.arg = CALL(branch, next, next); + skipspace(s, p); + if (ST_INDEX(s, *p) == '|') { + --*p; + cur->select.next = CALL(alternation, next, next); + } else { + cur->select.next = next; + } + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + size_t i; + + for (i = *p - 2u; isdigit(ST_INDEX(s, i)); --i) + ; + if (ST_INDEX(s, i) != '[') { + goto std_atom; + } + cur = bl_node(con->block); + cur->backref.type = RE_BACKCHECK; + cur->backref.next = next; + cur->backref.n = strtoul(St_ptr(s) + i + 1u, NULL, 10); + *p = i - 1u; + + return cur; + } + + default: + goto std_atom; + } + + skipspace(s, p); + if (ST_INDEX(s, *p) == '[') { + --*p; + } + return cur; + } + + case '>': { + re_node *cur = bl_node(con->block); + --*p; + cur->indep.type = RE_INDEP; + cur->indep.next = next; + cur->indep.arg = CALL(alternation, NULL, NULL); + skipspace(s, p); + if (ST_INDEX(s, *p) == '<') { + --*p; + } + return cur; + } + + case '}': { + re_node *begin = bl_node(con->block); + re_node *end = bl_node(con->block); + --*p; + begin->capture.n = end->capture.n = con->base->captures++; + end->capture.type = RE_END_CAPTURE; + end->capture.next = next; + begin->capture.type = RE_BEGIN_CAPTURE; + begin->capture.next = CALL(alternation, end, end); + skipspace(s, p); + if (ST_INDEX(s, *p) == '{') { + --*p; + } + return begin; + } + + case '^': { + re_node *cur = bl_node(con->block); + cur->x.type = RE_AT_BOS; + cur->x.next = next; + --*p; + return cur; + } + + case '$': { + re_node *cur = bl_node(con->block); + cur->x.type = RE_AT_EOS; + cur->x.next = next; + --*p; + return cur; + } + + case '.': { + re_node *cur = bl_node(con->block); + cur->x.type = RE_ANYCH; + cur->x.next = next; + --*p; + return cur; + } + + case '`': { + String tmp; + int c; + re_node *cur = bl_node(con->block); + cur->literal.type = RE_PARTIAL; + cur->literal.next = next; + + --*p; + St_init(&tmp); + while ( + (c = ST_INDEX(s, *p)) != EOF && + c != '`' + ) { + --*p; + if (c == ESCAPE && ST_INDEX(s, *p) != EOF) { + c = ST_INDEX(s, *p); + --*p; + } + St_cat_c(&tmp, c); + } + if (c == '`') { + --*p; + } + St_reverse(&tmp); + + cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp)); + memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp)); + St_clear(&tmp); + + return cur; + } + + case '\'': { + int c; + re_node *cur = bl_node(con->block); + cur->class.type = RE_CLASS; + cur->class.next = next; + memset(cur->class.vec, '\0', sizeof cur->class.vec); + for (--*p; (c = ST_INDEX(s, *p)) != EOF && c != '\''; --*p) { + if (c == ESCAPE) { + switch (c = ST_INDEX(s, *p - 1)) { + case EOF: + cur->class.vec[ESCAPE / CHAR_BIT] |= 1u << ESCAPE % CHAR_BIT; + break; + + case 'c': + fill(cur->class.vec, iscntrl); + --*p; + break; + case 'C': + nfill(cur->class.vec, iscntrl); + --*p; + break; + + case 'd': + fill(cur->class.vec, isdigit); + --*p; + break; + case 'D': + nfill(cur->class.vec, isdigit); + --*p; + break; + + case 'l': + fill(cur->class.vec, islower); + --*p; + break; + case 'L': + nfill(cur->class.vec, islower); + --*p; + break; + + case 'p': + fill(cur->class.vec, isprint); + --*p; + break; + case 'P': + nfill(cur->class.vec, isprint); + --*p; + break; + + case 'q': + fill(cur->class.vec, isalpha); + --*p; + break; + case 'Q': + nfill(cur->class.vec, isalpha); + --*p; + break; + + case 's': + fill(cur->class.vec, isspace); + --*p; + break; + case 'S': + nfill(cur->class.vec, isspace); + --*p; + break; + + case 'u': + fill(cur->class.vec, isupper); + --*p; + break; + case 'U': + nfill(cur->class.vec, isupper); + --*p; + break; + + case 'w': + fill(cur->class.vec, siword); + --*p; + break; + case 'W': + nfill(cur->class.vec, siword); + --*p; + break; + + case 'x': + fill(cur->class.vec, isxdigit); + --*p; + break; + case 'X': + nfill(cur->class.vec, isxdigit); + --*p; + break; + + default: + cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT; + --*p; + break; + } + } else { + int b; + if (ST_INDEX(s, *p - 1) == '-' && (b = ST_INDEX(s, *p - 2)) != EOF) { + if (b == ESCAPE && (b = ST_INDEX(s, *p - 3)) == EOF) { + b = ESCAPE; + } + if (b > c) { + goto normal_char; + } + for (; b < c; ++b) { + cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT; + } + cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT; + *p -= 2; + if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1) != EOF) { + --*p; + } + } else if (ST_INDEX(s, *p) == ']' && *p >= 8 && ST_INDEX(s, *p - 1) == ':') { + +#define CXCLASS_CHECK(n, t, f, o, b) if (1) { \ + if (memcmp(St_ptr(s) + *p - (n), (t), (n) - 1) != 0) { \ + goto normal_char; \ + } \ + (b)(cur->class.vec, (f)); \ + *p -= (o); \ + break; \ +} else (void)0 + +#define CCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, fill) +#define CNCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, nfill) + + if (ST_INDEX(s, *p - 8) == '[' && ST_INDEX(s, *p - 7) == ':') { + switch (ST_INDEX(s, *p - 6)) { + case 'a': + if (ST_INDEX(s, *p - 5) != 'l') { + goto normal_char; + } + switch (ST_INDEX(s, *p - 4)) { + case 'n': CCLASS_CHECK("um", isalnum, 8); + case 'p': CCLASS_CHECK("ha", isalpha, 8); + default: goto normal_char; + } + break; + + case 'c': CCLASS_CHECK("ntrl", iscntrl, 8); + case 'd': CCLASS_CHECK("igit", isdigit, 8); + case 'g': CCLASS_CHECK("raph", isgraph, 8); + case 'l': CCLASS_CHECK("ower", islower, 8); + + case 'p': + switch (ST_INDEX(s, *p - 5)) { + case 'r': CCLASS_CHECK("int", isprint, 8); + case 'u': CCLASS_CHECK("nct", ispunct, 8); + default: goto normal_char; + } + break; + + case 's': CCLASS_CHECK("pace", isspace, 8); + case 'u': CCLASS_CHECK("pper", isupper, 8); + default: goto normal_char; + } + } else if (*p >= 9 && ST_INDEX(s, *p - 9) == '[' && ST_INDEX(s, *p - 8) == ':') { + switch (ST_INDEX(s, *p - 7)) { + case 'x': + CCLASS_CHECK("digit", isxdigit, 9); + + case '^': + switch (ST_INDEX(s, *p - 6)) { + case 'a': + if (ST_INDEX(s, *p - 5) != 'l') { + goto normal_char; + } + switch (ST_INDEX(s, *p - 4)) { + case 'n': CNCLASS_CHECK("um", isalnum, 9); + case 'p': CNCLASS_CHECK("ha", isalpha, 9); + default: goto normal_char; + } + break; + + case 'c': CNCLASS_CHECK("ntrl", iscntrl, 9); + case 'd': CNCLASS_CHECK("igit", isdigit, 9); + case 'g': CNCLASS_CHECK("raph", isgraph, 9); + case 'l': CNCLASS_CHECK("ower", islower, 9); + + case 'p': + switch (ST_INDEX(s, *p - 5)) { + case 'r': CNCLASS_CHECK("int", isprint, 9); + case 'u': CNCLASS_CHECK("nct", ispunct, 9); + default: goto normal_char; + } + break; + + case 's': CNCLASS_CHECK("pace", isspace, 9); + case 'u': CNCLASS_CHECK("pper", isupper, 9); + default: goto normal_char; + } + break; + + default: goto normal_char; + } + } else if (*p >= 10 && memcmp(St_ptr(s) + *p - 10, "[:^xdigit", 9) == 0) { + nfill(cur->class.vec, isxdigit); + *p -= 10; + } else { + goto normal_char; + } + } else if (ST_INDEX(s, *p) == '^' && ((b = ST_INDEX(s, *p - 1) == '\'') || b == EOF)) { + cur->class.type = RE_N_CLASS; + } else normal_char: { + cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT; + } + } + } + if (c == '\'') { + --*p; + } + + if (cur->class.type == RE_CLASS) { + size_t i; + for (i = 0; i < CLASS_SIZE; ++i) { + if (cur->class.vec[i] != (unsigned char)~0u) { + return cur; + } + } + cur->x.type = RE_ANYCH; + } else { + size_t i; + assert(cur->class.type == RE_N_CLASS); + for (i = 0; i < CLASS_SIZE; ++i) { + if (cur->class.vec[i] != 0) { + return cur; + } + } + cur->x.type = RE_ANYCH; + } + + return cur; + } + +std_atom: + default: { + re_node *cur = bl_node(con->block); + cur->literal.type = RE_LITERAL; + + /*XXX?*/ + if (next && next->x.type == RE_LITERAL) { + if (next->literal.len) { + cur->literal.buf = bl_string(con->block, cur->literal.len = 1u + next->literal.len); + memcpy(cur->literal.buf + 1, next->literal.buf, next->literal.len); + } + cur->literal.next = next->literal.next; + } else { + cur->literal.next = next; + cur->literal.buf = bl_string(con->block, cur->literal.len = 1); + } + + cur->literal.buf[0] = ST_INDEX(s, *p); + --*p; + return cur; + } + } +} + +PRAZ(literal) { + SNORK; + int c; + String tmp; + re_node *cur; + + skipspace(s, p); + if ((ST_INDEX(s, *p) != ESCAPE || isalnum(ST_INDEX(s, *p - 1))) && MEATCHARP(ST_INDEX(s, *p))) { + TAILC(atom); + } + + cur = bl_node(con->block); + cur->literal.type = RE_LITERAL; + cur->literal.next = next; + + St_init(&tmp); + while ( + (c = ST_INDEX(s, *p)) != EOF && + ( + !MEATCHARP(c) || + (c == ESCAPE && !isalnum(ST_INDEX(s, *p - 1))) + ) + ) { + --*p; + if (c == ESCAPE && ST_INDEX(s, *p) != EOF) { + c = ST_INDEX(s, *p); + --*p; + } + St_cat_c(&tmp, c); + skipspace(s, p); + } + St_reverse(&tmp); + + if (next && next->x.type == RE_LITERAL) { + if (next->literal.len) { + St_cat_m(&tmp, next->literal.buf, next->literal.len); + } + cur->literal.next = next->literal.next; + } + + cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp)); + memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp)); + St_clear(&tmp); + + return cur; +} + +#define RETURN_REP(lo, hi, grdy) \ +do { \ + re_node *cur = bl_node(con->block); \ + cur->rep.type = (grdy); \ + cur->rep.next = next; \ + cur->rep.min = (lo); \ + cur->rep.max = (hi); \ + cur->rep.n = con->base->repets++; \ + cur->rep.arg = CALL(atom, cur, NULL); \ + return cur; \ +} while (0) + +#define STD_REP_CASE(lo, hi, grdy) \ +do { \ + --*p; \ + RETURN_REP(lo, hi, grdy); \ +} while (0) + +#define CUSTOM_REP_CASE(off, grdy, mismatch) \ +do { \ + size_t from, to; \ + to = *p - 1 - (off); \ + while (isdigit(ST_INDEX(s, to))) { \ + --to; \ + } \ + if (ST_INDEX(s, to) == ':') { \ + *p = to - 1; \ + to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \ + from = to; \ + } else { \ + if (ST_INDEX(s, to) != ',') { \ + mismatch; \ + } \ + from = to - 1; \ + if (!isdigit(ST_INDEX(s, from))) { \ + mismatch; \ + } \ + do { \ + --from; \ + } while (isdigit(ST_INDEX(s, from))); \ + if (ST_INDEX(s, from) != ':') { \ + mismatch; \ + } \ + *p = from - 1; \ + from = strtoul(St_ptr(s) + from + 1, NULL, 10); \ + to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \ + } \ + if (to == 1) { \ + if (from == 1) { \ + TAILC(atom); \ + } \ + if (from == 0) { \ + re_node *cur = bl_node(con->block); \ + cur->alter.type = RE_ALTER; \ + if ((grdy) == RE_REP_GREEDY) { \ + cur->alter.next = next; \ + cur->alter.arg = CALL(atom, next, NULL); \ + } else { \ + cur->alter.arg = next; \ + cur->alter.next = CALL(atom, next, NULL); \ + } \ + return cur; \ + } \ + } \ + RETURN_REP(from, to, grdy); \ +} while (0) + +PRAZ(quantifire) { + SNORK; + + skipspace(s, p); + switch (ST_INDEX(s, *p)) { + case EOF: return next; + + case '?': + switch (ST_INDEX(s, *p - 1)) { + case ':': CUSTOM_REP_CASE(1, RE_REP_FRUGAL, goto hook); + case '*': --*p; STD_REP_CASE(0, -1, RE_REP_FRUGAL); + case '+': --*p; STD_REP_CASE(1, -1, RE_REP_FRUGAL); + + case '?': { + re_node *cur = bl_node(con->block); + *p -= 2; + cur->alter.type = RE_ALTER; + cur->alter.next = CALL(atom, next, NULL); + cur->alter.arg = next; + return cur; + } + +hook: + default: { + re_node *cur = bl_node(con->block); + --*p; + cur->alter.type = RE_ALTER; + cur->alter.next = next; + cur->alter.arg = CALL(atom, next, NULL); + return cur; + } + } + + case ':': CUSTOM_REP_CASE(0, RE_REP_GREEDY, goto noquanti); + case '*': STD_REP_CASE(0, -1, RE_REP_GREEDY); + case '+': STD_REP_CASE(1, -1, RE_REP_GREEDY); + +noquanti: + default: TAILC(literal); + } +} + +PRAZ(branch) { + SNORK; + skipspace(s, p); + while (ST_INDEX(s, *p) != EOF && !ENDBRANCHP(ST_INDEX(s, *p))) { + next = CALL(quantifire, next, curolan_zebeth); + skipspace(s, p); + } + return next; +} + +PRAZ(alternation) { + SNORK; + re_node *cur; + + skipspace(s, p); + if (ST_INDEX(s, *p) == EOF) { + return next; + } + + next = CALL(branch, next, curolan_zebeth); + skipspace(s, p); + if (ST_INDEX(s, *p) != '|') { + return next; + } + + --*p; + cur = bl_node(con->block); + cur->alter.type = RE_ALTER; + cur->alter.next = next; + cur->alter.arg = CALL(alternation, curolan_zebeth, curolan_zebeth); + return cur; +} + +ATTR_CONST +static size_t minimum(const size_t a, const size_t b) { + return a < b ? a : b; +} + +ATTR_CONST +static size_t maximum(const size_t a, const size_t b) { + return a > b ? a : b; +} + +ATTR_PURE +static size_t subminlen(const re_node *node, const re_node *end, size_t start) { + if (!node || node == end) { + return start; + } + + switch (node->x.type) { + case RE_ALTER: + return minimum( + subminlen(node->alter.arg, end, start), + subminlen(node->alter.next, end, start) + ); + + case RE_ANYCH: + case RE_CLASS: + case RE_DEFCLASS: + case RE_N_CLASS: + case RE_N_DEFCLASS: + return subminlen(node->x.next, end, start + 1u); + + case RE_AT_BOL: + case RE_AT_BOS: + case RE_AT_EOL: + case RE_AT_EOS: + case RE_AT_NWBOUND: + case RE_AT_WBOUND: + case RE_BACKCHECK: + case RE_BACKREF: + case RE_BEGIN_CAPTURE: + case RE_END_CAPTURE: + case RE_PARTIAL: + case RE_STUFF_FRUGAL: + case RE_STUFF_GREEDY: + return subminlen(node->x.next, end, start); + + case RE_AT_MATCH: + return maximum( + subminlen(node->indep.arg, end, start), + subminlen(node->indep.next, end, start) + ); + + case RE_AT_NOMATCH: + return subminlen(node->indep.next, end, start); + + case RE_INDEP: + return subminlen(node->indep.next, end, start + subminlen(node->indep.arg, NULL, 0)); + + case RE_LITERAL: + return subminlen(node->literal.next, end, start + node->literal.len); + + case RE_SELECT: + return minimum( + subminlen(node->select.cond, end, start) + subminlen(node->select.arg, end, start), + subminlen(node->select.next, end, start) + ); + + case RE_REP_FRUGAL: + case RE_REP_GREEDY: + if (node->rep.min == 0) { + return subminlen(node->rep.next, end, start); + } + return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, node, 0)); + + case RE_XREP_FRUGAL: + case RE_XREP_GREEDY: + if (node->rep.min == 0) { + return subminlen(node->rep.next, end, start); + } + return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, NULL, 0)); + } + + NOTREACHED; +} + +ATTR_PURE +static size_t minlen(const re_node *node) { + return subminlen(node, NULL, 0); +} + +static void reinit(t_regex *re) { + const struct cap_state null = { -1, -1, -1 }; + size_t i; + for (i = 0; i < re->captures; ++i) { + re->cap_ptr[i] = null; + } +} + +static re_node **notcomplex(re_node **node, const re_node *end) { + assert(*node != NULL); + + if (*node == end) { + return node; + } + + switch ((*node)->x.type) { + case RE_ALTER: + case RE_AT_MATCH: + case RE_AT_NOMATCH: + case RE_BACKCHECK: + case RE_BACKREF: + case RE_BEGIN_CAPTURE: + case RE_END_CAPTURE: + case RE_INDEP: + case RE_PARTIAL: + case RE_REP_FRUGAL: + case RE_REP_GREEDY: + case RE_SELECT: + case RE_STUFF_FRUGAL: + case RE_STUFF_GREEDY: + return NULL; + + case RE_XREP_FRUGAL: + case RE_XREP_GREEDY: + if ((*node)->rep.min == (*node)->rep.max) { + return notcomplex(&(*node)->rep.next, end); + } + return NULL; + + case RE_ANYCH: + case RE_AT_BOL: + case RE_AT_BOS: + case RE_AT_EOL: + case RE_AT_EOS: + case RE_AT_NWBOUND: + case RE_AT_WBOUND: + case RE_CLASS: + case RE_DEFCLASS: + case RE_LITERAL: + case RE_N_CLASS: + case RE_N_DEFCLASS: + return notcomplex(&(*node)->x.next, end); + } + + NOTREACHED; +} + +static void trysimpl(re_node *node) { + assert(node->x.type == RE_REP_FRUGAL || node->x.type == RE_REP_GREEDY); + + { + re_node **const tmp = notcomplex(&node->rep.arg, node); + if (!tmp) { + return; + } + *tmp = NULL; + } + + switch (node->rep.type) { + case RE_REP_FRUGAL: + if ( + node->rep.arg->x.type == RE_ANYCH && + node->rep.arg->x.next == NULL && + node->rep.max == (size_t)-1 + ) { + if (node->rep.min == 0) { + node->x.type = RE_STUFF_FRUGAL; + break; + } + if (node->rep.min == 1) { + node->rep.arg->x.next = node->x.next; + node->rep.arg->x.type = RE_STUFF_FRUGAL; + node->x.next = node->rep.arg; + node->x.type = RE_ANYCH; + break; + } + } + + node->rep.n = minlen(node->rep.arg); + node->rep.type = RE_XREP_FRUGAL; + break; + + case RE_REP_GREEDY: + if ( + node->rep.arg->x.type == RE_ANYCH && + node->rep.arg->x.next == NULL && + node->rep.max == (size_t)-1 + ) { + if (node->rep.min == 0) { + node->x.type = RE_STUFF_GREEDY; + break; + } + if (node->rep.min == 1) { + node->rep.arg->x.next = node->x.next; + node->rep.arg->x.type = RE_STUFF_GREEDY; + node->x.next = node->rep.arg; + node->x.type = RE_ANYCH; + break; + } + } + + node->rep.n = minlen(node->rep.arg); + node->rep.type = RE_XREP_GREEDY; + break; + + default: + NOTREACHED; + } + +} + +static void simplerep(re_node *node, const re_node *const end) { + if (!node || node == end) { + return; + } + + switch (node->x.type) { + case RE_ALTER: + simplerep(node->alter.arg, end); + simplerep(node->alter.next, end); + return; + + case RE_AT_MATCH: + case RE_AT_NOMATCH: + case RE_INDEP: + simplerep(node->indep.arg, end); + simplerep(node->indep.next, end); + return; + + case RE_ANYCH: + case RE_AT_BOL: + case RE_AT_BOS: + case RE_AT_EOL: + case RE_AT_EOS: + case RE_AT_NWBOUND: + case RE_AT_WBOUND: + case RE_BACKCHECK: + case RE_BACKREF: + case RE_BEGIN_CAPTURE: + case RE_CLASS: + case RE_DEFCLASS: + case RE_END_CAPTURE: + case RE_LITERAL: + case RE_N_CLASS: + case RE_N_DEFCLASS: + case RE_PARTIAL: + simplerep(node->x.next, end); + return; + + case RE_REP_FRUGAL: + case RE_REP_GREEDY: + simplerep(node->rep.next, end); + simplerep(node->rep.arg, node); + trysimpl(node); + return; + + case RE_SELECT: + simplerep(node->select.cond, end); + simplerep(node->select.arg, end); + simplerep(node->select.next, end); + return; + + case RE_STUFF_FRUGAL: + case RE_STUFF_GREEDY: + case RE_XREP_FRUGAL: + case RE_XREP_GREEDY: + return; + } + + NOTREACHED; +} + +ATTR_PURE +static int anchorbegp(const re_node *node) { + return + node && + ( + node->x.type == RE_AT_BOS || + node->x.type == RE_STUFF_FRUGAL || + node->x.type == RE_STUFF_GREEDY || + ( + node->x.type == RE_ALTER && + anchorbegp(node->alter.arg) && + anchorbegp(node->alter.next) + ) || + ( + node->x.type == RE_INDEP && + anchorbegp(node->indep.arg) + ) || + ( + node->x.type == RE_AT_MATCH && + ( + anchorbegp(node->indep.arg) || + anchorbegp(node->indep.next) + ) + ) || + ( + ( + node->x.type == RE_AT_NOMATCH || + node->x.type == RE_AT_BOL || + node->x.type == RE_AT_EOL || + node->x.type == RE_AT_EOS || + node->x.type == RE_AT_NWBOUND || + node->x.type == RE_AT_WBOUND || + node->x.type == RE_BEGIN_CAPTURE || + node->x.type == RE_BACKCHECK + ) && + anchorbegp(node->x.next) + ) + ) + ; +} + +static void dostuff(t_regex *re) { + simplerep(re->start, NULL); + + if (!re->start || anchorbegp(re->start)) { + re->flags |= FL_ANCHOR_START; + } +} + +t_regex *re_compile(const String *s) { + size_t p; + struct parse_context dcon; + t_regex *re; + void *pre; + + if (h_get(&rcache, s, &pre) == H_OK) { + re = pre; + ++re->refs; + return re; + } + + re = xmalloc(1, sizeof *re); + re->flags = FL_NONE; + re->refs = 0; + re->repets = 0; + bl_init(&re->alloc); + re->cap_ptr = NULL; + re->captures = 0; + dcon.base = re; + dcon.block = &re->alloc; + dcon.s = s; + p = St_len(s) - 1; + dcon.p = &p; + + re->start = alternation(&dcon, NULL, NULL); + + if (re->captures) { + re->cap_ptr = xmalloc(re->captures, sizeof *re->cap_ptr); + reinit(re); + } + + dostuff(re); + re->minlen = minlen(re->start); + re->repbuf = xmalloc(re->repets, sizeof *re->repbuf); + + if (cachfill < sizeof cached / sizeof cached[0]) { + St_init(&cached[cachfill]); + St_cpy(&cached[cachfill], s); + ++re->refs; + h_put(&rcache, &cached[cachfill], re, 1); + ++cachfill; + } else { + h_del(&rcache, &cached[cachlast]); + St_cpy(&cached[cachlast], s); + ++re->refs; + h_put(&rcache, &cached[cachlast], re, 1); + cachlast = (cachlast + 1u) % (sizeof cached / sizeof cached[0]); + } + + if (Opt.debug & DBG_REGEX) { + io_write_s(Err, "Compiled "); + re_dump(re, io_fp(Err)); + } + + return re; +} + +t_regex *re_dup(t_regex *re) { + ++re->refs; + return re; +} + +void re_free(t_regex *re) { + if (re->refs) { + --re->refs; + return; + } + bl_free(&re->alloc); + xfree(re->repbuf); + xfree(re->cap_ptr); + xfree(re); +} + +ATTR_PURE +static int match_class(unsigned char c, const unsigned char *vec) { + return vec[c / CHAR_BIT] & 1u << c % CHAR_BIT; +} + +static void *mem_dup(const void *p, size_t n, size_t m) { + void *q = xmalloc(n, m); + memcpy(q, p, n * m); + return q; +} + +#define NO_MATCH ((size_t)-1) + +#define Mmatch0(Xmatch, Xindex) \ + size_t tmp; \ + \ + if (!node) { \ + return o; \ + } \ + \ + switch (node->x.type) { \ + case RE_ALTER: \ + if ((tmp = Xmatch(base, node->alter.arg, s, o)) == NO_MATCH) { \ + return Xmatch(base, node->alter.next, s, o); \ + } \ + return tmp; \ + \ + case RE_ANYCH: \ + if (Xindex(s, o) != EOF) { \ + return Xmatch(base, node->x.next, s, o + 1u); \ + } \ + return NO_MATCH; \ + \ + case RE_AT_BOL: \ + if (o == 0 || Xindex(s, o - 1u) == '\n') { \ + return Xmatch(base, node->x.next, s, o); \ + } \ + return NO_MATCH; \ + \ + case RE_AT_BOS: \ + if (o == 0) { \ + return Xmatch(base, node->x.next, s, o); \ + } \ + return NO_MATCH; \ + \ + case RE_AT_EOL: \ + if ( \ + Xindex(s, o) == '\n' || \ + ( \ + o && Xindex(s, o) == EOF && Xindex(s, o - 1u) != '\n' \ + ) \ + ) { \ + return Xmatch(base, node->x.next, s, o); \ + } \ + return NO_MATCH; \ + \ + case RE_AT_EOS: \ + if (Xindex(s, o) == EOF) { \ + return Xmatch(base, node->x.next, s, o); \ + } \ + return NO_MATCH; \ + \ + case RE_AT_MATCH: { \ + struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ + tmp = Xmatch(base, node->indep.arg, s, o); \ + if (tmp != NO_MATCH) { \ + tmp = Xmatch(base, node->indep.next, s, o); \ + if (tmp == NO_MATCH) { \ + memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ + } \ + } \ + xfree(prev); \ + return tmp; \ + } \ + \ + case RE_AT_NOMATCH: { \ + struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ + tmp = Xmatch(base, node->indep.arg, s, o); \ + if (tmp == NO_MATCH) { \ + tmp = Xmatch(base, node->indep.next, s, o); \ + } else { \ + memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ + tmp = NO_MATCH; \ + } \ + xfree(prev); \ + return tmp; \ + } \ + \ + case RE_AT_NWBOUND: \ + if (siword(Xindex(s, o - 1u)) == siword(Xindex(s, o))) { \ + return Xmatch(base, node->x.next, s, o); \ + } \ + return NO_MATCH; \ + \ + case RE_AT_WBOUND: \ + if (siword(Xindex(s, o - 1u)) != siword(Xindex(s, o))) { \ + return Xmatch(base, node->x.next, s, o); \ + } \ + return NO_MATCH; \ + \ + case RE_BACKCHECK: \ + if ( \ + node->backref.n < base->captures && \ + base->cap_ptr[node->backref.n].start + 1u && \ + base->cap_ptr[node->backref.n].end + 1u && \ + base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start \ + ) { \ + return Xmatch(base, node->backref.next, s, o); \ + } \ + return NO_MATCH + + +#define Mmatch1(Xmatch, Xindex) \ + case RE_BEGIN_CAPTURE: { \ + const size_t prev = base->cap_ptr[node->capture.n].pending; \ + base->cap_ptr[node->capture.n].pending = o; \ + if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \ + base->cap_ptr[node->capture.n].pending = prev; \ + } \ + return tmp; \ + } \ + \ + case RE_CLASS: \ + if ( \ + Xindex(s, o) != EOF && \ + match_class(Xindex(s, o), node->class.vec) \ + ) { \ + return Xmatch(base, node->class.next, s, o + 1u); \ + } \ + return NO_MATCH; \ + \ + case RE_DEFCLASS: \ + if ( \ + Xindex(s, o) != EOF && \ + match_class(Xindex(s, o), node->defclass.vec) \ + ) { \ + return Xmatch(base, node->defclass.next, s, o + 1u); \ + } \ + return NO_MATCH; \ + \ + case RE_END_CAPTURE: { \ + const size_t prevbeg = base->cap_ptr[node->capture.n].start; \ + const size_t prevend = base->cap_ptr[node->capture.n].end; \ + if (base->cap_ptr[node->capture.n].pending + 1u == 0u) { \ + return NO_MATCH; \ + } \ + \ + base->cap_ptr[node->capture.n].end = o; \ + base->cap_ptr[node->capture.n].start = base->cap_ptr[node->capture.n].pending; \ + if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \ + base->cap_ptr[node->capture.n].end = prevend; \ + base->cap_ptr[node->capture.n].start = prevbeg; \ + } \ + return tmp; \ + } \ + \ + case RE_INDEP: { \ + struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ + if ((tmp = Xmatch(base, node->indep.arg, s, o)) != NO_MATCH) { \ + tmp = Xmatch(base, node->indep.next, s, tmp); \ + if (tmp == NO_MATCH) { \ + memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ + } \ + } \ + xfree(prev); \ + return tmp; \ + } \ + \ + case RE_N_CLASS: \ + if ( \ + Xindex(s, o) != EOF && \ + !match_class(Xindex(s, o), node->class.vec) \ + ) { \ + return Xmatch(base, node->class.next, s, o + 1u); \ + } \ + return NO_MATCH; \ + \ + case RE_N_DEFCLASS: \ + if ( \ + Xindex(s, o) != EOF && \ + !match_class(Xindex(s, o), node->defclass.vec) \ + ) { \ + return Xmatch(base, node->defclass.next, s, o + 1u); \ + } \ + return NO_MATCH; \ + \ + case RE_PARTIAL: { \ + size_t n; \ + for (n = 0; n < node->literal.len; ++n) { \ + if (node->literal.buf[n] != Xindex(s, o + n)) { \ + break; \ + } \ + } \ + for (; n; --n) { \ + if ((tmp = Xmatch(base, node->literal.next, s, o + n)) != NO_MATCH) { \ + return tmp; \ + } \ + } \ + return Xmatch(base, node->literal.next, s, o); \ + } \ + \ + case RE_SELECT: { \ + struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ + if ((tmp = Xmatch(base, node->select.cond, s, o)) != NO_MATCH) { \ + tmp = Xmatch(base, node->select.arg, s, tmp); \ + if (tmp == NO_MATCH) { \ + memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ + } \ + } else { \ + tmp = Xmatch(base, node->select.next, s, o); \ + } \ + xfree(prev); \ + return tmp; \ + } \ + \ + case RE_REP_FRUGAL: \ + if (base->repbuf[node->rep.n] >= node->rep.max) { \ + const size_t n = base->repbuf[node->rep.n]; \ + \ + base->repbuf[node->rep.n] = 0; \ + tmp = Xmatch(base, node->rep.next, s, o); \ + base->repbuf[node->rep.n] = n; \ + return tmp; \ + } else if (base->repbuf[node->rep.n] >= node->rep.min) { \ + const size_t n = base->repbuf[node->rep.n]; \ + \ + base->repbuf[node->rep.n] = 0; \ + if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ + base->repbuf[node->rep.n] = n; \ + return tmp; \ + } \ + base->repbuf[node->rep.n] = n + 1u; \ + tmp = Xmatch(base, node->rep.arg, s, o); \ + --base->repbuf[node->rep.n]; \ + return tmp; \ + } else { \ + ++base->repbuf[node->rep.n]; \ + tmp = Xmatch(base, node->rep.arg, s, o); \ + --base->repbuf[node->rep.n]; \ + return tmp; \ + } \ + \ + case RE_REP_GREEDY: \ + if (base->repbuf[node->rep.n] >= node->rep.max) { \ + const size_t n = base->repbuf[node->rep.n]; \ + \ + base->repbuf[node->rep.n] = 0; \ + tmp = Xmatch(base, node->rep.next, s, o); \ + base->repbuf[node->rep.n] = n; \ + return tmp; \ + } else if (base->repbuf[node->rep.n] >= node->rep.min) { \ + const size_t n = base->repbuf[node->rep.n]; \ + \ + ++base->repbuf[node->rep.n]; \ + if ((tmp = Xmatch(base, node->rep.arg, s, o)) != NO_MATCH) { \ + --base->repbuf[node->rep.n]; \ + return tmp; \ + } \ + base->repbuf[node->rep.n] = 0; \ + tmp = Xmatch(base, node->rep.next, s, o); \ + base->repbuf[node->rep.n] = n; \ + return tmp; \ + } else { \ + ++base->repbuf[node->rep.n]; \ + tmp = Xmatch(base, node->rep.arg, s, o); \ + --base->repbuf[node->rep.n]; \ + return tmp; \ + } \ + \ + case RE_XREP_FRUGAL: { \ + size_t n; \ + \ + for (n = 0; n < node->rep.min; ++n) { \ + if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ + return NO_MATCH; \ + } \ + o = tmp; \ + } \ + \ + for (; n < node->rep.max; ++n) { \ + if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ + return tmp; \ + } \ + if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ + return NO_MATCH; \ + } \ + o = tmp; \ + } \ + return Xmatch(base, node->rep.next, s, o); \ + } \ + \ + case RE_XREP_GREEDY: { \ + size_t n; \ + \ + for (n = 0; n < node->rep.min; ++n) { \ + if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ + return NO_MATCH; \ + } \ + o = tmp; \ + } \ + \ + for (; n < node->rep.max; ++n) { \ + if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ + break; \ + } \ + o = tmp; \ + } \ + \ + for (; n > node->rep.min; --n) { \ + if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ + return tmp; \ + } \ + o -= node->rep.n; \ + } \ + \ + return Xmatch(base, node->rep.next, s, o); \ + } \ + } \ + \ + NOTREACHED + + +static size_t match(const t_regex *base, const re_node *node, const String *s, size_t o) { + + Mmatch0(match, ST_INDEX); + + case RE_BACKREF: + if ( + node->backref.n < base->captures && + base->cap_ptr[node->backref.n].start + 1u && + base->cap_ptr[node->backref.n].end + 1u && + base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start && + ( + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start <= St_len(s) - o && + memcmp( + St_ptr(s) + base->cap_ptr[node->backref.n].start, + St_ptr(s) + o, + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start + ) == 0 + ) + ) { + return match(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start); + } + return NO_MATCH; + + + case RE_LITERAL: + if ( + node->literal.len == 0 || + ( + node->literal.len <= St_len(s) - o && + memcmp(node->literal.buf, St_ptr(s) + o, node->literal.len) == 0 + ) + ) { + return match(base, node->literal.next, s, o + node->literal.len); + } + return NO_MATCH; + + case RE_STUFF_FRUGAL: + if (!node->x.next) { + return o; + } + + if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) { + for ( + o = St_stro_m(s, o, node->x.next->literal.buf, node->x.next->literal.len); + o + 1u; + o = St_stro_m(s, o + 1u, node->x.next->literal.buf, node->x.next->literal.len) + ) { + if ((tmp = match(base, node->x.next->literal.next, s, o + node->x.next->literal.len)) != NO_MATCH) { + return tmp; + } + } + return NO_MATCH; + } + + for (; o <= St_len(s); ++o) { + if ((tmp = match(base, node->x.next, s, o)) != NO_MATCH) { + return tmp; + } + } + return NO_MATCH; + + case RE_STUFF_GREEDY: { + size_t n; + + if (!node->x.next) { + return St_len(s); + } + + if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) { + for ( + n = St_rstr_m(s, node->x.next->literal.buf, node->x.next->literal.len); + n > o && n + 1u; + n = St_rstro_m(s, n - 1u, node->x.next->literal.buf, node->x.next->literal.len) + ) { + if ((tmp = match(base, node->x.next->literal.next, s, n + node->x.next->literal.len)) != NO_MATCH) { + return tmp; + } + if (!n) { + break; + } + } + if (n == o) { + return match(base, node->x.next->literal.next, s, n + node->x.next->literal.len); + } + return NO_MATCH; + } + + for (n = St_len(s); n > o; --n) { + if ((tmp = match(base, node->x.next, s, n)) != NO_MATCH) { + return tmp; + } + } + return match(base, node->x.next, s, o); + } + + Mmatch1(match, ST_INDEX); +} + +static size_t iomatch(const t_regex *base, const re_node *node, IO *s, size_t o) { + + Mmatch0(iomatch, io_peek); + + case RE_BACKREF: + if ( + node->backref.n < base->captures && + base->cap_ptr[node->backref.n].start + 1u && + base->cap_ptr[node->backref.n].end + 1u && + base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start && + ( + io_xcmp( + s, + base->cap_ptr[node->backref.n].start, + o, + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start + ) == 0 + ) + ) { + return iomatch(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start); + } + return NO_MATCH; + + + case RE_LITERAL: + if ( + node->literal.len == 0 || + ( + io_cmppeek(s, o, node->literal.buf, node->literal.len) == 0 + ) + ) { + return iomatch(base, node->literal.next, s, o + node->literal.len); + } + return NO_MATCH; + + case RE_STUFF_FRUGAL: + if (!node->x.next) { + return o; + } + + for (; io_peek(s, o) != EOF; ++o) { + if ((tmp = iomatch(base, node->x.next, s, o)) != NO_MATCH) { + return tmp; + } + } + return iomatch(base, node->x.next, s, o); + + case RE_STUFF_GREEDY: { + size_t n; + + for (n = o; io_peek(s, n) != EOF; ++n) + ; + + if (!node->x.next) { + return n; + } + + for (; n > o; --n) { + if ((tmp = iomatch(base, node->x.next, s, n)) != NO_MATCH) { + return tmp; + } + } + return iomatch(base, node->x.next, s, o); + } + + Mmatch1(iomatch, io_peek); +} + +#define DEFRBUF ((void)0) +#define MRETURN(x) return (x) +#define INITRBUF(re) memset((re)->repbuf, '\0', (re)->repets * sizeof *(re)->repbuf) + +int re_iomatch(t_regex *re, IO *io, size_t *ms, size_t *me) { + size_t i; + DEFRBUF; + INITRBUF(re); + + assert(re != NULL); + assert(io != NULL); + assert(io_bufred(io)); + assert(ms != NULL); + assert(me != NULL); + + reinit(re); + + if (re->flags & FL_ANCHOR_START) { + if ((*me = iomatch(re, re->start, io, 0)) == NO_MATCH) { + MRETURN(0); + } + *ms = 0; + MRETURN(1); + } + + for (i = 0; io_peek(io, i) != EOF; ++i) { + if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) { + *ms = i; + MRETURN(1); + } + } + + if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) { + *ms = i; + MRETURN(1); + } + MRETURN(0); +} + +int re_match(t_regex *re, const String *s, size_t *ms, size_t *me) { + size_t i; + DEFRBUF; + INITRBUF(re); + + assert(re != NULL); + assert(s != NULL); + assert(ms != NULL); + assert(me != NULL); + + if (re->minlen > St_len(s)) { + MRETURN(0); + } + + reinit(re); + + if (re->flags & FL_ANCHOR_START) { + if ((*me = match(re, re->start, s, 0)) == NO_MATCH) { + MRETURN(0); + } + *ms = 0; + MRETURN(1); + } + + if (re->start->x.type == RE_LITERAL && re->start->literal.len) { + for ( + i = St_str_m(s, re->start->literal.buf, re->start->literal.len); + i + 1u; + i = St_stro_m(s, i + 1u, re->start->literal.buf, re->start->literal.len) + ) { + if ((*me = match(re, re->start->literal.next, s, i + re->start->literal.len)) != NO_MATCH) { + *ms = i; + MRETURN(1); + } + } + MRETURN(0); + } + + for (i = 0; i <= St_len(s) - re->minlen; ++i) { + if ((*me = match(re, re->start, s, i)) != NO_MATCH) { + *ms = i; + MRETURN(1); + } + } + + MRETURN(0); +} + +size_t re_cabra(const t_regex *re) { + return re->captures; +} + +int re_backref(const t_regex *re, size_t i, size_t *a, size_t *z) { + if (i >= re->captures) { + return -1; + } + if (!(re->cap_ptr[i].start + 1u && re->cap_ptr[i].end + 1u)) { + return 1; + } + *a = re->cap_ptr[i].start; + *z = re->cap_ptr[i].end; + return 0; +} + +ATTR_PURE +static const re_node *branchend(const re_node *a, const re_node *b) { + for (; a; a = a->x.next) { + const re_node *q = b; + for (; q; q = q->x.next) { + if (a == q) { + return q; + } + } + } + return NULL; +} + +static void decom_class(String *s, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) { + unsigned char i; + int flag = 0; + char big_enough[100]; + + for (i = 0; i < UCHAR_MAX; ++i) { + if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) { + if (vec[(i + 1u) / CHAR_BIT] & 1u << (i + 1u) % CHAR_BIT) { + if (!flag) { + flag = 1; + goto mess; + } + flag = 1; + } else { + if (flag) { + if (i > 1u && vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) { + St_cat_c(s, '-'); + } + flag = 0; + } +mess: + if (!isprint(i)) { + St_cat_c(s, '\\'); + switch (i) { + case '\a': St_cat_c(s, 'a'); break; + case '\b': St_cat_c(s, 'b'); break; + case '\f': St_cat_c(s, 'f'); break; + case '\n': St_cat_c(s, 'n'); break; + case '\r': St_cat_c(s, 'r'); break; + case '\t': St_cat_c(s, 't'); break; + case '\v': St_cat_c(s, 'v'); break; + default: + sprintf(big_enough, "%03o", (unsigned)i); + St_cat_s(s, big_enough); + break; + } + } else { + if (strchr("\\\"", (char)i)) { + St_cat_c(s, '\\'); + } + St_cat_c(s, i); + if (strchr("!'-", (char)i)) { + St_cat_c(s, ESCAPE); + } + } + } + } + } + if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) { + if (flag) { + if (vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) { + St_cat_c(s, '-'); + } + flag = 0; + } + if (!isprint(i)) { + St_cat_c(s, '\\'); + switch (i) { + case '\a': St_cat_c(s, 'a'); break; + case '\b': St_cat_c(s, 'b'); break; + case '\f': St_cat_c(s, 'f'); break; + case '\n': St_cat_c(s, 'n'); break; + case '\r': St_cat_c(s, 'r'); break; + case '\t': St_cat_c(s, 't'); break; + case '\v': St_cat_c(s, 'v'); break; + default: + sprintf(big_enough, "%03o", (unsigned)i); + St_cat_s(s, big_enough); + break; + } + } else { + if (strchr("\\\"", (char)i)) { + St_cat_c(s, '\\'); + } + St_cat_c(s, i); + if (strchr("!'-", (char)i)) { + St_cat_c(s, ESCAPE); + } + } + } +} + +static void dumpclass(FILE *fp, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) { + String tmp; + St_init(&tmp); + decom_class(&tmp, vec); + ST_WRITE(&tmp, fp); + St_clear(&tmp); +} + +ATTR_CONST +static int cdefclass(const unsigned char *ptr) { + if (ptr == dc_word) { return 'w'; } + if (ptr == dc_alpha) { return 'q'; } + if (ptr == dc_cntrl) { return 'c'; } + if (ptr == dc_digit) { return 'd'; } + if (ptr == dc_lower) { return 'l'; } + if (ptr == dc_print) { return 'p'; } + if (ptr == dc_space) { return 's'; } + if (ptr == dc_upper) { return 'u'; } + if (ptr == dc_xdigit) { return 'x'; } + NOTREACHED; +} + +static void decom_liter(String *s, const re_node *node) { + size_t i; + assert(node->x.type == RE_LITERAL || node->x.type == RE_PARTIAL); + for (i = 0; i < node->literal.len; ++i) { + if (!isprint((unsigned char)node->literal.buf[i])) { + St_cat_c(s, '\\'); + switch (node->literal.buf[i]) { + case '\a': St_cat_c(s, 'a'); break; + case '\b': St_cat_c(s, 'b'); break; + case '\f': St_cat_c(s, 'f'); break; + case '\n': St_cat_c(s, 'n'); break; + case '\r': St_cat_c(s, 'r'); break; + case '\t': St_cat_c(s, 't'); break; + case '\v': St_cat_c(s, 'v'); break; + default: { + char big_enough[100]; + sprintf(big_enough, "%03o", (unsigned)(unsigned char)node->literal.buf[i]); + St_cat_s(s, big_enough); + break; + } + } + } else { + if (node->x.type == RE_LITERAL && strchr("\"\\", node->literal.buf[i])) { + St_cat_c(s, '\\'); + } + St_cat_c(s, node->literal.buf[i]); + if (MEATCHARP(node->literal.buf[i])) { + St_cat_c(s, ESCAPE); + } + } + } +} + +static void dumpliter(FILE *fp, const re_node *node) { + String tmp; + St_init(&tmp); + decom_liter(&tmp, node); + ST_WRITE(&tmp, fp); + St_clear(&tmp); +} + +static void do_indent(FILE *fp, size_t n) { + for (; n; --n) { + fputs(" ", fp); + } +} + +static void dumpnode(FILE *fp, const re_node *node, const re_node *end, const size_t indent) { + if (node == end) { + return; + } + + assert(node != NULL); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + + switch (node->x.type) { + case RE_ALTER: { + const re_node *const be = branchend(node->alter.arg, node->alter.next); + fputs("branch {\n", fp); + dumpnode(fp, node->alter.arg, be, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} or {\n", fp); + dumpnode(fp, node->alter.next, be, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /branch\n", fp); + dumpnode(fp, be, end, indent); + return; + } + + case RE_ANYCH: + fputs("anychar\n", fp); + break; + + case RE_AT_BOL: + fputs("at beginning of line\n", fp); + break; + + case RE_AT_BOS: + fputs("at beginning of string\n", fp); + break; + + case RE_AT_EOL: + fputs("at end of line\n", fp); + break; + + case RE_AT_EOS: + fputs("at end of string\n", fp); + break; + + case RE_AT_MATCH: + fputs("at match {\n", fp); + dumpnode(fp, node->indep.arg, NULL, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /at match\n", fp); + break; + + case RE_AT_NOMATCH: + fputs("not at match {\n", fp); + dumpnode(fp, node->indep.arg, NULL, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /not at match\n", fp); + break; + + case RE_AT_NWBOUND: + fputs("not at word boundary\n", fp); + break; + + case RE_AT_WBOUND: + fputs("at word boundary\n", fp); + break; + + case RE_BACKCHECK: + fprintf(fp, "check capture (%lu)\n", (unsigned long)node->backref.n); + break; + + case RE_BACKREF: + fprintf(fp, "backref (%lu)\n", (unsigned long)node->backref.n); + break; + + case RE_BEGIN_CAPTURE: + fprintf(fp, "begin capture (%lu)\n", (unsigned long)node->backref.n); + break; + + case RE_CLASS: + fputs("character class '", fp); + dumpclass(fp, node->class.vec); + fputs("'\n", fp); + break; + + case RE_DEFCLASS: + fprintf(fp, "character class %c!\n", cdefclass(node->defclass.vec)); + break; + + case RE_END_CAPTURE: + fprintf(fp, "end capture (%lu)\n", (unsigned long)node->backref.n); + break; + + case RE_INDEP: + fputs("independent submatch {\n", fp); + dumpnode(fp, node->indep.arg, NULL, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /independent submatch\n", fp); + break; + + case RE_LITERAL: + fputs("literal \"", fp); + dumpliter(fp, node); + fputs("\"\n", fp); + break; + + case RE_N_CLASS: + fputs("negated character class '", fp); + dumpclass(fp, node->class.vec); + fputs("'\n", fp); + break; + + case RE_N_DEFCLASS: + fprintf(fp, "character class %c!\n", toupper(cdefclass(node->defclass.vec))); + break; + + case RE_PARTIAL: + fputs("abbreviation `", fp); + dumpliter(fp, node); + fputs("`\n", fp); + break; + + case RE_REP_FRUGAL: + fputs("repetition ", fp); + if (node->rep.min == node->rep.max) { + fprintf(fp, ":%lu:?", (unsigned long)node->rep.min); + } else if (node->rep.max != (size_t)-1) { + fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max); + } else { + switch (node->rep.min) { + case 0: fputs("*?", fp); break; + case 1: fputs("+?", fp); break; + default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break; + } + } + fputs(" {\n", fp); + dumpnode(fp, node->rep.arg, node, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /repetition\n", fp); + break; + + case RE_REP_GREEDY: + fputs("repetition ", fp); + if (node->rep.min == node->rep.max) { + fprintf(fp, ":%lu:", (unsigned long)node->rep.min); + } else if (node->rep.max != (size_t)-1) { + fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max); + } else { + switch (node->rep.min) { + case 0: fputs("*", fp); break; + case 1: fputs("+", fp); break; + default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break; + } + } + fputs(" {\n", fp); + dumpnode(fp, node->rep.arg, node, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /repetition\n", fp); + break; + + case RE_SELECT: { + const re_node *const be = branchend(node->select.arg, node->select.next); + fputs("if {\n", fp); + dumpnode(fp, node->select.cond, NULL, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} then {\n", fp); + dumpnode(fp, node->select.arg, be, indent + 1); + if (node->select.next != be) { + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} else {\n", fp); + dumpnode(fp, node->select.next, be, indent + 1); + } + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /if\n", fp); + dumpnode(fp, be, end, indent); + return; + } + + case RE_STUFF_FRUGAL: + fputs(".*?\n", fp); + break; + + case RE_STUFF_GREEDY: + fputs(".*\n", fp); + break; + + case RE_XREP_FRUGAL: + fputs("simple repetition ", fp); + if (node->rep.min == node->rep.max) { + fprintf(fp, ":%lu:?", (unsigned long)node->rep.min); + } else if (node->rep.max != (size_t)-1) { + fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max); + } else { + switch (node->rep.min) { + case 0: fputs("*?", fp); break; + case 1: fputs("+?", fp); break; + default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break; + } + } + fputs(" {\n", fp); + dumpnode(fp, node->rep.arg, NULL, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /simple repetition\n", fp); + break; + + case RE_XREP_GREEDY: + fputs("simple repetition ", fp); + if (node->rep.min == node->rep.max) { + fprintf(fp, ":%lu:", (unsigned long)node->rep.min); + } else if (node->rep.max != (size_t)-1) { + fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max); + } else { + switch (node->rep.min) { + case 0: fputs("*", fp); break; + case 1: fputs("+", fp); break; + default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break; + } + } + fputs(" {\n", fp); + dumpnode(fp, node->rep.arg, NULL, indent + 1); + fprintf(fp, "[%p] ", (const void *)node); + do_indent(fp, indent); + fputs("} /simple repetition\n", fp); + break; + } + dumpnode(fp, node->x.next, end, indent); +} + +void re_dump(const t_regex *re, FILE *fp) { + fprintf(fp, "RE: minlen=%lu, anchor=%s\n", (unsigned long)re->minlen, re->flags & FL_ANCHOR_START ? "start" : "none"); + dumpnode(fp, re->start, NULL, 0); +} + +static void cat_n(String *s, unsigned long n) { + char big_enough[100]; + sprintf(big_enough, "%lu", n); + St_cat_s(s, big_enough); +} + +static void decom_repeat(String *s, const re_node *node) { + assert( + node->x.type == RE_REP_FRUGAL || + node->x.type == RE_REP_GREEDY || + node->x.type == RE_XREP_FRUGAL || + node->x.type == RE_XREP_GREEDY + ); + + if (node->rep.min == 0 && node->rep.max == (size_t)-1) { + St_cat_c(s, '*'); + } else if (node->rep.min == 0 && node->rep.max == 1) { + St_cat_c(s, '?'); + } else if (node->rep.min == 1 && node->rep.max == (size_t)-1) { + St_cat_c(s, '+'); + } else if (node->rep.min == node->rep.max) { + St_cat_c(s, ':'); + cat_n(s, node->rep.min); + St_cat_c(s, ':'); + } else if (node->rep.max == (size_t)-1) { + St_cat_c(s, ':'); + cat_n(s, node->rep.min); + St_cat_m(s, ",:", 2); + } else { + St_cat_c(s, ':'); + cat_n(s, node->rep.min); + St_cat_c(s, ','); + cat_n(s, node->rep.max); + St_cat_c(s, ':'); + } + + if (node->rep.type == RE_REP_FRUGAL || node->rep.type == RE_XREP_FRUGAL) { + St_cat_c(s, '?'); + } +} + +static void decom_node(String *s, const re_node *node, const re_node *end) { + if (node == end) { + return; + } + + assert(node != NULL); + switch (node->x.type) { + case RE_ALTER: { + const re_node *const be = branchend(node->alter.arg, node->alter.next); + St_cat_c(s, '('); + decom_node(s, node->alter.arg, be); + St_cat_c(s, '|'); + decom_node(s, node->alter.next, be); + St_cat_c(s, ')'); + decom_node(s, be, end); + break; + } + + case RE_ANYCH: + St_cat_c(s, '.'); + decom_node(s, node->x.next, end); + break; + + case RE_AT_BOL: + St_cat_m(s, "A!", 2); + decom_node(s, node->x.next, end); + break; + + case RE_AT_BOS: + St_cat_c(s, '^'); + decom_node(s, node->x.next, end); + break; + + case RE_AT_EOL: + St_cat_m(s, "Z!", 2); + decom_node(s, node->x.next, end); + break; + + case RE_AT_EOS: + St_cat_c(s, '$'); + decom_node(s, node->x.next, end); + break; + + case RE_AT_MATCH: + St_cat_c(s, '['); + decom_node(s, node->indep.arg, NULL); + St_cat_m(s, "&]", 2); + decom_node(s, node->indep.next, end); + break; + + case RE_AT_NOMATCH: + St_cat_c(s, '['); + decom_node(s, node->indep.arg, NULL); + St_cat_m(s, "^]", 2); + decom_node(s, node->indep.next, end); + break; + + case RE_AT_NWBOUND: + St_cat_m(s, "B!", 2); + decom_node(s, node->x.next, end); + break; + + case RE_AT_WBOUND: + St_cat_m(s, "b!", 2); + decom_node(s, node->x.next, end); + break; + + case RE_BACKREF: + case RE_BACKCHECK: { + String tmp; + if (node->x.type == RE_BACKCHECK) { + St_cat_c(s, '['); + } + St_init(&tmp); +#if HAVE_VSNPRINTF_P + St_xprintf(&tmp, "%lu", (unsigned long)node->backref.n); +#else + St_num(&tmp, node->backref.n); +#endif + St_cat(s, &tmp); + St_clear(&tmp); + if (node->x.type == RE_BACKCHECK) { + St_cat_c(s, ']'); + } else { + St_cat_c(s, ESCAPE); + } + decom_node(s, node->backref.next, end); + break; + } + + case RE_BEGIN_CAPTURE: + St_cat_c(s, '{'); + decom_node(s, node->x.next, end); + break; + + case RE_CLASS: + St_cat_c(s, '\''); + decom_class(s, node->class.vec); + St_cat_c(s, '\''); + decom_node(s, node->class.next, end); + break; + + case RE_DEFCLASS: + St_cat_c(s, cdefclass(node->defclass.vec)); + St_cat_c(s, ESCAPE); + decom_node(s, node->defclass.next, end); + break; + + case RE_END_CAPTURE: + St_cat_c(s, '}'); + decom_node(s, node->x.next, end); + break; + + case RE_INDEP: + St_cat_c(s, '<'); + decom_node(s, node->indep.arg, NULL); + St_cat_c(s, '>'); + decom_node(s, node->indep.next, end); + break; + + case RE_LITERAL: + decom_liter(s, node); + decom_node(s, node->literal.next, end); + break; + + case RE_N_CLASS: + St_cat_m(s, "'^", 2); + decom_class(s, node->class.vec); + St_cat_c(s, '\''); + decom_node(s, node->class.next, end); + break; + + case RE_N_DEFCLASS: + St_cat_c(s, toupper(cdefclass(node->defclass.vec))); + St_cat_c(s, ESCAPE); + decom_node(s, node->defclass.next, end); + break; + + case RE_PARTIAL: + St_cat_c(s, '`'); + decom_liter(s, node); + St_cat_c(s, '`'); + break; + + case RE_REP_FRUGAL: + case RE_REP_GREEDY: + St_cat_c(s, '('); + decom_node(s, node->rep.arg, node); + St_cat_c(s, ')'); + decom_repeat(s, node); + decom_node(s, node->rep.next, end); + break; + + case RE_SELECT: { + const re_node *const be = branchend(node->select.arg, node->select.next); + St_cat_c(s, '['); + if (be != node->select.next) { + decom_node(s, node->select.next, be); + St_cat_c(s, '|'); + } + decom_node(s, node->select.arg, be); + St_cat_c(s, '('); + decom_node(s, node->select.cond, NULL); + St_cat_c(s, ')'); + St_cat_m(s, "?]", 2); + decom_node(s, be, end); + break; + } + + case RE_STUFF_FRUGAL: + St_cat_m(s, ".*?", 3); + decom_node(s, node->x.next, end); + break; + + case RE_STUFF_GREEDY: + St_cat_m(s, ".*", 2); + decom_node(s, node->x.next, end); + break; + + case RE_XREP_FRUGAL: + case RE_XREP_GREEDY: + St_cat_c(s, '('); + decom_node(s, node->rep.arg, NULL); + St_cat_c(s, ')'); + decom_repeat(s, node); + decom_node(s, node->rep.next, end); + break; + } +} + +void re_decompile(const t_regex *re, String *s) { + St_zero(s); + decom_node(s, re->start, NULL); +}