Mercurial > repo
view src/ploki/re.c @ 7072:53e47c85558e
<oerjan> learn optional.
author | HackBot |
---|---|
date | Thu, 03 Mar 2016 00:05:00 +0000 |
parents | ac0403686959 |
children |
line wrap: on
line source
#include "config.h" #include "IO.h" #include "Str.h" #include "hash.h" #include "main_io.h" #include "main_opt.h" #include "re.h" #include "xmalloc.h" #include "zz.h" #include <ctype.h> #include <limits.h> #include <stddef.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <assert.h> enum {CLASS_SIZE = UCHAR_MAX / CHAR_BIT + 1u}; static unsigned char dc_word[CLASS_SIZE]; static unsigned char dc_alpha[CLASS_SIZE]; static unsigned char dc_cntrl[CLASS_SIZE]; static unsigned char dc_digit[CLASS_SIZE]; static unsigned char dc_lower[CLASS_SIZE]; static unsigned char dc_print[CLASS_SIZE]; static unsigned char dc_space[CLASS_SIZE]; static unsigned char dc_upper[CLASS_SIZE]; static unsigned char dc_xdigit[CLASS_SIZE]; static void fill(unsigned char *v, int (*pred)(int)) { unsigned char c; for (c = UCHAR_MAX; c; --c) { v[c / CHAR_BIT] |= !!pred(c) << c % CHAR_BIT; } v[0] |= !!pred(0); } static void nfill(unsigned char *v, int (*pred)(int)) { unsigned char c; for (c = UCHAR_MAX; c; --c) { v[c / CHAR_BIT] |= !pred(c) << c % CHAR_BIT; } v[0] |= !pred(0); } ATTR_CONST static int siword(int c) { return c == '_' || isalnum(c); } enum {MAGIC = 23}; static Hash rcache; static String cached[MAGIC]; static size_t cachfill, cachlast; static size_t hash(const void *s, size_t seed) { return St_hash(s, seed); } static int compar(const void *a, const void *b) { return St_cmp(a, b); } enum t_re_node { RE_ALTER, RE_ANYCH, RE_AT_BOL, RE_AT_BOS, RE_AT_EOL, RE_AT_EOS, RE_AT_MATCH, RE_AT_NOMATCH, RE_AT_NWBOUND, RE_AT_WBOUND, RE_BACKCHECK, RE_BACKREF, RE_BEGIN_CAPTURE, RE_CLASS, RE_DEFCLASS, RE_END_CAPTURE, RE_INDEP, RE_LITERAL, RE_N_CLASS, RE_N_DEFCLASS, RE_PARTIAL, RE_REP_FRUGAL, RE_REP_GREEDY, RE_SELECT, RE_STUFF_FRUGAL, RE_STUFF_GREEDY, RE_XREP_FRUGAL, RE_XREP_GREEDY }; #define CMN_HDR enum t_re_node type; union my_node *next typedef union my_node { struct { CMN_HDR; } x; struct { CMN_HDR; union my_node *arg; } alter; struct { CMN_HDR; size_t n; } backref; struct { CMN_HDR; size_t n; } capture; struct { CMN_HDR; unsigned char vec[CLASS_SIZE]; } class; struct { CMN_HDR; unsigned char *vec; } defclass; struct { CMN_HDR; union my_node *arg; } indep; struct { CMN_HDR; unsigned char *buf; size_t len; } literal; struct { CMN_HDR; union my_node *arg; size_t min, max; size_t n; } rep; struct { CMN_HDR; union my_node *arg; union my_node *cond; } select; } re_node; #include "re_block.c.h" enum re_flags { FL_NONE = 0, FL_ANCHOR_START = 1 }; struct cap_state { size_t start, end, pending; }; struct my_regex { re_node *start; size_t repets; size_t *repbuf; t_block alloc; size_t captures; struct cap_state *cap_ptr; size_t refs; enum re_flags flags; size_t minlen; }; static void delv(void *v) { re_free(v); } void re_init(void) { fill(dc_word, siword); fill(dc_alpha, isalpha); fill(dc_cntrl, iscntrl); fill(dc_digit, isdigit); fill(dc_lower, islower); fill(dc_print, isprint); fill(dc_space, isspace); fill(dc_upper, isupper); fill(dc_xdigit, isxdigit); h_init(&rcache, hash, compar, NULL, delv); } void re_end(void) { h_end(&rcache); while (cachfill) { --cachfill; St_clear(&cached[cachfill]); } cachlast = 0; } enum {ESCAPE = '!'}; #define MEATCHARP(c) strchr("!|({<[]>})*+?:.'^$`", c) #define ENDBRANCHP(c) strchr("|({<[", c) struct parse_context { t_regex *base; t_block *block; const String *s; size_t *p; }; static void skipspace(const String *const s, size_t *const p) { restart: while (isspace(ST_INDEX(s, *p))) { --*p; } if (ST_INDEX(s, *p) == ']' && ST_INDEX(s, *p - 1u) == '#') { *p -= 2u; while (ST_INDEX(s, *p) != EOF) { if (ST_INDEX(s, *p) == '[') { --*p; goto restart; } if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1u) != EOF) { --*p; } --*p; } } } #define PRAZ(name) static re_node *name(const struct parse_context *con, re_node *next, re_node *curolan_zebeth) #define SNORK const String *const s = con->s; size_t *const p = con->p #define CALL(name, nextn, glob) name(con, nextn, glob) #define TAILC(name) return CALL(name, next, curolan_zebeth) PRAZ(quantifire); PRAZ(branch); PRAZ(alternation); PRAZ(atom) { SNORK; (void)curolan_zebeth; skipspace(s, p); switch (ST_INDEX(s, *p)) { case '(': case '<': case '{': case '[': case EOF: { re_node *cur = bl_node(con->block); cur->literal.type = RE_LITERAL; cur->literal.next = next; cur->literal.buf = NULL; cur->literal.len = 0; return cur; } case ESCAPE: { re_node *cur = bl_node(con->block); --*p; if (isdigit(ST_INDEX(s, *p))) { do { --*p; } while (isdigit(ST_INDEX(s, *p))); cur->backref.type = RE_BACKREF; cur->backref.next = next; cur->backref.n = strtoul(St_ptr(s) + *p + 1, NULL, 10); } else if (isalpha(ST_INDEX(s, *p))) { switch (ST_INDEX(s, *p)) { case 'A': cur->x.type = RE_AT_BOL; cur->x.next = next; --*p; break; case 'Z': cur->x.type = RE_AT_EOL; cur->x.next = next; --*p; break; case 'b': cur->x.type = RE_AT_WBOUND; cur->x.next = next; --*p; break; case 'B': cur->x.type = RE_AT_NWBOUND; cur->x.next = next; --*p; break; case 'c': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_cntrl; cur->defclass.next = next; --*p; break; case 'C': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_cntrl; cur->defclass.next = next; --*p; break; case 'd': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_digit; cur->defclass.next = next; --*p; break; case 'D': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_digit; cur->defclass.next = next; --*p; break; case 'l': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_lower; cur->defclass.next = next; --*p; break; case 'L': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_lower; cur->defclass.next = next; --*p; break; case 'p': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_print; cur->defclass.next = next; --*p; break; case 'P': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_print; cur->defclass.next = next; --*p; break; case 'q': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_alpha; cur->defclass.next = next; --*p; break; case 'Q': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_alpha; cur->defclass.next = next; --*p; break; case 's': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_space; cur->defclass.next = next; --*p; break; case 'S': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_space; cur->defclass.next = next; --*p; break; case 'u': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_upper; cur->defclass.next = next; --*p; break; case 'U': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_upper; cur->defclass.next = next; --*p; break; case 'w': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_word; cur->defclass.next = next; --*p; break; case 'W': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_word; cur->defclass.next = next; --*p; break; case 'x': cur->defclass.type = RE_DEFCLASS; cur->defclass.vec = dc_xdigit; cur->defclass.next = next; --*p; break; case 'X': cur->defclass.type = RE_N_DEFCLASS; cur->defclass.vec = dc_xdigit; cur->defclass.next = next; --*p; break; default: goto default_escape; } } else default_escape: { cur->literal.type = RE_LITERAL; cur->literal.next = next; cur->literal.buf = bl_string(con->block, cur->literal.len = 1); if (ST_INDEX(s, *p) == EOF) { cur->literal.buf[0] = ESCAPE; } else { cur->literal.buf[0] = ST_INDEX(s, *p); --*p; } } return cur; } case ')': { re_node *cur; --*p; cur = CALL(alternation, next, next); skipspace(s, p); if (ST_INDEX(s, *p) == '(') { --*p; } return cur; } case ']': { re_node *cur; switch (ST_INDEX(s, *p - 1u)) { case '&': *p -= 2u; cur = bl_node(con->block); cur->indep.type = RE_AT_MATCH; cur->indep.next = next; cur->indep.arg = CALL(alternation, NULL, NULL); break; case '^': *p -= 2u; cur = bl_node(con->block); cur->indep.type = RE_AT_NOMATCH; cur->indep.next = next; cur->indep.arg = CALL(alternation, NULL, NULL); break; case '?': *p -= 2u; cur = bl_node(con->block); cur->select.type = RE_SELECT; cur->select.cond = CALL(quantifire, NULL, NULL); cur->select.arg = CALL(branch, next, next); skipspace(s, p); if (ST_INDEX(s, *p) == '|') { --*p; cur->select.next = CALL(alternation, next, next); } else { cur->select.next = next; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { size_t i; for (i = *p - 2u; isdigit(ST_INDEX(s, i)); --i) ; if (ST_INDEX(s, i) != '[') { goto std_atom; } cur = bl_node(con->block); cur->backref.type = RE_BACKCHECK; cur->backref.next = next; cur->backref.n = strtoul(St_ptr(s) + i + 1u, NULL, 10); *p = i - 1u; return cur; } default: goto std_atom; } skipspace(s, p); if (ST_INDEX(s, *p) == '[') { --*p; } return cur; } case '>': { re_node *cur = bl_node(con->block); --*p; cur->indep.type = RE_INDEP; cur->indep.next = next; cur->indep.arg = CALL(alternation, NULL, NULL); skipspace(s, p); if (ST_INDEX(s, *p) == '<') { --*p; } return cur; } case '}': { re_node *begin = bl_node(con->block); re_node *end = bl_node(con->block); --*p; begin->capture.n = end->capture.n = con->base->captures++; end->capture.type = RE_END_CAPTURE; end->capture.next = next; begin->capture.type = RE_BEGIN_CAPTURE; begin->capture.next = CALL(alternation, end, end); skipspace(s, p); if (ST_INDEX(s, *p) == '{') { --*p; } return begin; } case '^': { re_node *cur = bl_node(con->block); cur->x.type = RE_AT_BOS; cur->x.next = next; --*p; return cur; } case '$': { re_node *cur = bl_node(con->block); cur->x.type = RE_AT_EOS; cur->x.next = next; --*p; return cur; } case '.': { re_node *cur = bl_node(con->block); cur->x.type = RE_ANYCH; cur->x.next = next; --*p; return cur; } case '`': { String tmp; int c; re_node *cur = bl_node(con->block); cur->literal.type = RE_PARTIAL; cur->literal.next = next; --*p; St_init(&tmp); while ( (c = ST_INDEX(s, *p)) != EOF && c != '`' ) { --*p; if (c == ESCAPE && ST_INDEX(s, *p) != EOF) { c = ST_INDEX(s, *p); --*p; } St_cat_c(&tmp, c); } if (c == '`') { --*p; } St_reverse(&tmp); cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp)); memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp)); St_clear(&tmp); return cur; } case '\'': { int c; re_node *cur = bl_node(con->block); cur->class.type = RE_CLASS; cur->class.next = next; memset(cur->class.vec, '\0', sizeof cur->class.vec); for (--*p; (c = ST_INDEX(s, *p)) != EOF && c != '\''; --*p) { if (c == ESCAPE) { switch (c = ST_INDEX(s, *p - 1)) { case EOF: cur->class.vec[ESCAPE / CHAR_BIT] |= 1u << ESCAPE % CHAR_BIT; break; case 'c': fill(cur->class.vec, iscntrl); --*p; break; case 'C': nfill(cur->class.vec, iscntrl); --*p; break; case 'd': fill(cur->class.vec, isdigit); --*p; break; case 'D': nfill(cur->class.vec, isdigit); --*p; break; case 'l': fill(cur->class.vec, islower); --*p; break; case 'L': nfill(cur->class.vec, islower); --*p; break; case 'p': fill(cur->class.vec, isprint); --*p; break; case 'P': nfill(cur->class.vec, isprint); --*p; break; case 'q': fill(cur->class.vec, isalpha); --*p; break; case 'Q': nfill(cur->class.vec, isalpha); --*p; break; case 's': fill(cur->class.vec, isspace); --*p; break; case 'S': nfill(cur->class.vec, isspace); --*p; break; case 'u': fill(cur->class.vec, isupper); --*p; break; case 'U': nfill(cur->class.vec, isupper); --*p; break; case 'w': fill(cur->class.vec, siword); --*p; break; case 'W': nfill(cur->class.vec, siword); --*p; break; case 'x': fill(cur->class.vec, isxdigit); --*p; break; case 'X': nfill(cur->class.vec, isxdigit); --*p; break; default: cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT; --*p; break; } } else { int b; if (ST_INDEX(s, *p - 1) == '-' && (b = ST_INDEX(s, *p - 2)) != EOF) { if (b == ESCAPE && (b = ST_INDEX(s, *p - 3)) == EOF) { b = ESCAPE; } if (b > c) { goto normal_char; } for (; b < c; ++b) { cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT; } cur->class.vec[b / CHAR_BIT] |= 1u << b % CHAR_BIT; *p -= 2; if (ST_INDEX(s, *p) == ESCAPE && ST_INDEX(s, *p - 1) != EOF) { --*p; } } else if (ST_INDEX(s, *p) == ']' && *p >= 8 && ST_INDEX(s, *p - 1) == ':') { #define CXCLASS_CHECK(n, t, f, o, b) if (1) { \ if (memcmp(St_ptr(s) + *p - (n), (t), (n) - 1) != 0) { \ goto normal_char; \ } \ (b)(cur->class.vec, (f)); \ *p -= (o); \ break; \ } else (void)0 #define CCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, fill) #define CNCLASS_CHECK(t, f, o) CXCLASS_CHECK(sizeof (t), t, f, o, nfill) if (ST_INDEX(s, *p - 8) == '[' && ST_INDEX(s, *p - 7) == ':') { switch (ST_INDEX(s, *p - 6)) { case 'a': if (ST_INDEX(s, *p - 5) != 'l') { goto normal_char; } switch (ST_INDEX(s, *p - 4)) { case 'n': CCLASS_CHECK("um", isalnum, 8); case 'p': CCLASS_CHECK("ha", isalpha, 8); default: goto normal_char; } break; case 'c': CCLASS_CHECK("ntrl", iscntrl, 8); case 'd': CCLASS_CHECK("igit", isdigit, 8); case 'g': CCLASS_CHECK("raph", isgraph, 8); case 'l': CCLASS_CHECK("ower", islower, 8); case 'p': switch (ST_INDEX(s, *p - 5)) { case 'r': CCLASS_CHECK("int", isprint, 8); case 'u': CCLASS_CHECK("nct", ispunct, 8); default: goto normal_char; } break; case 's': CCLASS_CHECK("pace", isspace, 8); case 'u': CCLASS_CHECK("pper", isupper, 8); default: goto normal_char; } } else if (*p >= 9 && ST_INDEX(s, *p - 9) == '[' && ST_INDEX(s, *p - 8) == ':') { switch (ST_INDEX(s, *p - 7)) { case 'x': CCLASS_CHECK("digit", isxdigit, 9); case '^': switch (ST_INDEX(s, *p - 6)) { case 'a': if (ST_INDEX(s, *p - 5) != 'l') { goto normal_char; } switch (ST_INDEX(s, *p - 4)) { case 'n': CNCLASS_CHECK("um", isalnum, 9); case 'p': CNCLASS_CHECK("ha", isalpha, 9); default: goto normal_char; } break; case 'c': CNCLASS_CHECK("ntrl", iscntrl, 9); case 'd': CNCLASS_CHECK("igit", isdigit, 9); case 'g': CNCLASS_CHECK("raph", isgraph, 9); case 'l': CNCLASS_CHECK("ower", islower, 9); case 'p': switch (ST_INDEX(s, *p - 5)) { case 'r': CNCLASS_CHECK("int", isprint, 9); case 'u': CNCLASS_CHECK("nct", ispunct, 9); default: goto normal_char; } break; case 's': CNCLASS_CHECK("pace", isspace, 9); case 'u': CNCLASS_CHECK("pper", isupper, 9); default: goto normal_char; } break; default: goto normal_char; } } else if (*p >= 10 && memcmp(St_ptr(s) + *p - 10, "[:^xdigit", 9) == 0) { nfill(cur->class.vec, isxdigit); *p -= 10; } else { goto normal_char; } } else if (ST_INDEX(s, *p) == '^' && ((b = ST_INDEX(s, *p - 1) == '\'') || b == EOF)) { cur->class.type = RE_N_CLASS; } else normal_char: { cur->class.vec[c / CHAR_BIT] |= 1u << c % CHAR_BIT; } } } if (c == '\'') { --*p; } if (cur->class.type == RE_CLASS) { size_t i; for (i = 0; i < CLASS_SIZE; ++i) { if (cur->class.vec[i] != (unsigned char)~0u) { return cur; } } cur->x.type = RE_ANYCH; } else { size_t i; assert(cur->class.type == RE_N_CLASS); for (i = 0; i < CLASS_SIZE; ++i) { if (cur->class.vec[i] != 0) { return cur; } } cur->x.type = RE_ANYCH; } return cur; } std_atom: default: { re_node *cur = bl_node(con->block); cur->literal.type = RE_LITERAL; /*XXX?*/ if (next && next->x.type == RE_LITERAL) { if (next->literal.len) { cur->literal.buf = bl_string(con->block, cur->literal.len = 1u + next->literal.len); memcpy(cur->literal.buf + 1, next->literal.buf, next->literal.len); } cur->literal.next = next->literal.next; } else { cur->literal.next = next; cur->literal.buf = bl_string(con->block, cur->literal.len = 1); } cur->literal.buf[0] = ST_INDEX(s, *p); --*p; return cur; } } } PRAZ(literal) { SNORK; int c; String tmp; re_node *cur; skipspace(s, p); if ((ST_INDEX(s, *p) != ESCAPE || isalnum(ST_INDEX(s, *p - 1))) && MEATCHARP(ST_INDEX(s, *p))) { TAILC(atom); } cur = bl_node(con->block); cur->literal.type = RE_LITERAL; cur->literal.next = next; St_init(&tmp); while ( (c = ST_INDEX(s, *p)) != EOF && ( !MEATCHARP(c) || (c == ESCAPE && !isalnum(ST_INDEX(s, *p - 1))) ) ) { --*p; if (c == ESCAPE && ST_INDEX(s, *p) != EOF) { c = ST_INDEX(s, *p); --*p; } St_cat_c(&tmp, c); skipspace(s, p); } St_reverse(&tmp); if (next && next->x.type == RE_LITERAL) { if (next->literal.len) { St_cat_m(&tmp, next->literal.buf, next->literal.len); } cur->literal.next = next->literal.next; } cur->literal.buf = bl_string(con->block, cur->literal.len = St_len(&tmp)); memcpy(cur->literal.buf, St_ptr(&tmp), St_len(&tmp)); St_clear(&tmp); return cur; } #define RETURN_REP(lo, hi, grdy) \ do { \ re_node *cur = bl_node(con->block); \ cur->rep.type = (grdy); \ cur->rep.next = next; \ cur->rep.min = (lo); \ cur->rep.max = (hi); \ cur->rep.n = con->base->repets++; \ cur->rep.arg = CALL(atom, cur, NULL); \ return cur; \ } while (0) #define STD_REP_CASE(lo, hi, grdy) \ do { \ --*p; \ RETURN_REP(lo, hi, grdy); \ } while (0) #define CUSTOM_REP_CASE(off, grdy, mismatch) \ do { \ size_t from, to; \ to = *p - 1 - (off); \ while (isdigit(ST_INDEX(s, to))) { \ --to; \ } \ if (ST_INDEX(s, to) == ':') { \ *p = to - 1; \ to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \ from = to; \ } else { \ if (ST_INDEX(s, to) != ',') { \ mismatch; \ } \ from = to - 1; \ if (!isdigit(ST_INDEX(s, from))) { \ mismatch; \ } \ do { \ --from; \ } while (isdigit(ST_INDEX(s, from))); \ if (ST_INDEX(s, from) != ':') { \ mismatch; \ } \ *p = from - 1; \ from = strtoul(St_ptr(s) + from + 1, NULL, 10); \ to = ST_INDEX(s, to + 1) == ':' ? (size_t)-1 : strtoul(St_ptr(s) + to + 1, NULL, 10); \ } \ if (to == 1) { \ if (from == 1) { \ TAILC(atom); \ } \ if (from == 0) { \ re_node *cur = bl_node(con->block); \ cur->alter.type = RE_ALTER; \ if ((grdy) == RE_REP_GREEDY) { \ cur->alter.next = next; \ cur->alter.arg = CALL(atom, next, NULL); \ } else { \ cur->alter.arg = next; \ cur->alter.next = CALL(atom, next, NULL); \ } \ return cur; \ } \ } \ RETURN_REP(from, to, grdy); \ } while (0) PRAZ(quantifire) { SNORK; skipspace(s, p); switch (ST_INDEX(s, *p)) { case EOF: return next; case '?': switch (ST_INDEX(s, *p - 1)) { case ':': CUSTOM_REP_CASE(1, RE_REP_FRUGAL, goto hook); case '*': --*p; STD_REP_CASE(0, -1, RE_REP_FRUGAL); case '+': --*p; STD_REP_CASE(1, -1, RE_REP_FRUGAL); case '?': { re_node *cur = bl_node(con->block); *p -= 2; cur->alter.type = RE_ALTER; cur->alter.next = CALL(atom, next, NULL); cur->alter.arg = next; return cur; } hook: default: { re_node *cur = bl_node(con->block); --*p; cur->alter.type = RE_ALTER; cur->alter.next = next; cur->alter.arg = CALL(atom, next, NULL); return cur; } } case ':': CUSTOM_REP_CASE(0, RE_REP_GREEDY, goto noquanti); case '*': STD_REP_CASE(0, -1, RE_REP_GREEDY); case '+': STD_REP_CASE(1, -1, RE_REP_GREEDY); noquanti: default: TAILC(literal); } } PRAZ(branch) { SNORK; skipspace(s, p); while (ST_INDEX(s, *p) != EOF && !ENDBRANCHP(ST_INDEX(s, *p))) { next = CALL(quantifire, next, curolan_zebeth); skipspace(s, p); } return next; } PRAZ(alternation) { SNORK; re_node *cur; skipspace(s, p); if (ST_INDEX(s, *p) == EOF) { return next; } next = CALL(branch, next, curolan_zebeth); skipspace(s, p); if (ST_INDEX(s, *p) != '|') { return next; } --*p; cur = bl_node(con->block); cur->alter.type = RE_ALTER; cur->alter.next = next; cur->alter.arg = CALL(alternation, curolan_zebeth, curolan_zebeth); return cur; } ATTR_CONST static size_t minimum(const size_t a, const size_t b) { return a < b ? a : b; } ATTR_CONST static size_t maximum(const size_t a, const size_t b) { return a > b ? a : b; } ATTR_PURE static size_t subminlen(const re_node *node, const re_node *end, size_t start) { if (!node || node == end) { return start; } switch (node->x.type) { case RE_ALTER: return minimum( subminlen(node->alter.arg, end, start), subminlen(node->alter.next, end, start) ); case RE_ANYCH: case RE_CLASS: case RE_DEFCLASS: case RE_N_CLASS: case RE_N_DEFCLASS: return subminlen(node->x.next, end, start + 1u); case RE_AT_BOL: case RE_AT_BOS: case RE_AT_EOL: case RE_AT_EOS: case RE_AT_NWBOUND: case RE_AT_WBOUND: case RE_BACKCHECK: case RE_BACKREF: case RE_BEGIN_CAPTURE: case RE_END_CAPTURE: case RE_PARTIAL: case RE_STUFF_FRUGAL: case RE_STUFF_GREEDY: return subminlen(node->x.next, end, start); case RE_AT_MATCH: return maximum( subminlen(node->indep.arg, end, start), subminlen(node->indep.next, end, start) ); case RE_AT_NOMATCH: return subminlen(node->indep.next, end, start); case RE_INDEP: return subminlen(node->indep.next, end, start + subminlen(node->indep.arg, NULL, 0)); case RE_LITERAL: return subminlen(node->literal.next, end, start + node->literal.len); case RE_SELECT: return minimum( subminlen(node->select.cond, end, start) + subminlen(node->select.arg, end, start), subminlen(node->select.next, end, start) ); case RE_REP_FRUGAL: case RE_REP_GREEDY: if (node->rep.min == 0) { return subminlen(node->rep.next, end, start); } return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, node, 0)); case RE_XREP_FRUGAL: case RE_XREP_GREEDY: if (node->rep.min == 0) { return subminlen(node->rep.next, end, start); } return subminlen(node->rep.next, end, start + node->rep.min * subminlen(node->rep.arg, NULL, 0)); } NOTREACHED; } ATTR_PURE static size_t minlen(const re_node *node) { return subminlen(node, NULL, 0); } static void reinit(t_regex *re) { const struct cap_state null = { -1, -1, -1 }; size_t i; for (i = 0; i < re->captures; ++i) { re->cap_ptr[i] = null; } } static re_node **notcomplex(re_node **node, const re_node *end) { assert(*node != NULL); if (*node == end) { return node; } switch ((*node)->x.type) { case RE_ALTER: case RE_AT_MATCH: case RE_AT_NOMATCH: case RE_BACKCHECK: case RE_BACKREF: case RE_BEGIN_CAPTURE: case RE_END_CAPTURE: case RE_INDEP: case RE_PARTIAL: case RE_REP_FRUGAL: case RE_REP_GREEDY: case RE_SELECT: case RE_STUFF_FRUGAL: case RE_STUFF_GREEDY: return NULL; case RE_XREP_FRUGAL: case RE_XREP_GREEDY: if ((*node)->rep.min == (*node)->rep.max) { return notcomplex(&(*node)->rep.next, end); } return NULL; case RE_ANYCH: case RE_AT_BOL: case RE_AT_BOS: case RE_AT_EOL: case RE_AT_EOS: case RE_AT_NWBOUND: case RE_AT_WBOUND: case RE_CLASS: case RE_DEFCLASS: case RE_LITERAL: case RE_N_CLASS: case RE_N_DEFCLASS: return notcomplex(&(*node)->x.next, end); } NOTREACHED; } static void trysimpl(re_node *node) { assert(node->x.type == RE_REP_FRUGAL || node->x.type == RE_REP_GREEDY); { re_node **const tmp = notcomplex(&node->rep.arg, node); if (!tmp) { return; } *tmp = NULL; } switch (node->rep.type) { case RE_REP_FRUGAL: if ( node->rep.arg->x.type == RE_ANYCH && node->rep.arg->x.next == NULL && node->rep.max == (size_t)-1 ) { if (node->rep.min == 0) { node->x.type = RE_STUFF_FRUGAL; break; } if (node->rep.min == 1) { node->rep.arg->x.next = node->x.next; node->rep.arg->x.type = RE_STUFF_FRUGAL; node->x.next = node->rep.arg; node->x.type = RE_ANYCH; break; } } node->rep.n = minlen(node->rep.arg); node->rep.type = RE_XREP_FRUGAL; break; case RE_REP_GREEDY: if ( node->rep.arg->x.type == RE_ANYCH && node->rep.arg->x.next == NULL && node->rep.max == (size_t)-1 ) { if (node->rep.min == 0) { node->x.type = RE_STUFF_GREEDY; break; } if (node->rep.min == 1) { node->rep.arg->x.next = node->x.next; node->rep.arg->x.type = RE_STUFF_GREEDY; node->x.next = node->rep.arg; node->x.type = RE_ANYCH; break; } } node->rep.n = minlen(node->rep.arg); node->rep.type = RE_XREP_GREEDY; break; default: NOTREACHED; } } static void simplerep(re_node *node, const re_node *const end) { if (!node || node == end) { return; } switch (node->x.type) { case RE_ALTER: simplerep(node->alter.arg, end); simplerep(node->alter.next, end); return; case RE_AT_MATCH: case RE_AT_NOMATCH: case RE_INDEP: simplerep(node->indep.arg, end); simplerep(node->indep.next, end); return; case RE_ANYCH: case RE_AT_BOL: case RE_AT_BOS: case RE_AT_EOL: case RE_AT_EOS: case RE_AT_NWBOUND: case RE_AT_WBOUND: case RE_BACKCHECK: case RE_BACKREF: case RE_BEGIN_CAPTURE: case RE_CLASS: case RE_DEFCLASS: case RE_END_CAPTURE: case RE_LITERAL: case RE_N_CLASS: case RE_N_DEFCLASS: case RE_PARTIAL: simplerep(node->x.next, end); return; case RE_REP_FRUGAL: case RE_REP_GREEDY: simplerep(node->rep.next, end); simplerep(node->rep.arg, node); trysimpl(node); return; case RE_SELECT: simplerep(node->select.cond, end); simplerep(node->select.arg, end); simplerep(node->select.next, end); return; case RE_STUFF_FRUGAL: case RE_STUFF_GREEDY: case RE_XREP_FRUGAL: case RE_XREP_GREEDY: return; } NOTREACHED; } ATTR_PURE static int anchorbegp(const re_node *node) { return node && ( node->x.type == RE_AT_BOS || node->x.type == RE_STUFF_FRUGAL || node->x.type == RE_STUFF_GREEDY || ( node->x.type == RE_ALTER && anchorbegp(node->alter.arg) && anchorbegp(node->alter.next) ) || ( node->x.type == RE_INDEP && anchorbegp(node->indep.arg) ) || ( node->x.type == RE_AT_MATCH && ( anchorbegp(node->indep.arg) || anchorbegp(node->indep.next) ) ) || ( ( node->x.type == RE_AT_NOMATCH || node->x.type == RE_AT_BOL || node->x.type == RE_AT_EOL || node->x.type == RE_AT_EOS || node->x.type == RE_AT_NWBOUND || node->x.type == RE_AT_WBOUND || node->x.type == RE_BEGIN_CAPTURE || node->x.type == RE_BACKCHECK ) && anchorbegp(node->x.next) ) ) ; } static void dostuff(t_regex *re) { simplerep(re->start, NULL); if (!re->start || anchorbegp(re->start)) { re->flags |= FL_ANCHOR_START; } } t_regex *re_compile(const String *s) { size_t p; struct parse_context dcon; t_regex *re; void *pre; if (h_get(&rcache, s, &pre) == H_OK) { re = pre; ++re->refs; return re; } re = xmalloc(1, sizeof *re); re->flags = FL_NONE; re->refs = 0; re->repets = 0; bl_init(&re->alloc); re->cap_ptr = NULL; re->captures = 0; dcon.base = re; dcon.block = &re->alloc; dcon.s = s; p = St_len(s) - 1; dcon.p = &p; re->start = alternation(&dcon, NULL, NULL); if (re->captures) { re->cap_ptr = xmalloc(re->captures, sizeof *re->cap_ptr); reinit(re); } dostuff(re); re->minlen = minlen(re->start); re->repbuf = xmalloc(re->repets, sizeof *re->repbuf); if (cachfill < sizeof cached / sizeof cached[0]) { St_init(&cached[cachfill]); St_cpy(&cached[cachfill], s); ++re->refs; h_put(&rcache, &cached[cachfill], re, 1); ++cachfill; } else { h_del(&rcache, &cached[cachlast]); St_cpy(&cached[cachlast], s); ++re->refs; h_put(&rcache, &cached[cachlast], re, 1); cachlast = (cachlast + 1u) % (sizeof cached / sizeof cached[0]); } if (Opt.debug & DBG_REGEX) { io_write_s(Err, "Compiled "); re_dump(re, io_fp(Err)); } return re; } t_regex *re_dup(t_regex *re) { ++re->refs; return re; } void re_free(t_regex *re) { if (re->refs) { --re->refs; return; } bl_free(&re->alloc); xfree(re->repbuf); xfree(re->cap_ptr); xfree(re); } ATTR_PURE static int match_class(unsigned char c, const unsigned char *vec) { return vec[c / CHAR_BIT] & 1u << c % CHAR_BIT; } static void *mem_dup(const void *p, size_t n, size_t m) { void *q = xmalloc(n, m); memcpy(q, p, n * m); return q; } #define NO_MATCH ((size_t)-1) #define Mmatch0(Xmatch, Xindex) \ size_t tmp; \ \ if (!node) { \ return o; \ } \ \ switch (node->x.type) { \ case RE_ALTER: \ if ((tmp = Xmatch(base, node->alter.arg, s, o)) == NO_MATCH) { \ return Xmatch(base, node->alter.next, s, o); \ } \ return tmp; \ \ case RE_ANYCH: \ if (Xindex(s, o) != EOF) { \ return Xmatch(base, node->x.next, s, o + 1u); \ } \ return NO_MATCH; \ \ case RE_AT_BOL: \ if (o == 0 || Xindex(s, o - 1u) == '\n') { \ return Xmatch(base, node->x.next, s, o); \ } \ return NO_MATCH; \ \ case RE_AT_BOS: \ if (o == 0) { \ return Xmatch(base, node->x.next, s, o); \ } \ return NO_MATCH; \ \ case RE_AT_EOL: \ if ( \ Xindex(s, o) == '\n' || \ ( \ o && Xindex(s, o) == EOF && Xindex(s, o - 1u) != '\n' \ ) \ ) { \ return Xmatch(base, node->x.next, s, o); \ } \ return NO_MATCH; \ \ case RE_AT_EOS: \ if (Xindex(s, o) == EOF) { \ return Xmatch(base, node->x.next, s, o); \ } \ return NO_MATCH; \ \ case RE_AT_MATCH: { \ struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ tmp = Xmatch(base, node->indep.arg, s, o); \ if (tmp != NO_MATCH) { \ tmp = Xmatch(base, node->indep.next, s, o); \ if (tmp == NO_MATCH) { \ memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ } \ } \ xfree(prev); \ return tmp; \ } \ \ case RE_AT_NOMATCH: { \ struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ tmp = Xmatch(base, node->indep.arg, s, o); \ if (tmp == NO_MATCH) { \ tmp = Xmatch(base, node->indep.next, s, o); \ } else { \ memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ tmp = NO_MATCH; \ } \ xfree(prev); \ return tmp; \ } \ \ case RE_AT_NWBOUND: \ if (siword(Xindex(s, o - 1u)) == siword(Xindex(s, o))) { \ return Xmatch(base, node->x.next, s, o); \ } \ return NO_MATCH; \ \ case RE_AT_WBOUND: \ if (siword(Xindex(s, o - 1u)) != siword(Xindex(s, o))) { \ return Xmatch(base, node->x.next, s, o); \ } \ return NO_MATCH; \ \ case RE_BACKCHECK: \ if ( \ node->backref.n < base->captures && \ base->cap_ptr[node->backref.n].start + 1u && \ base->cap_ptr[node->backref.n].end + 1u && \ base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start \ ) { \ return Xmatch(base, node->backref.next, s, o); \ } \ return NO_MATCH #define Mmatch1(Xmatch, Xindex) \ case RE_BEGIN_CAPTURE: { \ const size_t prev = base->cap_ptr[node->capture.n].pending; \ base->cap_ptr[node->capture.n].pending = o; \ if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \ base->cap_ptr[node->capture.n].pending = prev; \ } \ return tmp; \ } \ \ case RE_CLASS: \ if ( \ Xindex(s, o) != EOF && \ match_class(Xindex(s, o), node->class.vec) \ ) { \ return Xmatch(base, node->class.next, s, o + 1u); \ } \ return NO_MATCH; \ \ case RE_DEFCLASS: \ if ( \ Xindex(s, o) != EOF && \ match_class(Xindex(s, o), node->defclass.vec) \ ) { \ return Xmatch(base, node->defclass.next, s, o + 1u); \ } \ return NO_MATCH; \ \ case RE_END_CAPTURE: { \ const size_t prevbeg = base->cap_ptr[node->capture.n].start; \ const size_t prevend = base->cap_ptr[node->capture.n].end; \ if (base->cap_ptr[node->capture.n].pending + 1u == 0u) { \ return NO_MATCH; \ } \ \ base->cap_ptr[node->capture.n].end = o; \ base->cap_ptr[node->capture.n].start = base->cap_ptr[node->capture.n].pending; \ if ((tmp = Xmatch(base, node->capture.next, s, o)) == NO_MATCH) { \ base->cap_ptr[node->capture.n].end = prevend; \ base->cap_ptr[node->capture.n].start = prevbeg; \ } \ return tmp; \ } \ \ case RE_INDEP: { \ struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ if ((tmp = Xmatch(base, node->indep.arg, s, o)) != NO_MATCH) { \ tmp = Xmatch(base, node->indep.next, s, tmp); \ if (tmp == NO_MATCH) { \ memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ } \ } \ xfree(prev); \ return tmp; \ } \ \ case RE_N_CLASS: \ if ( \ Xindex(s, o) != EOF && \ !match_class(Xindex(s, o), node->class.vec) \ ) { \ return Xmatch(base, node->class.next, s, o + 1u); \ } \ return NO_MATCH; \ \ case RE_N_DEFCLASS: \ if ( \ Xindex(s, o) != EOF && \ !match_class(Xindex(s, o), node->defclass.vec) \ ) { \ return Xmatch(base, node->defclass.next, s, o + 1u); \ } \ return NO_MATCH; \ \ case RE_PARTIAL: { \ size_t n; \ for (n = 0; n < node->literal.len; ++n) { \ if (node->literal.buf[n] != Xindex(s, o + n)) { \ break; \ } \ } \ for (; n; --n) { \ if ((tmp = Xmatch(base, node->literal.next, s, o + n)) != NO_MATCH) { \ return tmp; \ } \ } \ return Xmatch(base, node->literal.next, s, o); \ } \ \ case RE_SELECT: { \ struct cap_state *prev = mem_dup(base->cap_ptr, base->captures, sizeof *base->cap_ptr); \ if ((tmp = Xmatch(base, node->select.cond, s, o)) != NO_MATCH) { \ tmp = Xmatch(base, node->select.arg, s, tmp); \ if (tmp == NO_MATCH) { \ memcpy(base->cap_ptr, prev, base->captures * sizeof *base->cap_ptr); \ } \ } else { \ tmp = Xmatch(base, node->select.next, s, o); \ } \ xfree(prev); \ return tmp; \ } \ \ case RE_REP_FRUGAL: \ if (base->repbuf[node->rep.n] >= node->rep.max) { \ const size_t n = base->repbuf[node->rep.n]; \ \ base->repbuf[node->rep.n] = 0; \ tmp = Xmatch(base, node->rep.next, s, o); \ base->repbuf[node->rep.n] = n; \ return tmp; \ } else if (base->repbuf[node->rep.n] >= node->rep.min) { \ const size_t n = base->repbuf[node->rep.n]; \ \ base->repbuf[node->rep.n] = 0; \ if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ base->repbuf[node->rep.n] = n; \ return tmp; \ } \ base->repbuf[node->rep.n] = n + 1u; \ tmp = Xmatch(base, node->rep.arg, s, o); \ --base->repbuf[node->rep.n]; \ return tmp; \ } else { \ ++base->repbuf[node->rep.n]; \ tmp = Xmatch(base, node->rep.arg, s, o); \ --base->repbuf[node->rep.n]; \ return tmp; \ } \ \ case RE_REP_GREEDY: \ if (base->repbuf[node->rep.n] >= node->rep.max) { \ const size_t n = base->repbuf[node->rep.n]; \ \ base->repbuf[node->rep.n] = 0; \ tmp = Xmatch(base, node->rep.next, s, o); \ base->repbuf[node->rep.n] = n; \ return tmp; \ } else if (base->repbuf[node->rep.n] >= node->rep.min) { \ const size_t n = base->repbuf[node->rep.n]; \ \ ++base->repbuf[node->rep.n]; \ if ((tmp = Xmatch(base, node->rep.arg, s, o)) != NO_MATCH) { \ --base->repbuf[node->rep.n]; \ return tmp; \ } \ base->repbuf[node->rep.n] = 0; \ tmp = Xmatch(base, node->rep.next, s, o); \ base->repbuf[node->rep.n] = n; \ return tmp; \ } else { \ ++base->repbuf[node->rep.n]; \ tmp = Xmatch(base, node->rep.arg, s, o); \ --base->repbuf[node->rep.n]; \ return tmp; \ } \ \ case RE_XREP_FRUGAL: { \ size_t n; \ \ for (n = 0; n < node->rep.min; ++n) { \ if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ return NO_MATCH; \ } \ o = tmp; \ } \ \ for (; n < node->rep.max; ++n) { \ if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ return tmp; \ } \ if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ return NO_MATCH; \ } \ o = tmp; \ } \ return Xmatch(base, node->rep.next, s, o); \ } \ \ case RE_XREP_GREEDY: { \ size_t n; \ \ for (n = 0; n < node->rep.min; ++n) { \ if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ return NO_MATCH; \ } \ o = tmp; \ } \ \ for (; n < node->rep.max; ++n) { \ if ((tmp = Xmatch(base, node->rep.arg, s, o)) == NO_MATCH) { \ break; \ } \ o = tmp; \ } \ \ for (; n > node->rep.min; --n) { \ if ((tmp = Xmatch(base, node->rep.next, s, o)) != NO_MATCH) { \ return tmp; \ } \ o -= node->rep.n; \ } \ \ return Xmatch(base, node->rep.next, s, o); \ } \ } \ \ NOTREACHED static size_t match(const t_regex *base, const re_node *node, const String *s, size_t o) { Mmatch0(match, ST_INDEX); case RE_BACKREF: if ( node->backref.n < base->captures && base->cap_ptr[node->backref.n].start + 1u && base->cap_ptr[node->backref.n].end + 1u && base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start && ( base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start <= St_len(s) - o && memcmp( St_ptr(s) + base->cap_ptr[node->backref.n].start, St_ptr(s) + o, base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start ) == 0 ) ) { return match(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start); } return NO_MATCH; case RE_LITERAL: if ( node->literal.len == 0 || ( node->literal.len <= St_len(s) - o && memcmp(node->literal.buf, St_ptr(s) + o, node->literal.len) == 0 ) ) { return match(base, node->literal.next, s, o + node->literal.len); } return NO_MATCH; case RE_STUFF_FRUGAL: if (!node->x.next) { return o; } if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) { for ( o = St_stro_m(s, o, node->x.next->literal.buf, node->x.next->literal.len); o + 1u; o = St_stro_m(s, o + 1u, node->x.next->literal.buf, node->x.next->literal.len) ) { if ((tmp = match(base, node->x.next->literal.next, s, o + node->x.next->literal.len)) != NO_MATCH) { return tmp; } } return NO_MATCH; } for (; o <= St_len(s); ++o) { if ((tmp = match(base, node->x.next, s, o)) != NO_MATCH) { return tmp; } } return NO_MATCH; case RE_STUFF_GREEDY: { size_t n; if (!node->x.next) { return St_len(s); } if (node->x.next->x.type == RE_LITERAL && node->x.next->literal.len) { for ( n = St_rstr_m(s, node->x.next->literal.buf, node->x.next->literal.len); n > o && n + 1u; n = St_rstro_m(s, n - 1u, node->x.next->literal.buf, node->x.next->literal.len) ) { if ((tmp = match(base, node->x.next->literal.next, s, n + node->x.next->literal.len)) != NO_MATCH) { return tmp; } if (!n) { break; } } if (n == o) { return match(base, node->x.next->literal.next, s, n + node->x.next->literal.len); } return NO_MATCH; } for (n = St_len(s); n > o; --n) { if ((tmp = match(base, node->x.next, s, n)) != NO_MATCH) { return tmp; } } return match(base, node->x.next, s, o); } Mmatch1(match, ST_INDEX); } static size_t iomatch(const t_regex *base, const re_node *node, IO *s, size_t o) { Mmatch0(iomatch, io_peek); case RE_BACKREF: if ( node->backref.n < base->captures && base->cap_ptr[node->backref.n].start + 1u && base->cap_ptr[node->backref.n].end + 1u && base->cap_ptr[node->backref.n].end >= base->cap_ptr[node->backref.n].start && ( io_xcmp( s, base->cap_ptr[node->backref.n].start, o, base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start ) == 0 ) ) { return iomatch(base, node->backref.next, s, o + base->cap_ptr[node->backref.n].end - base->cap_ptr[node->backref.n].start); } return NO_MATCH; case RE_LITERAL: if ( node->literal.len == 0 || ( io_cmppeek(s, o, node->literal.buf, node->literal.len) == 0 ) ) { return iomatch(base, node->literal.next, s, o + node->literal.len); } return NO_MATCH; case RE_STUFF_FRUGAL: if (!node->x.next) { return o; } for (; io_peek(s, o) != EOF; ++o) { if ((tmp = iomatch(base, node->x.next, s, o)) != NO_MATCH) { return tmp; } } return iomatch(base, node->x.next, s, o); case RE_STUFF_GREEDY: { size_t n; for (n = o; io_peek(s, n) != EOF; ++n) ; if (!node->x.next) { return n; } for (; n > o; --n) { if ((tmp = iomatch(base, node->x.next, s, n)) != NO_MATCH) { return tmp; } } return iomatch(base, node->x.next, s, o); } Mmatch1(iomatch, io_peek); } #define DEFRBUF ((void)0) #define MRETURN(x) return (x) #define INITRBUF(re) memset((re)->repbuf, '\0', (re)->repets * sizeof *(re)->repbuf) int re_iomatch(t_regex *re, IO *io, size_t *ms, size_t *me) { size_t i; DEFRBUF; INITRBUF(re); assert(re != NULL); assert(io != NULL); assert(io_bufred(io)); assert(ms != NULL); assert(me != NULL); reinit(re); if (re->flags & FL_ANCHOR_START) { if ((*me = iomatch(re, re->start, io, 0)) == NO_MATCH) { MRETURN(0); } *ms = 0; MRETURN(1); } for (i = 0; io_peek(io, i) != EOF; ++i) { if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) { *ms = i; MRETURN(1); } } if ((*me = iomatch(re, re->start, io, i)) != NO_MATCH) { *ms = i; MRETURN(1); } MRETURN(0); } int re_match(t_regex *re, const String *s, size_t *ms, size_t *me) { size_t i; DEFRBUF; INITRBUF(re); assert(re != NULL); assert(s != NULL); assert(ms != NULL); assert(me != NULL); if (re->minlen > St_len(s)) { MRETURN(0); } reinit(re); if (re->flags & FL_ANCHOR_START) { if ((*me = match(re, re->start, s, 0)) == NO_MATCH) { MRETURN(0); } *ms = 0; MRETURN(1); } if (re->start->x.type == RE_LITERAL && re->start->literal.len) { for ( i = St_str_m(s, re->start->literal.buf, re->start->literal.len); i + 1u; i = St_stro_m(s, i + 1u, re->start->literal.buf, re->start->literal.len) ) { if ((*me = match(re, re->start->literal.next, s, i + re->start->literal.len)) != NO_MATCH) { *ms = i; MRETURN(1); } } MRETURN(0); } for (i = 0; i <= St_len(s) - re->minlen; ++i) { if ((*me = match(re, re->start, s, i)) != NO_MATCH) { *ms = i; MRETURN(1); } } MRETURN(0); } size_t re_cabra(const t_regex *re) { return re->captures; } int re_backref(const t_regex *re, size_t i, size_t *a, size_t *z) { if (i >= re->captures) { return -1; } if (!(re->cap_ptr[i].start + 1u && re->cap_ptr[i].end + 1u)) { return 1; } *a = re->cap_ptr[i].start; *z = re->cap_ptr[i].end; return 0; } ATTR_PURE static const re_node *branchend(const re_node *a, const re_node *b) { for (; a; a = a->x.next) { const re_node *q = b; for (; q; q = q->x.next) { if (a == q) { return q; } } } return NULL; } static void decom_class(String *s, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) { unsigned char i; int flag = 0; char big_enough[100]; for (i = 0; i < UCHAR_MAX; ++i) { if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) { if (vec[(i + 1u) / CHAR_BIT] & 1u << (i + 1u) % CHAR_BIT) { if (!flag) { flag = 1; goto mess; } flag = 1; } else { if (flag) { if (i > 1u && vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) { St_cat_c(s, '-'); } flag = 0; } mess: if (!isprint(i)) { St_cat_c(s, '\\'); switch (i) { case '\a': St_cat_c(s, 'a'); break; case '\b': St_cat_c(s, 'b'); break; case '\f': St_cat_c(s, 'f'); break; case '\n': St_cat_c(s, 'n'); break; case '\r': St_cat_c(s, 'r'); break; case '\t': St_cat_c(s, 't'); break; case '\v': St_cat_c(s, 'v'); break; default: sprintf(big_enough, "%03o", (unsigned)i); St_cat_s(s, big_enough); break; } } else { if (strchr("\\\"", (char)i)) { St_cat_c(s, '\\'); } St_cat_c(s, i); if (strchr("!'-", (char)i)) { St_cat_c(s, ESCAPE); } } } } } if (vec[i / CHAR_BIT] & 1u << i % CHAR_BIT) { if (flag) { if (vec[(i - 2u) / CHAR_BIT] & 1u << (i - 2u) % CHAR_BIT) { St_cat_c(s, '-'); } flag = 0; } if (!isprint(i)) { St_cat_c(s, '\\'); switch (i) { case '\a': St_cat_c(s, 'a'); break; case '\b': St_cat_c(s, 'b'); break; case '\f': St_cat_c(s, 'f'); break; case '\n': St_cat_c(s, 'n'); break; case '\r': St_cat_c(s, 'r'); break; case '\t': St_cat_c(s, 't'); break; case '\v': St_cat_c(s, 'v'); break; default: sprintf(big_enough, "%03o", (unsigned)i); St_cat_s(s, big_enough); break; } } else { if (strchr("\\\"", (char)i)) { St_cat_c(s, '\\'); } St_cat_c(s, i); if (strchr("!'-", (char)i)) { St_cat_c(s, ESCAPE); } } } } static void dumpclass(FILE *fp, const unsigned char vec[HAVE_C99(static CLASS_SIZE)]) { String tmp; St_init(&tmp); decom_class(&tmp, vec); ST_WRITE(&tmp, fp); St_clear(&tmp); } ATTR_CONST static int cdefclass(const unsigned char *ptr) { if (ptr == dc_word) { return 'w'; } if (ptr == dc_alpha) { return 'q'; } if (ptr == dc_cntrl) { return 'c'; } if (ptr == dc_digit) { return 'd'; } if (ptr == dc_lower) { return 'l'; } if (ptr == dc_print) { return 'p'; } if (ptr == dc_space) { return 's'; } if (ptr == dc_upper) { return 'u'; } if (ptr == dc_xdigit) { return 'x'; } NOTREACHED; } static void decom_liter(String *s, const re_node *node) { size_t i; assert(node->x.type == RE_LITERAL || node->x.type == RE_PARTIAL); for (i = 0; i < node->literal.len; ++i) { if (!isprint((unsigned char)node->literal.buf[i])) { St_cat_c(s, '\\'); switch (node->literal.buf[i]) { case '\a': St_cat_c(s, 'a'); break; case '\b': St_cat_c(s, 'b'); break; case '\f': St_cat_c(s, 'f'); break; case '\n': St_cat_c(s, 'n'); break; case '\r': St_cat_c(s, 'r'); break; case '\t': St_cat_c(s, 't'); break; case '\v': St_cat_c(s, 'v'); break; default: { char big_enough[100]; sprintf(big_enough, "%03o", (unsigned)(unsigned char)node->literal.buf[i]); St_cat_s(s, big_enough); break; } } } else { if (node->x.type == RE_LITERAL && strchr("\"\\", node->literal.buf[i])) { St_cat_c(s, '\\'); } St_cat_c(s, node->literal.buf[i]); if (MEATCHARP(node->literal.buf[i])) { St_cat_c(s, ESCAPE); } } } } static void dumpliter(FILE *fp, const re_node *node) { String tmp; St_init(&tmp); decom_liter(&tmp, node); ST_WRITE(&tmp, fp); St_clear(&tmp); } static void do_indent(FILE *fp, size_t n) { for (; n; --n) { fputs(" ", fp); } } static void dumpnode(FILE *fp, const re_node *node, const re_node *end, const size_t indent) { if (node == end) { return; } assert(node != NULL); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); switch (node->x.type) { case RE_ALTER: { const re_node *const be = branchend(node->alter.arg, node->alter.next); fputs("branch {\n", fp); dumpnode(fp, node->alter.arg, be, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} or {\n", fp); dumpnode(fp, node->alter.next, be, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /branch\n", fp); dumpnode(fp, be, end, indent); return; } case RE_ANYCH: fputs("anychar\n", fp); break; case RE_AT_BOL: fputs("at beginning of line\n", fp); break; case RE_AT_BOS: fputs("at beginning of string\n", fp); break; case RE_AT_EOL: fputs("at end of line\n", fp); break; case RE_AT_EOS: fputs("at end of string\n", fp); break; case RE_AT_MATCH: fputs("at match {\n", fp); dumpnode(fp, node->indep.arg, NULL, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /at match\n", fp); break; case RE_AT_NOMATCH: fputs("not at match {\n", fp); dumpnode(fp, node->indep.arg, NULL, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /not at match\n", fp); break; case RE_AT_NWBOUND: fputs("not at word boundary\n", fp); break; case RE_AT_WBOUND: fputs("at word boundary\n", fp); break; case RE_BACKCHECK: fprintf(fp, "check capture (%lu)\n", (unsigned long)node->backref.n); break; case RE_BACKREF: fprintf(fp, "backref (%lu)\n", (unsigned long)node->backref.n); break; case RE_BEGIN_CAPTURE: fprintf(fp, "begin capture (%lu)\n", (unsigned long)node->backref.n); break; case RE_CLASS: fputs("character class '", fp); dumpclass(fp, node->class.vec); fputs("'\n", fp); break; case RE_DEFCLASS: fprintf(fp, "character class %c!\n", cdefclass(node->defclass.vec)); break; case RE_END_CAPTURE: fprintf(fp, "end capture (%lu)\n", (unsigned long)node->backref.n); break; case RE_INDEP: fputs("independent submatch {\n", fp); dumpnode(fp, node->indep.arg, NULL, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /independent submatch\n", fp); break; case RE_LITERAL: fputs("literal \"", fp); dumpliter(fp, node); fputs("\"\n", fp); break; case RE_N_CLASS: fputs("negated character class '", fp); dumpclass(fp, node->class.vec); fputs("'\n", fp); break; case RE_N_DEFCLASS: fprintf(fp, "character class %c!\n", toupper(cdefclass(node->defclass.vec))); break; case RE_PARTIAL: fputs("abbreviation `", fp); dumpliter(fp, node); fputs("`\n", fp); break; case RE_REP_FRUGAL: fputs("repetition ", fp); if (node->rep.min == node->rep.max) { fprintf(fp, ":%lu:?", (unsigned long)node->rep.min); } else if (node->rep.max != (size_t)-1) { fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max); } else { switch (node->rep.min) { case 0: fputs("*?", fp); break; case 1: fputs("+?", fp); break; default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break; } } fputs(" {\n", fp); dumpnode(fp, node->rep.arg, node, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /repetition\n", fp); break; case RE_REP_GREEDY: fputs("repetition ", fp); if (node->rep.min == node->rep.max) { fprintf(fp, ":%lu:", (unsigned long)node->rep.min); } else if (node->rep.max != (size_t)-1) { fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max); } else { switch (node->rep.min) { case 0: fputs("*", fp); break; case 1: fputs("+", fp); break; default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break; } } fputs(" {\n", fp); dumpnode(fp, node->rep.arg, node, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /repetition\n", fp); break; case RE_SELECT: { const re_node *const be = branchend(node->select.arg, node->select.next); fputs("if {\n", fp); dumpnode(fp, node->select.cond, NULL, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} then {\n", fp); dumpnode(fp, node->select.arg, be, indent + 1); if (node->select.next != be) { fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} else {\n", fp); dumpnode(fp, node->select.next, be, indent + 1); } fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /if\n", fp); dumpnode(fp, be, end, indent); return; } case RE_STUFF_FRUGAL: fputs(".*?\n", fp); break; case RE_STUFF_GREEDY: fputs(".*\n", fp); break; case RE_XREP_FRUGAL: fputs("simple repetition ", fp); if (node->rep.min == node->rep.max) { fprintf(fp, ":%lu:?", (unsigned long)node->rep.min); } else if (node->rep.max != (size_t)-1) { fprintf(fp, ":%lu,%lu:?", (unsigned long)node->rep.min, (unsigned long)node->rep.max); } else { switch (node->rep.min) { case 0: fputs("*?", fp); break; case 1: fputs("+?", fp); break; default: fprintf(fp, ":%lu,:?", (unsigned long)node->rep.min); break; } } fputs(" {\n", fp); dumpnode(fp, node->rep.arg, NULL, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /simple repetition\n", fp); break; case RE_XREP_GREEDY: fputs("simple repetition ", fp); if (node->rep.min == node->rep.max) { fprintf(fp, ":%lu:", (unsigned long)node->rep.min); } else if (node->rep.max != (size_t)-1) { fprintf(fp, ":%lu,%lu:", (unsigned long)node->rep.min, (unsigned long)node->rep.max); } else { switch (node->rep.min) { case 0: fputs("*", fp); break; case 1: fputs("+", fp); break; default: fprintf(fp, ":%lu,:", (unsigned long)node->rep.min); break; } } fputs(" {\n", fp); dumpnode(fp, node->rep.arg, NULL, indent + 1); fprintf(fp, "[%p] ", (const void *)node); do_indent(fp, indent); fputs("} /simple repetition\n", fp); break; } dumpnode(fp, node->x.next, end, indent); } void re_dump(const t_regex *re, FILE *fp) { fprintf(fp, "RE: minlen=%lu, anchor=%s\n", (unsigned long)re->minlen, re->flags & FL_ANCHOR_START ? "start" : "none"); dumpnode(fp, re->start, NULL, 0); } static void cat_n(String *s, unsigned long n) { char big_enough[100]; sprintf(big_enough, "%lu", n); St_cat_s(s, big_enough); } static void decom_repeat(String *s, const re_node *node) { assert( node->x.type == RE_REP_FRUGAL || node->x.type == RE_REP_GREEDY || node->x.type == RE_XREP_FRUGAL || node->x.type == RE_XREP_GREEDY ); if (node->rep.min == 0 && node->rep.max == (size_t)-1) { St_cat_c(s, '*'); } else if (node->rep.min == 0 && node->rep.max == 1) { St_cat_c(s, '?'); } else if (node->rep.min == 1 && node->rep.max == (size_t)-1) { St_cat_c(s, '+'); } else if (node->rep.min == node->rep.max) { St_cat_c(s, ':'); cat_n(s, node->rep.min); St_cat_c(s, ':'); } else if (node->rep.max == (size_t)-1) { St_cat_c(s, ':'); cat_n(s, node->rep.min); St_cat_m(s, ",:", 2); } else { St_cat_c(s, ':'); cat_n(s, node->rep.min); St_cat_c(s, ','); cat_n(s, node->rep.max); St_cat_c(s, ':'); } if (node->rep.type == RE_REP_FRUGAL || node->rep.type == RE_XREP_FRUGAL) { St_cat_c(s, '?'); } } static void decom_node(String *s, const re_node *node, const re_node *end) { if (node == end) { return; } assert(node != NULL); switch (node->x.type) { case RE_ALTER: { const re_node *const be = branchend(node->alter.arg, node->alter.next); St_cat_c(s, '('); decom_node(s, node->alter.arg, be); St_cat_c(s, '|'); decom_node(s, node->alter.next, be); St_cat_c(s, ')'); decom_node(s, be, end); break; } case RE_ANYCH: St_cat_c(s, '.'); decom_node(s, node->x.next, end); break; case RE_AT_BOL: St_cat_m(s, "A!", 2); decom_node(s, node->x.next, end); break; case RE_AT_BOS: St_cat_c(s, '^'); decom_node(s, node->x.next, end); break; case RE_AT_EOL: St_cat_m(s, "Z!", 2); decom_node(s, node->x.next, end); break; case RE_AT_EOS: St_cat_c(s, '$'); decom_node(s, node->x.next, end); break; case RE_AT_MATCH: St_cat_c(s, '['); decom_node(s, node->indep.arg, NULL); St_cat_m(s, "&]", 2); decom_node(s, node->indep.next, end); break; case RE_AT_NOMATCH: St_cat_c(s, '['); decom_node(s, node->indep.arg, NULL); St_cat_m(s, "^]", 2); decom_node(s, node->indep.next, end); break; case RE_AT_NWBOUND: St_cat_m(s, "B!", 2); decom_node(s, node->x.next, end); break; case RE_AT_WBOUND: St_cat_m(s, "b!", 2); decom_node(s, node->x.next, end); break; case RE_BACKREF: case RE_BACKCHECK: { String tmp; if (node->x.type == RE_BACKCHECK) { St_cat_c(s, '['); } St_init(&tmp); #if HAVE_VSNPRINTF_P St_xprintf(&tmp, "%lu", (unsigned long)node->backref.n); #else St_num(&tmp, node->backref.n); #endif St_cat(s, &tmp); St_clear(&tmp); if (node->x.type == RE_BACKCHECK) { St_cat_c(s, ']'); } else { St_cat_c(s, ESCAPE); } decom_node(s, node->backref.next, end); break; } case RE_BEGIN_CAPTURE: St_cat_c(s, '{'); decom_node(s, node->x.next, end); break; case RE_CLASS: St_cat_c(s, '\''); decom_class(s, node->class.vec); St_cat_c(s, '\''); decom_node(s, node->class.next, end); break; case RE_DEFCLASS: St_cat_c(s, cdefclass(node->defclass.vec)); St_cat_c(s, ESCAPE); decom_node(s, node->defclass.next, end); break; case RE_END_CAPTURE: St_cat_c(s, '}'); decom_node(s, node->x.next, end); break; case RE_INDEP: St_cat_c(s, '<'); decom_node(s, node->indep.arg, NULL); St_cat_c(s, '>'); decom_node(s, node->indep.next, end); break; case RE_LITERAL: decom_liter(s, node); decom_node(s, node->literal.next, end); break; case RE_N_CLASS: St_cat_m(s, "'^", 2); decom_class(s, node->class.vec); St_cat_c(s, '\''); decom_node(s, node->class.next, end); break; case RE_N_DEFCLASS: St_cat_c(s, toupper(cdefclass(node->defclass.vec))); St_cat_c(s, ESCAPE); decom_node(s, node->defclass.next, end); break; case RE_PARTIAL: St_cat_c(s, '`'); decom_liter(s, node); St_cat_c(s, '`'); break; case RE_REP_FRUGAL: case RE_REP_GREEDY: St_cat_c(s, '('); decom_node(s, node->rep.arg, node); St_cat_c(s, ')'); decom_repeat(s, node); decom_node(s, node->rep.next, end); break; case RE_SELECT: { const re_node *const be = branchend(node->select.arg, node->select.next); St_cat_c(s, '['); if (be != node->select.next) { decom_node(s, node->select.next, be); St_cat_c(s, '|'); } decom_node(s, node->select.arg, be); St_cat_c(s, '('); decom_node(s, node->select.cond, NULL); St_cat_c(s, ')'); St_cat_m(s, "?]", 2); decom_node(s, be, end); break; } case RE_STUFF_FRUGAL: St_cat_m(s, ".*?", 3); decom_node(s, node->x.next, end); break; case RE_STUFF_GREEDY: St_cat_m(s, ".*", 2); decom_node(s, node->x.next, end); break; case RE_XREP_FRUGAL: case RE_XREP_GREEDY: St_cat_c(s, '('); decom_node(s, node->rep.arg, NULL); St_cat_c(s, ')'); decom_repeat(s, node); decom_node(s, node->rep.next, end); break; } } void re_decompile(const t_regex *re, String *s) { St_zero(s); decom_node(s, re->start, NULL); }