#include #include #include #include #include "cregex.h" typedef struct { const char *sp; cregex_node_t *stack, *output; } regex_parse_context; /* Shunting-yard algorithm * See https://en.wikipedia.org/wiki/Shunting-yard_algorithm */ STATIC INLINE cregex_node_t *push(regex_parse_context *context, const cregex_node_t *node) { //assert(context->stack <= context->output); *context->stack = *node; return context->stack++; } STATIC INLINE cregex_node_t *drop(regex_parse_context *context) { return --context->stack; } STATIC INLINE cregex_node_t *consume(regex_parse_context *context) { *--context->output = *--context->stack; return context->output; } STATIC INLINE cregex_node_t *concatenate(regex_parse_context *context, const cregex_node_t *bottom) { cregex_node_t newNode; memset(&newNode, 0, sizeof(cregex_node_t)); if (context->stack == bottom) { newNode.type = REGEX_NODE_TYPE_EPSILON; push(context, &newNode); } else { newNode.type = REGEX_NODE_TYPE_CONCATENATION; while (context->stack - 1 > bottom) { cregex_node_t *right = consume(context); cregex_node_t *left = consume(context); newNode.u.d.left = left; newNode.u.d.right = right; push(context, &newNode); } } return context->stack - 1; } STATIC cregex_node_t *parse_char_class(regex_parse_context *context) { cregex_node_t newNode; cregex_node_type type = (*context->sp == '^') ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED) : REGEX_NODE_TYPE_CHARACTER_CLASS; const char *from = context->sp; for (;;) { LONG ch = *context->sp++; memset(&newNode, 0, sizeof(cregex_node_t)); switch (ch) { case '\0': /* premature end of character class */ return NULL; case ']': if (context->sp - 1 == from) { goto CHARACTER; } else { newNode.type = type; newNode.u.b.from = from; newNode.u.b.to = context->sp - 1; return push(context, &newNode); } case '\\': ch = *context->sp++; /* fall-through */ default: CHARACTER: if (*context->sp == '-' && context->sp[1] != ']') { if (context->sp[1] < ch) /* empty range in character class */ return NULL; context->sp += 2; } break; } } } STATIC cregex_node_t *parse_interval(regex_parse_context *context) { const char *from = context->sp; LONG nmin, nmax; cregex_node_t newNode; memset(&newNode, 0, sizeof(cregex_node_t)); for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp) nmin = (nmin * 10) + (*context->sp - '0'); if (*context->sp == ',') { ++context->sp; if (*from != ',' && *context->sp == '}') nmax = -1; else { for (nmax = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp) nmax = (nmax * 10) + (*context->sp - '0'); if (*(context->sp - 1) == ',' || *context->sp != '}' || nmax < nmin) { context->sp = from; return NULL; } } } else if (*from != '}' && *context->sp == '}') { nmax = nmin; } else { context->sp = from; return NULL; } ++context->sp; newNode.type = REGEX_NODE_TYPE_QUANTIFIER; newNode.u.c.nmin = nmin; newNode.u.c.nmax = nmax; newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; newNode.u.c.quantified = consume(context); return push(context, &newNode); } STATIC cregex_node_t *parse_context(regex_parse_context *context, LONG depth) { cregex_node_t *bottom = context->stack; cregex_node_t newNode; for (;;) { LONG ch = *context->sp++; memset(&newNode, 0, sizeof(cregex_node_t)); switch (ch) { /* Characters */ case '\\': ch = *context->sp++; /* fall-through */ default: CHARACTER: newNode.type = REGEX_NODE_TYPE_CHARACTER; newNode.u.a.ch = ch; push(context, &newNode); break; case '.': newNode.type = REGEX_NODE_TYPE_ANY_CHARACTER; push(context, &newNode); break; case '[': if (!parse_char_class(context)) return NULL; break; /* Composites */ case '|': { cregex_node_t *left = concatenate(context, bottom), *right; if (!(right = parse_context(context, depth))) return NULL; if (left->type == REGEX_NODE_TYPE_EPSILON && right->type == left->type) { drop(context); } else if (left->type == REGEX_NODE_TYPE_EPSILON) { right = consume(context); drop(context); newNode.type = REGEX_NODE_TYPE_QUANTIFIER; newNode.u.c.nmin = 0; newNode.u.c.nmax = 1; newNode.u.c.greedy = 1; newNode.u.c.quantified = right; push(context, &newNode); } else if (right->type == REGEX_NODE_TYPE_EPSILON) { drop(context); left = consume(context); newNode.type = REGEX_NODE_TYPE_QUANTIFIER; newNode.u.c.nmin = 0; newNode.u.c.nmax = 1; newNode.u.c.greedy = 1; newNode.u.c.quantified = left; push(context, &newNode); } else { right = consume(context); left = consume(context); newNode.type = REGEX_NODE_TYPE_ALTERNATION; newNode.u.d.left = left; newNode.u.d.right = right; push(context, &newNode); } return bottom; } #define QUANTIFIER(ch, min, max) \ case ch: \ if (context->stack == bottom) { \ goto CHARACTER; \ } else { \ newNode.type = REGEX_NODE_TYPE_QUANTIFIER; \ newNode.u.c.nmin = min; \ newNode.u.c.nmax = max; \ newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; \ newNode.u.c.quantified = consume(context); \ push(context, &newNode); \ } \ break // END-QUANTIFIER /* clang-format off */ /* Quantifiers */ QUANTIFIER('?', 0, 1); QUANTIFIER('*', 0, -1); QUANTIFIER('+', 1, -1); /* clang-format on */ #undef QUANTIFIER case '{': if ((context->stack == bottom) || !parse_interval(context)) goto CHARACTER; break; /* Anchors */ case '^': newNode.type = REGEX_NODE_TYPE_ANCHOR_BEGIN; push(context, &newNode); break; case '$': newNode.type = REGEX_NODE_TYPE_ANCHOR_END; push(context, &newNode); break; /* Captures */ case '(': if (!parse_context(context, depth + 1)) { return NULL; } else { newNode.type = REGEX_NODE_TYPE_CAPTURE; newNode.u.e.captured = consume(context); push(context, &newNode); } break; case ')': if (depth > 0) return concatenate(context, bottom); /* unmatched close parenthesis */ return NULL; /* End of string */ case '\0': if (depth == 0) return concatenate(context, bottom); /* unmatched open parenthesis */ return NULL; } } } STATIC INLINE LONG estimate_nodes(const char *pattern) { return (LONG)(strlen(pattern) * 2); } /* Parse a pattern (using a previously allocated buffer of at least * estimate_nodes(pattern) nodes). */ STATIC cregex_node_t *parse_with_nodes(const char *pattern, cregex_node_t *nodes) { regex_parse_context context; context.sp = pattern; context.stack = nodes, context.output = nodes + estimate_nodes(pattern); return parse_context(&context, 0); } cregex_node_t *cregex_parse(const char *pattern) { size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern); cregex_node_t *nodes = AllocVec(size, MEMF_CLEAR); // Printf("mallocing %ld bytes for parse\n", size); if (!nodes) return NULL; if (!parse_with_nodes(pattern, nodes)) { free(nodes); return NULL; } return nodes; } VOID cregex_parse_free(cregex_node_t *root) { FreeVec(root); }