You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
308 lines
9.0 KiB
308 lines
9.0 KiB
#include <proto/exec.h>
|
|
#include <proto/dos.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "cregex.h"
|
|
|
|
typedef struct {
|
|
const char *sp;
|
|
cregex_node_t *stack, *output;
|
|
} regex_parse_context;
|
|
|
|
/* Shunting-yard algorithm
|
|
* See https://en.wikipedia.org/wiki/Shunting-yard_algorithm
|
|
*/
|
|
|
|
STATIC INLINE cregex_node_t *push(regex_parse_context *context,
|
|
const cregex_node_t *node)
|
|
{
|
|
//assert(context->stack <= context->output);
|
|
*context->stack = *node;
|
|
return context->stack++;
|
|
}
|
|
|
|
STATIC INLINE cregex_node_t *drop(regex_parse_context *context)
|
|
{
|
|
return --context->stack;
|
|
}
|
|
|
|
STATIC INLINE cregex_node_t *consume(regex_parse_context *context)
|
|
{
|
|
*--context->output = *--context->stack;
|
|
return context->output;
|
|
}
|
|
|
|
STATIC INLINE cregex_node_t *concatenate(regex_parse_context *context,
|
|
const cregex_node_t *bottom)
|
|
{
|
|
cregex_node_t newNode;
|
|
memset(&newNode, 0, sizeof(cregex_node_t));
|
|
|
|
if (context->stack == bottom) {
|
|
newNode.type = REGEX_NODE_TYPE_EPSILON;
|
|
push(context, &newNode);
|
|
}
|
|
else {
|
|
newNode.type = REGEX_NODE_TYPE_CONCATENATION;
|
|
while (context->stack - 1 > bottom) {
|
|
cregex_node_t *right = consume(context);
|
|
cregex_node_t *left = consume(context);
|
|
newNode.u.d.left = left;
|
|
newNode.u.d.right = right;
|
|
push(context, &newNode);
|
|
}
|
|
}
|
|
return context->stack - 1;
|
|
}
|
|
|
|
STATIC cregex_node_t *parse_char_class(regex_parse_context *context)
|
|
{
|
|
cregex_node_t newNode;
|
|
cregex_node_type type =
|
|
(*context->sp == '^')
|
|
? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED)
|
|
: REGEX_NODE_TYPE_CHARACTER_CLASS;
|
|
const char *from = context->sp;
|
|
|
|
for (;;) {
|
|
LONG ch = *context->sp++;
|
|
memset(&newNode, 0, sizeof(cregex_node_t));
|
|
switch (ch) {
|
|
case '\0':
|
|
/* premature end of character class */
|
|
return NULL;
|
|
case ']':
|
|
if (context->sp - 1 == from) {
|
|
goto CHARACTER;
|
|
} else {
|
|
newNode.type = type;
|
|
newNode.u.b.from = from;
|
|
newNode.u.b.to = context->sp - 1;
|
|
return push(context, &newNode);
|
|
}
|
|
case '\\':
|
|
ch = *context->sp++;
|
|
/* fall-through */
|
|
default:
|
|
CHARACTER:
|
|
if (*context->sp == '-' && context->sp[1] != ']') {
|
|
if (context->sp[1] < ch)
|
|
/* empty range in character class */
|
|
return NULL;
|
|
context->sp += 2;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
STATIC cregex_node_t *parse_interval(regex_parse_context *context)
|
|
{
|
|
const char *from = context->sp;
|
|
LONG nmin, nmax;
|
|
cregex_node_t newNode;
|
|
|
|
memset(&newNode, 0, sizeof(cregex_node_t));
|
|
|
|
for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp)
|
|
nmin = (nmin * 10) + (*context->sp - '0');
|
|
|
|
if (*context->sp == ',') {
|
|
++context->sp;
|
|
if (*from != ',' && *context->sp == '}')
|
|
nmax = -1;
|
|
else {
|
|
for (nmax = 0; *context->sp >= '0' && *context->sp <= '9';
|
|
++context->sp)
|
|
nmax = (nmax * 10) + (*context->sp - '0');
|
|
if (*(context->sp - 1) == ',' || *context->sp != '}' ||
|
|
nmax < nmin) {
|
|
context->sp = from;
|
|
return NULL;
|
|
}
|
|
}
|
|
} else if (*from != '}' && *context->sp == '}') {
|
|
nmax = nmin;
|
|
} else {
|
|
context->sp = from;
|
|
return NULL;
|
|
}
|
|
|
|
++context->sp;
|
|
newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
|
|
newNode.u.c.nmin = nmin;
|
|
newNode.u.c.nmax = nmax;
|
|
newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1;
|
|
newNode.u.c.quantified = consume(context);
|
|
return push(context, &newNode);
|
|
}
|
|
|
|
STATIC cregex_node_t *parse_context(regex_parse_context *context, LONG depth)
|
|
{
|
|
cregex_node_t *bottom = context->stack;
|
|
cregex_node_t newNode;
|
|
|
|
for (;;) {
|
|
LONG ch = *context->sp++;
|
|
memset(&newNode, 0, sizeof(cregex_node_t));
|
|
switch (ch) {
|
|
/* Characters */
|
|
case '\\':
|
|
ch = *context->sp++;
|
|
/* fall-through */
|
|
default:
|
|
CHARACTER:
|
|
newNode.type = REGEX_NODE_TYPE_CHARACTER;
|
|
newNode.u.a.ch = ch;
|
|
push(context, &newNode);
|
|
break;
|
|
case '.':
|
|
newNode.type = REGEX_NODE_TYPE_ANY_CHARACTER;
|
|
push(context, &newNode);
|
|
break;
|
|
case '[':
|
|
if (!parse_char_class(context))
|
|
return NULL;
|
|
break;
|
|
|
|
/* Composites */
|
|
case '|': {
|
|
cregex_node_t *left = concatenate(context, bottom), *right;
|
|
if (!(right = parse_context(context, depth)))
|
|
return NULL;
|
|
if (left->type == REGEX_NODE_TYPE_EPSILON &&
|
|
right->type == left->type) {
|
|
drop(context);
|
|
} else if (left->type == REGEX_NODE_TYPE_EPSILON) {
|
|
right = consume(context);
|
|
drop(context);
|
|
newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
|
|
newNode.u.c.nmin = 0;
|
|
newNode.u.c.nmax = 1;
|
|
newNode.u.c.greedy = 1;
|
|
newNode.u.c.quantified = right;
|
|
push(context, &newNode);
|
|
} else if (right->type == REGEX_NODE_TYPE_EPSILON) {
|
|
drop(context);
|
|
left = consume(context);
|
|
newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
|
|
newNode.u.c.nmin = 0;
|
|
newNode.u.c.nmax = 1;
|
|
newNode.u.c.greedy = 1;
|
|
newNode.u.c.quantified = left;
|
|
push(context, &newNode);
|
|
} else {
|
|
right = consume(context);
|
|
left = consume(context);
|
|
newNode.type = REGEX_NODE_TYPE_ALTERNATION;
|
|
newNode.u.d.left = left;
|
|
newNode.u.d.right = right;
|
|
push(context, &newNode);
|
|
}
|
|
return bottom;
|
|
}
|
|
|
|
#define QUANTIFIER(ch, min, max) \
|
|
case ch: \
|
|
if (context->stack == bottom) { \
|
|
goto CHARACTER; \
|
|
} else { \
|
|
newNode.type = REGEX_NODE_TYPE_QUANTIFIER; \
|
|
newNode.u.c.nmin = min; \
|
|
newNode.u.c.nmax = max; \
|
|
newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; \
|
|
newNode.u.c.quantified = consume(context); \
|
|
push(context, &newNode); \
|
|
} \
|
|
break
|
|
// END-QUANTIFIER
|
|
/* clang-format off */
|
|
/* Quantifiers */
|
|
QUANTIFIER('?', 0, 1);
|
|
QUANTIFIER('*', 0, -1);
|
|
QUANTIFIER('+', 1, -1);
|
|
/* clang-format on */
|
|
#undef QUANTIFIER
|
|
|
|
case '{':
|
|
if ((context->stack == bottom) || !parse_interval(context))
|
|
goto CHARACTER;
|
|
break;
|
|
|
|
/* Anchors */
|
|
case '^':
|
|
newNode.type = REGEX_NODE_TYPE_ANCHOR_BEGIN;
|
|
push(context,
|
|
&newNode);
|
|
break;
|
|
case '$':
|
|
newNode.type = REGEX_NODE_TYPE_ANCHOR_END;
|
|
push(context,
|
|
&newNode);
|
|
break;
|
|
|
|
/* Captures */
|
|
case '(':
|
|
if (!parse_context(context, depth + 1)) {
|
|
return NULL;
|
|
} else {
|
|
newNode.type = REGEX_NODE_TYPE_CAPTURE;
|
|
newNode.u.e.captured = consume(context);
|
|
push(context, &newNode);
|
|
}
|
|
break;
|
|
case ')':
|
|
if (depth > 0)
|
|
return concatenate(context, bottom);
|
|
/* unmatched close parenthesis */
|
|
return NULL;
|
|
|
|
/* End of string */
|
|
case '\0':
|
|
if (depth == 0)
|
|
return concatenate(context, bottom);
|
|
/* unmatched open parenthesis */
|
|
return NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
STATIC INLINE LONG estimate_nodes(const char *pattern)
|
|
{
|
|
return (LONG)(strlen(pattern) * 2);
|
|
}
|
|
|
|
/* Parse a pattern (using a previously allocated buffer of at least
|
|
* estimate_nodes(pattern) nodes).
|
|
*/
|
|
STATIC cregex_node_t *parse_with_nodes(const char *pattern,
|
|
cregex_node_t *nodes)
|
|
{
|
|
regex_parse_context context;
|
|
context.sp = pattern;
|
|
context.stack = nodes,
|
|
context.output = nodes + estimate_nodes(pattern);
|
|
return parse_context(&context, 0);
|
|
}
|
|
|
|
cregex_node_t *cregex_parse(const char *pattern)
|
|
{
|
|
size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern);
|
|
cregex_node_t *nodes = AllocVec(size, MEMF_CLEAR);
|
|
// Printf("mallocing %ld bytes for parse\n", size);
|
|
if (!nodes)
|
|
return NULL;
|
|
|
|
if (!parse_with_nodes(pattern, nodes)) {
|
|
free(nodes);
|
|
return NULL;
|
|
}
|
|
|
|
return nodes;
|
|
}
|
|
|
|
VOID cregex_parse_free(cregex_node_t *root)
|
|
{
|
|
FreeVec(root);
|
|
}
|
|
|