You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
configreader/cregex/cregex_parse.c

308 lines
9.0 KiB

#include <proto/exec.h>
#include <proto/dos.h>
#include <stdlib.h>
#include <string.h>
#include "cregex.h"
typedef struct {
const char *sp;
cregex_node_t *stack, *output;
} regex_parse_context;
/* Shunting-yard algorithm
* See https://en.wikipedia.org/wiki/Shunting-yard_algorithm
*/
STATIC INLINE cregex_node_t *push(regex_parse_context *context,
const cregex_node_t *node)
{
//assert(context->stack <= context->output);
*context->stack = *node;
return context->stack++;
}
STATIC INLINE cregex_node_t *drop(regex_parse_context *context)
{
return --context->stack;
}
STATIC INLINE cregex_node_t *consume(regex_parse_context *context)
{
*--context->output = *--context->stack;
return context->output;
}
STATIC INLINE cregex_node_t *concatenate(regex_parse_context *context,
const cregex_node_t *bottom)
{
cregex_node_t newNode;
memset(&newNode, 0, sizeof(cregex_node_t));
if (context->stack == bottom) {
newNode.type = REGEX_NODE_TYPE_EPSILON;
push(context, &newNode);
}
else {
newNode.type = REGEX_NODE_TYPE_CONCATENATION;
while (context->stack - 1 > bottom) {
cregex_node_t *right = consume(context);
cregex_node_t *left = consume(context);
newNode.u.d.left = left;
newNode.u.d.right = right;
push(context, &newNode);
}
}
return context->stack - 1;
}
STATIC cregex_node_t *parse_char_class(regex_parse_context *context)
{
cregex_node_t newNode;
cregex_node_type type =
(*context->sp == '^')
? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED)
: REGEX_NODE_TYPE_CHARACTER_CLASS;
const char *from = context->sp;
for (;;) {
LONG ch = *context->sp++;
memset(&newNode, 0, sizeof(cregex_node_t));
switch (ch) {
case '\0':
/* premature end of character class */
return NULL;
case ']':
if (context->sp - 1 == from) {
goto CHARACTER;
} else {
newNode.type = type;
newNode.u.b.from = from;
newNode.u.b.to = context->sp - 1;
return push(context, &newNode);
}
case '\\':
ch = *context->sp++;
/* fall-through */
default:
CHARACTER:
if (*context->sp == '-' && context->sp[1] != ']') {
if (context->sp[1] < ch)
/* empty range in character class */
return NULL;
context->sp += 2;
}
break;
}
}
}
STATIC cregex_node_t *parse_interval(regex_parse_context *context)
{
const char *from = context->sp;
LONG nmin, nmax;
cregex_node_t newNode;
memset(&newNode, 0, sizeof(cregex_node_t));
for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp)
nmin = (nmin * 10) + (*context->sp - '0');
if (*context->sp == ',') {
++context->sp;
if (*from != ',' && *context->sp == '}')
nmax = -1;
else {
for (nmax = 0; *context->sp >= '0' && *context->sp <= '9';
++context->sp)
nmax = (nmax * 10) + (*context->sp - '0');
if (*(context->sp - 1) == ',' || *context->sp != '}' ||
nmax < nmin) {
context->sp = from;
return NULL;
}
}
} else if (*from != '}' && *context->sp == '}') {
nmax = nmin;
} else {
context->sp = from;
return NULL;
}
++context->sp;
newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
newNode.u.c.nmin = nmin;
newNode.u.c.nmax = nmax;
newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1;
newNode.u.c.quantified = consume(context);
return push(context, &newNode);
}
STATIC cregex_node_t *parse_context(regex_parse_context *context, LONG depth)
{
cregex_node_t *bottom = context->stack;
cregex_node_t newNode;
for (;;) {
LONG ch = *context->sp++;
memset(&newNode, 0, sizeof(cregex_node_t));
switch (ch) {
/* Characters */
case '\\':
ch = *context->sp++;
/* fall-through */
default:
CHARACTER:
newNode.type = REGEX_NODE_TYPE_CHARACTER;
newNode.u.a.ch = ch;
push(context, &newNode);
break;
case '.':
newNode.type = REGEX_NODE_TYPE_ANY_CHARACTER;
push(context, &newNode);
break;
case '[':
if (!parse_char_class(context))
return NULL;
break;
/* Composites */
case '|': {
cregex_node_t *left = concatenate(context, bottom), *right;
if (!(right = parse_context(context, depth)))
return NULL;
if (left->type == REGEX_NODE_TYPE_EPSILON &&
right->type == left->type) {
drop(context);
} else if (left->type == REGEX_NODE_TYPE_EPSILON) {
right = consume(context);
drop(context);
newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
newNode.u.c.nmin = 0;
newNode.u.c.nmax = 1;
newNode.u.c.greedy = 1;
newNode.u.c.quantified = right;
push(context, &newNode);
} else if (right->type == REGEX_NODE_TYPE_EPSILON) {
drop(context);
left = consume(context);
newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
newNode.u.c.nmin = 0;
newNode.u.c.nmax = 1;
newNode.u.c.greedy = 1;
newNode.u.c.quantified = left;
push(context, &newNode);
} else {
right = consume(context);
left = consume(context);
newNode.type = REGEX_NODE_TYPE_ALTERNATION;
newNode.u.d.left = left;
newNode.u.d.right = right;
push(context, &newNode);
}
return bottom;
}
#define QUANTIFIER(ch, min, max) \
case ch: \
if (context->stack == bottom) { \
goto CHARACTER; \
} else { \
newNode.type = REGEX_NODE_TYPE_QUANTIFIER; \
newNode.u.c.nmin = min; \
newNode.u.c.nmax = max; \
newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; \
newNode.u.c.quantified = consume(context); \
push(context, &newNode); \
} \
break
// END-QUANTIFIER
/* clang-format off */
/* Quantifiers */
QUANTIFIER('?', 0, 1);
QUANTIFIER('*', 0, -1);
QUANTIFIER('+', 1, -1);
/* clang-format on */
#undef QUANTIFIER
case '{':
if ((context->stack == bottom) || !parse_interval(context))
goto CHARACTER;
break;
/* Anchors */
case '^':
newNode.type = REGEX_NODE_TYPE_ANCHOR_BEGIN;
push(context,
&newNode);
break;
case '$':
newNode.type = REGEX_NODE_TYPE_ANCHOR_END;
push(context,
&newNode);
break;
/* Captures */
case '(':
if (!parse_context(context, depth + 1)) {
return NULL;
} else {
newNode.type = REGEX_NODE_TYPE_CAPTURE;
newNode.u.e.captured = consume(context);
push(context, &newNode);
}
break;
case ')':
if (depth > 0)
return concatenate(context, bottom);
/* unmatched close parenthesis */
return NULL;
/* End of string */
case '\0':
if (depth == 0)
return concatenate(context, bottom);
/* unmatched open parenthesis */
return NULL;
}
}
}
STATIC INLINE LONG estimate_nodes(const char *pattern)
{
return (LONG)(strlen(pattern) * 2);
}
/* Parse a pattern (using a previously allocated buffer of at least
* estimate_nodes(pattern) nodes).
*/
STATIC cregex_node_t *parse_with_nodes(const char *pattern,
cregex_node_t *nodes)
{
regex_parse_context context;
context.sp = pattern;
context.stack = nodes,
context.output = nodes + estimate_nodes(pattern);
return parse_context(&context, 0);
}
cregex_node_t *cregex_parse(const char *pattern)
{
size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern);
cregex_node_t *nodes = AllocVec(size, MEMF_CLEAR);
// Printf("mallocing %ld bytes for parse\n", size);
if (!nodes)
return NULL;
if (!parse_with_nodes(pattern, nodes)) {
free(nodes);
return NULL;
}
return nodes;
}
VOID cregex_parse_free(cregex_node_t *root)
{
FreeVec(root);
}