configreader/cregex/cregex_parse.c

#include <proto/exec.h>
#include <proto/dos.h>
#include <stdlib.h>
#include <string.h>

#include "cregex.h"

typedef struct {
    const char *sp;
    cregex_node_t *stack, *output;
} regex_parse_context;

/* Shunting-yard algorithm
 * See https://en.wikipedia.org/wiki/Shunting-yard_algorithm
 */

STATIC INLINE cregex_node_t *push(regex_parse_context *context,
                                  const cregex_node_t *node)
{
    //assert(context->stack <= context->output);
    *context->stack = *node;
    return context->stack++;
}

STATIC INLINE cregex_node_t *drop(regex_parse_context *context)
{
    return --context->stack;
}

STATIC INLINE cregex_node_t *consume(regex_parse_context *context)
{
    *--context->output = *--context->stack;
    return context->output;
}

STATIC INLINE cregex_node_t *concatenate(regex_parse_context *context,
                                         const cregex_node_t *bottom)
{
	cregex_node_t newNode;
	memset(&newNode, 0, sizeof(cregex_node_t));

    if (context->stack == bottom) {
    	newNode.type = REGEX_NODE_TYPE_EPSILON;
        push(context, &newNode);
    }
    else {
        newNode.type = REGEX_NODE_TYPE_CONCATENATION;
        while (context->stack - 1 > bottom) {
            cregex_node_t *right = consume(context);
            cregex_node_t *left = consume(context);
            newNode.u.d.left = left;
            newNode.u.d.right = right;
            push(context, &newNode);
        }
    }
    return context->stack - 1;
}

STATIC cregex_node_t *parse_char_class(regex_parse_context *context)
{
	cregex_node_t newNode;
    cregex_node_type type =
        (*context->sp == '^')
            ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED)
            : REGEX_NODE_TYPE_CHARACTER_CLASS;
    const char *from = context->sp;

    for (;;) {
        LONG ch = *context->sp++;
    	memset(&newNode, 0, sizeof(cregex_node_t));
        switch (ch) {
        case '\0':
            /* premature end of character class */
            return NULL;
        case ']':
            if (context->sp - 1 == from) {
                goto CHARACTER;
            } else {
            	newNode.type = type;
            	newNode.u.b.from = from;
            	newNode.u.b.to = context->sp - 1;
	            return push(context, &newNode);
            }
        case '\\':
            ch = *context->sp++;
            /* fall-through */
        default:
        CHARACTER:
            if (*context->sp == '-' && context->sp[1] != ']') {
                if (context->sp[1] < ch)
                    /* empty range in character class */
                    return NULL;
                context->sp += 2;
            }
            break;
        }
    }
}

STATIC cregex_node_t *parse_interval(regex_parse_context *context)
{
    const char *from = context->sp;
    LONG nmin, nmax;
    cregex_node_t newNode;

    memset(&newNode, 0, sizeof(cregex_node_t));

    for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp)
        nmin = (nmin * 10) + (*context->sp - '0');

    if (*context->sp == ',') {
        ++context->sp;
        if (*from != ',' && *context->sp == '}')
            nmax = -1;
        else {
            for (nmax = 0; *context->sp >= '0' && *context->sp <= '9';
                 ++context->sp)
                nmax = (nmax * 10) + (*context->sp - '0');
            if (*(context->sp - 1) == ',' || *context->sp != '}' ||
                nmax < nmin) {
                context->sp = from;
                return NULL;
            }
        }
    } else if (*from != '}' && *context->sp == '}') {
        nmax = nmin;
    } else {
        context->sp = from;
        return NULL;
    }

    ++context->sp;
    newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
    newNode.u.c.nmin = nmin;
    newNode.u.c.nmax = nmax;
    newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1;
    newNode.u.c.quantified = consume(context);
    return push(context, &newNode);
}

STATIC cregex_node_t *parse_context(regex_parse_context *context, LONG depth)
{
    cregex_node_t *bottom = context->stack;
    cregex_node_t newNode;

    for (;;) {
        LONG ch = *context->sp++;
        memset(&newNode, 0, sizeof(cregex_node_t));
        switch (ch) {
        /* Characters */
        case '\\':
            ch = *context->sp++;
            /* fall-through */
        default:
        CHARACTER:
        	newNode.type = REGEX_NODE_TYPE_CHARACTER;
        	newNode.u.a.ch = ch;
            push(context, &newNode);
            break;
        case '.':
        	newNode.type = REGEX_NODE_TYPE_ANY_CHARACTER;
            push(context, &newNode);
            break;
        case '[':
            if (!parse_char_class(context))
                return NULL;
            break;

        /* Composites */
        case '|': {
            cregex_node_t *left = concatenate(context, bottom), *right;
            if (!(right = parse_context(context, depth)))
                return NULL;
            if (left->type == REGEX_NODE_TYPE_EPSILON &&
                right->type == left->type) {
                drop(context);
            } else if (left->type == REGEX_NODE_TYPE_EPSILON) {
                right = consume(context);
                drop(context);
                newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
                newNode.u.c.nmin = 0;
                newNode.u.c.nmax = 1;
                newNode.u.c.greedy = 1;
                newNode.u.c.quantified = right;
                push(context, &newNode);
            } else if (right->type == REGEX_NODE_TYPE_EPSILON) {
                drop(context);
                left = consume(context);
                newNode.type = REGEX_NODE_TYPE_QUANTIFIER;
                newNode.u.c.nmin = 0;
                newNode.u.c.nmax = 1;
                newNode.u.c.greedy = 1;
                newNode.u.c.quantified = left;
                push(context, &newNode);
            } else {
                right = consume(context);
                left = consume(context);
                newNode.type = REGEX_NODE_TYPE_ALTERNATION;
                newNode.u.d.left = left;
                newNode.u.d.right = right;
                push(context, &newNode);
            }
            return bottom;
        }

#define QUANTIFIER(ch, min, max)						\
    case ch:                                            \
        if (context->stack == bottom)  {                \
            goto CHARACTER;                     	    \
        } else {										\
        	newNode.type = REGEX_NODE_TYPE_QUANTIFIER;	\
            newNode.u.c.nmin = min;                     \
            newNode.u.c.nmax = max;                     \
            newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; \
            newNode.u.c.quantified = consume(context);  \
        	push(context,  &newNode);                   \
        }												\
        break
// END-QUANTIFIER
       /* clang-format off */
        /* Quantifiers */
        QUANTIFIER('?', 0, 1);
        QUANTIFIER('*', 0, -1);
        QUANTIFIER('+', 1, -1);
       /* clang-format on */
#undef QUANTIFIER

        case '{':
            if ((context->stack == bottom) || !parse_interval(context))
                goto CHARACTER;
            break;

        /* Anchors */
        case '^':
        	newNode.type = REGEX_NODE_TYPE_ANCHOR_BEGIN;
            push(context,
                 &newNode);
            break;
        case '$':
        	newNode.type = REGEX_NODE_TYPE_ANCHOR_END;
            push(context,
                 &newNode);
            break;

        /* Captures */
        case '(':
            if (!parse_context(context, depth + 1)) {
                return NULL;
            } else {
	          	newNode.type = REGEX_NODE_TYPE_CAPTURE;
    	    	newNode.u.e.captured = consume(context);
        	    push(context, &newNode);
            }
            break;
        case ')':
            if (depth > 0)
                return concatenate(context, bottom);
            /* unmatched close parenthesis */
            return NULL;

        /* End of string */
        case '\0':
            if (depth == 0)
                return concatenate(context, bottom);
            /* unmatched open parenthesis */
            return NULL;
        }
    }
}

STATIC INLINE LONG estimate_nodes(const char *pattern)
{
    return (LONG)(strlen(pattern) * 2);
}

/* Parse a pattern (using a previously allocated buffer of at least
 * estimate_nodes(pattern) nodes).
 */
STATIC cregex_node_t *parse_with_nodes(const char *pattern,
                                       cregex_node_t *nodes)
{
    regex_parse_context context;
    context.sp = pattern;
    context.stack = nodes,
    context.output = nodes + estimate_nodes(pattern);
    return parse_context(&context, 0);
}

cregex_node_t *cregex_parse(const char *pattern)
{
    size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern);
    cregex_node_t *nodes = AllocVec(size, MEMF_CLEAR);
//    Printf("mallocing %ld bytes for parse\n", size);
    if (!nodes)
        return NULL;

    if (!parse_with_nodes(pattern, nodes)) {
        free(nodes);
        return NULL;
    }

    return nodes;
}

VOID cregex_parse_free(cregex_node_t *root)
{
    FreeVec(root);
}