You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
configreader/cregex/cregex_compile.c

351 lines
11 KiB

#include <proto/exec.h>
#include <stdlib.h>
#include "cregex.h"
#include <string.h>
typedef struct {
cregex_program_instr_t *pc;
LONG ncaptures;
} regex_compile_context;
STATIC LONG count_instructions(const cregex_node_t *node)
{
switch (node->type) {
case REGEX_NODE_TYPE_EPSILON:
return 0;
/* Characters */
case REGEX_NODE_TYPE_CHARACTER:
case REGEX_NODE_TYPE_ANY_CHARACTER:
case REGEX_NODE_TYPE_CHARACTER_CLASS:
case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
return 1;
/* Composites */
case REGEX_NODE_TYPE_CONCATENATION:
return count_instructions(node->u.d.left) + count_instructions(node->u.d.right);
case REGEX_NODE_TYPE_ALTERNATION:
return 2 +
count_instructions(node->u.d.left) +
count_instructions(node->u.d.right);
/* Quantifiers */
case REGEX_NODE_TYPE_QUANTIFIER: {
LONG num = count_instructions(node->u.c.quantified);
if (node->u.c.nmax >= node->u.c.nmin)
{
return node->u.c.nmin * num + (node->u.c.nmax - node->u.c.nmin) * (num + 1);
}
else
{
return 1 + (node->u.c.nmin ? node->u.c.nmin * num : num + 1);
}
}
/* Anchors */
case REGEX_NODE_TYPE_ANCHOR_BEGIN:
case REGEX_NODE_TYPE_ANCHOR_END:
return 1;
/* Captures */
case REGEX_NODE_TYPE_CAPTURE:
return 2 + count_instructions(node->u.e.captured);
}
/* should not reach here */
return 0;
}
STATIC BOOL node_is_anchored(const cregex_node_t *node)
{
switch (node->type) {
case REGEX_NODE_TYPE_EPSILON:
return FALSE;
/* Characters */
case REGEX_NODE_TYPE_CHARACTER:
case REGEX_NODE_TYPE_ANY_CHARACTER:
case REGEX_NODE_TYPE_CHARACTER_CLASS:
case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
return FALSE;
/* Composites */
case REGEX_NODE_TYPE_CONCATENATION:
return node_is_anchored(node->u.d.left);
case REGEX_NODE_TYPE_ALTERNATION:
return (BOOL)(node_is_anchored(node->u.d.left) && node_is_anchored(node->u.d.right));
/* Quantifiers */
case REGEX_NODE_TYPE_QUANTIFIER:
return node_is_anchored(node->u.c.quantified);
/* Anchors */
case REGEX_NODE_TYPE_ANCHOR_BEGIN:
return TRUE;
case REGEX_NODE_TYPE_ANCHOR_END:
return FALSE;
/* Captures */
case REGEX_NODE_TYPE_CAPTURE:
return node_is_anchored(node->u.e.captured);
}
/* should not reach here */
return FALSE;
}
STATIC INLINE cregex_program_instr_t *emit(
regex_compile_context *context,
const cregex_program_instr_t *instruction)
{
*context->pc = *instruction;
return context->pc++;
}
STATIC cregex_program_instr_t *compile_char_class(
const cregex_node_t *node,
cregex_program_instr_t *instruction)
{
const char *sp = node->u.b.from;
for (;;) {
LONG ch = *sp++;
switch (ch) {
case ']':
if (sp - 1 == node->u.b.from)
goto CHARACTER;
return instruction;
case '\\':
ch = *sp++;
/* fall-through */
default:
CHARACTER:
if (*sp == '-' && sp[1] != ']') {
for (; ch <= sp[1]; ++ch)
cregex_char_class_add(instruction->u.b.klass, ch);
sp += 2;
} else {
cregex_char_class_add(instruction->u.b.klass, ch);
}
break;
}
}
}
STATIC cregex_program_instr_t *compile_context(regex_compile_context *context,
const cregex_node_t *node)
{
cregex_program_instr_t *bottom = context->pc, *split, *jump;
LONG ncaptures = context->ncaptures, capture;
cregex_program_instr_t newInstr;
memset(&newInstr, 0, sizeof(cregex_program_instr_t));
switch (node->type) {
case REGEX_NODE_TYPE_EPSILON:
break;
/* Characters */
case REGEX_NODE_TYPE_CHARACTER:
newInstr.opcode = REGEX_PROGRAM_OPCODE_CHARACTER;
newInstr.u.a.ch = node->u.a.ch;
emit(context, &newInstr);
break;
case REGEX_NODE_TYPE_ANY_CHARACTER:
newInstr.opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER;
emit(context, &newInstr);
break;
case REGEX_NODE_TYPE_CHARACTER_CLASS:
newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS;
compile_char_class( node, emit(context, &newInstr));
break;
case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS_NEGATED;
compile_char_class( node, emit(context, &newInstr));
break;
/* Composites */
case REGEX_NODE_TYPE_CONCATENATION:
compile_context(context, node->u.d.left);
compile_context(context, node->u.d.right);
break;
case REGEX_NODE_TYPE_ALTERNATION: {
cregex_program_instr_t splitInstr;
cregex_program_instr_t jumpInstr;
memset(&splitInstr, 0, sizeof(cregex_program_instr_t));
memset(&jumpInstr, 0, sizeof(cregex_program_instr_t));
splitInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT;
jumpInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP;
split = emit(context, &splitInstr);
split->u.c.first = compile_context(context, node->u.d.left);
jump = emit(context, &jumpInstr);
split->u.c.second = compile_context(context, node->u.d.right);
jump->u.d.target = context->pc;
}
break;
/* Quantifiers */
case REGEX_NODE_TYPE_QUANTIFIER: {
cregex_program_instr_t *last = NULL;
LONG i = 0;
for (i = 0; i < node->u.c.nmin; ++i) {
context->ncaptures = ncaptures;
last = compile_context(context, node->u.c.quantified);
}
if (node->u.c.nmax > node->u.c.nmin) {
for (i = 0; i < node->u.c.nmax - node->u.c.nmin; ++i) {
memset(&newInstr, 0, sizeof(cregex_program_instr_t));
newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT;
context->ncaptures = ncaptures;
split = emit(context, &newInstr);
split->u.c.first = compile_context(context, node->u.c.quantified);
split->u.c.second = context->pc;
if (!node->u.c.greedy) {
cregex_program_instr_t *swap = split->u.c.first;
split->u.c.first = split->u.c.second;
split->u.c.second = swap;
}
}
} else if (node->u.c.nmax == -1) {
newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT;
split = emit(context, &newInstr);
if (node->u.c.nmin == 0) {
split->u.c.first = compile_context(context, node->u.c.quantified);
newInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP;
jump = emit(context, &newInstr);
split->u.c.second = context->pc;
jump->u.d.target = split;
} else {
split->u.c.first = last;
split->u.c.second = context->pc;
}
if (!node->u.c.greedy) {
cregex_program_instr_t *swap = split->u.c.first;
split->u.c.first = split->u.c.second;
split->u.c.second = swap;
}
}
break;
}
/* Anchors */
case REGEX_NODE_TYPE_ANCHOR_BEGIN:
newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN;
emit(context, &newInstr);
break;
case REGEX_NODE_TYPE_ANCHOR_END:
newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_END;
emit(context, &newInstr);
break;
/* Captures */
case REGEX_NODE_TYPE_CAPTURE:
capture = context->ncaptures++ * 2;
newInstr.opcode = REGEX_PROGRAM_OPCODE_SAVE;
newInstr.u.e.save = capture;
emit(context,&newInstr);
compile_context(context, node->u.e.captured);
newInstr.u.e.save = capture + 1;
emit(context, &newInstr);
break;
}
return bottom;
}
/* Compile a parsed pattern (using a previously allocated program with at least
* estimate_instructions(root) instructions).
*/
STATIC cregex_program_t *compile_node_with_program(const cregex_node_t *root,
cregex_program_t *program)
{
regex_compile_context context;
cregex_node_t rootNode;
cregex_program_instr_t finalInstr;
memset(&rootNode, 0, sizeof(cregex_node_t));
rootNode.type = REGEX_NODE_TYPE_CAPTURE;
rootNode.u.e.captured = (cregex_node_t *)root;
/* add capture node for entire match */
root = &rootNode;
/* add .*? unless pattern starts with ^ */
if (!node_is_anchored(root))
{
cregex_node_t concatNode;
cregex_node_t quantifierNode;
cregex_node_t anyCharNode;
memset(&anyCharNode, 0, sizeof(cregex_node_t));
anyCharNode.type = REGEX_NODE_TYPE_ANY_CHARACTER;
memset(&quantifierNode, 0, sizeof(cregex_node_t));
quantifierNode.type = REGEX_NODE_TYPE_QUANTIFIER;
quantifierNode.u.c.nmin = 0;
quantifierNode.u.c.nmax = -1;
quantifierNode.u.c.greedy = 0;
quantifierNode.u.c.quantified = &anyCharNode;
memset(&concatNode, 0, sizeof(cregex_node_t));
concatNode.type = REGEX_NODE_TYPE_CONCATENATION;
concatNode.u.d.left = &quantifierNode;
concatNode.u.d.right = (cregex_node_t*)root;
root = &concatNode;
}
/* compile */
memset(&context, 0, sizeof(regex_compile_context));
context.pc = program->instructions;
context.ncaptures = 0;
compile_context(&context, root);
/* emit final match instruction */
memset(&finalInstr, 0, sizeof(cregex_program_instr_t));
finalInstr.opcode = REGEX_PROGRAM_OPCODE_MATCH;
emit(&context, &finalInstr);
/* set total number of instructions */
program->ninstructions = context.pc - program->instructions;
return program;
}
/* Upper bound of number of instructions required to compile parsed pattern. */
STATIC LONG estimate_instructions(const cregex_node_t *root)
{
return count_instructions(root)
/* .*? is added unless pattern starts with ^,
* save instructions are added for beginning and end of match,
* a final match instruction is added to the end of the program
*/
+ !node_is_anchored(root) * 3 + 2 + 1;
}
cregex_program_t *cregex_compile_node(const cregex_node_t *root)
{
size_t size = sizeof(cregex_program_t) +
sizeof(cregex_program_instr_t) * (estimate_instructions(root) - 1);
cregex_program_t *program;
if (!(program = AllocVec(size, MEMF_CLEAR)))
return NULL;
if (!compile_node_with_program(root, program)) {
free(program);
return NULL;
}
return program;
}
/* Free a compiled program */
VOID cregex_compile_free(cregex_program_t *program)
{
FreeVec(program);
}