You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
351 lines
11 KiB
351 lines
11 KiB
#include <proto/exec.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "cregex.h"
|
|
#include <string.h>
|
|
|
|
typedef struct {
|
|
cregex_program_instr_t *pc;
|
|
LONG ncaptures;
|
|
} regex_compile_context;
|
|
|
|
STATIC LONG count_instructions(const cregex_node_t *node)
|
|
{
|
|
switch (node->type) {
|
|
case REGEX_NODE_TYPE_EPSILON:
|
|
return 0;
|
|
|
|
/* Characters */
|
|
case REGEX_NODE_TYPE_CHARACTER:
|
|
case REGEX_NODE_TYPE_ANY_CHARACTER:
|
|
case REGEX_NODE_TYPE_CHARACTER_CLASS:
|
|
case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
|
|
return 1;
|
|
|
|
/* Composites */
|
|
case REGEX_NODE_TYPE_CONCATENATION:
|
|
return count_instructions(node->u.d.left) + count_instructions(node->u.d.right);
|
|
case REGEX_NODE_TYPE_ALTERNATION:
|
|
return 2 +
|
|
count_instructions(node->u.d.left) +
|
|
count_instructions(node->u.d.right);
|
|
|
|
/* Quantifiers */
|
|
case REGEX_NODE_TYPE_QUANTIFIER: {
|
|
LONG num = count_instructions(node->u.c.quantified);
|
|
if (node->u.c.nmax >= node->u.c.nmin)
|
|
{
|
|
return node->u.c.nmin * num + (node->u.c.nmax - node->u.c.nmin) * (num + 1);
|
|
}
|
|
else
|
|
{
|
|
return 1 + (node->u.c.nmin ? node->u.c.nmin * num : num + 1);
|
|
}
|
|
}
|
|
|
|
/* Anchors */
|
|
case REGEX_NODE_TYPE_ANCHOR_BEGIN:
|
|
case REGEX_NODE_TYPE_ANCHOR_END:
|
|
return 1;
|
|
|
|
/* Captures */
|
|
case REGEX_NODE_TYPE_CAPTURE:
|
|
return 2 + count_instructions(node->u.e.captured);
|
|
}
|
|
|
|
/* should not reach here */
|
|
return 0;
|
|
}
|
|
|
|
STATIC BOOL node_is_anchored(const cregex_node_t *node)
|
|
{
|
|
switch (node->type) {
|
|
case REGEX_NODE_TYPE_EPSILON:
|
|
return FALSE;
|
|
|
|
/* Characters */
|
|
case REGEX_NODE_TYPE_CHARACTER:
|
|
case REGEX_NODE_TYPE_ANY_CHARACTER:
|
|
case REGEX_NODE_TYPE_CHARACTER_CLASS:
|
|
case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
|
|
return FALSE;
|
|
|
|
/* Composites */
|
|
case REGEX_NODE_TYPE_CONCATENATION:
|
|
return node_is_anchored(node->u.d.left);
|
|
case REGEX_NODE_TYPE_ALTERNATION:
|
|
return (BOOL)(node_is_anchored(node->u.d.left) && node_is_anchored(node->u.d.right));
|
|
|
|
/* Quantifiers */
|
|
case REGEX_NODE_TYPE_QUANTIFIER:
|
|
return node_is_anchored(node->u.c.quantified);
|
|
|
|
/* Anchors */
|
|
case REGEX_NODE_TYPE_ANCHOR_BEGIN:
|
|
return TRUE;
|
|
case REGEX_NODE_TYPE_ANCHOR_END:
|
|
return FALSE;
|
|
|
|
/* Captures */
|
|
case REGEX_NODE_TYPE_CAPTURE:
|
|
return node_is_anchored(node->u.e.captured);
|
|
}
|
|
|
|
/* should not reach here */
|
|
return FALSE;
|
|
}
|
|
|
|
STATIC INLINE cregex_program_instr_t *emit(
|
|
regex_compile_context *context,
|
|
const cregex_program_instr_t *instruction)
|
|
{
|
|
*context->pc = *instruction;
|
|
return context->pc++;
|
|
}
|
|
|
|
STATIC cregex_program_instr_t *compile_char_class(
|
|
const cregex_node_t *node,
|
|
cregex_program_instr_t *instruction)
|
|
{
|
|
const char *sp = node->u.b.from;
|
|
|
|
for (;;) {
|
|
LONG ch = *sp++;
|
|
switch (ch) {
|
|
case ']':
|
|
if (sp - 1 == node->u.b.from)
|
|
goto CHARACTER;
|
|
return instruction;
|
|
case '\\':
|
|
ch = *sp++;
|
|
/* fall-through */
|
|
default:
|
|
CHARACTER:
|
|
if (*sp == '-' && sp[1] != ']') {
|
|
for (; ch <= sp[1]; ++ch)
|
|
cregex_char_class_add(instruction->u.b.klass, ch);
|
|
sp += 2;
|
|
} else {
|
|
cregex_char_class_add(instruction->u.b.klass, ch);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
STATIC cregex_program_instr_t *compile_context(regex_compile_context *context,
|
|
const cregex_node_t *node)
|
|
{
|
|
cregex_program_instr_t *bottom = context->pc, *split, *jump;
|
|
LONG ncaptures = context->ncaptures, capture;
|
|
cregex_program_instr_t newInstr;
|
|
memset(&newInstr, 0, sizeof(cregex_program_instr_t));
|
|
|
|
|
|
switch (node->type) {
|
|
case REGEX_NODE_TYPE_EPSILON:
|
|
break;
|
|
|
|
/* Characters */
|
|
case REGEX_NODE_TYPE_CHARACTER:
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_CHARACTER;
|
|
newInstr.u.a.ch = node->u.a.ch;
|
|
emit(context, &newInstr);
|
|
break;
|
|
case REGEX_NODE_TYPE_ANY_CHARACTER:
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER;
|
|
emit(context, &newInstr);
|
|
break;
|
|
case REGEX_NODE_TYPE_CHARACTER_CLASS:
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS;
|
|
compile_char_class( node, emit(context, &newInstr));
|
|
break;
|
|
case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS_NEGATED;
|
|
compile_char_class( node, emit(context, &newInstr));
|
|
break;
|
|
|
|
/* Composites */
|
|
case REGEX_NODE_TYPE_CONCATENATION:
|
|
compile_context(context, node->u.d.left);
|
|
compile_context(context, node->u.d.right);
|
|
break;
|
|
case REGEX_NODE_TYPE_ALTERNATION: {
|
|
cregex_program_instr_t splitInstr;
|
|
cregex_program_instr_t jumpInstr;
|
|
memset(&splitInstr, 0, sizeof(cregex_program_instr_t));
|
|
memset(&jumpInstr, 0, sizeof(cregex_program_instr_t));
|
|
splitInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT;
|
|
jumpInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP;
|
|
|
|
split = emit(context, &splitInstr);
|
|
split->u.c.first = compile_context(context, node->u.d.left);
|
|
jump = emit(context, &jumpInstr);
|
|
split->u.c.second = compile_context(context, node->u.d.right);
|
|
jump->u.d.target = context->pc;
|
|
}
|
|
break;
|
|
|
|
/* Quantifiers */
|
|
case REGEX_NODE_TYPE_QUANTIFIER: {
|
|
cregex_program_instr_t *last = NULL;
|
|
LONG i = 0;
|
|
for (i = 0; i < node->u.c.nmin; ++i) {
|
|
context->ncaptures = ncaptures;
|
|
last = compile_context(context, node->u.c.quantified);
|
|
}
|
|
if (node->u.c.nmax > node->u.c.nmin) {
|
|
for (i = 0; i < node->u.c.nmax - node->u.c.nmin; ++i) {
|
|
memset(&newInstr, 0, sizeof(cregex_program_instr_t));
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT;
|
|
context->ncaptures = ncaptures;
|
|
split = emit(context, &newInstr);
|
|
split->u.c.first = compile_context(context, node->u.c.quantified);
|
|
split->u.c.second = context->pc;
|
|
if (!node->u.c.greedy) {
|
|
cregex_program_instr_t *swap = split->u.c.first;
|
|
split->u.c.first = split->u.c.second;
|
|
split->u.c.second = swap;
|
|
}
|
|
}
|
|
} else if (node->u.c.nmax == -1) {
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT;
|
|
split = emit(context, &newInstr);
|
|
if (node->u.c.nmin == 0) {
|
|
split->u.c.first = compile_context(context, node->u.c.quantified);
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP;
|
|
jump = emit(context, &newInstr);
|
|
split->u.c.second = context->pc;
|
|
jump->u.d.target = split;
|
|
} else {
|
|
split->u.c.first = last;
|
|
split->u.c.second = context->pc;
|
|
}
|
|
if (!node->u.c.greedy) {
|
|
cregex_program_instr_t *swap = split->u.c.first;
|
|
split->u.c.first = split->u.c.second;
|
|
split->u.c.second = swap;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* Anchors */
|
|
case REGEX_NODE_TYPE_ANCHOR_BEGIN:
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN;
|
|
emit(context, &newInstr);
|
|
break;
|
|
case REGEX_NODE_TYPE_ANCHOR_END:
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_END;
|
|
emit(context, &newInstr);
|
|
break;
|
|
|
|
/* Captures */
|
|
case REGEX_NODE_TYPE_CAPTURE:
|
|
capture = context->ncaptures++ * 2;
|
|
newInstr.opcode = REGEX_PROGRAM_OPCODE_SAVE;
|
|
|
|
newInstr.u.e.save = capture;
|
|
emit(context,&newInstr);
|
|
|
|
compile_context(context, node->u.e.captured);
|
|
|
|
newInstr.u.e.save = capture + 1;
|
|
emit(context, &newInstr);
|
|
break;
|
|
}
|
|
|
|
return bottom;
|
|
}
|
|
|
|
/* Compile a parsed pattern (using a previously allocated program with at least
|
|
* estimate_instructions(root) instructions).
|
|
*/
|
|
STATIC cregex_program_t *compile_node_with_program(const cregex_node_t *root,
|
|
cregex_program_t *program)
|
|
{
|
|
regex_compile_context context;
|
|
cregex_node_t rootNode;
|
|
cregex_program_instr_t finalInstr;
|
|
|
|
memset(&rootNode, 0, sizeof(cregex_node_t));
|
|
rootNode.type = REGEX_NODE_TYPE_CAPTURE;
|
|
rootNode.u.e.captured = (cregex_node_t *)root;
|
|
/* add capture node for entire match */
|
|
root = &rootNode;
|
|
|
|
/* add .*? unless pattern starts with ^ */
|
|
if (!node_is_anchored(root))
|
|
{
|
|
cregex_node_t concatNode;
|
|
cregex_node_t quantifierNode;
|
|
cregex_node_t anyCharNode;
|
|
|
|
memset(&anyCharNode, 0, sizeof(cregex_node_t));
|
|
anyCharNode.type = REGEX_NODE_TYPE_ANY_CHARACTER;
|
|
|
|
memset(&quantifierNode, 0, sizeof(cregex_node_t));
|
|
quantifierNode.type = REGEX_NODE_TYPE_QUANTIFIER;
|
|
quantifierNode.u.c.nmin = 0;
|
|
quantifierNode.u.c.nmax = -1;
|
|
quantifierNode.u.c.greedy = 0;
|
|
quantifierNode.u.c.quantified = &anyCharNode;
|
|
|
|
memset(&concatNode, 0, sizeof(cregex_node_t));
|
|
concatNode.type = REGEX_NODE_TYPE_CONCATENATION;
|
|
concatNode.u.d.left = &quantifierNode;
|
|
concatNode.u.d.right = (cregex_node_t*)root;
|
|
|
|
root = &concatNode;
|
|
}
|
|
|
|
/* compile */
|
|
memset(&context, 0, sizeof(regex_compile_context));
|
|
context.pc = program->instructions;
|
|
context.ncaptures = 0;
|
|
compile_context(&context, root);
|
|
|
|
/* emit final match instruction */
|
|
memset(&finalInstr, 0, sizeof(cregex_program_instr_t));
|
|
finalInstr.opcode = REGEX_PROGRAM_OPCODE_MATCH;
|
|
emit(&context, &finalInstr);
|
|
|
|
/* set total number of instructions */
|
|
program->ninstructions = context.pc - program->instructions;
|
|
|
|
return program;
|
|
}
|
|
|
|
/* Upper bound of number of instructions required to compile parsed pattern. */
|
|
STATIC LONG estimate_instructions(const cregex_node_t *root)
|
|
{
|
|
return count_instructions(root)
|
|
/* .*? is added unless pattern starts with ^,
|
|
* save instructions are added for beginning and end of match,
|
|
* a final match instruction is added to the end of the program
|
|
*/
|
|
+ !node_is_anchored(root) * 3 + 2 + 1;
|
|
}
|
|
|
|
cregex_program_t *cregex_compile_node(const cregex_node_t *root)
|
|
{
|
|
size_t size = sizeof(cregex_program_t) +
|
|
sizeof(cregex_program_instr_t) * (estimate_instructions(root) - 1);
|
|
cregex_program_t *program;
|
|
|
|
if (!(program = AllocVec(size, MEMF_CLEAR)))
|
|
return NULL;
|
|
|
|
if (!compile_node_with_program(root, program)) {
|
|
free(program);
|
|
return NULL;
|
|
}
|
|
|
|
return program;
|
|
}
|
|
|
|
/* Free a compiled program */
|
|
VOID cregex_compile_free(cregex_program_t *program)
|
|
{
|
|
FreeVec(program);
|
|
}
|
|
|