#include #include #include "cregex.h" #include typedef struct { cregex_program_instr_t *pc; LONG ncaptures; } regex_compile_context; STATIC LONG count_instructions(const cregex_node_t *node) { switch (node->type) { case REGEX_NODE_TYPE_EPSILON: return 0; /* Characters */ case REGEX_NODE_TYPE_CHARACTER: case REGEX_NODE_TYPE_ANY_CHARACTER: case REGEX_NODE_TYPE_CHARACTER_CLASS: case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: return 1; /* Composites */ case REGEX_NODE_TYPE_CONCATENATION: return count_instructions(node->u.d.left) + count_instructions(node->u.d.right); case REGEX_NODE_TYPE_ALTERNATION: return 2 + count_instructions(node->u.d.left) + count_instructions(node->u.d.right); /* Quantifiers */ case REGEX_NODE_TYPE_QUANTIFIER: { LONG num = count_instructions(node->u.c.quantified); if (node->u.c.nmax >= node->u.c.nmin) { return node->u.c.nmin * num + (node->u.c.nmax - node->u.c.nmin) * (num + 1); } else { return 1 + (node->u.c.nmin ? node->u.c.nmin * num : num + 1); } } /* Anchors */ case REGEX_NODE_TYPE_ANCHOR_BEGIN: case REGEX_NODE_TYPE_ANCHOR_END: return 1; /* Captures */ case REGEX_NODE_TYPE_CAPTURE: return 2 + count_instructions(node->u.e.captured); } /* should not reach here */ return 0; } STATIC BOOL node_is_anchored(const cregex_node_t *node) { switch (node->type) { case REGEX_NODE_TYPE_EPSILON: return FALSE; /* Characters */ case REGEX_NODE_TYPE_CHARACTER: case REGEX_NODE_TYPE_ANY_CHARACTER: case REGEX_NODE_TYPE_CHARACTER_CLASS: case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: return FALSE; /* Composites */ case REGEX_NODE_TYPE_CONCATENATION: return node_is_anchored(node->u.d.left); case REGEX_NODE_TYPE_ALTERNATION: return (BOOL)(node_is_anchored(node->u.d.left) && node_is_anchored(node->u.d.right)); /* Quantifiers */ case REGEX_NODE_TYPE_QUANTIFIER: return node_is_anchored(node->u.c.quantified); /* Anchors */ case REGEX_NODE_TYPE_ANCHOR_BEGIN: return TRUE; case REGEX_NODE_TYPE_ANCHOR_END: return FALSE; /* Captures */ case REGEX_NODE_TYPE_CAPTURE: return node_is_anchored(node->u.e.captured); } /* should not reach here */ return FALSE; } STATIC INLINE cregex_program_instr_t *emit( regex_compile_context *context, const cregex_program_instr_t *instruction) { *context->pc = *instruction; return context->pc++; } STATIC cregex_program_instr_t *compile_char_class( const cregex_node_t *node, cregex_program_instr_t *instruction) { const char *sp = node->u.b.from; for (;;) { LONG ch = *sp++; switch (ch) { case ']': if (sp - 1 == node->u.b.from) goto CHARACTER; return instruction; case '\\': ch = *sp++; /* fall-through */ default: CHARACTER: if (*sp == '-' && sp[1] != ']') { for (; ch <= sp[1]; ++ch) cregex_char_class_add(instruction->u.b.klass, ch); sp += 2; } else { cregex_char_class_add(instruction->u.b.klass, ch); } break; } } } STATIC cregex_program_instr_t *compile_context(regex_compile_context *context, const cregex_node_t *node) { cregex_program_instr_t *bottom = context->pc, *split, *jump; LONG ncaptures = context->ncaptures, capture; cregex_program_instr_t newInstr; memset(&newInstr, 0, sizeof(cregex_program_instr_t)); switch (node->type) { case REGEX_NODE_TYPE_EPSILON: break; /* Characters */ case REGEX_NODE_TYPE_CHARACTER: newInstr.opcode = REGEX_PROGRAM_OPCODE_CHARACTER; newInstr.u.a.ch = node->u.a.ch; emit(context, &newInstr); break; case REGEX_NODE_TYPE_ANY_CHARACTER: newInstr.opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER; emit(context, &newInstr); break; case REGEX_NODE_TYPE_CHARACTER_CLASS: newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS; compile_char_class( node, emit(context, &newInstr)); break; case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS_NEGATED; compile_char_class( node, emit(context, &newInstr)); break; /* Composites */ case REGEX_NODE_TYPE_CONCATENATION: compile_context(context, node->u.d.left); compile_context(context, node->u.d.right); break; case REGEX_NODE_TYPE_ALTERNATION: { cregex_program_instr_t splitInstr; cregex_program_instr_t jumpInstr; memset(&splitInstr, 0, sizeof(cregex_program_instr_t)); memset(&jumpInstr, 0, sizeof(cregex_program_instr_t)); splitInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT; jumpInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP; split = emit(context, &splitInstr); split->u.c.first = compile_context(context, node->u.d.left); jump = emit(context, &jumpInstr); split->u.c.second = compile_context(context, node->u.d.right); jump->u.d.target = context->pc; } break; /* Quantifiers */ case REGEX_NODE_TYPE_QUANTIFIER: { cregex_program_instr_t *last = NULL; LONG i = 0; for (i = 0; i < node->u.c.nmin; ++i) { context->ncaptures = ncaptures; last = compile_context(context, node->u.c.quantified); } if (node->u.c.nmax > node->u.c.nmin) { for (i = 0; i < node->u.c.nmax - node->u.c.nmin; ++i) { memset(&newInstr, 0, sizeof(cregex_program_instr_t)); newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT; context->ncaptures = ncaptures; split = emit(context, &newInstr); split->u.c.first = compile_context(context, node->u.c.quantified); split->u.c.second = context->pc; if (!node->u.c.greedy) { cregex_program_instr_t *swap = split->u.c.first; split->u.c.first = split->u.c.second; split->u.c.second = swap; } } } else if (node->u.c.nmax == -1) { newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT; split = emit(context, &newInstr); if (node->u.c.nmin == 0) { split->u.c.first = compile_context(context, node->u.c.quantified); newInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP; jump = emit(context, &newInstr); split->u.c.second = context->pc; jump->u.d.target = split; } else { split->u.c.first = last; split->u.c.second = context->pc; } if (!node->u.c.greedy) { cregex_program_instr_t *swap = split->u.c.first; split->u.c.first = split->u.c.second; split->u.c.second = swap; } } break; } /* Anchors */ case REGEX_NODE_TYPE_ANCHOR_BEGIN: newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN; emit(context, &newInstr); break; case REGEX_NODE_TYPE_ANCHOR_END: newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_END; emit(context, &newInstr); break; /* Captures */ case REGEX_NODE_TYPE_CAPTURE: capture = context->ncaptures++ * 2; newInstr.opcode = REGEX_PROGRAM_OPCODE_SAVE; newInstr.u.e.save = capture; emit(context,&newInstr); compile_context(context, node->u.e.captured); newInstr.u.e.save = capture + 1; emit(context, &newInstr); break; } return bottom; } /* Compile a parsed pattern (using a previously allocated program with at least * estimate_instructions(root) instructions). */ STATIC cregex_program_t *compile_node_with_program(const cregex_node_t *root, cregex_program_t *program) { regex_compile_context context; cregex_node_t rootNode; cregex_program_instr_t finalInstr; memset(&rootNode, 0, sizeof(cregex_node_t)); rootNode.type = REGEX_NODE_TYPE_CAPTURE; rootNode.u.e.captured = (cregex_node_t *)root; /* add capture node for entire match */ root = &rootNode; /* add .*? unless pattern starts with ^ */ if (!node_is_anchored(root)) { cregex_node_t concatNode; cregex_node_t quantifierNode; cregex_node_t anyCharNode; memset(&anyCharNode, 0, sizeof(cregex_node_t)); anyCharNode.type = REGEX_NODE_TYPE_ANY_CHARACTER; memset(&quantifierNode, 0, sizeof(cregex_node_t)); quantifierNode.type = REGEX_NODE_TYPE_QUANTIFIER; quantifierNode.u.c.nmin = 0; quantifierNode.u.c.nmax = -1; quantifierNode.u.c.greedy = 0; quantifierNode.u.c.quantified = &anyCharNode; memset(&concatNode, 0, sizeof(cregex_node_t)); concatNode.type = REGEX_NODE_TYPE_CONCATENATION; concatNode.u.d.left = &quantifierNode; concatNode.u.d.right = (cregex_node_t*)root; root = &concatNode; } /* compile */ memset(&context, 0, sizeof(regex_compile_context)); context.pc = program->instructions; context.ncaptures = 0; compile_context(&context, root); /* emit final match instruction */ memset(&finalInstr, 0, sizeof(cregex_program_instr_t)); finalInstr.opcode = REGEX_PROGRAM_OPCODE_MATCH; emit(&context, &finalInstr); /* set total number of instructions */ program->ninstructions = context.pc - program->instructions; return program; } /* Upper bound of number of instructions required to compile parsed pattern. */ STATIC LONG estimate_instructions(const cregex_node_t *root) { return count_instructions(root) /* .*? is added unless pattern starts with ^, * save instructions are added for beginning and end of match, * a final match instruction is added to the end of the program */ + !node_is_anchored(root) * 3 + 2 + 1; } cregex_program_t *cregex_compile_node(const cregex_node_t *root) { size_t size = sizeof(cregex_program_t) + sizeof(cregex_program_instr_t) * (estimate_instructions(root) - 1); cregex_program_t *program; if (!(program = AllocVec(size, MEMF_CLEAR))) return NULL; if (!compile_node_with_program(root, program)) { free(program); return NULL; } return program; } /* Free a compiled program */ VOID cregex_compile_free(cregex_program_t *program) { FreeVec(program); }