#include #include #include #include "cregex.h" #define REGEX_VM_MAX_MATCHES 20 /* The VM executes one or more threads, each running a regular expression * program, which is just a list of regular expression instructions. Each * thread maintains two registers while it runs: a program counter (PC) and * a string pointer (SP). */ typedef struct { LONG visited; const cregex_program_instr_t *pc; const char *matches[REGEX_VM_MAX_MATCHES]; } vm_thread; /* Run program on string */ STATIC LONG vm_run(const cregex_program_t *program, const char *string, const char **matches, LONG nmatches); /* Run program on string (using a previously allocated buffer of at least * vm_estimate_threads(program) threads) */ STATIC LONG vm_run_with_threads(const cregex_program_t *program, const char *string, const char **matches, LONG nmatches, vm_thread *threads); typedef struct { LONG nthreads; vm_thread *threads; } vm_thread_list; STATIC VOID vm_add_thread(vm_thread_list *list, const cregex_program_t *program, const cregex_program_instr_t *pc, const char *string, const char *sp, const char **matches, LONG nmatches) { if (list->threads[pc - program->instructions].visited == sp - string + 1) return; list->threads[pc - program->instructions].visited = sp - string + 1; switch (pc->opcode) { case REGEX_PROGRAM_OPCODE_MATCH: /* fall-through */ /* Characters */ case REGEX_PROGRAM_OPCODE_CHARACTER: case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: case REGEX_PROGRAM_OPCODE_CHCLS: case REGEX_PROGRAM_OPCODE_CHCLS_NEGATED: list->threads[list->nthreads].pc = pc; memcpy((char*)list->threads[list->nthreads].matches, (char*)matches, sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES) ? nmatches : REGEX_VM_MAX_MATCHES)); ++list->nthreads; break; /* Control-flow */ case REGEX_PROGRAM_OPCODE_SPLIT: vm_add_thread(list, program, pc->u.c.first, string, sp, matches, nmatches); vm_add_thread(list, program, pc->u.c.second, string, sp, matches, nmatches); break; case REGEX_PROGRAM_OPCODE_JUMP: vm_add_thread(list, program, pc->u.d.target, string, sp, matches, nmatches); break; /* Assertions */ case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: if (sp == string) vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); break; case REGEX_PROGRAM_OPCODE_ASSERT_END: if (!*sp) vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); break; /* Saving */ case REGEX_PROGRAM_OPCODE_SAVE: if (pc->u.e.save < nmatches && pc->u.e.save < REGEX_VM_MAX_MATCHES) { const char *saved = matches[pc->u.e.save]; matches[pc->u.e.save] = sp; vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); matches[pc->u.e.save] = saved; } else { vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); } break; } } /* Upper bound of number of threads required to run program */ STATIC LONG vm_estimate_threads(const cregex_program_t *program) { return program->ninstructions * 2; } STATIC LONG vm_run(const cregex_program_t *program, const char *string, const char **matches, LONG nmatches) { size_t size = sizeof(vm_thread) * vm_estimate_threads(program); vm_thread *threads; LONG matched; if (!(threads = AllocVec(size, MEMF_CLEAR))) { return -1; } matched = vm_run_with_threads(program, string, matches, nmatches, threads); FreeVec(threads); return matched; } STATIC LONG vm_run_with_threads(const cregex_program_t *program, const char *string, const char **matches, LONG nmatches, vm_thread *threads) { vm_thread_list currentList; vm_thread_list* current; vm_thread_list nextList; vm_thread_list* next; vm_thread_list* swap = NULL; LONG matched = 0; const char *sp = NULL; LONG i = 0; memset(¤tList, 0, sizeof(vm_thread_list)); currentList.nthreads = 0; currentList.threads = threads; current = ¤tList; memset(&nextList, 0, sizeof(vm_thread_list)); nextList.nthreads = 0; nextList.threads = threads + program->ninstructions; next = &nextList; memset(threads, 0, sizeof(vm_thread) * program->ninstructions * 2); vm_add_thread(current, program, program->instructions, string, string, matches, nmatches); for (sp = string;; ++sp) { for (i = 0; i < current->nthreads; ++i) { vm_thread *thread = current->threads + i; switch (thread->pc->opcode) { case REGEX_PROGRAM_OPCODE_MATCH: matched = 1; current->nthreads = 0; memcpy((char*)matches, (char*)thread->matches, sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES) ? nmatches : REGEX_VM_MAX_MATCHES)); continue; /* Characters */ case REGEX_PROGRAM_OPCODE_CHARACTER: if (*sp == thread->pc->u.a.ch) break; continue; case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: if (*sp) break; continue; case REGEX_PROGRAM_OPCODE_CHCLS: if (cregex_char_class_contains(thread->pc->u.b.klass, *sp)) break; continue; case REGEX_PROGRAM_OPCODE_CHCLS_NEGATED: if (!cregex_char_class_contains(thread->pc->u.b.klass, *sp)) break; continue; /* Control-flow */ case REGEX_PROGRAM_OPCODE_SPLIT: case REGEX_PROGRAM_OPCODE_JUMP: /* fall-through */ /* Assertions */ case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: case REGEX_PROGRAM_OPCODE_ASSERT_END: /* fall-through */ /* Saving */ case REGEX_PROGRAM_OPCODE_SAVE: /* handled in vm_add_thread() */ abort(); } vm_add_thread(next, program, thread->pc + 1, string, sp + 1, thread->matches, nmatches); } /* swap current and next thread list */ swap = current; current = next; next = swap; next->nthreads = 0; /* done if no more threads are running or end of string reached */ if (current->nthreads == 0 || !*sp) break; } return matched; } LONG cregex_program_run(const cregex_program_t *program, const char *string, const char **matches, LONG nmatches) { return vm_run(program, string, matches, nmatches); }