commit 4efeabf70b3c1b7bcd4a79947166abfd52322aa5 Author: Alan Francis Date: Sat Nov 18 13:04:51 2023 +0000 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bfc48da --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.o +*.lib +*.uaem +configreader +.DS_Store + diff --git a/configmodel.c b/configmodel.c new file mode 100644 index 0000000..476100a --- /dev/null +++ b/configmodel.c @@ -0,0 +1,234 @@ +#include "configmodel.h" + +#include "cregex.h" +#include "stringarray.h" + +#include +#include +#include +#include + +cregex_program_t* InitialisePattern(CONST_STRPTR pattern); +Array RunPattern(CONST_STRPTR text, cregex_program_t* patternProgram); + +#define WHITESPACE "[ \\t\\n\\r\\f\\v]" +#define RX_BLANK_LINE "^[ \t\n\r\f\v]*($|#|;)" + +#define RX_SECTION_LINE "^[ \t\n\r\f\v]*\\[([a-z0-9]+)([ \t\n\r\f\v]*\"(.+)\")*\\][ \t\n\r\f\v]*($|#|;)" +#define RX_VARIABLE_LINE "^[ \t\n\r\f\v]*([a-z][a-z0-9]+)[ \t\n\r\f\v]*=[ \t\n\r\f\v]*(.+)[ \t\n\r\f\v]*($|#|;)" +#define RX_INTEGER "^-?[1-9][0-9]*$" + +STATIC cregex_program_t* sectionPatternProgram = NULL; +STATIC cregex_program_t* variablePatternProgram = NULL; +STATIC cregex_program_t* blankPatternProgram = NULL; +STATIC cregex_program_t* integerPatternProgram = NULL; + +struct Section +{ + StringArray names; +}; + +enum VariableType +{ + TypeBool=0, + TypeInteger=1, + TypeString=2, +}; + +struct Variable +{ + enum VariableType type; + CONST_STRPTR name; + union + { + CONST_STRPTR stringValue; + BOOL boolValue; + LONG longValue; + } value; +}; + +struct Line +{ + STRPTR rawText; + union + { + struct Variable* variable; + struct Section* section; + } + object; +}; + +VOID InitialisePatterns(VOID) +{ + sectionPatternProgram = InitialisePattern(RX_SECTION_LINE); + variablePatternProgram = InitialisePattern(RX_VARIABLE_LINE); + blankPatternProgram = InitialisePattern(RX_BLANK_LINE); + integerPatternProgram = InitialisePattern(RX_INTEGER); +} + +VOID ReleasePatterns(VOID) +{ + if( sectionPatternProgram != NULL ) cregex_compile_free( sectionPatternProgram ); + if( variablePatternProgram != NULL ) cregex_compile_free( variablePatternProgram ); + if( blankPatternProgram != NULL ) cregex_compile_free( blankPatternProgram ); + if( integerPatternProgram != NULL ) cregex_compile_free( integerPatternProgram ); +} + +cregex_program_t* InitialisePattern(CONST_STRPTR pattern) +{ + cregex_program_t* result = NULL; + cregex_node_t* patternNode = cregex_parse(pattern); + if( patternNode ) + { + result = cregex_compile_node( patternNode ); + if( result != NULL ) + { + //Printf("successfully compiled %s\n", pattern); + } + else + { + Printf("failed to compile %s\n", pattern); + } + cregex_parse_free( patternNode ); + } + else + { + Printf("could not parse %s\n", pattern); + } + return result; +} + +Array RunPattern(CONST_STRPTR text, cregex_program_t* patternProgram) +{ + Array result = NULL; + char* localMatches[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + if (cregex_program_run(patternProgram, text, localMatches, 20) > 0) { + int j = 0; + int nmatches = 0; + + // count the matches + for (j = 0; j < 20; ++j) + if (localMatches[j]) + nmatches = j; + + if( nmatches > 0 ) + { + result = StringArrayNew(); + // loop the matches + for (j = 0; j <= nmatches; j += 2) { + if (localMatches[j] && localMatches[j + 1]) { + int len = (int)(localMatches[j + 1] - localMatches[j]); + STRPTR buffer = AllocVec(len+1, MEMF_CLEAR); // freed in the array + sprintf(buffer, "%.*s", len, localMatches[j]); + if( buffer[len-1] == '\n' ) + { + buffer[len-1] = '\0'; + } + StringArrayAppend(result, buffer); + } else { + //Printf("(NULL,NULL)\n"); + } + } + } + // end + } else { + //Printf("\"%s\": no match\n", text); + } + return result; +} + +LINEPTR LineReadIncludingContinuation(BPTR file) +{ + UBYTE* buffer = AllocVec(512, MEMF_CLEAR); + ULONG bufLength = 512; + ULONG bytesReadTotal = 0; + UBYTE* read = NULL; + struct Line* result = NULL; + + do + { + read = FGets(file, &(buffer[bytesReadTotal]), bufLength-bytesReadTotal); + bytesReadTotal = strlen(buffer); + } + while( read != NULL && bytesReadTotal >= 2 && bytesReadTotal < bufLength && buffer[bytesReadTotal-1] == '\n' && buffer[bytesReadTotal-2] == '\\' ); + + + if( bytesReadTotal > 0 ) + { + StringArray stringArray = NULL; + result = AllocVec(sizeof(struct Line), MEMF_CLEAR); + result->rawText = AllocVec(bytesReadTotal+1, MEMF_CLEAR); + CopyMem(buffer, result->rawText, bytesReadTotal-1); + +// Printf("\n\nraw text = {{{%s}}}\n", result->rawText); // + stringArray = RunPattern(result->rawText, sectionPatternProgram); + if( stringArray ) + { + if( SizeOfArray(stringArray) == 3 ) + { + Printf("\nsection {%s}\n", StringArrayValues(stringArray)[1]); + } + else if( SizeOfArray(stringArray) == 5 ) + { + Printf("\nsection {%s.%s}\n", StringArrayValues(stringArray)[1], StringArrayValues(stringArray)[3]); + } +// Printf("\nsection size=%ld\n", SizeOfArray(stringArray)); +// StringArrayForEach(stringArray, Printf("{{%s}}",aString);); + StringArrayFree(stringArray); + } + else + { + stringArray = RunPattern(result->rawText, variablePatternProgram); + if( stringArray ) + { + Printf("\nvariable {%s} = {%s}\n", StringArrayValues(stringArray)[1], StringArrayValues(stringArray)[2]); +// StringArrayForEach(stringArray, Printf("{{%s}}",aString);); + StringArrayFree(stringArray); + } + else + { + stringArray = RunPattern(result->rawText, blankPatternProgram); + if( stringArray ) + { + Printf("\nYY %s\n", result->rawText); + StringArrayFree(stringArray); + } + else + { + Printf("\nXX %s\n", result->rawText); + } + } + } + + + } + FreeVec(buffer); + return result; +} + +VOID LineFree(LINEPTR abstractLine) +{ + struct Line* line = (struct Line*)abstractLine; + if( line != NULL ) + { + if( line->rawText != NULL ) + { + FreeVec(line->rawText); + } + FreeVec(line); + } +} + +CONST_STRPTR LineGetRawText(LINEPTR abstractLine) +{ + struct Line* line = (struct Line*)abstractLine; + if( line != NULL ) + { + return (CONST_STRPTR)line->rawText; + } + else + { + return NULL; + } +} + diff --git a/configmodel.h b/configmodel.h new file mode 100644 index 0000000..6e27560 --- /dev/null +++ b/configmodel.h @@ -0,0 +1,16 @@ +#ifndef __CONFIGMODEL_H +#define __CONFIGMODEL_H +#include +#include + +typedef APTR LINEPTR; +typedef APTR SECTIONPTR; +typedef APTR VARIABLEPTR; + +VOID InitialisePatterns(VOID); +VOID ReleasePatterns(VOID); + +LINEPTR LineReadIncludingContinuation(BPTR file); +VOID LineFree(LINEPTR abstractLine); +CONST_STRPTR LineGetRawText(LINEPTR line); +#endif \ No newline at end of file diff --git a/cregex.h b/cregex.h new file mode 100644 index 0000000..b0acfaa --- /dev/null +++ b/cregex.h @@ -0,0 +1,147 @@ +#ifndef CREGEX_H +#define CREGEX_H + +#include +#if defined(__SASC) +#define INLINE __inline +#elif defined(__VBCC__) +#define INLINE inline +#else +#define INLINE +#endif + +typedef enum { + REGEX_NODE_TYPE_EPSILON = 0, + /* Characters */ + REGEX_NODE_TYPE_CHARACTER, + REGEX_NODE_TYPE_ANY_CHARACTER, + REGEX_NODE_TYPE_CHARACTER_CLASS, + REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED, + /* Composites */ + REGEX_NODE_TYPE_CONCATENATION, + REGEX_NODE_TYPE_ALTERNATION, + /* Quantifiers */ + REGEX_NODE_TYPE_QUANTIFIER, + /* Anchors */ + REGEX_NODE_TYPE_ANCHOR_BEGIN, + REGEX_NODE_TYPE_ANCHOR_END, + /* Captures */ + REGEX_NODE_TYPE_CAPTURE +} cregex_node_type; + +typedef struct cregex_node { + cregex_node_type type; + union { + /* REGEX_NODE_TYPE_CHARACTER */ + struct { + LONG ch; + } a; + /* REGEX_NODE_TYPE_CHARACTER_CLASS, + * REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED + */ + struct { + const char *from, *to; + } b; + /* REGEX_NODE_TYPE_QUANTIFIER */ + struct { + LONG nmin, nmax, greedy; + struct cregex_node *quantified; + } c; + /* REGEX_NODE_TYPE_CONCATENATION, + * REGEX_NODE_TYPE_ALTERNATION + */ + struct { + struct cregex_node *left, *right; + } d; + /* REGEX_NODE_TYPE_CAPTURE */ + struct { + struct cregex_node *captured; + } e; + } u; +} cregex_node_t; + +typedef enum { + REGEX_PROGRAM_OPCODE_MATCH = 0, + /* Characters */ + REGEX_PROGRAM_OPCODE_CHARACTER, + REGEX_PROGRAM_OPCODE_ANY_CHARACTER, + REGEX_PROGRAM_OPCODE_CHCLS, + REGEX_PROGRAM_OPCODE_CHCLS_NEGATED, + /* Control-flow */ + REGEX_PROGRAM_OPCODE_SPLIT, + REGEX_PROGRAM_OPCODE_JUMP, + /* Assertions */ + REGEX_PROGRAM_OPCODE_ASSERT_BEGIN, + REGEX_PROGRAM_OPCODE_ASSERT_END, + /* Saving */ + REGEX_PROGRAM_OPCODE_SAVE +} cregex_program_opcode_t; + +#include + +typedef char cregex_char_class[(UCHAR_MAX + CHAR_BIT - 1) / CHAR_BIT]; + +STATIC INLINE LONG cregex_char_class_contains(const cregex_char_class klass, + LONG ch) +{ + return klass[ch / CHAR_BIT] & (1 << ch % CHAR_BIT); +} + +STATIC INLINE LONG cregex_char_class_add(cregex_char_class klass, LONG ch) +{ + klass[ch / CHAR_BIT] |= 1 << (ch % CHAR_BIT); + return ch; +} + +typedef struct cregex_program_instr { + cregex_program_opcode_t opcode; + union { + /* REGEX_PROGRAM_OPCODE_CHARACTER */ + struct { + LONG ch; + } a; + /* REGEX_PROGRAM_OPCODE_CHCLS, + * REGEX_PROGRAM_OPCODE_CHCLS_NEGATED + */ + struct { + cregex_char_class klass; + } b; + /* REGEX_PROGRAM_OPCODE_SPLIT */ + struct { + struct cregex_program_instr *first, *second; + } c; + /* REGEX_PROGRAM_OPCODE_JUMP */ + struct { + struct cregex_program_instr *target; + } d; + /* REGEX_PROGRAM_OPCODE_SAVE */ + struct { + LONG save; + } e; + } u; +} cregex_program_instr_t; + +typedef struct { + LONG ninstructions; + cregex_program_instr_t instructions[1]; +} cregex_program_t; + +/* Run program on string */ +LONG cregex_program_run(const cregex_program_t *program, + const char *string, + const char **matches, + LONG nmatches); + +/* Compile a parsed pattern */ +cregex_program_t *cregex_compile_node(const cregex_node_t *root); + +/* Free a compiled program */ +VOID cregex_compile_free(cregex_program_t *program); + +/* Parse a pattern */ +cregex_node_t *cregex_parse(const char *pattern); + +/* Free a parsed pattern */ +VOID cregex_parse_free(cregex_node_t *root); + +#endif diff --git a/cregex_compile.c b/cregex_compile.c new file mode 100644 index 0000000..6037700 --- /dev/null +++ b/cregex_compile.c @@ -0,0 +1,351 @@ +#include +#include + +#include "cregex.h" +#include + +typedef struct { + cregex_program_instr_t *pc; + LONG ncaptures; +} regex_compile_context; + +STATIC LONG count_instructions(const cregex_node_t *node) +{ + switch (node->type) { + case REGEX_NODE_TYPE_EPSILON: + return 0; + + /* Characters */ + case REGEX_NODE_TYPE_CHARACTER: + case REGEX_NODE_TYPE_ANY_CHARACTER: + case REGEX_NODE_TYPE_CHARACTER_CLASS: + case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: + return 1; + + /* Composites */ + case REGEX_NODE_TYPE_CONCATENATION: + return count_instructions(node->u.d.left) + count_instructions(node->u.d.right); + case REGEX_NODE_TYPE_ALTERNATION: + return 2 + + count_instructions(node->u.d.left) + + count_instructions(node->u.d.right); + + /* Quantifiers */ + case REGEX_NODE_TYPE_QUANTIFIER: { + LONG num = count_instructions(node->u.c.quantified); + if (node->u.c.nmax >= node->u.c.nmin) + { + return node->u.c.nmin * num + (node->u.c.nmax - node->u.c.nmin) * (num + 1); + } + else + { + return 1 + (node->u.c.nmin ? node->u.c.nmin * num : num + 1); + } + } + + /* Anchors */ + case REGEX_NODE_TYPE_ANCHOR_BEGIN: + case REGEX_NODE_TYPE_ANCHOR_END: + return 1; + + /* Captures */ + case REGEX_NODE_TYPE_CAPTURE: + return 2 + count_instructions(node->u.e.captured); + } + + /* should not reach here */ + return 0; +} + +STATIC BOOL node_is_anchored(const cregex_node_t *node) +{ + switch (node->type) { + case REGEX_NODE_TYPE_EPSILON: + return FALSE; + + /* Characters */ + case REGEX_NODE_TYPE_CHARACTER: + case REGEX_NODE_TYPE_ANY_CHARACTER: + case REGEX_NODE_TYPE_CHARACTER_CLASS: + case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: + return FALSE; + + /* Composites */ + case REGEX_NODE_TYPE_CONCATENATION: + return node_is_anchored(node->u.d.left); + case REGEX_NODE_TYPE_ALTERNATION: + return (BOOL)(node_is_anchored(node->u.d.left) && node_is_anchored(node->u.d.right)); + + /* Quantifiers */ + case REGEX_NODE_TYPE_QUANTIFIER: + return node_is_anchored(node->u.c.quantified); + + /* Anchors */ + case REGEX_NODE_TYPE_ANCHOR_BEGIN: + return TRUE; + case REGEX_NODE_TYPE_ANCHOR_END: + return FALSE; + + /* Captures */ + case REGEX_NODE_TYPE_CAPTURE: + return node_is_anchored(node->u.e.captured); + } + + /* should not reach here */ + return FALSE; +} + +STATIC INLINE cregex_program_instr_t *emit( + regex_compile_context *context, + const cregex_program_instr_t *instruction) +{ + *context->pc = *instruction; + return context->pc++; +} + +STATIC cregex_program_instr_t *compile_char_class( + const cregex_node_t *node, + cregex_program_instr_t *instruction) +{ + const char *sp = node->u.b.from; + + for (;;) { + LONG ch = *sp++; + switch (ch) { + case ']': + if (sp - 1 == node->u.b.from) + goto CHARACTER; + return instruction; + case '\\': + ch = *sp++; + /* fall-through */ + default: + CHARACTER: + if (*sp == '-' && sp[1] != ']') { + for (; ch <= sp[1]; ++ch) + cregex_char_class_add(instruction->u.b.klass, ch); + sp += 2; + } else { + cregex_char_class_add(instruction->u.b.klass, ch); + } + break; + } + } +} + +STATIC cregex_program_instr_t *compile_context(regex_compile_context *context, + const cregex_node_t *node) +{ + cregex_program_instr_t *bottom = context->pc, *split, *jump; + LONG ncaptures = context->ncaptures, capture; + cregex_program_instr_t newInstr; + memset(&newInstr, 0, sizeof(cregex_program_instr_t)); + + + switch (node->type) { + case REGEX_NODE_TYPE_EPSILON: + break; + + /* Characters */ + case REGEX_NODE_TYPE_CHARACTER: + newInstr.opcode = REGEX_PROGRAM_OPCODE_CHARACTER; + newInstr.u.a.ch = node->u.a.ch; + emit(context, &newInstr); + break; + case REGEX_NODE_TYPE_ANY_CHARACTER: + newInstr.opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER; + emit(context, &newInstr); + break; + case REGEX_NODE_TYPE_CHARACTER_CLASS: + newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS; + compile_char_class( node, emit(context, &newInstr)); + break; + case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED: + newInstr.opcode = REGEX_PROGRAM_OPCODE_CHCLS_NEGATED; + compile_char_class( node, emit(context, &newInstr)); + break; + + /* Composites */ + case REGEX_NODE_TYPE_CONCATENATION: + compile_context(context, node->u.d.left); + compile_context(context, node->u.d.right); + break; + case REGEX_NODE_TYPE_ALTERNATION: { + cregex_program_instr_t splitInstr; + cregex_program_instr_t jumpInstr; + memset(&splitInstr, 0, sizeof(cregex_program_instr_t)); + memset(&jumpInstr, 0, sizeof(cregex_program_instr_t)); + splitInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT; + jumpInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP; + + split = emit(context, &splitInstr); + split->u.c.first = compile_context(context, node->u.d.left); + jump = emit(context, &jumpInstr); + split->u.c.second = compile_context(context, node->u.d.right); + jump->u.d.target = context->pc; + } + break; + + /* Quantifiers */ + case REGEX_NODE_TYPE_QUANTIFIER: { + cregex_program_instr_t *last = NULL; + LONG i = 0; + for (i = 0; i < node->u.c.nmin; ++i) { + context->ncaptures = ncaptures; + last = compile_context(context, node->u.c.quantified); + } + if (node->u.c.nmax > node->u.c.nmin) { + for (i = 0; i < node->u.c.nmax - node->u.c.nmin; ++i) { + memset(&newInstr, 0, sizeof(cregex_program_instr_t)); + newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT; + context->ncaptures = ncaptures; + split = emit(context, &newInstr); + split->u.c.first = compile_context(context, node->u.c.quantified); + split->u.c.second = context->pc; + if (!node->u.c.greedy) { + cregex_program_instr_t *swap = split->u.c.first; + split->u.c.first = split->u.c.second; + split->u.c.second = swap; + } + } + } else if (node->u.c.nmax == -1) { + newInstr.opcode = REGEX_PROGRAM_OPCODE_SPLIT; + split = emit(context, &newInstr); + if (node->u.c.nmin == 0) { + split->u.c.first = compile_context(context, node->u.c.quantified); + newInstr.opcode = REGEX_PROGRAM_OPCODE_JUMP; + jump = emit(context, &newInstr); + split->u.c.second = context->pc; + jump->u.d.target = split; + } else { + split->u.c.first = last; + split->u.c.second = context->pc; + } + if (!node->u.c.greedy) { + cregex_program_instr_t *swap = split->u.c.first; + split->u.c.first = split->u.c.second; + split->u.c.second = swap; + } + } + break; + } + + /* Anchors */ + case REGEX_NODE_TYPE_ANCHOR_BEGIN: + newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN; + emit(context, &newInstr); + break; + case REGEX_NODE_TYPE_ANCHOR_END: + newInstr.opcode = REGEX_PROGRAM_OPCODE_ASSERT_END; + emit(context, &newInstr); + break; + + /* Captures */ + case REGEX_NODE_TYPE_CAPTURE: + capture = context->ncaptures++ * 2; + newInstr.opcode = REGEX_PROGRAM_OPCODE_SAVE; + + newInstr.u.e.save = capture; + emit(context,&newInstr); + + compile_context(context, node->u.e.captured); + + newInstr.u.e.save = capture + 1; + emit(context, &newInstr); + break; + } + + return bottom; +} + +/* Compile a parsed pattern (using a previously allocated program with at least + * estimate_instructions(root) instructions). + */ +STATIC cregex_program_t *compile_node_with_program(const cregex_node_t *root, + cregex_program_t *program) +{ + regex_compile_context context; + cregex_node_t rootNode; + cregex_program_instr_t finalInstr; + + memset(&rootNode, 0, sizeof(cregex_node_t)); + rootNode.type = REGEX_NODE_TYPE_CAPTURE; + rootNode.u.e.captured = (cregex_node_t *)root; + /* add capture node for entire match */ + root = &rootNode; + + /* add .*? unless pattern starts with ^ */ + if (!node_is_anchored(root)) + { + cregex_node_t concatNode; + cregex_node_t quantifierNode; + cregex_node_t anyCharNode; + + memset(&anyCharNode, 0, sizeof(cregex_node_t)); + anyCharNode.type = REGEX_NODE_TYPE_ANY_CHARACTER; + + memset(&quantifierNode, 0, sizeof(cregex_node_t)); + quantifierNode.type = REGEX_NODE_TYPE_QUANTIFIER; + quantifierNode.u.c.nmin = 0; + quantifierNode.u.c.nmax = -1; + quantifierNode.u.c.greedy = 0; + quantifierNode.u.c.quantified = &anyCharNode; + + memset(&concatNode, 0, sizeof(cregex_node_t)); + concatNode.type = REGEX_NODE_TYPE_CONCATENATION; + concatNode.u.d.left = &quantifierNode; + concatNode.u.d.right = (cregex_node_t*)root; + + root = &concatNode; + } + + /* compile */ + memset(&context, 0, sizeof(regex_compile_context)); + context.pc = program->instructions; + context.ncaptures = 0; + compile_context(&context, root); + + /* emit final match instruction */ + memset(&finalInstr, 0, sizeof(cregex_program_instr_t)); + finalInstr.opcode = REGEX_PROGRAM_OPCODE_MATCH; + emit(&context, &finalInstr); + + /* set total number of instructions */ + program->ninstructions = context.pc - program->instructions; + + return program; +} + +/* Upper bound of number of instructions required to compile parsed pattern. */ +STATIC LONG estimate_instructions(const cregex_node_t *root) +{ + return count_instructions(root) + /* .*? is added unless pattern starts with ^, + * save instructions are added for beginning and end of match, + * a final match instruction is added to the end of the program + */ + + !node_is_anchored(root) * 3 + 2 + 1; +} + +cregex_program_t *cregex_compile_node(const cregex_node_t *root) +{ + size_t size = sizeof(cregex_program_t) + + sizeof(cregex_program_instr_t) * (estimate_instructions(root) - 1); + cregex_program_t *program; + + if (!(program = AllocVec(size, MEMF_CLEAR))) + return NULL; + + if (!compile_node_with_program(root, program)) { + free(program); + return NULL; + } + + return program; +} + +/* Free a compiled program */ +VOID cregex_compile_free(cregex_program_t *program) +{ + FreeVec(program); +} diff --git a/cregex_parse.c b/cregex_parse.c new file mode 100644 index 0000000..3a4e109 --- /dev/null +++ b/cregex_parse.c @@ -0,0 +1,308 @@ +#include +#include +#include +#include + +#include "cregex.h" + +typedef struct { + const char *sp; + cregex_node_t *stack, *output; +} regex_parse_context; + +/* Shunting-yard algorithm + * See https://en.wikipedia.org/wiki/Shunting-yard_algorithm + */ + +STATIC INLINE cregex_node_t *push(regex_parse_context *context, + const cregex_node_t *node) +{ + //assert(context->stack <= context->output); + *context->stack = *node; + return context->stack++; +} + +STATIC INLINE cregex_node_t *drop(regex_parse_context *context) +{ + return --context->stack; +} + +STATIC INLINE cregex_node_t *consume(regex_parse_context *context) +{ + *--context->output = *--context->stack; + return context->output; +} + +STATIC INLINE cregex_node_t *concatenate(regex_parse_context *context, + const cregex_node_t *bottom) +{ + cregex_node_t newNode; + memset(&newNode, 0, sizeof(cregex_node_t)); + + if (context->stack == bottom) { + newNode.type = REGEX_NODE_TYPE_EPSILON; + push(context, &newNode); + } + else { + newNode.type = REGEX_NODE_TYPE_CONCATENATION; + while (context->stack - 1 > bottom) { + cregex_node_t *right = consume(context); + cregex_node_t *left = consume(context); + newNode.u.d.left = left; + newNode.u.d.right = right; + push(context, &newNode); + } + } + return context->stack - 1; +} + +STATIC cregex_node_t *parse_char_class(regex_parse_context *context) +{ + cregex_node_t newNode; + cregex_node_type type = + (*context->sp == '^') + ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED) + : REGEX_NODE_TYPE_CHARACTER_CLASS; + const char *from = context->sp; + + for (;;) { + LONG ch = *context->sp++; + memset(&newNode, 0, sizeof(cregex_node_t)); + switch (ch) { + case '\0': + /* premature end of character class */ + return NULL; + case ']': + if (context->sp - 1 == from) { + goto CHARACTER; + } else { + newNode.type = type; + newNode.u.b.from = from; + newNode.u.b.to = context->sp - 1; + return push(context, &newNode); + } + case '\\': + ch = *context->sp++; + /* fall-through */ + default: + CHARACTER: + if (*context->sp == '-' && context->sp[1] != ']') { + if (context->sp[1] < ch) + /* empty range in character class */ + return NULL; + context->sp += 2; + } + break; + } + } +} + +STATIC cregex_node_t *parse_interval(regex_parse_context *context) +{ + const char *from = context->sp; + LONG nmin, nmax; + cregex_node_t newNode; + + memset(&newNode, 0, sizeof(cregex_node_t)); + + for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp) + nmin = (nmin * 10) + (*context->sp - '0'); + + if (*context->sp == ',') { + ++context->sp; + if (*from != ',' && *context->sp == '}') + nmax = -1; + else { + for (nmax = 0; *context->sp >= '0' && *context->sp <= '9'; + ++context->sp) + nmax = (nmax * 10) + (*context->sp - '0'); + if (*(context->sp - 1) == ',' || *context->sp != '}' || + nmax < nmin) { + context->sp = from; + return NULL; + } + } + } else if (*from != '}' && *context->sp == '}') { + nmax = nmin; + } else { + context->sp = from; + return NULL; + } + + ++context->sp; + newNode.type = REGEX_NODE_TYPE_QUANTIFIER; + newNode.u.c.nmin = nmin; + newNode.u.c.nmax = nmax; + newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; + newNode.u.c.quantified = consume(context); + return push(context, &newNode); +} + +STATIC cregex_node_t *parse_context(regex_parse_context *context, LONG depth) +{ + cregex_node_t *bottom = context->stack; + cregex_node_t newNode; + + for (;;) { + LONG ch = *context->sp++; + memset(&newNode, 0, sizeof(cregex_node_t)); + switch (ch) { + /* Characters */ + case '\\': + ch = *context->sp++; + /* fall-through */ + default: + CHARACTER: + newNode.type = REGEX_NODE_TYPE_CHARACTER; + newNode.u.a.ch = ch; + push(context, &newNode); + break; + case '.': + newNode.type = REGEX_NODE_TYPE_ANY_CHARACTER; + push(context, &newNode); + break; + case '[': + if (!parse_char_class(context)) + return NULL; + break; + + /* Composites */ + case '|': { + cregex_node_t *left = concatenate(context, bottom), *right; + if (!(right = parse_context(context, depth))) + return NULL; + if (left->type == REGEX_NODE_TYPE_EPSILON && + right->type == left->type) { + drop(context); + } else if (left->type == REGEX_NODE_TYPE_EPSILON) { + right = consume(context); + drop(context); + newNode.type = REGEX_NODE_TYPE_QUANTIFIER; + newNode.u.c.nmin = 0; + newNode.u.c.nmax = 1; + newNode.u.c.greedy = 1; + newNode.u.c.quantified = right; + push(context, &newNode); + } else if (right->type == REGEX_NODE_TYPE_EPSILON) { + drop(context); + left = consume(context); + newNode.type = REGEX_NODE_TYPE_QUANTIFIER; + newNode.u.c.nmin = 0; + newNode.u.c.nmax = 1; + newNode.u.c.greedy = 1; + newNode.u.c.quantified = left; + push(context, &newNode); + } else { + right = consume(context); + left = consume(context); + newNode.type = REGEX_NODE_TYPE_ALTERNATION; + newNode.u.d.left = left; + newNode.u.d.right = right; + push(context, &newNode); + } + return bottom; + } + +#define QUANTIFIER(ch, min, max) \ + case ch: \ + if (context->stack == bottom) { \ + goto CHARACTER; \ + } else { \ + newNode.type = REGEX_NODE_TYPE_QUANTIFIER; \ + newNode.u.c.nmin = min; \ + newNode.u.c.nmax = max; \ + newNode.u.c.greedy = (*context->sp == '?') ? (++context->sp, 0) : 1; \ + newNode.u.c.quantified = consume(context); \ + push(context, &newNode); \ + } \ + break +// END-QUANTIFIER + /* clang-format off */ + /* Quantifiers */ + QUANTIFIER('?', 0, 1); + QUANTIFIER('*', 0, -1); + QUANTIFIER('+', 1, -1); + /* clang-format on */ +#undef QUANTIFIER + + case '{': + if ((context->stack == bottom) || !parse_interval(context)) + goto CHARACTER; + break; + + /* Anchors */ + case '^': + newNode.type = REGEX_NODE_TYPE_ANCHOR_BEGIN; + push(context, + &newNode); + break; + case '$': + newNode.type = REGEX_NODE_TYPE_ANCHOR_END; + push(context, + &newNode); + break; + + /* Captures */ + case '(': + if (!parse_context(context, depth + 1)) { + return NULL; + } else { + newNode.type = REGEX_NODE_TYPE_CAPTURE; + newNode.u.e.captured = consume(context); + push(context, &newNode); + } + break; + case ')': + if (depth > 0) + return concatenate(context, bottom); + /* unmatched close parenthesis */ + return NULL; + + /* End of string */ + case '\0': + if (depth == 0) + return concatenate(context, bottom); + /* unmatched open parenthesis */ + return NULL; + } + } +} + +STATIC INLINE LONG estimate_nodes(const char *pattern) +{ + return (LONG)(strlen(pattern) * 2); +} + +/* Parse a pattern (using a previously allocated buffer of at least + * estimate_nodes(pattern) nodes). + */ +STATIC cregex_node_t *parse_with_nodes(const char *pattern, + cregex_node_t *nodes) +{ + regex_parse_context context; + context.sp = pattern; + context.stack = nodes, + context.output = nodes + estimate_nodes(pattern); + return parse_context(&context, 0); +} + +cregex_node_t *cregex_parse(const char *pattern) +{ + size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern); + cregex_node_t *nodes = AllocVec(size, MEMF_CLEAR); +// Printf("mallocing %ld bytes for parse\n", size); + if (!nodes) + return NULL; + + if (!parse_with_nodes(pattern, nodes)) { + free(nodes); + return NULL; + } + + return nodes; +} + +VOID cregex_parse_free(cregex_node_t *root) +{ + FreeVec(root); +} diff --git a/cregex_vm.c b/cregex_vm.c new file mode 100644 index 0000000..1a876ae --- /dev/null +++ b/cregex_vm.c @@ -0,0 +1,229 @@ +#include +#include +#include + +#include "cregex.h" + +#define REGEX_VM_MAX_MATCHES 20 + +/* The VM executes one or more threads, each running a regular expression + * program, which is just a list of regular expression instructions. Each + * thread maintains two registers while it runs: a program counter (PC) and + * a string pointer (SP). + */ +typedef struct { + LONG visited; + const cregex_program_instr_t *pc; + const char *matches[REGEX_VM_MAX_MATCHES]; +} vm_thread; + +/* Run program on string */ +STATIC LONG vm_run(const cregex_program_t *program, + const char *string, + const char **matches, + LONG nmatches); + +/* Run program on string (using a previously allocated buffer of at least + * vm_estimate_threads(program) threads) + */ +STATIC LONG vm_run_with_threads(const cregex_program_t *program, + const char *string, + const char **matches, + LONG nmatches, + vm_thread *threads); + +typedef struct { + LONG nthreads; + vm_thread *threads; +} vm_thread_list; + +STATIC VOID vm_add_thread(vm_thread_list *list, + const cregex_program_t *program, + const cregex_program_instr_t *pc, + const char *string, + const char *sp, + const char **matches, + LONG nmatches) +{ + if (list->threads[pc - program->instructions].visited == sp - string + 1) + return; + list->threads[pc - program->instructions].visited = sp - string + 1; + + switch (pc->opcode) { + case REGEX_PROGRAM_OPCODE_MATCH: + /* fall-through */ + + /* Characters */ + case REGEX_PROGRAM_OPCODE_CHARACTER: + case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: + case REGEX_PROGRAM_OPCODE_CHCLS: + case REGEX_PROGRAM_OPCODE_CHCLS_NEGATED: + list->threads[list->nthreads].pc = pc; + memcpy((char*)list->threads[list->nthreads].matches, + (char*)matches, + sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES) + ? nmatches + : REGEX_VM_MAX_MATCHES)); + ++list->nthreads; + break; + + /* Control-flow */ + case REGEX_PROGRAM_OPCODE_SPLIT: + vm_add_thread(list, program, pc->u.c.first, string, sp, matches, nmatches); + vm_add_thread(list, program, pc->u.c.second, string, sp, matches, nmatches); + break; + case REGEX_PROGRAM_OPCODE_JUMP: + vm_add_thread(list, program, pc->u.d.target, string, sp, matches, nmatches); + break; + + /* Assertions */ + case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: + if (sp == string) + vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); + break; + case REGEX_PROGRAM_OPCODE_ASSERT_END: + if (!*sp) + vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); + break; + + /* Saving */ + case REGEX_PROGRAM_OPCODE_SAVE: + if (pc->u.e.save < nmatches && pc->u.e.save < REGEX_VM_MAX_MATCHES) { + const char *saved = matches[pc->u.e.save]; + matches[pc->u.e.save] = sp; + vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); + matches[pc->u.e.save] = saved; + } else { + vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches); + } + break; + } +} + +/* Upper bound of number of threads required to run program */ +STATIC LONG vm_estimate_threads(const cregex_program_t *program) +{ + return program->ninstructions * 2; +} + +STATIC LONG vm_run(const cregex_program_t *program, + const char *string, + const char **matches, + LONG nmatches) +{ + size_t size = sizeof(vm_thread) * vm_estimate_threads(program); + vm_thread *threads; + LONG matched; + + if (!(threads = AllocVec(size, MEMF_CLEAR))) + { + return -1; + } + + matched = vm_run_with_threads(program, string, matches, nmatches, threads); + FreeVec(threads); + return matched; +} + +STATIC LONG vm_run_with_threads(const cregex_program_t *program, + const char *string, + const char **matches, + LONG nmatches, + vm_thread *threads) +{ + vm_thread_list currentList; + vm_thread_list* current; + vm_thread_list nextList; + vm_thread_list* next; + vm_thread_list* swap = NULL; + LONG matched = 0; + const char *sp = NULL; + LONG i = 0; + + memset(¤tList, 0, sizeof(vm_thread_list)); + currentList.nthreads = 0; + currentList.threads = threads; + current = ¤tList; + + memset(&nextList, 0, sizeof(vm_thread_list)); + nextList.nthreads = 0; + nextList.threads = threads + program->ninstructions; + next = &nextList; + + + memset(threads, 0, sizeof(vm_thread) * program->ninstructions * 2); + + vm_add_thread(current, program, program->instructions, string, string, + matches, nmatches); + for (sp = string;; ++sp) { + for (i = 0; i < current->nthreads; ++i) { + vm_thread *thread = current->threads + i; + switch (thread->pc->opcode) { + case REGEX_PROGRAM_OPCODE_MATCH: + matched = 1; + current->nthreads = 0; + memcpy((char*)matches, (char*)thread->matches, + sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES) + ? nmatches + : REGEX_VM_MAX_MATCHES)); + continue; + + /* Characters */ + case REGEX_PROGRAM_OPCODE_CHARACTER: + if (*sp == thread->pc->u.a.ch) + break; + continue; + case REGEX_PROGRAM_OPCODE_ANY_CHARACTER: + if (*sp) + break; + continue; + case REGEX_PROGRAM_OPCODE_CHCLS: + if (cregex_char_class_contains(thread->pc->u.b.klass, *sp)) + break; + continue; + case REGEX_PROGRAM_OPCODE_CHCLS_NEGATED: + if (!cregex_char_class_contains(thread->pc->u.b.klass, *sp)) + break; + continue; + + /* Control-flow */ + case REGEX_PROGRAM_OPCODE_SPLIT: + case REGEX_PROGRAM_OPCODE_JUMP: + /* fall-through */ + + /* Assertions */ + case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN: + case REGEX_PROGRAM_OPCODE_ASSERT_END: + /* fall-through */ + + /* Saving */ + case REGEX_PROGRAM_OPCODE_SAVE: + /* handled in vm_add_thread() */ + abort(); + } + + vm_add_thread(next, program, thread->pc + 1, string, sp + 1, + thread->matches, nmatches); + } + + /* swap current and next thread list */ + swap = current; + current = next; + next = swap; + next->nthreads = 0; + + /* done if no more threads are running or end of string reached */ + if (current->nthreads == 0 || !*sp) + break; + } + + return matched; +} + +LONG cregex_program_run(const cregex_program_t *program, + const char *string, + const char **matches, + LONG nmatches) +{ + return vm_run(program, string, matches, nmatches); +} diff --git a/linearray.c b/linearray.c new file mode 100644 index 0000000..1c28467 --- /dev/null +++ b/linearray.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include "linearray.h" + +// ----------------------------- + + +// ----------------------------- + + +LineArray LineArrayNew(VOID) +{ + #define SIZE_LINEPTR 2 + return NewArray(SIZE_LINEPTR); +} + +VOID LineArrayAppend(LineArray array, LINEPTR value) +{ + AppendToArray(LINEPTR, array, value); +} + +VOID LineArrayFree(LineArray array) +{ + ArrayForEach(LINEPTR, aLine, array, LineFree(aLine);); + DeleteArray(array); +} + +LINEPTR* LineArrayValues(LineArray array) +{ + return ArrayValues(LINEPTR, array); +} + + +// ----------------------------------------------- +// ----------------------------------------------- + + + diff --git a/linearray.h b/linearray.h new file mode 100644 index 0000000..7a6b4c3 --- /dev/null +++ b/linearray.h @@ -0,0 +1,17 @@ +#ifndef __LINEARRAY_H +#define __LINEARRAY_H + +#include +#include +#include +#include "configmodel.h" + +#define LineArray Array + +LineArray LineArrayNew(VOID); +VOID LineArrayAppend(LineArray array, LINEPTR value); +VOID LineArrayFree(LineArray array); +LINEPTR* LineArrayValues(LineArray array); + +#endif + diff --git a/main.c b/main.c new file mode 100644 index 0000000..d763d50 --- /dev/null +++ b/main.c @@ -0,0 +1,85 @@ +// Printf("running (%ld bytes avail)\n", AvailMem(0)); +#define __CLIB_PRAGMA_LIBCALL +#include +#include +#define __NOLIBBASE__ +#include "stringarray.h" +#include "linearray.h" +#include "configmodel.h" +#include + +#include "cregex.h" + +#define ZERO ((BPTR)0) + +WORD DoTheWork(STRPTR filename); +VOID ProcessFile(BPTR configFile); + +char *vers="\0$VER: ConfigReader (dd.mm.yyyy)"; +char *stacksize = "$STACK:8192"; // only works when started from CLI + +struct Library *ContainerkitBase; + +VOID ProcessFile(BPTR configFile) +{ + Array lineArray = NULL; + LINEPTR line = NULL; + + InitialisePatterns(); + lineArray = LineArrayNew(); + line = LineReadIncludingContinuation(configFile); + while( line != NULL ) + { +// Printf("successfully read line {%s}\n", LineGetRawText(line)); + LineArrayAppend(lineArray, line); + line = LineReadIncludingContinuation(configFile); + } + LineArrayFree(lineArray); + ReleasePatterns(); +} + +WORD DoTheWork(STRPTR filename) +{ + WORD result = RETURN_OK; + if (ContainerkitBase) + { + BPTR configFile = ZERO; + configFile = Open(filename, MODE_OLDFILE); + if( configFile != ZERO ) + { + ProcessFile(configFile); + Close(configFile); + } + else + { + Printf("file open failed!\n"); + result = RETURN_ERROR; + } + } + return result; +} + +WORD main(WORD argc, STRPTR *argv) +{ + WORD result = RETURN_OK; + + // this does nothing but the first call to Print drops a bunch of memory, + // I assume because of opening some resource so this means my start and + // end markers are "clean" and I can ensure I'm not leaking. + Printf("\n"); + + ContainerkitBase = OpenLibrary("containerkit.library", 1); + if( ContainerkitBase ) + { + Printf("\n\nrunning (%ld bytes avail)\n\n", AvailMem(0)); + result = DoTheWork(argv[1]); + Printf("\n\ndone (%ld bytes avail)\n\n", AvailMem(0)); + CloseLibrary(ContainerkitBase); + } + else + { + Printf("failed to open library\n"); + result = RETURN_ERROR; + } + return result; +} diff --git a/smakefile b/smakefile new file mode 100644 index 0000000..89aadc2 --- /dev/null +++ b/smakefile @@ -0,0 +1,34 @@ +# +# :ts=8 +# + +############################################################################### + +NAME = configreader + +LFLAGS = addsym smallcode smalldata noicons batch +LIBS = lib:sc.lib lib:amiga.lib lib:debug.lib + +############################################################################### + +$(NAME) : main.o configmodel.o cregex.lib arraytypes.lib + slink lib:c.o main.o configmodel.o to $(NAME) noicons lib $(LIBS) cregex.lib arraytypes.lib $(LFLAGS) + +cregex.lib : cregex_compile.o cregex_parse.o cregex_vm.o + JOIN cregex_compile.o cregex_parse.o cregex_vm.o AS cregex.lib + +arraytypes.lib : stringarray.o linearray.o + JOIN stringarray.o linearray.o AS arraytypes.lib + +clean: + delete \#?.o \#?.lib $(NAME) ALL + +############################################################################### + +main.o : main.c stringarray.h +stringarray.o : stringarray.c stringarray.h +linearray.o : linearray.c linearray.h +configmodel.o : configmodel.c configmodel.h +cregex_compile.o : cregex_compile.c cregex.h +cregex_parse.o : cregex_parse.c cregex.h +cregex_vm.o : cregex_vm.c cregex.h diff --git a/stringarray.c b/stringarray.c new file mode 100644 index 0000000..38c68b0 --- /dev/null +++ b/stringarray.c @@ -0,0 +1,35 @@ +#include "stringarray.h" +#include +#include + +Array StringArrayNew(VOID) +{ + #define SIZE_STRPTR 2 + return NewArray(SIZE_STRPTR); +} + +VOID StringArrayAppend(Array array, CONST_STRPTR value) +{ + AppendToArray(CONST_STRPTR, array, value); +} + +VOID StringArrayAppendAndRetain(Array array, CONST_STRPTR value) +{ + STRPTR localCopy = AllocVec(strlen(value)+1, MEMF_CLEAR); + CopyMem(value, localCopy, strlen(value)); + StringArrayAppend(array, localCopy); +} + +VOID StringArrayFree(Array array) +{ + if( array != NULL ) + { + StringArrayForEach(array, FreeVec(aString);); + DeleteArray(array); + } +} + +CONST_STRPTR* StringArrayValues(Array array) +{ + return ArrayValues(CONST_STRPTR, array); +} \ No newline at end of file diff --git a/stringarray.h b/stringarray.h new file mode 100644 index 0000000..b60726e --- /dev/null +++ b/stringarray.h @@ -0,0 +1,18 @@ +#ifndef __STRINGARRAY_H +#define __STRINGARRAY_H + +#include +#include + +#define StringArray Array + +StringArray StringArrayNew(VOID); +VOID StringArrayAppend(StringArray array, CONST_STRPTR value); +VOID StringArrayAppendAndRetain(StringArray array, CONST_STRPTR value); +VOID StringArrayFree(StringArray array); +CONST_STRPTR* StringArrayValues(StringArray array); + +#define StringArrayForEach(array, block) do {STRPTR *afe_123_p = (*(STRPTR **)array); STRPTR aString = *afe_123_p; int afe_123_c = (((ULONG *)array)[1]);\ + for (; afe_123_c--; aString = *(++afe_123_p)) block} while (0); + +#endif \ No newline at end of file diff --git a/testconfig.cfg b/testconfig.cfg new file mode 100644 index 0000000..8cdc8fe --- /dev/null +++ b/testconfig.cfg @@ -0,0 +1,22 @@ +#this is a comment +[core] + repositoryformatversion = 0 + filemode = true + bare = false + logallrefupdates = true + ignorecase = true + precomposeunicode = true + +[remote "origin.foo"] #this is also a coment + url = git@git.alancfrancis.com:acf/AmigaGit2.git + fetch = +refs/heads/*:refs/remotes/origin/* + +[branch "main"] + remote = origin + merge = refs/heads/main + +[branch "config-file-parsing-from-book"] + remote = origin + merge = refs/heads/config-file-parsing-from-book + somekey = Alan Francis +