diff options
author | Jason Linehan <patientulysses@gmail.com> | 2012-07-22 19:52:43 -0400 |
---|---|---|
committer | Jason Linehan <patientulysses@gmail.com> | 2012-07-22 19:52:43 -0400 |
commit | de8cb439d5b75e4923f8da34f2f8a45767ac1038 (patch) | |
tree | 4959fe65490bed0c4b705f0e3a4f88ac275e2244 | |
parent | c32c15e9f4229abea608b15177824edffeb0440c (diff) | |
download | plex-de8cb439d5b75e4923f8da34f2f8a45767ac1038.tar.gz plex-de8cb439d5b75e4923f8da34f2f8a45767ac1038.tar.bz2 plex-de8cb439d5b75e4923f8da34f2f8a45767ac1038.zip |
Restructuring.
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | dfa.c | 204 | ||||
-rw-r--r-- | dfa.h | 36 | ||||
-rw-r--r-- | gen.c (renamed from tlex.h) | 0 | ||||
-rw-r--r-- | gen.h | 0 | ||||
-rw-r--r-- | lex.c | 486 | ||||
-rw-r--r-- | lex.h | 0 | ||||
-rw-r--r-- | main.c | 0 | ||||
-rw-r--r-- | nfa.c | 358 | ||||
-rw-r--r-- | nfa.h | 43 | ||||
-rw-r--r-- | stack.h | 0 | ||||
-rw-r--r-- | tlex.c | 251 | ||||
-rw-r--r-- | tok.h | 58 | ||||
-rw-r--r-- | tom.c | 988 | ||||
-rw-r--r-- | tom.h | 45 |
15 files changed, 1187 insertions, 1286 deletions
@@ -9,10 +9,10 @@ LDFLAGS= # # -SOURCES=test.c bloom.c +SOURCES=main.c lex.c nfa.c dfa.c gen.c OBJECTS=$(SOURCES:.c=.o) -EXECUTABLE=test +EXECUTABLE=plex all: $(SOURCES) $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCES) -o $(EXECUTABLE) @@ -0,0 +1,204 @@ +/* + * Make a DFA transition table from an NFA created with + * Thompson's construction. + */ + +/* + * DTAB is the deterministic transition table. It is indexed by state number + * along the major axis and by input character along the minor axis. Dstates + * is a list of deterministic states represented as sets of NFA states. + * NSTATES is the number of valid entries in DTAB. + */ +struct dfa_state { + unsigned group:8; // Group id used by minimize() + unsigned mark:1; // Mark used by make_dtran() + char *accept; // accept action if accept state + int anchor // Anchor point if an accept state. + SET *set; // Set of NFA states represented by this DFA state +}; + + +struct dfa_state *DSTATES; // DFA states table +struct dfa_state *PREV_STATE; // Most-recently marked DFA state in DTAB + +ROW *DTAB; // DFA transition table +int NSTATES // Number of DFA states + + + +/** + * dfa + * ``` + * Turn an NFA with indicated start state (sstate) into a DFA and + * return the number of states in the DFA transition table. + * + * dfap is modified to point at that transition table + * acceptp is modified to point at an array of accepting states + * (indexed by state number). + * dfa() discards all the memory used for the initial NFA. + */ +int dfa(FILE *input, ROW *(dfap[]), ACCEPT *(*acceptp)) +{ + ACCEPT *accept_states; + int start + int i; + + /* Build the NFA */ + start = nfa(input); + + NSTATES = 0; + DSTATES = calloc(DFA_MAX, sizeof(DFA_STATE)); + DTAB = calloc(DFA_MAX, sizeof(ROW)); + PREV_STATE = DSTATES; + + if (!DSTATES || !DTAB) + halt(SIGABRT, "No memory for DFA transition matrix!"); + + /* Convert NFA to DFA */ + make_dtran(start); + + /* Free the NFA memory, but not the accept strings. */ + free_nfa(); + + /* Build the DFA */ + DTAB = realloc(DTAB, NSTATES * sizeof(ROW)); + accept_states = malloc(NSTATES * sizeof(ACCEPT)); + + if (!accept_states || !DTAB) + halt(SIGABRT, "Out of memory!!"); + + for (i=NSTATES; i-->0;) { + accept_states[i].string = DSTATES[i].accept; + accept_states[i].anchor = DSTATES[i].anchor; + } + + free(DSTATES); + *dfap = DTAB; + *acceptp = accept_states; + + return NSTATES; +} + + + +int add_to_dstates(SET *NFA_set, char *accept_string, int anchor) +{ + int nextstate; + + if (NSTATES > (DFA_MAX-1)) + halt(SIGABRT, "Too many DFA states\n"); + + nextstate = NSTATES++; + DSTATES[nextstate].set = NFA_set; + DSTATES[nextstate].accept = accept_string; + DSTATES[nextstate].anchor = anchor; + + return nextstate; +} + + +/** + * in_dstates + * `````````` + * If there's a set in Dstates that is identical to NFA_set, return the + * index of the Dstate entry, else return -1. + */ +int in_dstates(SET *NFA_set) +{ + struct dfa_state *p; + struct dfa_state *end = &DSTATE[NSTATES]; + + for (p=DSTATES; p<end; ++p) { + if (IS_EQUIVALENT(NFA_set, p->set)) + return(p - DSTATES); + } + + return -1; +} + + +/* + * get_unmarked + * ```````````` + * Return a pointer to an unmarked state in Dstates. If no such state + * exists, return NULL. Print an asterisk for each state to tell the + * user that the program hasn't died while the table is being constructed. + */ +DFA_STATE *get_unmarked(void) +{ + for (; Last_marked<&DSTATES[NSTATES]; ++Last_marked) { + if (!Last_marked->mark) { + putc('*', stderr); + fflush(stderr); + return Last_marked; + } + } + return NULL; +} + + +/* Free the memory used for the NFA sets in all Dstate entries. */ +void free_sets(void) +{ + struct dfa_state *p; + struct dfa_state *end = &DSTATES[NSTATES]; + + for (p=DSTATES; p<end; ++p) + delset(p->set); +} + + +/** + * make_dtran + * `````````` + * @sstate: starting NFA state + */ +void make_dtran(int sstate) +{ + struct dfa_state *current; // state currently being expanded + SET *NFA_set // set of NFA states that define next DFA state + int nextstate; // goto DFA state for current char + char *isaccept; // current DFA state is accept + int anchor; // anchor, if any + int c; //input char + + /* + * Initially Dstates contains a single, unmarked, start state + * formed by taking the epsilon closure of the NFA start state. + * So, Dstates[0] (and DTAB[0]) is the DFA start state. + */ + NFA_set = newset() ; + ADD(NFA_set, sstate); + + NSTATES = 1; + DSTATES[0].set = e_closure(NFA_set,&DSTATES[0].accept,&DSTATES[0].anchor); + DSTATES[0].mark = 0; + + /* Make the table */ + while (current = get_unmarked()) { + + current->mark = 1; + + for (c=MAX_CHARS; --c>=0;) { + if (NFA_set = move(current->set, c)) + NFA_set = e_closure(NFA_set, &isaccept, &anchor); + + if (!NFA_set) // No outgoing transitions + nextstate = F; + + else if((nextstate = in_dstates(NFA_set)) != -1) + delset(NFA_set); + + else + nextstate = add_to_dstates(NFA_set, isaccept, anchor); + + DTAB[current-DSTATES][c] = nextstate; + } + } + /* Terminate string of *'s printed in get_unmarked(); */ + putc('\n', stderr); + + /* Free the memory used for the DFA_STATE sets */ + free_sets(); +} + @@ -0,0 +1,36 @@ +#ifndef __DFA_H +#define __DFA_H + +/* + * Representations of Deterministic finite automata. + */ + +/* Maximum number of DFA states. There will be problems if this is >= 255. */ +#define DFA_MAX 254 + +/* + * The type of the output DFA transition table (the internal one is an + * array of int). It is used to figure the various table sizes. + */ +typedef unsigned char TTYPE; + + +#define F -1 // Marks a failure state in the table. +#define MAX_CHARS 128 // Maximum width of DFA transition table. + + +/* + * One full row of DTAB, which is itself + * an array of DFA_MAX ROWS. + */ +typedef int ROW[MAX_CHARS]; + + +typedef struct accept_t { + char *string; // An accepting string. NULL if non-accepting. + int anchor; // Anchor point, if any. See NFA.h +} ACCEPT; + + + +#endif /* __DFA_H */ @@ -0,0 +1,486 @@ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include "lib/error.h" + +/* + * Lexical analyzer. + */ +enum token current_tok; +char *input_line; +char *input_start; +char *lexeme; + + +enum token lex_advance(void) +{ + static bool inquote = false; // When processing a quoted string. + bool saw_escape; // Saw a backslash escape + + static char *stack[SSIZE]; // Import stack + static char **sp = NULL; // Stack pointer + + /* Initialize the stack pointer. */ + if (!sp) + sp = stack - 1; // Necessary for a large model. + + /* Get the next line */ + if (current_tok == EOS) { + if (inquote) + halt(SIGABRT, "No newline."); + /* + * Sit in this loop until a non-blank line is read + * into the "Input" array. + */ + do { + /* Then at the end of file. */ + if (!(input = (getline)())) { + current_tok = END_OF_INPUT; + goto exit; + } + /* Ignore leading whitespace. */ + while (isspace(*input)) + input++; + + } while (!*input); /* Ignore blank lines. */ + + input_start = input; // Remember start of line for errors. + } + + while (*input == '\0') { + /* Restore previous input source. */ + if (INBOUNDS(stack, sp)) { + input = *sp--; + continue; + } + + /* + * No more input sources to restore; you're at the real end + * of the input string. + */ + current_tok = EOS; + lexeme = '\0'; + goto exit; + } + + if (!inquote) { + /* Macro expansion required. */ + while (*input == '{') { + /* Stack current input string. */ + *++sp = input; + /* Use macro body as input string. */ + input = get_macro(sp); + + if (TOOHIGH(stack,sp)) + halt(SIGABRT, "Stack overflow"); + } + } + + /* + * At either start or end of a quoted string. All characters are + * treated as literals while inquote is true. + */ + if (*input == '"') { + inquote = ~inquote; + if (!*++input) { + current_tok = EOS; + lexeme = '\0'; + goto exit; + } + } + + saw_escape = (*input == '\\'); + + if (!inquote) { + if (isspace(*input)) { + current_tok = EOS; + lexeme = '\0'; + goto exit; + } + lexeme = esc(&input); + } else { + if (saw_escape && input[1] == '"') { + /* Skip the escaped character. */ + input += 2; + lexeme = '"'; + } else { + lexeme = *input++; + } + } + + current_tok = (inquote || saw_escape) ? L : [lexeme]; + + exit: + return current_tok; +} + + +/* + * The parser + * `````````` + * The following components implement a recursive descent parser which + * produces a non-finite automaton for a regular expression using + * Thompson's construction. + * + * The NFA is created as a directed graph, which each node containing + * pointers to the next node. The machine can also be considered as an + * array where the state number is the array index. + * + * The parser descends through + * + * machine() + * rule() + * expr() + * factor() + * term() + * dodash() + * + */ + + +/* + * machine + * ``````` + * Build the NFA + */ +NFA *machine(void) +{ + NFA *start; + NFA *p; + + ENTER("machine"); + + p = start = new_nfa(); + p->next = rule(); + + while (!MATCH(END_OF_INPUT)) { + p->next2 = new_nfa(); + p = p->next2; + p->next = rule(); + } + + LEAVE("machine"); + + return start; +} + + +/** + * rule + * ```` + * Return an NFA for a rule in the grammar. + * + * rule --> expr EOS action + * ^expr EOS action + * expr$ EOS action + * + * action --> <tabs> <string of characters> + * epsilon + */ +NFA *rule(void) +{ + NFA *start = NULL; + NFA *end = NULL; + int anchor = NONE; + + ENTER("rule"); + + if (MATCH(AT_BOL)) { + start = new_nfa(); + start->edge = '\n'; + anchor |= START; + advance(); + expr(&start->next, &end); + } else { + expr(&start, &end); + } + + /* + * Pattern followed by a carriage-return or linefeed + * (use a character class). + */ + if (MATCH(AT_EOL)) { + advance(); + end->next = new_nfa(); + end->edge = CCL; + + if (!(end->bitset = newset())) + halt(SIGABRT, "Out of memory."); + + ADD(end->bitset, '\n'); + + end = end->next; + anchor |= END; + } + + while (isspace(*input)) + input++; + + end->accept = save(input); + end->anchor = anchor; + advance(); // Skip past EOS + + LEAVE("rule"); + + return start; +} + + +/* + * expr + * ```` + * NOTE + * Because a recursive descent compiler can't handle left recursion, + * certain productions such as + * + * expr -> expr OR cat_expr + * | cat_expr + * + * must be translated into + * + * expr -> cat_expr expr' + * expr' -> OR cat_expr expr' + * epsilon + * + * This translation can be implemented with this loop: + * + * cat_expr + * while( match(OR) ) + * cat_expr + * do the OR + */ +void expr(NFA **startp, NFA **endp) +{ + NFA *e2_start = NULL; /* expression to right of | */ + NFA *e2_end = NULL; + NFA *p; + + ENTER("expr"); + + cat_expr(startp, endp); + + while(MATCH(OR)) { + advance(); + cat_expr(&e2_start, &e2_end); + + p = new_nfa(); + p->next2 = e2_start; + p->next = *startp; + *startp = p; + + p = new_nfa(); + (*endp)->next = p; + e2_end ->next = p; + *endp = p; + } + LEAVE("expr"); +} + + +/* + * cat_expr + * ```````` + * The same translations that were needed in the expr rules are needed again + * here: + * + * cat_expr -> cat_expr | factor + * factor + * + * is translated to: + * + * cat_expr -> factor cat_expr' + * cat_expr' -> | factor cat_expr' + * epsilon + */ +void cat_expr(NFA **startp, NFA **endp ) +{ + NFA *e2_start; + NFA *e2_end; + + ENTER("cat_expr"); + + if (first_in_cat(current_tok)) + factor(startp, endp); + + while (first_in_cat(current_tok)) { + factor(&e2_start, &e2_end); + + memcpy(*endp, e2_start, sizeof(NFA)); + discard(e2_start); + + *endp = e2_end; + } + + LEAVE("cat_expr"); +} + + + +int first_in_cat(TOKEN tok) +{ + switch(tok) + { + case CLOSE_PAREN: + case AT_EOL: + case OR: + case EOS: + return 0; + + case CLOSURE: + case PLUS_CLOSE: + case OPTIONAL: + halt(SIGABRT, "Bad closure?"); + return 0; + + case CCL_END: + halt(SIGABRT, "Bracket problems."); + return 0; + + case AT_BOL: + halt(SIGABRT, "Beginning of line."); + return 0; + } + + return 1; +} + + + +/* + * factor --> term* | term+ | term? + */ +void factor(NFA **startp, NFA **endp) +{ + NFA *start; + NFA *end; + + ENTER("factor"); + + term(startp, endp); + + if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE) || MATCH(OPTIONAL)) { + start = new_nfa(); + end = new_nfa(); + start->next = *startp; + (*endp)->next = end; + + // * or ? + if (MATCH(CLOSURE) || MATCH(OPTIONAL)) + start->next2 = end; + + // * or + + if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE)) + (*endp)->next2 = *startp; + + *startp = start; + *endp = end; + advance(); + } + + LEAVE("factor"); +} + + +/* Process the term productions: + * + * term --> [...] | [^...] | [] | [^] | . | (expr) | <character> + * + * The [] is nonstandard. It matches a space, tab, formfeed, or newline, + * but not a carriage return (\r). All of these are single nodes in the + * NFA. + */ +void term(NFA **startp, NFA **endp) +{ + NFA *start; + int c; + + ENTER("term"); + + if (MATCH(OPEN_PAREN)) { + advance(); + expr(startp, endp); + if (MATCH(CLOSE_PAREN)) + advance(); + else + halt(SIGABRT, "Parenthesis are rotten."); + } else { + *startp = start = new_nfa(); + *endp = start->next = new_nfa(); + + if (!(MATCH(ANY) || MATCH(CCL_START))) { + start->edge = Lexeme; + advance(); + } else { + start->edge = CCL; + + if (!(start->bitset = newset())) + halt(SIGABRT, "Out of memory."); + + /* dot (.) */ + if (MATCH(ANY)) { + ADD(start->bitset, '\n'); + + if(!Unix) + ADD(start->bitset, '\r'); + + COMPLEMENT(start->bitset); + } else { + advance(); + /* Negative character class */ + if (MATCH(AT_BOL)) { + advance(); + + /* Don't include \n in class */ + ADD(start->bitset, '\n'); + + if (!Unix) + ADD(start->bitset, '\r'); + + COMPLEMENT(start->bitset); + } + + if (!MATCH(CCL_END)) { + dodash(start->bitset); + } else { // [] or [^] + for (c=0; c<=' '; ++c) + ADD(start->bitset, c); + } + } + advance(); + } + } + LEAVE("term"); +} + + + +void dodash(SET *set) +{ + register int first; + + /* Treat [-...] as a literal dash, but print a warning. */ + if (MATCH(DASH)) { + warning(W_STARTDASH); + ADD(set, lexeme); + advance(); + } + + for (; !MATCH(EOS) && !MATCH(CCL_END); advance()) { + if (!MATCH(DASH)) { + first = lexeme; + ADD(set, lexeme); + /* Looking at a dash */ + } else { + advance(); + /* Treat [...-] as literal */ + if (MATCH(CCL_END)) { + warning(W_ENDDASH); + ADD(set, '-'); + } else { + for (; first<=lexeme; first++) + ADD(set, first); + } + } + } +} + @@ -0,0 +1,358 @@ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include "common/error.h" + +#include "nfa.h" + + +/* True if stack isn't full or empty. */ +#define STACK_OK() (INBOUNDS(nfa_stack, nfa_stack_ptr)) +#define STACK_USED() ((int)(nfa_stack_ptr-nfa_stack) + 1) // Number of slots used +#define CLEAR_STACK() (nfa_stack_ptr = nfa_stack - 1) // Reset the stack +#define PUSH(x) (*++nfa_stack_ptr = (x)) // push x to the stack +#define POP() (*nfa_stack_ptr--) // pop x from stack + +#define MATCH(t) (cur_tok == (t)) +#define SSIZE 32 // State size + + +char *cur_input; // Current position in input string. +char *beg_input; // Beginning of input string +enum token cur_tok; // Current token +int lexeme; // Value associated with LITERAL + + +struct nfa_t *nfa_stack[SSIZE]; // Stack used by new() +struct nfa_t **nfa_stack_ptr = &nfa_stack[-1]; // Stack pointer + + +struct nfa_t *nfa_base; // Base address of nfa_states array +struct nfa_t *nfa_states; // State machine array +int nfa_nstates=0; // Number of states in array. +int nfa_next; // Index of next element in the array. + + +/***************************************************************************** + * Allocate, create, and destroy NFA blocks in the NFA stack array. + * + *****************************************************************************/ + +/** + * new_nfa + * ``````` + * Get a new NFA state. + * + * USAGE + * The first call to new_nfa allocates a big block of memory that can + * contain the maximum number of states. Subsequent calls simply return + * a pointer to a region of this memory block. + */ +struct nfa_t *new_nfa(void) +{ + static bool init = false; + struct nfa_t *new; + + /* Allocate memory for the maximum number of states at once. */ + if (!init) { + nfa_states = calloc(NFA_MAX, sizeof(struct nfa_t)); + + if (!nfa_states) + halt(SIGABRT, "Could not allocate NFA.\n"); + + nfa_stack_ptr = &nfa_state_stack[-1]; + init = true; + } + + if (++nfa_nstates >= NFA_MAX) + halt(SIGABRT, "Too long\n"); + + /* If the stack is not ok, it's empty */ + new = !STACK_OK() ? &nfa_states[nfa_next++] : POP(); + new->edge = EPSILON; + + return new; +} + + +/** + * del_nfa + * ``````` + * Delete an allocated NFA. + * + * USAGE + * Actually decrements the number of states and clears the region + * of memory in the allocated NFA stack block. + */ +void del_nfa(NFA *nfa) +{ + --nfa_nstates; + + memset(nfa, 0, sizeof(NFA)); + nfa->edge = EMPTY ; + PUSH(nfa); + + if (!STACK_OK()) + halt(SIGABRT, "Stack overflow!"); +} + + +/***************************************************************************** + * Save one of the strings from the parsing in a big string array. + * + *****************************************************************************/ + +/** + * save + * ```` + * A single large array will hold all the strings in the NFA. + */ +char *save(char *str) +{ + static bool init = false; + static char size[8]; // Query-mode size + static int *strings; // Where to save accepting strings. + static int *savep; // Current position in strings array. + char *startp; + char *textp; + int len; + + if (!init) { + if (!(savep = strings = (int *)malloc(STR_MAX))) + halt(SIGABRT, "Out of memory"); + init = true; + } + + /* Query mode. Returns number of bytes in use. */ + if (!str) { + sprintf(size, "%ld", (long)(savep - strings)); + return size; + } + + /* + * Lines starting with a | say that the action for the + * next line should be used for the current rule. + * No strings are copied in this situation. + */ + if (*str == '|') + return (char *)(savep + 1); + + *savep++ = LINENO; + + for (textp=(char *)savep; *str; *textp++ = *str++) { + if (textp >= (char *)strings + (STR_MAX-1)) + halt(SIGABRT, "String problems."); + } + + *textp++ = '\0' ; + + /* + * Increment savep past the text. "len" is initialized + * to the string length. The "len/sizeof(int)" truncates + * the size down to an even multiple of the current int size. + * The "+(len % sizeof(int) != 0)" adds 1 to the truncated + * size if the string length isn't an even multiple of the int + * size (the != operator evaluates to 1 or 0). Return a pointer + * to the string itself. + */ + startp = (char *)savep; + len = textp - startp; + savep += (len / sizeof(int)) + (len % sizeof(int) != 0); + + return startp; +} + + +/** + * thompson + * ```````` + * The main access routine. Creates an NFA using Thompson's construction. + * + * @max_state: the maximum number of states, modified to reflect largest used. + * @start_state: modified to be the start state + */ +struct nfa_t *thompson(int *max_state, NFA **start_state) +{ + CLEAR_STACK(); + + /* Load first token */ + cur_token = EOS; + advance(); + + nfa_nstates = 0; + nfa_next = 0; + + *start_state = machine(); // Manufacture the NFA + *max_state = nfa_next; // Max state # in NFA + + return nfa_base; +} + + +/** + * nfa + * ``` + * Compile the NFA and initialize the various global variables used by + * move() and e_closure(). Return the state number (index) of the NFA start + * state. This routine must be called before either e_closure() or move() + * are called. The memory used for the nfa can be freed with free_nfa() + * (in thompson.c). + */ +int nfa(FILE *input) +{ + struct nfa_t *sstate; + Nfa = thompson(&nfa_states, &sstate); + return (sstate - nfa_base); +} + + +void free_nfa(void) +{ + free(nfa_base); +} + + +/***************************************************************************** + * Operations on NFA. + * + * Epsilon enclosure and traversal. + * + *****************************************************************************/ + + +/** + * e_closure + * ````````` + * Perform an epsilon closure on an input set. + * + * @input: the set of start states to examine. + * @accept: modified to point at the string associated with an accepting state. + * @anchor: is modified to hold the anchor point, if any. + * + * Computes the epsilon closure set for the input states. The output set + * will contain all states that can be reached by making epsilon transitions + * from all NFA states in the input set. Returns an empty set if the input + * set or the closure set is empty, modifies *accept to point at the + * accepting string if one of the elements of the output state is an + * accepting state. + */ +SET *e_closure(SET *input, char **accept, int *anchor) +{ + int stack[NFA_MAX]; // Stack of untested states + int *tos; // Stack pointer + NFA *p; // NFA state being examined + int i; // State number of " + int accept_num = LARGEST_INT ; + + if (!input) + goto abort; + + /* + * The procedure: + * + * 0. Push all states in input set onto the stack. + * 1. while the stack is not empty... + * 2. pop the top element i and, if it's an accept state, + * *accept = the accept string. + * 3. If there is an epsilon transition from i to u + * 4. If u isn't in the closure set + * 5. Add u to the closure set + * 6. Push u onto the stack + */ + + /* 0 */ + *accept = NULL; + tos = &stack[-1]; + + for (next_member(NULL); (i=next_member(input)) >= 0;) + *++tos = i; + + /* 1 */ + while (INBOUNDS(stack,tos)) { + /* 2 */ + i = *tos--; + p = &nfa_base[i]; + + if (p->accept && (i < accept_num)) { + accept_num = i; + *accept = p->accept; + *anchor = p->anchor; + } + + /* 3 */ + if (p->edge == EPSILON) { + if (p->next) { + /* 4 */ + i = p->next - nfa; + if (!MEMBER(input, i)) { + /* 5 */ + ADD(input, i); + /* 6 */ + *++tos = i; + } + } + if (p->next2) { + /* 4 */ + i = p->next2 - nfa; + if (!MEMBER(input, i)) { + /* 5 */ + ADD( input, i); + /* 6 */ + *++tos = i; + } + } + } + } + abort: + return input; +} + + +/** + * move + * ```` + * Return a set that contains all NFA states that can be reached by making + * transitions on "c" from any NFA state in "inp_set". + * + * @inp_set: input set + * @c : transition on this character. + */ +SET *move(SET *inp_set, int c) +{ + SET *outset = NULL; // Output set + NFA *p; // Current NFA state + int i; + + for (i = nfa_states; --i >= 0;) { + if (MEMBER(inp_set, i)) { + p = &nfa_base[i]; + + if (p->edge==c || (p->edge==CCL && TEST(p->bitset, c))) { + if(!outset) + outset = newset(); + + ADD(outset, p->next - nfa_base); + } + } + } + return(outset); +} + + + + + +#ifdef MAIN + +main(int argc, char **argv ) +{ + struct nfa_t *nfa; + struct nfa_t *start_state; + int max_state; + + nfa = thompson(getline, &max_state, &start_state); +} + +#endif + + @@ -0,0 +1,43 @@ +#ifndef _NFA_H +#define _NFA_H + +/* + * An nfa_t is the non-deterministic finite automaton (NFA) produced + * in the Thompson construction. + */ + +struct nfa_t { + int edge; // Edge label: char, CCL, EMPTY, or EPSILON. + char *bitset; // Set to store character class. + struct nfa_t *next; // Next state (NULL if no next state). + struct nfa_t *next2; // Another next state if edge == EPSILON. + char *accept; // NULL if !accepting state, else the action. + int anchor; // Says whether pattern is anchored and where. +}; + + +/* Non-character edge values */ +#define EPSILON -1 +#define CCL -2 +#define EMPTY -3 + +/* Anchor field values (e.g. regex $, ^) */ +#define NONE 0 +#define START 1 +#define END 2 +#define BOTH (START | END) + +/* + * Maximum number of NFA states in a single machine. + * NFA_MAX * sizeof(struct nfa_t *) can't exceed 64k. + */ +#define NFA_MAX 512 + +/* Total space that can be used by the accept strings. */ +#define STR_MAX (10 * 1024) + + +struct nfa_t *thompson(int *max_state, struct nfa_t **start_state); + +#endif + @@ -1,251 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <ctype.h> -#include <stdarg.h> -#include "nfa.h" -#include "dfa.h" - -void cmd_line_error P(( int usage, char *fmt, ... )); -void do_file P(( void )); -void head P(( int suppress_output )); -void tail P(( void )); -void strip_comments P((char *string )); - - -/* - * Name of DFA transition table. Up to 3 characters are appended - * to the end of this name in the row-compressed tables. - */ -#define DTRAN_NAME "Yy_nxt" -#define MAXINP 2048 // Max rule size -#define E(x) fprintf(stderr,"%s\n", x) - -/* - * Assorted options. - */ -int VERBOSE = 0; // Print stats -int NOLINE = 0; // Suppress #line directives -int UNIX = 1; // Use UNIX newlines. -int Public = 1; // Make static symbols public. -char *TEMPLATE = "lex.par"; // Driver template for the state machine. -int LINENO = 1; // Current input line number. -int LINEMARK = 1; // Line number of the first line in a multi-line rule. - -/* - * Command line switches. - */ -int COL_COMPRESS = 1; -int NO_COMPRESS = 0; -int THRESHOLD = 4; -int NO_HEAD = 0; -int HEAD_ONLY = 0; - -/* - * Input and output files. - */ -char IBUF[MAXINP]; // Line buffer for input. -char *IFILENAME; // Input filename -FILE *INPUT; // Input file stream -FILE *OUTPUT; // Output file stream - - - -/** - * MAIN - */ -void main(int argc, char argv[]) -{ - static char *p; - - if (INPUT = fopen(*argv,"r")) - IFILENAME = *argv; - else - halt(SIGABRT,"Can't open input file %s", *argv); - - do_file (); - fclose (INPUT); - exit (0); -} - - -/** - * do_file - * ??? - */ -void do_file(void) -{ - int nstates; // Number of DFA states - ROW *dtran; // Transition table. - ACCEPT *accept; // Set of accept states. - int i; - - /* Process the input file */ - - /* Print everything up to first %% */ - head(HEAD_ONLY); - - /* Make the DFA */ - nstates = min_dfa(get_expr, &dtran, &accept); - - printf("%d out of %d DFA states in min machine\n", nstates, DFA_MAX); - printf("%d bytes required for min tables\n\n", - nstates * MAX_CHARS * sizeof(TTYPE) /* dtran */ - + nstates * sizeof(TTYPE) ); /* accept */ - - /* First part of driver */ - - if (!driver_1(OUTPUT, !NOLINE, TEMPLATE)) { - perror(TEMPLATE); - exit(1); - } - - /* Compressed tables */ - if (NO_COMPRESS) { - fprintf (OUTPUT ,"YYPRIVATE YY_TTYPE %s[ %d ][ %d ] =\n", - DTRAN_NAME, nstates, MAX_CHARS); - - print_array(OUTPUT, (int *)dtran, nstates, MAX_CHARS); - defnext (OUTPUT, DTRAN_NAME); - /* Column compressed tables */ - } else if(COL_COMPRESS) { - i = squash (OUTPUT, dtran, nstates, MAX_CHARS, DTRAN_NAME); - cnext(OUTPUT, DTRAN_NAME); - - printf("%d bytes required for column-compressed tables\n\n", - i /* dtran */ - + (nstates * sizeof(int)) ); /* Yy_accept */ - /* Pair-compressed tables */ - } else { - i = pairs(OUTPUT, (int *)dtran, nstates,MAX_CHARS, DTRAN_NAME, THRESHOLD, 0); - - if (VERBOSE) { - /* Figure the space occupied for the various tables. The - * Microsoft compiler uses roughly 100 bytes for the yy_next() - * subroutine. Column compression does yy_next in line with a - * macro so the overhead is negligible. - */ - i = (i * sizeof(TTYPE )) /* YysDD arrays */ - + (nstates * sizeof(TTYPE*)) /* Dtran[] */ - + (nstates * sizeof(TTYPE )) /* Yy_accept[] */ - + 100 ; /* yy_next() */ - - printf("%d bytes required for pair-compressed tables\n", i ); - } - pnext( OUTPUT, DTRAN_NAME ); - } - - /* Print the rest of the driver and everyting after the second %% */ - pdriver(OUTPUT, nstates, accept); - tail(); -} - - -/* - * Head processes everything up to the first %%. Any lines that begin - * with white space or are surrounded by %{ and %} are passed to the - * output. All other lines are assumed to be macro definitions. - * A %% can not be concealed in a %{ %} but it must be anchored at start - * of line so a %% in a printf statement (for example) is passed to the - * output correctly. Similarly, a %{ and %} must be the first two characters - * on the line. - */ -void head(int suppress_output) -{ - int transparent = 0; // True if in a %{ %} block - - if (!suppress_output && Public) - fputs("#define YYPRIVATE\n\n", OUTPUT); - - if (!NOLINE) - fprintf(OUTPUT, "#line 1 \"%s\"\n", IFILENAME); - - while (fgets( IBUF, MAXINP, INPUT)) { - ++ LINENO; - - /* Don't strip comments. */ - if (!transparent) - strip_comments(IBUF); - - if (VERBOSE > 1) - printf("h%d: %s", LINENO, IBUF); - - if (IBUF[0] == '%') { - if (IBUF[1] == '%') { - if (!suppress_output) - fputs("\n", OUTPUT); - break; - } else { - /* }{ */ - if (IBUF[1] == '{') - transparent = 1; - - else if (IBUF[1] == '}') - transparent = 0; - - else - lerror(0, "Ignoring illegal %%%c directive\n", IBUF[1]); - } - } else if (transparent || isspace(IBUF[0])) { - if (!suppress_output) - fputs(IBUF, OUTPUT); - } else { - new_macro(IBUF); - /* - * Replace macro def with a blank line so that the - * line numbers won't get messed up. - */ - if (!suppress_output) - fputs("\n", OUTPUT); - } - } - if (VERBOSE > 1) - printmacs(); -} - - -/* - * Scan through the string, replacing C-like comments with space - * characters. Multiple-line comments are supported. - */ -void strip_comments(char *string) -{ - static int incomment = 0; - - for (; *string ; ++string) { - if (incomment) { - if (string[0]=='*' && string[1]=='/') { - incomment = 0; - *string++ = ' '; - } - - if (!isspace(*string)) { - *string = ' '; - } - } else { - if (string[0]=='/' && string[1]=='*') { - incomment = 1; - *string++ = ' '; - *string = ' '; - } - } - } -} - -/** - * Throw away the line that had the %% on it. - */ -void tail(void) -{ - fgets(IBUF, MAXINP, INPUT); - - if (!NOLINE) - fprintf(OUTPUT, "#line %d \"%s\"\n", LINENO + 1, IFILENAME); - - while (fgets(IBUF, MAXINP, INPUT)) { - if (VERBOSE > 1) - printf("t%d: %s", LINENO++, IBUF); - - fputs(IBUF, OUTPUT); - } -} - @@ -0,0 +1,58 @@ +#ifndef __TOKEN_H +#define __TOKEN_H + +/**************************************************************************** + * TOKENS + ****************************************************************************/ + +enum token { + EOS = 1, // end of string + ANY, // . + AT_BOL, // ^ + AT_EOL, // $ + CCL_END, // ] + CCL_START, // [ + CLOSE_CURLY, // } + CLOSE_PAREN, // ) + CLOSURE, // * + DASH, // - + END_OF_INPUT, // EOF + L, // literal character + OPEN_CURLY, // { + OPEN_PAREN, // ( + OPTIONAL, // ? + OR, // | + PLUS_CLOSE // + +}; + + +enum token TOKEN_MAP[] = { +// ^@ ^A ^B ^C ^D ^E ^F ^G ^H ^I ^J ^K ^L ^M ^N + L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, +// ^O ^P ^Q ^R ^S ^T ^U ^V ^W ^X ^Y ^Z ^[ ^\ ^] + L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, +// ^^ ^_ SPACE ! " # $ % & ' + L, L, L, L, L, L, AT_EOL, L, L, L, +// ( ) * + , - . + OPEN_PAREN, CLOSE_PAREN, CLOSURE, PLUS_CLOSE, L, DASH, ANY, +// / 0 1 2 3 4 5 6 7 8 9 : ; < = + L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, +// > ? + L, OPTIONAL, +// @ A B C D E F G H I J K L M N + L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, +// O P Q R S T U V W X Y Z + L, L, L, L, L, L, L, L, L, L, L, L, +// [ \ ] ^ + CCL_START, L, CCL_END, AT_BOL, +// _ ` a b c d e f g h i j k l m + L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, +// n o p q r s t u v w x y z + L, L, L, L, L, L, L, L, L, L, L, L, L, +// { | } DEL + OPEN_CURLY, OR, CLOSE_CURLY, L +}; + + +#endif + @@ -1,988 +0,0 @@ -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#include "common/textutils.h" -#include "common/set.h" -#include "common/error.h" -#include "common/hash.h" -#include "common/textutils.h" - -#include "nfa.h" - -#include "dfa.h" /* prototype for lerror(). */ -#include "globals.h" /* externs for Verbose, etc. */ - -#ifdef DEBUG - int Lev = 0; -# define ENTER(f) printf("%*senter %s [%c][%1.10s] \n", \ - Lev++ * 4, "", f, Lexeme, Input) -# define LEAVE(f) printf("%*sleave %s [%c][%1.10s]\n", \ - --Lev * 4, "", f, Lexeme, Input) -#else -# define ENTER(f) -# define LEAVE(f) -#endif - -/* - * Error processing stuff. Note that all errors are fatal. - */ -typedef enum ERR_NUM -{ - E_MEM, /* Out of memory */ - E_BADEXPR, /* Malformed regular expression */ - E_PAREN, /* Missing close parenthesis */ - E_STACK, /* Internal error: Discard stack full */ - E_LENGTH, /* Too many regular expressions */ - E_BRACKET, /* Missing [ in character class */ - E_BOL, /* ^ must be at start of expr or ccl */ - E_CLOSE, /* + ? or * must follow expression */ - E_STRINGS, /* Too many characters in accept actions */ - E_NEWLINE, /* Newline in quoted string */ - E_BADMAC, /* Missing } in macro expansion */ - E_NOMAC, /* Macro doesn't exist */ - E_MACDEPTH /* Macro expansions nested too deeply. */ - -} ERR_NUM; - - -char *ERROR[] = /* Indexed by ERR_NUM */ -{ - "Not enough memory for NFA", - "Malformed regular expression", - "Missing close parenthesis", - "Internal error: Discard stack full", - "Too many regular expressions or expression too long", - "Missing [ in character class", - "^ must be at start of expression or after [", - "+ ? or * must follow an expression or subexpression", - "Too many characters in accept actions", - "Newline in quoted string, use \\n to get newline into expression", - "Missing } in macro expansion", - "Macro doesn't exist", - "Macro expansions nested too deeply" -}; - -typedef enum WARN_NUM -{ - W_STARTDASH, /* Dash at start of character class */ - W_ENDDASH /* Dash at end of character class */ - -} WARN_NUM; - - -PRIVATE char *WARNING[] = /* Indexed by WARN_NUM */ -{ - "Treating dash in [-...] as a literal dash", - "Treating dash in [...-] as a literal dash" -}; - - -NFA *Nfa_states ; /* State-machine array */ -int Nstates = 0 ; /* # of NFA states in machine */ -int Next_alloc; /* Index of next element of the array */ - -#define SSIZE 32 - -PRIVATE NFA *Sstack[ SSIZE ]; /* Stack used by new() */ -PRIVATE NFA **Sp = &Sstack[ -1 ]; /* Stack pointer */ - -#define STACK_OK() ( INBOUNDS(Sstack, Sp) ) /* true if stack not */ - /* full or empty */ -#define STACK_USED() ( (int)(Sp-Sstack) + 1 ) /* slots used */ -#define CLEAR_STACK() ( Sp = Sstack - 1 ) /* reset the stack */ -#define PUSH(x) ( *++Sp = (x) ) /* put x on stack */ -#define POP() ( *Sp-- ) /* get x from stack */ - - -/*-------------------------------------------------------------- - * MACRO support: - */ - -#define MAC_NAME_MAX 34 /* Maximum name length */ -#define MAC_TEXT_MAX 80 /* Maximum amount of expansion text */ - -typedef struct MACRO -{ - char name[ MAC_NAME_MAX ]; - char text[ MAC_TEXT_MAX ]; - -} MACRO; - -HASH_TAB *Macros; /* Symbol table for macro definitions */ - -typedef enum TOKEN -{ - EOS = 1, /* end of string */ - ANY, /* . */ - AT_BOL, /* ^ */ - AT_EOL, /* $ */ - CCL_END, /* ] */ - CCL_START, /* [ */ - CLOSE_CURLY, /* } */ - CLOSE_PAREN, /* ) */ - CLOSURE, /* * */ - DASH, /* - */ - END_OF_INPUT, /* EOF */ - L, /* literal character */ - OPEN_CURLY, /* { */ - OPEN_PAREN, /* ( */ - OPTIONAL, /* ? */ - OR, /* | */ - PLUS_CLOSE /* + */ - -} TOKEN; - - -TOKEN Tokmap[] = -{ -/* ^@ ^A ^B ^C ^D ^E ^F ^G ^H ^I ^J ^K ^L ^M ^N */ - L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, - -/* ^O ^P ^Q ^R ^S ^T ^U ^V ^W ^X ^Y ^Z ^[ ^\ ^] */ - L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, - -/* ^^ ^_ SPACE ! " # $ % & ' */ - L, L, L, L, L, L, AT_EOL, L, L, L, - -/* ( ) * + , - . */ - OPEN_PAREN, CLOSE_PAREN, CLOSURE, PLUS_CLOSE, L, DASH, ANY, - -/* / 0 1 2 3 4 5 6 7 8 9 : ; < = */ - L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, - -/* > ? */ - L, OPTIONAL, - -/* @ A B C D E F G H I J K L M N */ - L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, - -/* O P Q R S T U V W X Y Z */ - L, L, L, L, L, L, L, L, L, L, L, L, - -/* [ \ ] ^ */ - CCL_START, L, CCL_END, AT_BOL, - -/* _ ` a b c d e f g h i j k l m */ - L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, - -/* n o p q r s t u v w x y z */ - L, L, L, L, L, L, L, L, L, L, L, L, L, - -/* { | } DEL */ - OPEN_CURLY, OR, CLOSE_CURLY, L -}; - -char *(*Ifunct) P((void)) ; /* Input function pointer */ -char *Input = "" ; /* Current position in input string */ -char *S_input ; /* Beginning of input string */ -TOKEN Current_tok ; /* Current token */ -int Lexeme ; /* Value associated with LITERAL */ - -#define MATCH(t) (Current_tok == (t)) - -void parse_err P(( ERR_NUM type )); -void warning P(( WARN_NUM type )); -void errmsg P(( int type, char **table, char *msgtype )); -NFA *new P(( void )); -void discard P(( NFA *nfa_to_discard )); -char *save P(( char *str )); -char *get_macro P(( char **namep )); -void print_a_macro P(( MACRO *mac )); -TOKEN advance P(( void )); -NFA *machine P(( void )); -NFA *rule P(( void )); -void expr P(( NFA **startp, NFA **endp )); -void cat_expr P(( NFA **startp, NFA **endp )); -int first_in_cat P(( TOKEN tok )); -void factor P(( NFA **startp, NFA **endp )); -void term P(( NFA **startp, NFA **endp )); -void dodash P(( SET *set )); - - - -void warning(WARN_NUM type) -{ - errmsg( (int)type, WARNING, "WARNING" ); -} - - -void parse_err(ERR_NUM type) -{ - errmsg( (int)type, ERROR, "ERROR" ); - exit(1); -} - - -/* Highlight point of error */ -void errmsg(int type, char **table, char *msgtype ) -{ - char *p; - fprintf(stderr, "%s (line %d) %s\n", msgtype, Actual_lineno, table[type] ); - - for( p = S_input; ++p <= Input; putc('-', stderr) ) - ; - fprintf( stderr, "v\n%s\n", S_input ); - exit( 1 ); -} - -/*--------------------------------------------------------------*/ - -NFA *new(void) /* NFA management functions */ -{ - static int first = 1; - NFA *new; - - if (first) { - if (!(Nfa_states = (NFA *) calloc(NFA_MAX, sizeof(NFA)))) - parse_err(E_MEM); - - first = 0; - Sp = &Sstack[ -1 ]; - } - - if (++Nstates >= NFA_MAX) - parse_err(E_LENGTH); - - /* If the stack is not ok, it's empty */ - - p = !STACK_OK() ? &Nfa_states[Next_alloc++] : POP(); - p->edge = EPSILON; - - return p; -} - -/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ - -void discard(NFA *nfa) -{ - --Nstates; - - memset(nfa, 0, sizeof(NFA)); - nfa->edge = EMPTY ; - PUSH(nfa); - - if(!STACK_OK()) - parse_err(E_STACK); -} - -/*----------------------------------------------------------------------*/ - -/* String management */ -char *save(char *str) -{ - char *textp, *startp; - int len; - static int first = 1; - static char size[8]; /* Query-mode size */ - static int *strings; /* Place to save accepting strings */ - static int *savep; /* Current position in strings array. */ - - if (first) { - if (!(savep = strings = (int *)malloc(STR_MAX))) - parse_err(E_MEM); - first_time = 0; - } - - /* Query mode. Returns number of bytes in use. */ - if (!str) { - sprintf( size, "%ld", (long)(savep - strings) ); - return size; - } - - if (*str == '|') - return (char *)(savep + 1); - - *savep++ = Lineno; - - for (textp = (char *)savep; *str; *textp++ = *str++) { - if (textp >= (char *)strings + (STR_MAX-1)) - parse_err(E_STRINGS); - } - - *textp++ = '\0' ; - - /* - * Increment savep past the text. "len" is initialized - * to the string length. The "len/sizeof(int)" truncates - * the size down to an even multiple of the current int size. - * The "+(len % sizeof(int) != 0)" adds 1 to the truncated - * size if the string length isn't an even multiple of the int - * size (the != operator evaluates to 1 or 0). Return a pointer - * to the string itself. - */ - - startp = (char *)savep; - len = textp - startp; - savep += (len / sizeof(int)) + (len % sizeof(int) != 0); - - return startp; -} - -/*------------------------------------------------------------*/ - -/* - * Add a new macro to the table. If two macros have the same name, the - * second one takes precedence. A definition takes the form: - * name <whitespace> text [<whitespace>] - * whitespace at the end of the line is ignored. - */ -void new_macro(char *def) -{ - unsigned hash_add(); - - char *name; /* Name component of macro definition */ - char *text; /* text part of macro definition */ - char *edef; /* pointer to end of text part */ - MACRO *p; - static int first = 1; - - if (first) { - first = 0; - Macros = maketab(31, hash_add, strcmp); - } - - /* Isolate name */ - for (name = def; *def && !isspace(*def); def++) - ; - - if (*def) - *def++ = '\0' ; - - /* - * Isolate the definition text. This process is complicated - * because you need to discard any trailing whitespace on the - * line. The first while loop skips the preceding whitespace. - * The for loop is looking for end of string. If you find a - * white character (and the \n at the end of string is white), - * remember the position as a potential end of string. - */ - while (isspace(*def)) - ++def; - - text = def; - - edef = NULL; - - while (*def) { - if (!isspace(*def)) - ++def; - else - for (edef = def++; isspace(*def); ++def) - ; - } - - if (edef) - *edef = '\0'; - - /* Add the macro to the symbol table */ - p = (MACRO *) newsym(sizeof(MACRO)); - slcpy(p->name, name, MAC_NAME_MAX); - slcpy(p->text, text, MAC_TEXT_MAX); - addsym(Macros, p); - - D(printf("Added macro definition, macro table now is:\n");) - D(print_macs();) -} - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ - -/* - * Return a pointer to the contents of a macro having the indicated - * name. Abort with a message if no macro exists. The macro name includes - * the brackets. *namep is modified to point past the close brace. - */ -char *get_macro(char **namep) -{ - char *p; - MACRO *mac; - - /* Hit a '{', skip it and find '}'. */ - if (!(p = strchr(++(*namep), '}'))) - parse_err(E_BADMAC); - else { - /* Overwrite the close brace. */ - *p = '\0'; - - if (!(mac = (MACRO *)findsym(Macros, *namep))) - parse_err(E_NOMAC); - - /* Replace the brace. */ - *p++ = '}'; - - /* Update name pointer */ - *namep = p; - - return mac->text; - } - - /* You shouldn't be here. */ - return "ERROR"; -} - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ - -/* Workhorse function */ -void print_a_macro(MACRO *mac) -{ - printf("%-16s--[%s]--\n", mac->name, mac->text); -} - -/* Print all macros to stdout. */ -void printmacs(void) -{ - if (!Macros) - printf("\tThere are no macros\n"); - else { - printf("\nMACROS:\n"); - ptab(Macros, (void(*)P((void*,...))) print_a_macro, NULL, 1); - } -} - - -/*---------------------------------------------------------------- - * Lexical analyzer: - * - * Lexical analysis is trivial because all lexemes are single-character values. - * The only complications are escape sequences and quoted strings, both - * of which are handled by advance(), below. This routine advances past the - * current token, putting the new token into Current_tok and the equivalent - * lexeme into Lexeme. If the character was escaped, Lexeme holds the actual - * value. For example, if a "\s" is encountered, Lexeme will hold a space - * character. The MATCH(x) macro returns true if x matches the current token. - * Advance both modifies Current_tok to the current token and returns it. - * - * Macro expansion is handled by means of a stack (declared at the top - * of the subroutine). When an expansion request is encountered, the - * current input buffer is stacked, and input is read from the macro - * text. This process repeats with nested macros, so SSIZE controls - * the maximum macro-nesting depth. - */ - -TOKEN advance(void) -{ - static int inquote = 0; // When processing quoted string - int saw_esc; // Saw a backslash escape - static char *stack[SSIZE] // Import stack - static char **sp = NULL; // Stack pointer - - /* Initialize the stack pointer. */ - if (!sp) - sp = stack - 1; // Necessary for a large model. - - /* Get another line */ - if (Current_tok == EOS) { - if(inquote) - parse_err(E_NEWLINE); - /* - * Sit in this loop until a non-blank line is read - * into the "Input" array. - */ - do { - - /* Then at the end of file. */ - if (!(Input = (*Ifunct)())) { - Current_tok = END_OF_INPUT; - goto exit; - } - /* Ignore leading whitespace. */ - while (isspace(*Input)) - Input++; - - } while (!*Input); /* Ignore blank lines. */ - - S_input = Input; // Remember start of line for errors. - } - - while (*Input == '\0') { - /* Restore previous input source. */ - if (INBOUNDS(stack, sp)) { - Input = *sp--; - continue; - } - - /* - * No more input sources to restore; you're at the real end - * of the input string. - */ - Current_tok = EOS; - Lexeme = '\0'; - goto exit; - } - - if (!inquote) { - /* Macro expansion required. */ - while(*Input == '{') { - /* Stack current input string. */ - *++sp = Input; - /* Use macro body as input string. */ - Input = get_macro(sp); - - if(TOOHIGH(stack,sp)) - parse_err(E_MACDEPTH); // Stack overflow! - } - } - - /* - * At either start or end of a quoted string. All characters are - * treated as literals while inquote is true. - */ - if (*Input == '"') { - inquote = ~inquote; - if (!*++Input) { - Current_tok = EOS; - Lexeme = '\0'; - goto exit; - } - } - - saw_esc = (*Input == '\\'); - - if(!inquote) { - if(isspace(*Input)) { - Current_tok = EOS; - Lexeme = '\0'; - goto exit; - } - Lexeme = esc(&Input); - } else { - if (saw_esc && Input[1] == '"') { - /* Skip the escaped character. */ - Input += 2; - Lexeme = '"'; - } else { - Lexeme = *Input++; - } - } - - Current_tok = (inquote || saw_esc) ? L : Tokmap[Lexeme]; - - exit: - return Current_tok; -} - - -/*-------------------------------------------------------------- - * The Parser: - * A simple recursive descent parser that creates a Thompson NFA for - * a regular expression. The access routine [thompson()] is at the - * bottom. The NFA is created as a directed graph, with each node - * containing pointer's to the next node. Since the structures are - * allocated from an array, the machine can also be considered - * as an array where the state number is the array index. - */ - -NFA *machine() -{ - NFA *start; - NFA *p; - - ENTER("machine"); - - p = start = new(); - p->next = rule(); - - while(!MATCH(END_OF_INPUT)) { - p->next2 = new(); - p = p->next2; - p->next = rule(); - } - - LEAVE("machine"); - - return start; -} - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ - -/* rule --> expr EOS action -* ^expr EOS action -* expr$ EOS action -* -* action --> <tabs> <string of characters> -* epsilon -*/ -NFA *rule(void) -{ - NFA *start = NULL; - NFA *end = NULL; - int anchor = NONE; - - ENTER("rule"); - - if (MATCH(AT_BOL)) { - start = new(); - start->edge = '\n'; - anchor |= START; - advance(); - expr(&start->next, &end); - } else { - expr(&start, &end); - } - - /* - * Pattern followed by a carriage-return or linefeed - * (use a character class). - */ - if (MATCH(AT_EOL)) { - advance(); - end->next = new(); - end->edge = CCL; - - if (!(end->bitset = newset())) - parse_err(E_MEM); - - ADD(end->bitset, '\n'); - - if (!Unix) - ADD( end->bitset, '\r' ); - - end = end->next; - anchor |= END; - } - - while (isspace(*Input)) - Input++; - - end->accept = save(Input); - end->anchor = anchor; - advance(); // Skip past EOS - - LEAVE("rule"); - - return start; -} - - -/* - * Because a recursive descent compiler can't handle left recursion, - * the productions: - * - * expr -> expr OR cat_expr - * | cat_expr - * - * must be translated into: - * - * expr -> cat_expr expr' - * expr' -> OR cat_expr expr' - * epsilon - * - * which can be implemented with this loop: - * - * cat_expr - * while( match(OR) ) - * cat_expr - * do the OR - */ -void expr(NFA **startp, NFA **endp) -{ - NFA *e2_start = NULL; /* expression to right of | */ - NFA *e2_end = NULL; - NFA *p; - - ENTER("expr"); - - cat_expr(startp, endp); - - while(MATCH(OR)) { - advance(); - cat_expr(&e2_start, &e2_end); - - p = new(); - p->next2 = e2_start; - p->next = *startp; - *startp = p; - - p = new(); - (*endp)->next = p; - e2_end ->next = p; - *endp = p; - } - LEAVE("expr"); -} - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ - -/* The same translations that were needed in the expr rules are needed again - * here: - * - * cat_expr -> cat_expr | factor - * factor - * - * is translated to: - * - * cat_expr -> factor cat_expr' - * cat_expr' -> | factor cat_expr' - * epsilon - */ -void cat_expr(NFA **startp, NFA **endp ) -{ - NFA *e2_start; - NFA *e2_end; - - ENTER("cat_expr"); - - if (first_in_cat(Current_tok)) - factor(startp, endp); - - while(first_in_cat(Current_tok)) { - factor(&e2_start, &e2_end); - - memcpy(*endp, e2_start, sizeof(NFA)); - discard(e2_start); - - *endp = e2_end; - } - - LEAVE("cat_expr"); -} - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/ - -int first_in_cat(TOKEN tok) -{ - switch(tok) { - case CLOSE_PAREN: - case AT_EOL: - case OR: - case EOS: return 0; - - case CLOSURE: - case PLUS_CLOSE: - case OPTIONAL: parse_err( E_CLOSE ); return 0; - - case CCL_END: parse_err( E_BRACKET ); return 0; - case AT_BOL: parse_err( E_BOL ); return 0; - } - - return 1; -} - -/* - * factor --> term* | term+ | term? - */ -void factor(NFA **startp, NFA **endp) -{ - NFA *start; - NFA *end; - - ENTER("factor"); - - term(startp, endp); - - if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE) || MATCH(OPTIONAL)) { - start = new(); - end = new(); - start->next = *startp; - (*endp)->next = end; - - // * or ? - if (MATCH(CLOSURE) || MATCH(OPTIONAL)) - start->next2 = end; - - // * or + - if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE)) - (*endp)->next2 = *startp; - - *startp = start; - *endp = end; - advance(); - } - - LEAVE("factor"); -} - - -/* Process the term productions: - * - * term --> [...] | [^...] | [] | [^] | . | (expr) | <character> - * - * The [] is nonstandard. It matches a space, tab, formfeed, or newline, - * but not a carriage return (\r). All of these are single nodes in the - * NFA. - */ -void term(NFA **startp, NFA **endp) -{ - NFA *start; - int c; - - ENTER("term"); - - if (MATCH(OPEN_PAREN)) { - advance(); - expr(startp, endp); - if (MATCH(CLOSE_PAREN)) - advance(); - else - parse_err(E_PAREN); - } else { - *startp = start = new(); - *endp = start->next = new(); - - if (!(MATCH(ANY) || MATCH(CCL_START))) { - start->edge = Lexeme; - advance(); - } else { - start->edge = CCL; - - if (!(start->bitset = newset())) - parse_err(E_MEM); - - /* dot (.) */ - if (MATCH(ANY)) { - ADD(start->bitset, '\n'); - - if(!Unix) - ADD(start->bitset, '\r'); - - COMPLEMENT(start->bitset); - } else { - advance(); - /* Negative character class */ - if (MATCH(AT_BOL)) { - advance(); - - /* Don't include \n in class */ - ADD(start->bitset, '\n'); - - if (!Unix) - ADD(start->bitset, '\r'); - - COMPLEMENT(start->bitset); - } - - if (!MATCH(CCL_END)) { - dodash(start->bitset); - } else { // [] or [^] - for (c=0; c<=' '; ++c) - ADD(start->bitset, c); - } - } - advance(); - } - } - LEAVE("term"); -} - -/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ - -void dodash(SET *set) -{ - register int first; - - /* Treat [-...] as a literal dash, but print a warning. */ - if (MATCH(DASH)) { - warning(W_STARTDASH); - ADD(set, Lexeme); - advance(); - } - - for (; !MATCH(EOS) && !MATCH(CCL_END); advance()) { - if (!MATCH(DASH)) { - first = Lexeme; - ADD(set, Lexeme); - /* Looking at a dash */ - } else { - advance(); - /* Treat [...-] as literal */ - if (MATCH(CCL_END)) { - warning(W_ENDDASH); - ADD(set, '-'); - } else { - for (; first<=Lexeme; first++) - ADD(set, first); - } - } - } -} -} - - -/* Access routine to this module. Return a pointer to a NFA transition - * table that represents the regular expression pointed to by expr or - * NULL if there's not enough memory. Modify *max_state to reflect the - * largest state number used. This number will probably be a larger - * number than the total number of states. Modify *start_state to point - * to the start state. This pointer is garbage if thompson() returned 0. - * The memory for the table is fetched from malloc(); use free() to - * discard it. - */ -NFA *thompson(char *(*input_function) P((void)), int *max_state, NFA **start_state) -{ - CLEAR_STACK(); - - Ifunct = input_function; - - /* Load first token */ - Current_tok = EOS; - advance(); - - Nstates = 0; - Next_alloc = 0; - - *start_state = machine(); // Manufacture the NFA - *max_state = Next_alloc ; // Max state # in NFA - - if (Verbose > 1) - print_nfa(Nfa_states, *max_state, *start_state); - - if(Verbose) { - printf("%d/%d NFA states used.\n", *max_state, NFA_MAX); - printf("%s/%d bytes used for accept strings.\n\n", save(NULL), STR_MAX); - } - - return Nfa_states; -} - - -#ifdef MAIN - -#define ALLOC -#include "globals.h" - -/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ -/* For debugging, print a single NFA structure. */ -void pnode(NFA * nfa) -{ - if (!nfa) - printf("(NULL)"); - else { - printf("+--Structure at 0x%04x-------------------\n", nfa); - printf("|edge = 0x%x (%c)\n", nfa->edge, nfa->edge); - printf("|next = 0x%x\n", nfa->next); - printf("|next2 = 0x%x\n", nfa->next2); - printf("|accept = <%s>\n", nfa->accept); - printf("|bitset = "); - printccl(nfa->bitset); - printf("\n"); - printf("+----------------------------------------\n"); - } -} - -/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ - -char *getline() -{ - static char buf[80]; - - printf("%d: ", Lineno++); - gets(buf); - - return *buf ? buf : NULL; -} - -/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ - -main(int argc, char **argv ) -{ - NFA *nfa; - NFA *start_state; - int max_state; - - Verbose = 2; - - nfa = thompson(getline, &max_state, &start_state); -} - -#endif - - @@ -1,45 +0,0 @@ -#ifndef _TOMLEXER_H -#define _TOMLEXER_H - -/* - * An nfa_t is the non-deterministic finite automaton (NFA) produced - * in the Thompson construction. - */ - -typedef struct nfa_t { - int edge; // Edge label: char, CCL, EMPTY, or EPSILON. - char *bitset; // Set to store character class. - NFA *next; // Next state (NULL if no next state). - NFA *next2; // Another next state if edge == EPSILON. - char *accept; // NULL if !accepting state, else the action. - int anchor; // Says whether pattern is anchored and where. -} NFA; - -/* Non-character edge values */ -#define EPSILON -1 -#define CCL -2 -#define EMPTY -3 - -/* Anchor field values (e.g. regex $, ^) */ -#define NONE 0 -#define START 1 -#define END 2 -#define BOTH (START | END) - -/* - * Maximum number of NFA states in a single machine. - * NFA_MAX * sizeof(struct nfa_t *) can't exceed 64k. - */ -#define NFA_MAX 512 - -/* - * Total space that can be used by the accept strings. - */ -#define STR_MAX (10 * 1024) - -void new_macro(char *definition); -void printmacs(void); -NFA *thompson(char *(*input_func)(), int *max_state, NFA **start_state); -void print_nfa(NFA *nfa, int len, NFA *start); - - |