summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Linehan <patientulysses@gmail.com>2012-07-22 19:52:43 -0400
committerJason Linehan <patientulysses@gmail.com>2012-07-22 19:52:43 -0400
commitde8cb439d5b75e4923f8da34f2f8a45767ac1038 (patch)
tree4959fe65490bed0c4b705f0e3a4f88ac275e2244
parentc32c15e9f4229abea608b15177824edffeb0440c (diff)
downloadplex-de8cb439d5b75e4923f8da34f2f8a45767ac1038.tar.gz
plex-de8cb439d5b75e4923f8da34f2f8a45767ac1038.tar.bz2
plex-de8cb439d5b75e4923f8da34f2f8a45767ac1038.zip
Restructuring.
-rw-r--r--Makefile4
-rw-r--r--dfa.c204
-rw-r--r--dfa.h36
-rw-r--r--gen.c (renamed from tlex.h)0
-rw-r--r--gen.h0
-rw-r--r--lex.c486
-rw-r--r--lex.h0
-rw-r--r--main.c0
-rw-r--r--nfa.c358
-rw-r--r--nfa.h43
-rw-r--r--stack.h0
-rw-r--r--tlex.c251
-rw-r--r--tok.h58
-rw-r--r--tom.c988
-rw-r--r--tom.h45
15 files changed, 1187 insertions, 1286 deletions
diff --git a/Makefile b/Makefile
index 415fb26..b85df51 100644
--- a/Makefile
+++ b/Makefile
@@ -9,10 +9,10 @@ LDFLAGS=
#
#
-SOURCES=test.c bloom.c
+SOURCES=main.c lex.c nfa.c dfa.c gen.c
OBJECTS=$(SOURCES:.c=.o)
-EXECUTABLE=test
+EXECUTABLE=plex
all: $(SOURCES)
$(CC) $(CFLAGS) $(LDFLAGS) $(SOURCES) -o $(EXECUTABLE)
diff --git a/dfa.c b/dfa.c
new file mode 100644
index 0000000..c4fe116
--- /dev/null
+++ b/dfa.c
@@ -0,0 +1,204 @@
+/*
+ * Make a DFA transition table from an NFA created with
+ * Thompson's construction.
+ */
+
+/*
+ * DTAB is the deterministic transition table. It is indexed by state number
+ * along the major axis and by input character along the minor axis. Dstates
+ * is a list of deterministic states represented as sets of NFA states.
+ * NSTATES is the number of valid entries in DTAB.
+ */
+struct dfa_state {
+ unsigned group:8; // Group id used by minimize()
+ unsigned mark:1; // Mark used by make_dtran()
+ char *accept; // accept action if accept state
+ int anchor // Anchor point if an accept state.
+ SET *set; // Set of NFA states represented by this DFA state
+};
+
+
+struct dfa_state *DSTATES; // DFA states table
+struct dfa_state *PREV_STATE; // Most-recently marked DFA state in DTAB
+
+ROW *DTAB; // DFA transition table
+int NSTATES // Number of DFA states
+
+
+
+/**
+ * dfa
+ * ```
+ * Turn an NFA with indicated start state (sstate) into a DFA and
+ * return the number of states in the DFA transition table.
+ *
+ * dfap is modified to point at that transition table
+ * acceptp is modified to point at an array of accepting states
+ * (indexed by state number).
+ * dfa() discards all the memory used for the initial NFA.
+ */
+int dfa(FILE *input, ROW *(dfap[]), ACCEPT *(*acceptp))
+{
+ ACCEPT *accept_states;
+ int start
+ int i;
+
+ /* Build the NFA */
+ start = nfa(input);
+
+ NSTATES = 0;
+ DSTATES = calloc(DFA_MAX, sizeof(DFA_STATE));
+ DTAB = calloc(DFA_MAX, sizeof(ROW));
+ PREV_STATE = DSTATES;
+
+ if (!DSTATES || !DTAB)
+ halt(SIGABRT, "No memory for DFA transition matrix!");
+
+ /* Convert NFA to DFA */
+ make_dtran(start);
+
+ /* Free the NFA memory, but not the accept strings. */
+ free_nfa();
+
+ /* Build the DFA */
+ DTAB = realloc(DTAB, NSTATES * sizeof(ROW));
+ accept_states = malloc(NSTATES * sizeof(ACCEPT));
+
+ if (!accept_states || !DTAB)
+ halt(SIGABRT, "Out of memory!!");
+
+ for (i=NSTATES; i-->0;) {
+ accept_states[i].string = DSTATES[i].accept;
+ accept_states[i].anchor = DSTATES[i].anchor;
+ }
+
+ free(DSTATES);
+ *dfap = DTAB;
+ *acceptp = accept_states;
+
+ return NSTATES;
+}
+
+
+
+int add_to_dstates(SET *NFA_set, char *accept_string, int anchor)
+{
+ int nextstate;
+
+ if (NSTATES > (DFA_MAX-1))
+ halt(SIGABRT, "Too many DFA states\n");
+
+ nextstate = NSTATES++;
+ DSTATES[nextstate].set = NFA_set;
+ DSTATES[nextstate].accept = accept_string;
+ DSTATES[nextstate].anchor = anchor;
+
+ return nextstate;
+}
+
+
+/**
+ * in_dstates
+ * ``````````
+ * If there's a set in Dstates that is identical to NFA_set, return the
+ * index of the Dstate entry, else return -1.
+ */
+int in_dstates(SET *NFA_set)
+{
+ struct dfa_state *p;
+ struct dfa_state *end = &DSTATE[NSTATES];
+
+ for (p=DSTATES; p<end; ++p) {
+ if (IS_EQUIVALENT(NFA_set, p->set))
+ return(p - DSTATES);
+ }
+
+ return -1;
+}
+
+
+/*
+ * get_unmarked
+ * ````````````
+ * Return a pointer to an unmarked state in Dstates. If no such state
+ * exists, return NULL. Print an asterisk for each state to tell the
+ * user that the program hasn't died while the table is being constructed.
+ */
+DFA_STATE *get_unmarked(void)
+{
+ for (; Last_marked<&DSTATES[NSTATES]; ++Last_marked) {
+ if (!Last_marked->mark) {
+ putc('*', stderr);
+ fflush(stderr);
+ return Last_marked;
+ }
+ }
+ return NULL;
+}
+
+
+/* Free the memory used for the NFA sets in all Dstate entries. */
+void free_sets(void)
+{
+ struct dfa_state *p;
+ struct dfa_state *end = &DSTATES[NSTATES];
+
+ for (p=DSTATES; p<end; ++p)
+ delset(p->set);
+}
+
+
+/**
+ * make_dtran
+ * ``````````
+ * @sstate: starting NFA state
+ */
+void make_dtran(int sstate)
+{
+ struct dfa_state *current; // state currently being expanded
+ SET *NFA_set // set of NFA states that define next DFA state
+ int nextstate; // goto DFA state for current char
+ char *isaccept; // current DFA state is accept
+ int anchor; // anchor, if any
+ int c; //input char
+
+ /*
+ * Initially Dstates contains a single, unmarked, start state
+ * formed by taking the epsilon closure of the NFA start state.
+ * So, Dstates[0] (and DTAB[0]) is the DFA start state.
+ */
+ NFA_set = newset() ;
+ ADD(NFA_set, sstate);
+
+ NSTATES = 1;
+ DSTATES[0].set = e_closure(NFA_set,&DSTATES[0].accept,&DSTATES[0].anchor);
+ DSTATES[0].mark = 0;
+
+ /* Make the table */
+ while (current = get_unmarked()) {
+
+ current->mark = 1;
+
+ for (c=MAX_CHARS; --c>=0;) {
+ if (NFA_set = move(current->set, c))
+ NFA_set = e_closure(NFA_set, &isaccept, &anchor);
+
+ if (!NFA_set) // No outgoing transitions
+ nextstate = F;
+
+ else if((nextstate = in_dstates(NFA_set)) != -1)
+ delset(NFA_set);
+
+ else
+ nextstate = add_to_dstates(NFA_set, isaccept, anchor);
+
+ DTAB[current-DSTATES][c] = nextstate;
+ }
+ }
+ /* Terminate string of *'s printed in get_unmarked(); */
+ putc('\n', stderr);
+
+ /* Free the memory used for the DFA_STATE sets */
+ free_sets();
+}
+
diff --git a/dfa.h b/dfa.h
new file mode 100644
index 0000000..5e50d47
--- /dev/null
+++ b/dfa.h
@@ -0,0 +1,36 @@
+#ifndef __DFA_H
+#define __DFA_H
+
+/*
+ * Representations of Deterministic finite automata.
+ */
+
+/* Maximum number of DFA states. There will be problems if this is >= 255. */
+#define DFA_MAX 254
+
+/*
+ * The type of the output DFA transition table (the internal one is an
+ * array of int). It is used to figure the various table sizes.
+ */
+typedef unsigned char TTYPE;
+
+
+#define F -1 // Marks a failure state in the table.
+#define MAX_CHARS 128 // Maximum width of DFA transition table.
+
+
+/*
+ * One full row of DTAB, which is itself
+ * an array of DFA_MAX ROWS.
+ */
+typedef int ROW[MAX_CHARS];
+
+
+typedef struct accept_t {
+ char *string; // An accepting string. NULL if non-accepting.
+ int anchor; // Anchor point, if any. See NFA.h
+} ACCEPT;
+
+
+
+#endif /* __DFA_H */
diff --git a/tlex.h b/gen.c
index e69de29..e69de29 100644
--- a/tlex.h
+++ b/gen.c
diff --git a/gen.h b/gen.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/gen.h
diff --git a/lex.c b/lex.c
new file mode 100644
index 0000000..1821a85
--- /dev/null
+++ b/lex.c
@@ -0,0 +1,486 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "lib/error.h"
+
+/*
+ * Lexical analyzer.
+ */
+enum token current_tok;
+char *input_line;
+char *input_start;
+char *lexeme;
+
+
+enum token lex_advance(void)
+{
+ static bool inquote = false; // When processing a quoted string.
+ bool saw_escape; // Saw a backslash escape
+
+ static char *stack[SSIZE]; // Import stack
+ static char **sp = NULL; // Stack pointer
+
+ /* Initialize the stack pointer. */
+ if (!sp)
+ sp = stack - 1; // Necessary for a large model.
+
+ /* Get the next line */
+ if (current_tok == EOS) {
+ if (inquote)
+ halt(SIGABRT, "No newline.");
+ /*
+ * Sit in this loop until a non-blank line is read
+ * into the "Input" array.
+ */
+ do {
+ /* Then at the end of file. */
+ if (!(input = (getline)())) {
+ current_tok = END_OF_INPUT;
+ goto exit;
+ }
+ /* Ignore leading whitespace. */
+ while (isspace(*input))
+ input++;
+
+ } while (!*input); /* Ignore blank lines. */
+
+ input_start = input; // Remember start of line for errors.
+ }
+
+ while (*input == '\0') {
+ /* Restore previous input source. */
+ if (INBOUNDS(stack, sp)) {
+ input = *sp--;
+ continue;
+ }
+
+ /*
+ * No more input sources to restore; you're at the real end
+ * of the input string.
+ */
+ current_tok = EOS;
+ lexeme = '\0';
+ goto exit;
+ }
+
+ if (!inquote) {
+ /* Macro expansion required. */
+ while (*input == '{') {
+ /* Stack current input string. */
+ *++sp = input;
+ /* Use macro body as input string. */
+ input = get_macro(sp);
+
+ if (TOOHIGH(stack,sp))
+ halt(SIGABRT, "Stack overflow");
+ }
+ }
+
+ /*
+ * At either start or end of a quoted string. All characters are
+ * treated as literals while inquote is true.
+ */
+ if (*input == '"') {
+ inquote = ~inquote;
+ if (!*++input) {
+ current_tok = EOS;
+ lexeme = '\0';
+ goto exit;
+ }
+ }
+
+ saw_escape = (*input == '\\');
+
+ if (!inquote) {
+ if (isspace(*input)) {
+ current_tok = EOS;
+ lexeme = '\0';
+ goto exit;
+ }
+ lexeme = esc(&input);
+ } else {
+ if (saw_escape && input[1] == '"') {
+ /* Skip the escaped character. */
+ input += 2;
+ lexeme = '"';
+ } else {
+ lexeme = *input++;
+ }
+ }
+
+ current_tok = (inquote || saw_escape) ? L : [lexeme];
+
+ exit:
+ return current_tok;
+}
+
+
+/*
+ * The parser
+ * ``````````
+ * The following components implement a recursive descent parser which
+ * produces a non-finite automaton for a regular expression using
+ * Thompson's construction.
+ *
+ * The NFA is created as a directed graph, which each node containing
+ * pointers to the next node. The machine can also be considered as an
+ * array where the state number is the array index.
+ *
+ * The parser descends through
+ *
+ * machine()
+ * rule()
+ * expr()
+ * factor()
+ * term()
+ * dodash()
+ *
+ */
+
+
+/*
+ * machine
+ * ```````
+ * Build the NFA
+ */
+NFA *machine(void)
+{
+ NFA *start;
+ NFA *p;
+
+ ENTER("machine");
+
+ p = start = new_nfa();
+ p->next = rule();
+
+ while (!MATCH(END_OF_INPUT)) {
+ p->next2 = new_nfa();
+ p = p->next2;
+ p->next = rule();
+ }
+
+ LEAVE("machine");
+
+ return start;
+}
+
+
+/**
+ * rule
+ * ````
+ * Return an NFA for a rule in the grammar.
+ *
+ * rule --> expr EOS action
+ * ^expr EOS action
+ * expr$ EOS action
+ *
+ * action --> <tabs> <string of characters>
+ * epsilon
+ */
+NFA *rule(void)
+{
+ NFA *start = NULL;
+ NFA *end = NULL;
+ int anchor = NONE;
+
+ ENTER("rule");
+
+ if (MATCH(AT_BOL)) {
+ start = new_nfa();
+ start->edge = '\n';
+ anchor |= START;
+ advance();
+ expr(&start->next, &end);
+ } else {
+ expr(&start, &end);
+ }
+
+ /*
+ * Pattern followed by a carriage-return or linefeed
+ * (use a character class).
+ */
+ if (MATCH(AT_EOL)) {
+ advance();
+ end->next = new_nfa();
+ end->edge = CCL;
+
+ if (!(end->bitset = newset()))
+ halt(SIGABRT, "Out of memory.");
+
+ ADD(end->bitset, '\n');
+
+ end = end->next;
+ anchor |= END;
+ }
+
+ while (isspace(*input))
+ input++;
+
+ end->accept = save(input);
+ end->anchor = anchor;
+ advance(); // Skip past EOS
+
+ LEAVE("rule");
+
+ return start;
+}
+
+
+/*
+ * expr
+ * ````
+ * NOTE
+ * Because a recursive descent compiler can't handle left recursion,
+ * certain productions such as
+ *
+ * expr -> expr OR cat_expr
+ * | cat_expr
+ *
+ * must be translated into
+ *
+ * expr -> cat_expr expr'
+ * expr' -> OR cat_expr expr'
+ * epsilon
+ *
+ * This translation can be implemented with this loop:
+ *
+ * cat_expr
+ * while( match(OR) )
+ * cat_expr
+ * do the OR
+ */
+void expr(NFA **startp, NFA **endp)
+{
+ NFA *e2_start = NULL; /* expression to right of | */
+ NFA *e2_end = NULL;
+ NFA *p;
+
+ ENTER("expr");
+
+ cat_expr(startp, endp);
+
+ while(MATCH(OR)) {
+ advance();
+ cat_expr(&e2_start, &e2_end);
+
+ p = new_nfa();
+ p->next2 = e2_start;
+ p->next = *startp;
+ *startp = p;
+
+ p = new_nfa();
+ (*endp)->next = p;
+ e2_end ->next = p;
+ *endp = p;
+ }
+ LEAVE("expr");
+}
+
+
+/*
+ * cat_expr
+ * ````````
+ * The same translations that were needed in the expr rules are needed again
+ * here:
+ *
+ * cat_expr -> cat_expr | factor
+ * factor
+ *
+ * is translated to:
+ *
+ * cat_expr -> factor cat_expr'
+ * cat_expr' -> | factor cat_expr'
+ * epsilon
+ */
+void cat_expr(NFA **startp, NFA **endp )
+{
+ NFA *e2_start;
+ NFA *e2_end;
+
+ ENTER("cat_expr");
+
+ if (first_in_cat(current_tok))
+ factor(startp, endp);
+
+ while (first_in_cat(current_tok)) {
+ factor(&e2_start, &e2_end);
+
+ memcpy(*endp, e2_start, sizeof(NFA));
+ discard(e2_start);
+
+ *endp = e2_end;
+ }
+
+ LEAVE("cat_expr");
+}
+
+
+
+int first_in_cat(TOKEN tok)
+{
+ switch(tok)
+ {
+ case CLOSE_PAREN:
+ case AT_EOL:
+ case OR:
+ case EOS:
+ return 0;
+
+ case CLOSURE:
+ case PLUS_CLOSE:
+ case OPTIONAL:
+ halt(SIGABRT, "Bad closure?");
+ return 0;
+
+ case CCL_END:
+ halt(SIGABRT, "Bracket problems.");
+ return 0;
+
+ case AT_BOL:
+ halt(SIGABRT, "Beginning of line.");
+ return 0;
+ }
+
+ return 1;
+}
+
+
+
+/*
+ * factor --> term* | term+ | term?
+ */
+void factor(NFA **startp, NFA **endp)
+{
+ NFA *start;
+ NFA *end;
+
+ ENTER("factor");
+
+ term(startp, endp);
+
+ if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE) || MATCH(OPTIONAL)) {
+ start = new_nfa();
+ end = new_nfa();
+ start->next = *startp;
+ (*endp)->next = end;
+
+ // * or ?
+ if (MATCH(CLOSURE) || MATCH(OPTIONAL))
+ start->next2 = end;
+
+ // * or +
+ if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE))
+ (*endp)->next2 = *startp;
+
+ *startp = start;
+ *endp = end;
+ advance();
+ }
+
+ LEAVE("factor");
+}
+
+
+/* Process the term productions:
+ *
+ * term --> [...] | [^...] | [] | [^] | . | (expr) | <character>
+ *
+ * The [] is nonstandard. It matches a space, tab, formfeed, or newline,
+ * but not a carriage return (\r). All of these are single nodes in the
+ * NFA.
+ */
+void term(NFA **startp, NFA **endp)
+{
+ NFA *start;
+ int c;
+
+ ENTER("term");
+
+ if (MATCH(OPEN_PAREN)) {
+ advance();
+ expr(startp, endp);
+ if (MATCH(CLOSE_PAREN))
+ advance();
+ else
+ halt(SIGABRT, "Parenthesis are rotten.");
+ } else {
+ *startp = start = new_nfa();
+ *endp = start->next = new_nfa();
+
+ if (!(MATCH(ANY) || MATCH(CCL_START))) {
+ start->edge = Lexeme;
+ advance();
+ } else {
+ start->edge = CCL;
+
+ if (!(start->bitset = newset()))
+ halt(SIGABRT, "Out of memory.");
+
+ /* dot (.) */
+ if (MATCH(ANY)) {
+ ADD(start->bitset, '\n');
+
+ if(!Unix)
+ ADD(start->bitset, '\r');
+
+ COMPLEMENT(start->bitset);
+ } else {
+ advance();
+ /* Negative character class */
+ if (MATCH(AT_BOL)) {
+ advance();
+
+ /* Don't include \n in class */
+ ADD(start->bitset, '\n');
+
+ if (!Unix)
+ ADD(start->bitset, '\r');
+
+ COMPLEMENT(start->bitset);
+ }
+
+ if (!MATCH(CCL_END)) {
+ dodash(start->bitset);
+ } else { // [] or [^]
+ for (c=0; c<=' '; ++c)
+ ADD(start->bitset, c);
+ }
+ }
+ advance();
+ }
+ }
+ LEAVE("term");
+}
+
+
+
+void dodash(SET *set)
+{
+ register int first;
+
+ /* Treat [-...] as a literal dash, but print a warning. */
+ if (MATCH(DASH)) {
+ warning(W_STARTDASH);
+ ADD(set, lexeme);
+ advance();
+ }
+
+ for (; !MATCH(EOS) && !MATCH(CCL_END); advance()) {
+ if (!MATCH(DASH)) {
+ first = lexeme;
+ ADD(set, lexeme);
+ /* Looking at a dash */
+ } else {
+ advance();
+ /* Treat [...-] as literal */
+ if (MATCH(CCL_END)) {
+ warning(W_ENDDASH);
+ ADD(set, '-');
+ } else {
+ for (; first<=lexeme; first++)
+ ADD(set, first);
+ }
+ }
+ }
+}
+
diff --git a/lex.h b/lex.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lex.h
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/main.c
diff --git a/nfa.c b/nfa.c
new file mode 100644
index 0000000..3f5ac7f
--- /dev/null
+++ b/nfa.c
@@ -0,0 +1,358 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "common/error.h"
+
+#include "nfa.h"
+
+
+/* True if stack isn't full or empty. */
+#define STACK_OK() (INBOUNDS(nfa_stack, nfa_stack_ptr))
+#define STACK_USED() ((int)(nfa_stack_ptr-nfa_stack) + 1) // Number of slots used
+#define CLEAR_STACK() (nfa_stack_ptr = nfa_stack - 1) // Reset the stack
+#define PUSH(x) (*++nfa_stack_ptr = (x)) // push x to the stack
+#define POP() (*nfa_stack_ptr--) // pop x from stack
+
+#define MATCH(t) (cur_tok == (t))
+#define SSIZE 32 // State size
+
+
+char *cur_input; // Current position in input string.
+char *beg_input; // Beginning of input string
+enum token cur_tok; // Current token
+int lexeme; // Value associated with LITERAL
+
+
+struct nfa_t *nfa_stack[SSIZE]; // Stack used by new()
+struct nfa_t **nfa_stack_ptr = &nfa_stack[-1]; // Stack pointer
+
+
+struct nfa_t *nfa_base; // Base address of nfa_states array
+struct nfa_t *nfa_states; // State machine array
+int nfa_nstates=0; // Number of states in array.
+int nfa_next; // Index of next element in the array.
+
+
+/*****************************************************************************
+ * Allocate, create, and destroy NFA blocks in the NFA stack array.
+ *
+ *****************************************************************************/
+
+/**
+ * new_nfa
+ * ```````
+ * Get a new NFA state.
+ *
+ * USAGE
+ * The first call to new_nfa allocates a big block of memory that can
+ * contain the maximum number of states. Subsequent calls simply return
+ * a pointer to a region of this memory block.
+ */
+struct nfa_t *new_nfa(void)
+{
+ static bool init = false;
+ struct nfa_t *new;
+
+ /* Allocate memory for the maximum number of states at once. */
+ if (!init) {
+ nfa_states = calloc(NFA_MAX, sizeof(struct nfa_t));
+
+ if (!nfa_states)
+ halt(SIGABRT, "Could not allocate NFA.\n");
+
+ nfa_stack_ptr = &nfa_state_stack[-1];
+ init = true;
+ }
+
+ if (++nfa_nstates >= NFA_MAX)
+ halt(SIGABRT, "Too long\n");
+
+ /* If the stack is not ok, it's empty */
+ new = !STACK_OK() ? &nfa_states[nfa_next++] : POP();
+ new->edge = EPSILON;
+
+ return new;
+}
+
+
+/**
+ * del_nfa
+ * ```````
+ * Delete an allocated NFA.
+ *
+ * USAGE
+ * Actually decrements the number of states and clears the region
+ * of memory in the allocated NFA stack block.
+ */
+void del_nfa(NFA *nfa)
+{
+ --nfa_nstates;
+
+ memset(nfa, 0, sizeof(NFA));
+ nfa->edge = EMPTY ;
+ PUSH(nfa);
+
+ if (!STACK_OK())
+ halt(SIGABRT, "Stack overflow!");
+}
+
+
+/*****************************************************************************
+ * Save one of the strings from the parsing in a big string array.
+ *
+ *****************************************************************************/
+
+/**
+ * save
+ * ````
+ * A single large array will hold all the strings in the NFA.
+ */
+char *save(char *str)
+{
+ static bool init = false;
+ static char size[8]; // Query-mode size
+ static int *strings; // Where to save accepting strings.
+ static int *savep; // Current position in strings array.
+ char *startp;
+ char *textp;
+ int len;
+
+ if (!init) {
+ if (!(savep = strings = (int *)malloc(STR_MAX)))
+ halt(SIGABRT, "Out of memory");
+ init = true;
+ }
+
+ /* Query mode. Returns number of bytes in use. */
+ if (!str) {
+ sprintf(size, "%ld", (long)(savep - strings));
+ return size;
+ }
+
+ /*
+ * Lines starting with a | say that the action for the
+ * next line should be used for the current rule.
+ * No strings are copied in this situation.
+ */
+ if (*str == '|')
+ return (char *)(savep + 1);
+
+ *savep++ = LINENO;
+
+ for (textp=(char *)savep; *str; *textp++ = *str++) {
+ if (textp >= (char *)strings + (STR_MAX-1))
+ halt(SIGABRT, "String problems.");
+ }
+
+ *textp++ = '\0' ;
+
+ /*
+ * Increment savep past the text. "len" is initialized
+ * to the string length. The "len/sizeof(int)" truncates
+ * the size down to an even multiple of the current int size.
+ * The "+(len % sizeof(int) != 0)" adds 1 to the truncated
+ * size if the string length isn't an even multiple of the int
+ * size (the != operator evaluates to 1 or 0). Return a pointer
+ * to the string itself.
+ */
+ startp = (char *)savep;
+ len = textp - startp;
+ savep += (len / sizeof(int)) + (len % sizeof(int) != 0);
+
+ return startp;
+}
+
+
+/**
+ * thompson
+ * ````````
+ * The main access routine. Creates an NFA using Thompson's construction.
+ *
+ * @max_state: the maximum number of states, modified to reflect largest used.
+ * @start_state: modified to be the start state
+ */
+struct nfa_t *thompson(int *max_state, NFA **start_state)
+{
+ CLEAR_STACK();
+
+ /* Load first token */
+ cur_token = EOS;
+ advance();
+
+ nfa_nstates = 0;
+ nfa_next = 0;
+
+ *start_state = machine(); // Manufacture the NFA
+ *max_state = nfa_next; // Max state # in NFA
+
+ return nfa_base;
+}
+
+
+/**
+ * nfa
+ * ```
+ * Compile the NFA and initialize the various global variables used by
+ * move() and e_closure(). Return the state number (index) of the NFA start
+ * state. This routine must be called before either e_closure() or move()
+ * are called. The memory used for the nfa can be freed with free_nfa()
+ * (in thompson.c).
+ */
+int nfa(FILE *input)
+{
+ struct nfa_t *sstate;
+ Nfa = thompson(&nfa_states, &sstate);
+ return (sstate - nfa_base);
+}
+
+
+void free_nfa(void)
+{
+ free(nfa_base);
+}
+
+
+/*****************************************************************************
+ * Operations on NFA.
+ *
+ * Epsilon enclosure and traversal.
+ *
+ *****************************************************************************/
+
+
+/**
+ * e_closure
+ * `````````
+ * Perform an epsilon closure on an input set.
+ *
+ * @input: the set of start states to examine.
+ * @accept: modified to point at the string associated with an accepting state.
+ * @anchor: is modified to hold the anchor point, if any.
+ *
+ * Computes the epsilon closure set for the input states. The output set
+ * will contain all states that can be reached by making epsilon transitions
+ * from all NFA states in the input set. Returns an empty set if the input
+ * set or the closure set is empty, modifies *accept to point at the
+ * accepting string if one of the elements of the output state is an
+ * accepting state.
+ */
+SET *e_closure(SET *input, char **accept, int *anchor)
+{
+ int stack[NFA_MAX]; // Stack of untested states
+ int *tos; // Stack pointer
+ NFA *p; // NFA state being examined
+ int i; // State number of "
+ int accept_num = LARGEST_INT ;
+
+ if (!input)
+ goto abort;
+
+ /*
+ * The procedure:
+ *
+ * 0. Push all states in input set onto the stack.
+ * 1. while the stack is not empty...
+ * 2. pop the top element i and, if it's an accept state,
+ * *accept = the accept string.
+ * 3. If there is an epsilon transition from i to u
+ * 4. If u isn't in the closure set
+ * 5. Add u to the closure set
+ * 6. Push u onto the stack
+ */
+
+ /* 0 */
+ *accept = NULL;
+ tos = &stack[-1];
+
+ for (next_member(NULL); (i=next_member(input)) >= 0;)
+ *++tos = i;
+
+ /* 1 */
+ while (INBOUNDS(stack,tos)) {
+ /* 2 */
+ i = *tos--;
+ p = &nfa_base[i];
+
+ if (p->accept && (i < accept_num)) {
+ accept_num = i;
+ *accept = p->accept;
+ *anchor = p->anchor;
+ }
+
+ /* 3 */
+ if (p->edge == EPSILON) {
+ if (p->next) {
+ /* 4 */
+ i = p->next - nfa;
+ if (!MEMBER(input, i)) {
+ /* 5 */
+ ADD(input, i);
+ /* 6 */
+ *++tos = i;
+ }
+ }
+ if (p->next2) {
+ /* 4 */
+ i = p->next2 - nfa;
+ if (!MEMBER(input, i)) {
+ /* 5 */
+ ADD( input, i);
+ /* 6 */
+ *++tos = i;
+ }
+ }
+ }
+ }
+ abort:
+ return input;
+}
+
+
+/**
+ * move
+ * ````
+ * Return a set that contains all NFA states that can be reached by making
+ * transitions on "c" from any NFA state in "inp_set".
+ *
+ * @inp_set: input set
+ * @c : transition on this character.
+ */
+SET *move(SET *inp_set, int c)
+{
+ SET *outset = NULL; // Output set
+ NFA *p; // Current NFA state
+ int i;
+
+ for (i = nfa_states; --i >= 0;) {
+ if (MEMBER(inp_set, i)) {
+ p = &nfa_base[i];
+
+ if (p->edge==c || (p->edge==CCL && TEST(p->bitset, c))) {
+ if(!outset)
+ outset = newset();
+
+ ADD(outset, p->next - nfa_base);
+ }
+ }
+ }
+ return(outset);
+}
+
+
+
+
+
+#ifdef MAIN
+
+main(int argc, char **argv )
+{
+ struct nfa_t *nfa;
+ struct nfa_t *start_state;
+ int max_state;
+
+ nfa = thompson(getline, &max_state, &start_state);
+}
+
+#endif
+
+
diff --git a/nfa.h b/nfa.h
new file mode 100644
index 0000000..32a269e
--- /dev/null
+++ b/nfa.h
@@ -0,0 +1,43 @@
+#ifndef _NFA_H
+#define _NFA_H
+
+/*
+ * An nfa_t is the non-deterministic finite automaton (NFA) produced
+ * in the Thompson construction.
+ */
+
+struct nfa_t {
+ int edge; // Edge label: char, CCL, EMPTY, or EPSILON.
+ char *bitset; // Set to store character class.
+ struct nfa_t *next; // Next state (NULL if no next state).
+ struct nfa_t *next2; // Another next state if edge == EPSILON.
+ char *accept; // NULL if !accepting state, else the action.
+ int anchor; // Says whether pattern is anchored and where.
+};
+
+
+/* Non-character edge values */
+#define EPSILON -1
+#define CCL -2
+#define EMPTY -3
+
+/* Anchor field values (e.g. regex $, ^) */
+#define NONE 0
+#define START 1
+#define END 2
+#define BOTH (START | END)
+
+/*
+ * Maximum number of NFA states in a single machine.
+ * NFA_MAX * sizeof(struct nfa_t *) can't exceed 64k.
+ */
+#define NFA_MAX 512
+
+/* Total space that can be used by the accept strings. */
+#define STR_MAX (10 * 1024)
+
+
+struct nfa_t *thompson(int *max_state, struct nfa_t **start_state);
+
+#endif
+
diff --git a/stack.h b/stack.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/stack.h
diff --git a/tlex.c b/tlex.c
deleted file mode 100644
index 6f451eb..0000000
--- a/tlex.c
+++ /dev/null
@@ -1,251 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdarg.h>
-#include "nfa.h"
-#include "dfa.h"
-
-void cmd_line_error P(( int usage, char *fmt, ... ));
-void do_file P(( void ));
-void head P(( int suppress_output ));
-void tail P(( void ));
-void strip_comments P((char *string ));
-
-
-/*
- * Name of DFA transition table. Up to 3 characters are appended
- * to the end of this name in the row-compressed tables.
- */
-#define DTRAN_NAME "Yy_nxt"
-#define MAXINP 2048 // Max rule size
-#define E(x) fprintf(stderr,"%s\n", x)
-
-/*
- * Assorted options.
- */
-int VERBOSE = 0; // Print stats
-int NOLINE = 0; // Suppress #line directives
-int UNIX = 1; // Use UNIX newlines.
-int Public = 1; // Make static symbols public.
-char *TEMPLATE = "lex.par"; // Driver template for the state machine.
-int LINENO = 1; // Current input line number.
-int LINEMARK = 1; // Line number of the first line in a multi-line rule.
-
-/*
- * Command line switches.
- */
-int COL_COMPRESS = 1;
-int NO_COMPRESS = 0;
-int THRESHOLD = 4;
-int NO_HEAD = 0;
-int HEAD_ONLY = 0;
-
-/*
- * Input and output files.
- */
-char IBUF[MAXINP]; // Line buffer for input.
-char *IFILENAME; // Input filename
-FILE *INPUT; // Input file stream
-FILE *OUTPUT; // Output file stream
-
-
-
-/**
- * MAIN
- */
-void main(int argc, char argv[])
-{
- static char *p;
-
- if (INPUT = fopen(*argv,"r"))
- IFILENAME = *argv;
- else
- halt(SIGABRT,"Can't open input file %s", *argv);
-
- do_file ();
- fclose (INPUT);
- exit (0);
-}
-
-
-/**
- * do_file
- * ???
- */
-void do_file(void)
-{
- int nstates; // Number of DFA states
- ROW *dtran; // Transition table.
- ACCEPT *accept; // Set of accept states.
- int i;
-
- /* Process the input file */
-
- /* Print everything up to first %% */
- head(HEAD_ONLY);
-
- /* Make the DFA */
- nstates = min_dfa(get_expr, &dtran, &accept);
-
- printf("%d out of %d DFA states in min machine\n", nstates, DFA_MAX);
- printf("%d bytes required for min tables\n\n",
- nstates * MAX_CHARS * sizeof(TTYPE) /* dtran */
- + nstates * sizeof(TTYPE) ); /* accept */
-
- /* First part of driver */
-
- if (!driver_1(OUTPUT, !NOLINE, TEMPLATE)) {
- perror(TEMPLATE);
- exit(1);
- }
-
- /* Compressed tables */
- if (NO_COMPRESS) {
- fprintf (OUTPUT ,"YYPRIVATE YY_TTYPE %s[ %d ][ %d ] =\n",
- DTRAN_NAME, nstates, MAX_CHARS);
-
- print_array(OUTPUT, (int *)dtran, nstates, MAX_CHARS);
- defnext (OUTPUT, DTRAN_NAME);
- /* Column compressed tables */
- } else if(COL_COMPRESS) {
- i = squash (OUTPUT, dtran, nstates, MAX_CHARS, DTRAN_NAME);
- cnext(OUTPUT, DTRAN_NAME);
-
- printf("%d bytes required for column-compressed tables\n\n",
- i /* dtran */
- + (nstates * sizeof(int)) ); /* Yy_accept */
- /* Pair-compressed tables */
- } else {
- i = pairs(OUTPUT, (int *)dtran, nstates,MAX_CHARS, DTRAN_NAME, THRESHOLD, 0);
-
- if (VERBOSE) {
- /* Figure the space occupied for the various tables. The
- * Microsoft compiler uses roughly 100 bytes for the yy_next()
- * subroutine. Column compression does yy_next in line with a
- * macro so the overhead is negligible.
- */
- i = (i * sizeof(TTYPE )) /* YysDD arrays */
- + (nstates * sizeof(TTYPE*)) /* Dtran[] */
- + (nstates * sizeof(TTYPE )) /* Yy_accept[] */
- + 100 ; /* yy_next() */
-
- printf("%d bytes required for pair-compressed tables\n", i );
- }
- pnext( OUTPUT, DTRAN_NAME );
- }
-
- /* Print the rest of the driver and everyting after the second %% */
- pdriver(OUTPUT, nstates, accept);
- tail();
-}
-
-
-/*
- * Head processes everything up to the first %%. Any lines that begin
- * with white space or are surrounded by %{ and %} are passed to the
- * output. All other lines are assumed to be macro definitions.
- * A %% can not be concealed in a %{ %} but it must be anchored at start
- * of line so a %% in a printf statement (for example) is passed to the
- * output correctly. Similarly, a %{ and %} must be the first two characters
- * on the line.
- */
-void head(int suppress_output)
-{
- int transparent = 0; // True if in a %{ %} block
-
- if (!suppress_output && Public)
- fputs("#define YYPRIVATE\n\n", OUTPUT);
-
- if (!NOLINE)
- fprintf(OUTPUT, "#line 1 \"%s\"\n", IFILENAME);
-
- while (fgets( IBUF, MAXINP, INPUT)) {
- ++ LINENO;
-
- /* Don't strip comments. */
- if (!transparent)
- strip_comments(IBUF);
-
- if (VERBOSE > 1)
- printf("h%d: %s", LINENO, IBUF);
-
- if (IBUF[0] == '%') {
- if (IBUF[1] == '%') {
- if (!suppress_output)
- fputs("\n", OUTPUT);
- break;
- } else {
- /* }{ */
- if (IBUF[1] == '{')
- transparent = 1;
-
- else if (IBUF[1] == '}')
- transparent = 0;
-
- else
- lerror(0, "Ignoring illegal %%%c directive\n", IBUF[1]);
- }
- } else if (transparent || isspace(IBUF[0])) {
- if (!suppress_output)
- fputs(IBUF, OUTPUT);
- } else {
- new_macro(IBUF);
- /*
- * Replace macro def with a blank line so that the
- * line numbers won't get messed up.
- */
- if (!suppress_output)
- fputs("\n", OUTPUT);
- }
- }
- if (VERBOSE > 1)
- printmacs();
-}
-
-
-/*
- * Scan through the string, replacing C-like comments with space
- * characters. Multiple-line comments are supported.
- */
-void strip_comments(char *string)
-{
- static int incomment = 0;
-
- for (; *string ; ++string) {
- if (incomment) {
- if (string[0]=='*' && string[1]=='/') {
- incomment = 0;
- *string++ = ' ';
- }
-
- if (!isspace(*string)) {
- *string = ' ';
- }
- } else {
- if (string[0]=='/' && string[1]=='*') {
- incomment = 1;
- *string++ = ' ';
- *string = ' ';
- }
- }
- }
-}
-
-/**
- * Throw away the line that had the %% on it.
- */
-void tail(void)
-{
- fgets(IBUF, MAXINP, INPUT);
-
- if (!NOLINE)
- fprintf(OUTPUT, "#line %d \"%s\"\n", LINENO + 1, IFILENAME);
-
- while (fgets(IBUF, MAXINP, INPUT)) {
- if (VERBOSE > 1)
- printf("t%d: %s", LINENO++, IBUF);
-
- fputs(IBUF, OUTPUT);
- }
-}
-
diff --git a/tok.h b/tok.h
new file mode 100644
index 0000000..9188fcd
--- /dev/null
+++ b/tok.h
@@ -0,0 +1,58 @@
+#ifndef __TOKEN_H
+#define __TOKEN_H
+
+/****************************************************************************
+ * TOKENS
+ ****************************************************************************/
+
+enum token {
+ EOS = 1, // end of string
+ ANY, // .
+ AT_BOL, // ^
+ AT_EOL, // $
+ CCL_END, // ]
+ CCL_START, // [
+ CLOSE_CURLY, // }
+ CLOSE_PAREN, // )
+ CLOSURE, // *
+ DASH, // -
+ END_OF_INPUT, // EOF
+ L, // literal character
+ OPEN_CURLY, // {
+ OPEN_PAREN, // (
+ OPTIONAL, // ?
+ OR, // |
+ PLUS_CLOSE // +
+};
+
+
+enum token TOKEN_MAP[] = {
+// ^@ ^A ^B ^C ^D ^E ^F ^G ^H ^I ^J ^K ^L ^M ^N
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+// ^O ^P ^Q ^R ^S ^T ^U ^V ^W ^X ^Y ^Z ^[ ^\ ^]
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+// ^^ ^_ SPACE ! " # $ % & '
+ L, L, L, L, L, L, AT_EOL, L, L, L,
+// ( ) * + , - .
+ OPEN_PAREN, CLOSE_PAREN, CLOSURE, PLUS_CLOSE, L, DASH, ANY,
+// / 0 1 2 3 4 5 6 7 8 9 : ; < =
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+// > ?
+ L, OPTIONAL,
+// @ A B C D E F G H I J K L M N
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+// O P Q R S T U V W X Y Z
+ L, L, L, L, L, L, L, L, L, L, L, L,
+// [ \ ] ^
+ CCL_START, L, CCL_END, AT_BOL,
+// _ ` a b c d e f g h i j k l m
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+// n o p q r s t u v w x y z
+ L, L, L, L, L, L, L, L, L, L, L, L, L,
+// { | } DEL
+ OPEN_CURLY, OR, CLOSE_CURLY, L
+};
+
+
+#endif
+
diff --git a/tom.c b/tom.c
deleted file mode 100644
index 926af21..0000000
--- a/tom.c
+++ /dev/null
@@ -1,988 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include "common/textutils.h"
-#include "common/set.h"
-#include "common/error.h"
-#include "common/hash.h"
-#include "common/textutils.h"
-
-#include "nfa.h"
-
-#include "dfa.h" /* prototype for lerror(). */
-#include "globals.h" /* externs for Verbose, etc. */
-
-#ifdef DEBUG
- int Lev = 0;
-# define ENTER(f) printf("%*senter %s [%c][%1.10s] \n", \
- Lev++ * 4, "", f, Lexeme, Input)
-# define LEAVE(f) printf("%*sleave %s [%c][%1.10s]\n", \
- --Lev * 4, "", f, Lexeme, Input)
-#else
-# define ENTER(f)
-# define LEAVE(f)
-#endif
-
-/*
- * Error processing stuff. Note that all errors are fatal.
- */
-typedef enum ERR_NUM
-{
- E_MEM, /* Out of memory */
- E_BADEXPR, /* Malformed regular expression */
- E_PAREN, /* Missing close parenthesis */
- E_STACK, /* Internal error: Discard stack full */
- E_LENGTH, /* Too many regular expressions */
- E_BRACKET, /* Missing [ in character class */
- E_BOL, /* ^ must be at start of expr or ccl */
- E_CLOSE, /* + ? or * must follow expression */
- E_STRINGS, /* Too many characters in accept actions */
- E_NEWLINE, /* Newline in quoted string */
- E_BADMAC, /* Missing } in macro expansion */
- E_NOMAC, /* Macro doesn't exist */
- E_MACDEPTH /* Macro expansions nested too deeply. */
-
-} ERR_NUM;
-
-
-char *ERROR[] = /* Indexed by ERR_NUM */
-{
- "Not enough memory for NFA",
- "Malformed regular expression",
- "Missing close parenthesis",
- "Internal error: Discard stack full",
- "Too many regular expressions or expression too long",
- "Missing [ in character class",
- "^ must be at start of expression or after [",
- "+ ? or * must follow an expression or subexpression",
- "Too many characters in accept actions",
- "Newline in quoted string, use \\n to get newline into expression",
- "Missing } in macro expansion",
- "Macro doesn't exist",
- "Macro expansions nested too deeply"
-};
-
-typedef enum WARN_NUM
-{
- W_STARTDASH, /* Dash at start of character class */
- W_ENDDASH /* Dash at end of character class */
-
-} WARN_NUM;
-
-
-PRIVATE char *WARNING[] = /* Indexed by WARN_NUM */
-{
- "Treating dash in [-...] as a literal dash",
- "Treating dash in [...-] as a literal dash"
-};
-
-
-NFA *Nfa_states ; /* State-machine array */
-int Nstates = 0 ; /* # of NFA states in machine */
-int Next_alloc; /* Index of next element of the array */
-
-#define SSIZE 32
-
-PRIVATE NFA *Sstack[ SSIZE ]; /* Stack used by new() */
-PRIVATE NFA **Sp = &Sstack[ -1 ]; /* Stack pointer */
-
-#define STACK_OK() ( INBOUNDS(Sstack, Sp) ) /* true if stack not */
- /* full or empty */
-#define STACK_USED() ( (int)(Sp-Sstack) + 1 ) /* slots used */
-#define CLEAR_STACK() ( Sp = Sstack - 1 ) /* reset the stack */
-#define PUSH(x) ( *++Sp = (x) ) /* put x on stack */
-#define POP() ( *Sp-- ) /* get x from stack */
-
-
-/*--------------------------------------------------------------
- * MACRO support:
- */
-
-#define MAC_NAME_MAX 34 /* Maximum name length */
-#define MAC_TEXT_MAX 80 /* Maximum amount of expansion text */
-
-typedef struct MACRO
-{
- char name[ MAC_NAME_MAX ];
- char text[ MAC_TEXT_MAX ];
-
-} MACRO;
-
-HASH_TAB *Macros; /* Symbol table for macro definitions */
-
-typedef enum TOKEN
-{
- EOS = 1, /* end of string */
- ANY, /* . */
- AT_BOL, /* ^ */
- AT_EOL, /* $ */
- CCL_END, /* ] */
- CCL_START, /* [ */
- CLOSE_CURLY, /* } */
- CLOSE_PAREN, /* ) */
- CLOSURE, /* * */
- DASH, /* - */
- END_OF_INPUT, /* EOF */
- L, /* literal character */
- OPEN_CURLY, /* { */
- OPEN_PAREN, /* ( */
- OPTIONAL, /* ? */
- OR, /* | */
- PLUS_CLOSE /* + */
-
-} TOKEN;
-
-
-TOKEN Tokmap[] =
-{
-/* ^@ ^A ^B ^C ^D ^E ^F ^G ^H ^I ^J ^K ^L ^M ^N */
- L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* ^O ^P ^Q ^R ^S ^T ^U ^V ^W ^X ^Y ^Z ^[ ^\ ^] */
- L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* ^^ ^_ SPACE ! " # $ % & ' */
- L, L, L, L, L, L, AT_EOL, L, L, L,
-
-/* ( ) * + , - . */
- OPEN_PAREN, CLOSE_PAREN, CLOSURE, PLUS_CLOSE, L, DASH, ANY,
-
-/* / 0 1 2 3 4 5 6 7 8 9 : ; < = */
- L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* > ? */
- L, OPTIONAL,
-
-/* @ A B C D E F G H I J K L M N */
- L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* O P Q R S T U V W X Y Z */
- L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* [ \ ] ^ */
- CCL_START, L, CCL_END, AT_BOL,
-
-/* _ ` a b c d e f g h i j k l m */
- L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* n o p q r s t u v w x y z */
- L, L, L, L, L, L, L, L, L, L, L, L, L,
-
-/* { | } DEL */
- OPEN_CURLY, OR, CLOSE_CURLY, L
-};
-
-char *(*Ifunct) P((void)) ; /* Input function pointer */
-char *Input = "" ; /* Current position in input string */
-char *S_input ; /* Beginning of input string */
-TOKEN Current_tok ; /* Current token */
-int Lexeme ; /* Value associated with LITERAL */
-
-#define MATCH(t) (Current_tok == (t))
-
-void parse_err P(( ERR_NUM type ));
-void warning P(( WARN_NUM type ));
-void errmsg P(( int type, char **table, char *msgtype ));
-NFA *new P(( void ));
-void discard P(( NFA *nfa_to_discard ));
-char *save P(( char *str ));
-char *get_macro P(( char **namep ));
-void print_a_macro P(( MACRO *mac ));
-TOKEN advance P(( void ));
-NFA *machine P(( void ));
-NFA *rule P(( void ));
-void expr P(( NFA **startp, NFA **endp ));
-void cat_expr P(( NFA **startp, NFA **endp ));
-int first_in_cat P(( TOKEN tok ));
-void factor P(( NFA **startp, NFA **endp ));
-void term P(( NFA **startp, NFA **endp ));
-void dodash P(( SET *set ));
-
-
-
-void warning(WARN_NUM type)
-{
- errmsg( (int)type, WARNING, "WARNING" );
-}
-
-
-void parse_err(ERR_NUM type)
-{
- errmsg( (int)type, ERROR, "ERROR" );
- exit(1);
-}
-
-
-/* Highlight point of error */
-void errmsg(int type, char **table, char *msgtype )
-{
- char *p;
- fprintf(stderr, "%s (line %d) %s\n", msgtype, Actual_lineno, table[type] );
-
- for( p = S_input; ++p <= Input; putc('-', stderr) )
- ;
- fprintf( stderr, "v\n%s\n", S_input );
- exit( 1 );
-}
-
-/*--------------------------------------------------------------*/
-
-NFA *new(void) /* NFA management functions */
-{
- static int first = 1;
- NFA *new;
-
- if (first) {
- if (!(Nfa_states = (NFA *) calloc(NFA_MAX, sizeof(NFA))))
- parse_err(E_MEM);
-
- first = 0;
- Sp = &Sstack[ -1 ];
- }
-
- if (++Nstates >= NFA_MAX)
- parse_err(E_LENGTH);
-
- /* If the stack is not ok, it's empty */
-
- p = !STACK_OK() ? &Nfa_states[Next_alloc++] : POP();
- p->edge = EPSILON;
-
- return p;
-}
-
-/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-void discard(NFA *nfa)
-{
- --Nstates;
-
- memset(nfa, 0, sizeof(NFA));
- nfa->edge = EMPTY ;
- PUSH(nfa);
-
- if(!STACK_OK())
- parse_err(E_STACK);
-}
-
-/*----------------------------------------------------------------------*/
-
-/* String management */
-char *save(char *str)
-{
- char *textp, *startp;
- int len;
- static int first = 1;
- static char size[8]; /* Query-mode size */
- static int *strings; /* Place to save accepting strings */
- static int *savep; /* Current position in strings array. */
-
- if (first) {
- if (!(savep = strings = (int *)malloc(STR_MAX)))
- parse_err(E_MEM);
- first_time = 0;
- }
-
- /* Query mode. Returns number of bytes in use. */
- if (!str) {
- sprintf( size, "%ld", (long)(savep - strings) );
- return size;
- }
-
- if (*str == '|')
- return (char *)(savep + 1);
-
- *savep++ = Lineno;
-
- for (textp = (char *)savep; *str; *textp++ = *str++) {
- if (textp >= (char *)strings + (STR_MAX-1))
- parse_err(E_STRINGS);
- }
-
- *textp++ = '\0' ;
-
- /*
- * Increment savep past the text. "len" is initialized
- * to the string length. The "len/sizeof(int)" truncates
- * the size down to an even multiple of the current int size.
- * The "+(len % sizeof(int) != 0)" adds 1 to the truncated
- * size if the string length isn't an even multiple of the int
- * size (the != operator evaluates to 1 or 0). Return a pointer
- * to the string itself.
- */
-
- startp = (char *)savep;
- len = textp - startp;
- savep += (len / sizeof(int)) + (len % sizeof(int) != 0);
-
- return startp;
-}
-
-/*------------------------------------------------------------*/
-
-/*
- * Add a new macro to the table. If two macros have the same name, the
- * second one takes precedence. A definition takes the form:
- * name <whitespace> text [<whitespace>]
- * whitespace at the end of the line is ignored.
- */
-void new_macro(char *def)
-{
- unsigned hash_add();
-
- char *name; /* Name component of macro definition */
- char *text; /* text part of macro definition */
- char *edef; /* pointer to end of text part */
- MACRO *p;
- static int first = 1;
-
- if (first) {
- first = 0;
- Macros = maketab(31, hash_add, strcmp);
- }
-
- /* Isolate name */
- for (name = def; *def && !isspace(*def); def++)
- ;
-
- if (*def)
- *def++ = '\0' ;
-
- /*
- * Isolate the definition text. This process is complicated
- * because you need to discard any trailing whitespace on the
- * line. The first while loop skips the preceding whitespace.
- * The for loop is looking for end of string. If you find a
- * white character (and the \n at the end of string is white),
- * remember the position as a potential end of string.
- */
- while (isspace(*def))
- ++def;
-
- text = def;
-
- edef = NULL;
-
- while (*def) {
- if (!isspace(*def))
- ++def;
- else
- for (edef = def++; isspace(*def); ++def)
- ;
- }
-
- if (edef)
- *edef = '\0';
-
- /* Add the macro to the symbol table */
- p = (MACRO *) newsym(sizeof(MACRO));
- slcpy(p->name, name, MAC_NAME_MAX);
- slcpy(p->text, text, MAC_TEXT_MAX);
- addsym(Macros, p);
-
- D(printf("Added macro definition, macro table now is:\n");)
- D(print_macs();)
-}
-
-/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
-
-/*
- * Return a pointer to the contents of a macro having the indicated
- * name. Abort with a message if no macro exists. The macro name includes
- * the brackets. *namep is modified to point past the close brace.
- */
-char *get_macro(char **namep)
-{
- char *p;
- MACRO *mac;
-
- /* Hit a '{', skip it and find '}'. */
- if (!(p = strchr(++(*namep), '}')))
- parse_err(E_BADMAC);
- else {
- /* Overwrite the close brace. */
- *p = '\0';
-
- if (!(mac = (MACRO *)findsym(Macros, *namep)))
- parse_err(E_NOMAC);
-
- /* Replace the brace. */
- *p++ = '}';
-
- /* Update name pointer */
- *namep = p;
-
- return mac->text;
- }
-
- /* You shouldn't be here. */
- return "ERROR";
-}
-
-/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
-
-/* Workhorse function */
-void print_a_macro(MACRO *mac)
-{
- printf("%-16s--[%s]--\n", mac->name, mac->text);
-}
-
-/* Print all macros to stdout. */
-void printmacs(void)
-{
- if (!Macros)
- printf("\tThere are no macros\n");
- else {
- printf("\nMACROS:\n");
- ptab(Macros, (void(*)P((void*,...))) print_a_macro, NULL, 1);
- }
-}
-
-
-/*----------------------------------------------------------------
- * Lexical analyzer:
- *
- * Lexical analysis is trivial because all lexemes are single-character values.
- * The only complications are escape sequences and quoted strings, both
- * of which are handled by advance(), below. This routine advances past the
- * current token, putting the new token into Current_tok and the equivalent
- * lexeme into Lexeme. If the character was escaped, Lexeme holds the actual
- * value. For example, if a "\s" is encountered, Lexeme will hold a space
- * character. The MATCH(x) macro returns true if x matches the current token.
- * Advance both modifies Current_tok to the current token and returns it.
- *
- * Macro expansion is handled by means of a stack (declared at the top
- * of the subroutine). When an expansion request is encountered, the
- * current input buffer is stacked, and input is read from the macro
- * text. This process repeats with nested macros, so SSIZE controls
- * the maximum macro-nesting depth.
- */
-
-TOKEN advance(void)
-{
- static int inquote = 0; // When processing quoted string
- int saw_esc; // Saw a backslash escape
- static char *stack[SSIZE] // Import stack
- static char **sp = NULL; // Stack pointer
-
- /* Initialize the stack pointer. */
- if (!sp)
- sp = stack - 1; // Necessary for a large model.
-
- /* Get another line */
- if (Current_tok == EOS) {
- if(inquote)
- parse_err(E_NEWLINE);
- /*
- * Sit in this loop until a non-blank line is read
- * into the "Input" array.
- */
- do {
-
- /* Then at the end of file. */
- if (!(Input = (*Ifunct)())) {
- Current_tok = END_OF_INPUT;
- goto exit;
- }
- /* Ignore leading whitespace. */
- while (isspace(*Input))
- Input++;
-
- } while (!*Input); /* Ignore blank lines. */
-
- S_input = Input; // Remember start of line for errors.
- }
-
- while (*Input == '\0') {
- /* Restore previous input source. */
- if (INBOUNDS(stack, sp)) {
- Input = *sp--;
- continue;
- }
-
- /*
- * No more input sources to restore; you're at the real end
- * of the input string.
- */
- Current_tok = EOS;
- Lexeme = '\0';
- goto exit;
- }
-
- if (!inquote) {
- /* Macro expansion required. */
- while(*Input == '{') {
- /* Stack current input string. */
- *++sp = Input;
- /* Use macro body as input string. */
- Input = get_macro(sp);
-
- if(TOOHIGH(stack,sp))
- parse_err(E_MACDEPTH); // Stack overflow!
- }
- }
-
- /*
- * At either start or end of a quoted string. All characters are
- * treated as literals while inquote is true.
- */
- if (*Input == '"') {
- inquote = ~inquote;
- if (!*++Input) {
- Current_tok = EOS;
- Lexeme = '\0';
- goto exit;
- }
- }
-
- saw_esc = (*Input == '\\');
-
- if(!inquote) {
- if(isspace(*Input)) {
- Current_tok = EOS;
- Lexeme = '\0';
- goto exit;
- }
- Lexeme = esc(&Input);
- } else {
- if (saw_esc && Input[1] == '"') {
- /* Skip the escaped character. */
- Input += 2;
- Lexeme = '"';
- } else {
- Lexeme = *Input++;
- }
- }
-
- Current_tok = (inquote || saw_esc) ? L : Tokmap[Lexeme];
-
- exit:
- return Current_tok;
-}
-
-
-/*--------------------------------------------------------------
- * The Parser:
- * A simple recursive descent parser that creates a Thompson NFA for
- * a regular expression. The access routine [thompson()] is at the
- * bottom. The NFA is created as a directed graph, with each node
- * containing pointer's to the next node. Since the structures are
- * allocated from an array, the machine can also be considered
- * as an array where the state number is the array index.
- */
-
-NFA *machine()
-{
- NFA *start;
- NFA *p;
-
- ENTER("machine");
-
- p = start = new();
- p->next = rule();
-
- while(!MATCH(END_OF_INPUT)) {
- p->next2 = new();
- p = p->next2;
- p->next = rule();
- }
-
- LEAVE("machine");
-
- return start;
-}
-
-/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
-
-/* rule --> expr EOS action
-* ^expr EOS action
-* expr$ EOS action
-*
-* action --> <tabs> <string of characters>
-* epsilon
-*/
-NFA *rule(void)
-{
- NFA *start = NULL;
- NFA *end = NULL;
- int anchor = NONE;
-
- ENTER("rule");
-
- if (MATCH(AT_BOL)) {
- start = new();
- start->edge = '\n';
- anchor |= START;
- advance();
- expr(&start->next, &end);
- } else {
- expr(&start, &end);
- }
-
- /*
- * Pattern followed by a carriage-return or linefeed
- * (use a character class).
- */
- if (MATCH(AT_EOL)) {
- advance();
- end->next = new();
- end->edge = CCL;
-
- if (!(end->bitset = newset()))
- parse_err(E_MEM);
-
- ADD(end->bitset, '\n');
-
- if (!Unix)
- ADD( end->bitset, '\r' );
-
- end = end->next;
- anchor |= END;
- }
-
- while (isspace(*Input))
- Input++;
-
- end->accept = save(Input);
- end->anchor = anchor;
- advance(); // Skip past EOS
-
- LEAVE("rule");
-
- return start;
-}
-
-
-/*
- * Because a recursive descent compiler can't handle left recursion,
- * the productions:
- *
- * expr -> expr OR cat_expr
- * | cat_expr
- *
- * must be translated into:
- *
- * expr -> cat_expr expr'
- * expr' -> OR cat_expr expr'
- * epsilon
- *
- * which can be implemented with this loop:
- *
- * cat_expr
- * while( match(OR) )
- * cat_expr
- * do the OR
- */
-void expr(NFA **startp, NFA **endp)
-{
- NFA *e2_start = NULL; /* expression to right of | */
- NFA *e2_end = NULL;
- NFA *p;
-
- ENTER("expr");
-
- cat_expr(startp, endp);
-
- while(MATCH(OR)) {
- advance();
- cat_expr(&e2_start, &e2_end);
-
- p = new();
- p->next2 = e2_start;
- p->next = *startp;
- *startp = p;
-
- p = new();
- (*endp)->next = p;
- e2_end ->next = p;
- *endp = p;
- }
- LEAVE("expr");
-}
-
-/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
-
-/* The same translations that were needed in the expr rules are needed again
- * here:
- *
- * cat_expr -> cat_expr | factor
- * factor
- *
- * is translated to:
- *
- * cat_expr -> factor cat_expr'
- * cat_expr' -> | factor cat_expr'
- * epsilon
- */
-void cat_expr(NFA **startp, NFA **endp )
-{
- NFA *e2_start;
- NFA *e2_end;
-
- ENTER("cat_expr");
-
- if (first_in_cat(Current_tok))
- factor(startp, endp);
-
- while(first_in_cat(Current_tok)) {
- factor(&e2_start, &e2_end);
-
- memcpy(*endp, e2_start, sizeof(NFA));
- discard(e2_start);
-
- *endp = e2_end;
- }
-
- LEAVE("cat_expr");
-}
-
-/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
-
-int first_in_cat(TOKEN tok)
-{
- switch(tok) {
- case CLOSE_PAREN:
- case AT_EOL:
- case OR:
- case EOS: return 0;
-
- case CLOSURE:
- case PLUS_CLOSE:
- case OPTIONAL: parse_err( E_CLOSE ); return 0;
-
- case CCL_END: parse_err( E_BRACKET ); return 0;
- case AT_BOL: parse_err( E_BOL ); return 0;
- }
-
- return 1;
-}
-
-/*
- * factor --> term* | term+ | term?
- */
-void factor(NFA **startp, NFA **endp)
-{
- NFA *start;
- NFA *end;
-
- ENTER("factor");
-
- term(startp, endp);
-
- if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE) || MATCH(OPTIONAL)) {
- start = new();
- end = new();
- start->next = *startp;
- (*endp)->next = end;
-
- // * or ?
- if (MATCH(CLOSURE) || MATCH(OPTIONAL))
- start->next2 = end;
-
- // * or +
- if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE))
- (*endp)->next2 = *startp;
-
- *startp = start;
- *endp = end;
- advance();
- }
-
- LEAVE("factor");
-}
-
-
-/* Process the term productions:
- *
- * term --> [...] | [^...] | [] | [^] | . | (expr) | <character>
- *
- * The [] is nonstandard. It matches a space, tab, formfeed, or newline,
- * but not a carriage return (\r). All of these are single nodes in the
- * NFA.
- */
-void term(NFA **startp, NFA **endp)
-{
- NFA *start;
- int c;
-
- ENTER("term");
-
- if (MATCH(OPEN_PAREN)) {
- advance();
- expr(startp, endp);
- if (MATCH(CLOSE_PAREN))
- advance();
- else
- parse_err(E_PAREN);
- } else {
- *startp = start = new();
- *endp = start->next = new();
-
- if (!(MATCH(ANY) || MATCH(CCL_START))) {
- start->edge = Lexeme;
- advance();
- } else {
- start->edge = CCL;
-
- if (!(start->bitset = newset()))
- parse_err(E_MEM);
-
- /* dot (.) */
- if (MATCH(ANY)) {
- ADD(start->bitset, '\n');
-
- if(!Unix)
- ADD(start->bitset, '\r');
-
- COMPLEMENT(start->bitset);
- } else {
- advance();
- /* Negative character class */
- if (MATCH(AT_BOL)) {
- advance();
-
- /* Don't include \n in class */
- ADD(start->bitset, '\n');
-
- if (!Unix)
- ADD(start->bitset, '\r');
-
- COMPLEMENT(start->bitset);
- }
-
- if (!MATCH(CCL_END)) {
- dodash(start->bitset);
- } else { // [] or [^]
- for (c=0; c<=' '; ++c)
- ADD(start->bitset, c);
- }
- }
- advance();
- }
- }
- LEAVE("term");
-}
-
-/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-void dodash(SET *set)
-{
- register int first;
-
- /* Treat [-...] as a literal dash, but print a warning. */
- if (MATCH(DASH)) {
- warning(W_STARTDASH);
- ADD(set, Lexeme);
- advance();
- }
-
- for (; !MATCH(EOS) && !MATCH(CCL_END); advance()) {
- if (!MATCH(DASH)) {
- first = Lexeme;
- ADD(set, Lexeme);
- /* Looking at a dash */
- } else {
- advance();
- /* Treat [...-] as literal */
- if (MATCH(CCL_END)) {
- warning(W_ENDDASH);
- ADD(set, '-');
- } else {
- for (; first<=Lexeme; first++)
- ADD(set, first);
- }
- }
- }
-}
-}
-
-
-/* Access routine to this module. Return a pointer to a NFA transition
- * table that represents the regular expression pointed to by expr or
- * NULL if there's not enough memory. Modify *max_state to reflect the
- * largest state number used. This number will probably be a larger
- * number than the total number of states. Modify *start_state to point
- * to the start state. This pointer is garbage if thompson() returned 0.
- * The memory for the table is fetched from malloc(); use free() to
- * discard it.
- */
-NFA *thompson(char *(*input_function) P((void)), int *max_state, NFA **start_state)
-{
- CLEAR_STACK();
-
- Ifunct = input_function;
-
- /* Load first token */
- Current_tok = EOS;
- advance();
-
- Nstates = 0;
- Next_alloc = 0;
-
- *start_state = machine(); // Manufacture the NFA
- *max_state = Next_alloc ; // Max state # in NFA
-
- if (Verbose > 1)
- print_nfa(Nfa_states, *max_state, *start_state);
-
- if(Verbose) {
- printf("%d/%d NFA states used.\n", *max_state, NFA_MAX);
- printf("%s/%d bytes used for accept strings.\n\n", save(NULL), STR_MAX);
- }
-
- return Nfa_states;
-}
-
-
-#ifdef MAIN
-
-#define ALLOC
-#include "globals.h"
-
-/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-/* For debugging, print a single NFA structure. */
-void pnode(NFA * nfa)
-{
- if (!nfa)
- printf("(NULL)");
- else {
- printf("+--Structure at 0x%04x-------------------\n", nfa);
- printf("|edge = 0x%x (%c)\n", nfa->edge, nfa->edge);
- printf("|next = 0x%x\n", nfa->next);
- printf("|next2 = 0x%x\n", nfa->next2);
- printf("|accept = <%s>\n", nfa->accept);
- printf("|bitset = ");
- printccl(nfa->bitset);
- printf("\n");
- printf("+----------------------------------------\n");
- }
-}
-
-/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-char *getline()
-{
- static char buf[80];
-
- printf("%d: ", Lineno++);
- gets(buf);
-
- return *buf ? buf : NULL;
-}
-
-/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-main(int argc, char **argv )
-{
- NFA *nfa;
- NFA *start_state;
- int max_state;
-
- Verbose = 2;
-
- nfa = thompson(getline, &max_state, &start_state);
-}
-
-#endif
-
-
diff --git a/tom.h b/tom.h
deleted file mode 100644
index a12498b..0000000
--- a/tom.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef _TOMLEXER_H
-#define _TOMLEXER_H
-
-/*
- * An nfa_t is the non-deterministic finite automaton (NFA) produced
- * in the Thompson construction.
- */
-
-typedef struct nfa_t {
- int edge; // Edge label: char, CCL, EMPTY, or EPSILON.
- char *bitset; // Set to store character class.
- NFA *next; // Next state (NULL if no next state).
- NFA *next2; // Another next state if edge == EPSILON.
- char *accept; // NULL if !accepting state, else the action.
- int anchor; // Says whether pattern is anchored and where.
-} NFA;
-
-/* Non-character edge values */
-#define EPSILON -1
-#define CCL -2
-#define EMPTY -3
-
-/* Anchor field values (e.g. regex $, ^) */
-#define NONE 0
-#define START 1
-#define END 2
-#define BOTH (START | END)
-
-/*
- * Maximum number of NFA states in a single machine.
- * NFA_MAX * sizeof(struct nfa_t *) can't exceed 64k.
- */
-#define NFA_MAX 512
-
-/*
- * Total space that can be used by the accept strings.
- */
-#define STR_MAX (10 * 1024)
-
-void new_macro(char *definition);
-void printmacs(void);
-NFA *thompson(char *(*input_func)(), int *max_state, NFA **start_state);
-void print_nfa(NFA *nfa, int len, NFA *start);
-
-