summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Linehan <patientulysses@gmail.com>2012-07-22 13:15:39 -0400
committerJason Linehan <patientulysses@gmail.com>2012-07-22 13:15:39 -0400
commitc32c15e9f4229abea608b15177824edffeb0440c (patch)
treed7d3cfec3291961be42925e741a904192a7d4434
downloadplex-c32c15e9f4229abea608b15177824edffeb0440c.tar.gz
plex-c32c15e9f4229abea608b15177824edffeb0440c.tar.bz2
plex-c32c15e9f4229abea608b15177824edffeb0440c.zip
First commit.
-rw-r--r--Makefile21
-rw-r--r--tlex.c251
-rw-r--r--tlex.h0
-rw-r--r--tom.c988
-rw-r--r--tom.h45
5 files changed, 1305 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..415fb26
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+CC=gcc
+#
+#
+# optimize warnings
+# lvl 3 \ /
+CFLAGS=-O3 -Wall
+LDFLAGS=
+#
+#
+#
+
+SOURCES=test.c bloom.c
+OBJECTS=$(SOURCES:.c=.o)
+
+EXECUTABLE=test
+
+all: $(SOURCES)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCES) -o $(EXECUTABLE)
+
+clean:
+ rm -f $(OBJECTS) $(EXECUTABLE) gmon.out
diff --git a/tlex.c b/tlex.c
new file mode 100644
index 0000000..6f451eb
--- /dev/null
+++ b/tlex.c
@@ -0,0 +1,251 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include "nfa.h"
+#include "dfa.h"
+
+void cmd_line_error P(( int usage, char *fmt, ... ));
+void do_file P(( void ));
+void head P(( int suppress_output ));
+void tail P(( void ));
+void strip_comments P((char *string ));
+
+
+/*
+ * Name of DFA transition table. Up to 3 characters are appended
+ * to the end of this name in the row-compressed tables.
+ */
+#define DTRAN_NAME "Yy_nxt"
+#define MAXINP 2048 // Max rule size
+#define E(x) fprintf(stderr,"%s\n", x)
+
+/*
+ * Assorted options.
+ */
+int VERBOSE = 0; // Print stats
+int NOLINE = 0; // Suppress #line directives
+int UNIX = 1; // Use UNIX newlines.
+int Public = 1; // Make static symbols public.
+char *TEMPLATE = "lex.par"; // Driver template for the state machine.
+int LINENO = 1; // Current input line number.
+int LINEMARK = 1; // Line number of the first line in a multi-line rule.
+
+/*
+ * Command line switches.
+ */
+int COL_COMPRESS = 1;
+int NO_COMPRESS = 0;
+int THRESHOLD = 4;
+int NO_HEAD = 0;
+int HEAD_ONLY = 0;
+
+/*
+ * Input and output files.
+ */
+char IBUF[MAXINP]; // Line buffer for input.
+char *IFILENAME; // Input filename
+FILE *INPUT; // Input file stream
+FILE *OUTPUT; // Output file stream
+
+
+
+/**
+ * MAIN
+ */
+void main(int argc, char argv[])
+{
+ static char *p;
+
+ if (INPUT = fopen(*argv,"r"))
+ IFILENAME = *argv;
+ else
+ halt(SIGABRT,"Can't open input file %s", *argv);
+
+ do_file ();
+ fclose (INPUT);
+ exit (0);
+}
+
+
+/**
+ * do_file
+ * ???
+ */
+void do_file(void)
+{
+ int nstates; // Number of DFA states
+ ROW *dtran; // Transition table.
+ ACCEPT *accept; // Set of accept states.
+ int i;
+
+ /* Process the input file */
+
+ /* Print everything up to first %% */
+ head(HEAD_ONLY);
+
+ /* Make the DFA */
+ nstates = min_dfa(get_expr, &dtran, &accept);
+
+ printf("%d out of %d DFA states in min machine\n", nstates, DFA_MAX);
+ printf("%d bytes required for min tables\n\n",
+ nstates * MAX_CHARS * sizeof(TTYPE) /* dtran */
+ + nstates * sizeof(TTYPE) ); /* accept */
+
+ /* First part of driver */
+
+ if (!driver_1(OUTPUT, !NOLINE, TEMPLATE)) {
+ perror(TEMPLATE);
+ exit(1);
+ }
+
+ /* Compressed tables */
+ if (NO_COMPRESS) {
+ fprintf (OUTPUT ,"YYPRIVATE YY_TTYPE %s[ %d ][ %d ] =\n",
+ DTRAN_NAME, nstates, MAX_CHARS);
+
+ print_array(OUTPUT, (int *)dtran, nstates, MAX_CHARS);
+ defnext (OUTPUT, DTRAN_NAME);
+ /* Column compressed tables */
+ } else if(COL_COMPRESS) {
+ i = squash (OUTPUT, dtran, nstates, MAX_CHARS, DTRAN_NAME);
+ cnext(OUTPUT, DTRAN_NAME);
+
+ printf("%d bytes required for column-compressed tables\n\n",
+ i /* dtran */
+ + (nstates * sizeof(int)) ); /* Yy_accept */
+ /* Pair-compressed tables */
+ } else {
+ i = pairs(OUTPUT, (int *)dtran, nstates,MAX_CHARS, DTRAN_NAME, THRESHOLD, 0);
+
+ if (VERBOSE) {
+ /* Figure the space occupied for the various tables. The
+ * Microsoft compiler uses roughly 100 bytes for the yy_next()
+ * subroutine. Column compression does yy_next in line with a
+ * macro so the overhead is negligible.
+ */
+ i = (i * sizeof(TTYPE )) /* YysDD arrays */
+ + (nstates * sizeof(TTYPE*)) /* Dtran[] */
+ + (nstates * sizeof(TTYPE )) /* Yy_accept[] */
+ + 100 ; /* yy_next() */
+
+ printf("%d bytes required for pair-compressed tables\n", i );
+ }
+ pnext( OUTPUT, DTRAN_NAME );
+ }
+
+ /* Print the rest of the driver and everyting after the second %% */
+ pdriver(OUTPUT, nstates, accept);
+ tail();
+}
+
+
+/*
+ * Head processes everything up to the first %%. Any lines that begin
+ * with white space or are surrounded by %{ and %} are passed to the
+ * output. All other lines are assumed to be macro definitions.
+ * A %% can not be concealed in a %{ %} but it must be anchored at start
+ * of line so a %% in a printf statement (for example) is passed to the
+ * output correctly. Similarly, a %{ and %} must be the first two characters
+ * on the line.
+ */
+void head(int suppress_output)
+{
+ int transparent = 0; // True if in a %{ %} block
+
+ if (!suppress_output && Public)
+ fputs("#define YYPRIVATE\n\n", OUTPUT);
+
+ if (!NOLINE)
+ fprintf(OUTPUT, "#line 1 \"%s\"\n", IFILENAME);
+
+ while (fgets( IBUF, MAXINP, INPUT)) {
+ ++ LINENO;
+
+ /* Don't strip comments. */
+ if (!transparent)
+ strip_comments(IBUF);
+
+ if (VERBOSE > 1)
+ printf("h%d: %s", LINENO, IBUF);
+
+ if (IBUF[0] == '%') {
+ if (IBUF[1] == '%') {
+ if (!suppress_output)
+ fputs("\n", OUTPUT);
+ break;
+ } else {
+ /* }{ */
+ if (IBUF[1] == '{')
+ transparent = 1;
+
+ else if (IBUF[1] == '}')
+ transparent = 0;
+
+ else
+ lerror(0, "Ignoring illegal %%%c directive\n", IBUF[1]);
+ }
+ } else if (transparent || isspace(IBUF[0])) {
+ if (!suppress_output)
+ fputs(IBUF, OUTPUT);
+ } else {
+ new_macro(IBUF);
+ /*
+ * Replace macro def with a blank line so that the
+ * line numbers won't get messed up.
+ */
+ if (!suppress_output)
+ fputs("\n", OUTPUT);
+ }
+ }
+ if (VERBOSE > 1)
+ printmacs();
+}
+
+
+/*
+ * Scan through the string, replacing C-like comments with space
+ * characters. Multiple-line comments are supported.
+ */
+void strip_comments(char *string)
+{
+ static int incomment = 0;
+
+ for (; *string ; ++string) {
+ if (incomment) {
+ if (string[0]=='*' && string[1]=='/') {
+ incomment = 0;
+ *string++ = ' ';
+ }
+
+ if (!isspace(*string)) {
+ *string = ' ';
+ }
+ } else {
+ if (string[0]=='/' && string[1]=='*') {
+ incomment = 1;
+ *string++ = ' ';
+ *string = ' ';
+ }
+ }
+ }
+}
+
+/**
+ * Throw away the line that had the %% on it.
+ */
+void tail(void)
+{
+ fgets(IBUF, MAXINP, INPUT);
+
+ if (!NOLINE)
+ fprintf(OUTPUT, "#line %d \"%s\"\n", LINENO + 1, IFILENAME);
+
+ while (fgets(IBUF, MAXINP, INPUT)) {
+ if (VERBOSE > 1)
+ printf("t%d: %s", LINENO++, IBUF);
+
+ fputs(IBUF, OUTPUT);
+ }
+}
+
diff --git a/tlex.h b/tlex.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tlex.h
diff --git a/tom.c b/tom.c
new file mode 100644
index 0000000..926af21
--- /dev/null
+++ b/tom.c
@@ -0,0 +1,988 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "common/textutils.h"
+#include "common/set.h"
+#include "common/error.h"
+#include "common/hash.h"
+#include "common/textutils.h"
+
+#include "nfa.h"
+
+#include "dfa.h" /* prototype for lerror(). */
+#include "globals.h" /* externs for Verbose, etc. */
+
+#ifdef DEBUG
+ int Lev = 0;
+# define ENTER(f) printf("%*senter %s [%c][%1.10s] \n", \
+ Lev++ * 4, "", f, Lexeme, Input)
+# define LEAVE(f) printf("%*sleave %s [%c][%1.10s]\n", \
+ --Lev * 4, "", f, Lexeme, Input)
+#else
+# define ENTER(f)
+# define LEAVE(f)
+#endif
+
+/*
+ * Error processing stuff. Note that all errors are fatal.
+ */
+typedef enum ERR_NUM
+{
+ E_MEM, /* Out of memory */
+ E_BADEXPR, /* Malformed regular expression */
+ E_PAREN, /* Missing close parenthesis */
+ E_STACK, /* Internal error: Discard stack full */
+ E_LENGTH, /* Too many regular expressions */
+ E_BRACKET, /* Missing [ in character class */
+ E_BOL, /* ^ must be at start of expr or ccl */
+ E_CLOSE, /* + ? or * must follow expression */
+ E_STRINGS, /* Too many characters in accept actions */
+ E_NEWLINE, /* Newline in quoted string */
+ E_BADMAC, /* Missing } in macro expansion */
+ E_NOMAC, /* Macro doesn't exist */
+ E_MACDEPTH /* Macro expansions nested too deeply. */
+
+} ERR_NUM;
+
+
+char *ERROR[] = /* Indexed by ERR_NUM */
+{
+ "Not enough memory for NFA",
+ "Malformed regular expression",
+ "Missing close parenthesis",
+ "Internal error: Discard stack full",
+ "Too many regular expressions or expression too long",
+ "Missing [ in character class",
+ "^ must be at start of expression or after [",
+ "+ ? or * must follow an expression or subexpression",
+ "Too many characters in accept actions",
+ "Newline in quoted string, use \\n to get newline into expression",
+ "Missing } in macro expansion",
+ "Macro doesn't exist",
+ "Macro expansions nested too deeply"
+};
+
+typedef enum WARN_NUM
+{
+ W_STARTDASH, /* Dash at start of character class */
+ W_ENDDASH /* Dash at end of character class */
+
+} WARN_NUM;
+
+
+PRIVATE char *WARNING[] = /* Indexed by WARN_NUM */
+{
+ "Treating dash in [-...] as a literal dash",
+ "Treating dash in [...-] as a literal dash"
+};
+
+
+NFA *Nfa_states ; /* State-machine array */
+int Nstates = 0 ; /* # of NFA states in machine */
+int Next_alloc; /* Index of next element of the array */
+
+#define SSIZE 32
+
+PRIVATE NFA *Sstack[ SSIZE ]; /* Stack used by new() */
+PRIVATE NFA **Sp = &Sstack[ -1 ]; /* Stack pointer */
+
+#define STACK_OK() ( INBOUNDS(Sstack, Sp) ) /* true if stack not */
+ /* full or empty */
+#define STACK_USED() ( (int)(Sp-Sstack) + 1 ) /* slots used */
+#define CLEAR_STACK() ( Sp = Sstack - 1 ) /* reset the stack */
+#define PUSH(x) ( *++Sp = (x) ) /* put x on stack */
+#define POP() ( *Sp-- ) /* get x from stack */
+
+
+/*--------------------------------------------------------------
+ * MACRO support:
+ */
+
+#define MAC_NAME_MAX 34 /* Maximum name length */
+#define MAC_TEXT_MAX 80 /* Maximum amount of expansion text */
+
+typedef struct MACRO
+{
+ char name[ MAC_NAME_MAX ];
+ char text[ MAC_TEXT_MAX ];
+
+} MACRO;
+
+HASH_TAB *Macros; /* Symbol table for macro definitions */
+
+typedef enum TOKEN
+{
+ EOS = 1, /* end of string */
+ ANY, /* . */
+ AT_BOL, /* ^ */
+ AT_EOL, /* $ */
+ CCL_END, /* ] */
+ CCL_START, /* [ */
+ CLOSE_CURLY, /* } */
+ CLOSE_PAREN, /* ) */
+ CLOSURE, /* * */
+ DASH, /* - */
+ END_OF_INPUT, /* EOF */
+ L, /* literal character */
+ OPEN_CURLY, /* { */
+ OPEN_PAREN, /* ( */
+ OPTIONAL, /* ? */
+ OR, /* | */
+ PLUS_CLOSE /* + */
+
+} TOKEN;
+
+
+TOKEN Tokmap[] =
+{
+/* ^@ ^A ^B ^C ^D ^E ^F ^G ^H ^I ^J ^K ^L ^M ^N */
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* ^O ^P ^Q ^R ^S ^T ^U ^V ^W ^X ^Y ^Z ^[ ^\ ^] */
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* ^^ ^_ SPACE ! " # $ % & ' */
+ L, L, L, L, L, L, AT_EOL, L, L, L,
+
+/* ( ) * + , - . */
+ OPEN_PAREN, CLOSE_PAREN, CLOSURE, PLUS_CLOSE, L, DASH, ANY,
+
+/* / 0 1 2 3 4 5 6 7 8 9 : ; < = */
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* > ? */
+ L, OPTIONAL,
+
+/* @ A B C D E F G H I J K L M N */
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* O P Q R S T U V W X Y Z */
+ L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* [ \ ] ^ */
+ CCL_START, L, CCL_END, AT_BOL,
+
+/* _ ` a b c d e f g h i j k l m */
+ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* n o p q r s t u v w x y z */
+ L, L, L, L, L, L, L, L, L, L, L, L, L,
+
+/* { | } DEL */
+ OPEN_CURLY, OR, CLOSE_CURLY, L
+};
+
+char *(*Ifunct) P((void)) ; /* Input function pointer */
+char *Input = "" ; /* Current position in input string */
+char *S_input ; /* Beginning of input string */
+TOKEN Current_tok ; /* Current token */
+int Lexeme ; /* Value associated with LITERAL */
+
+#define MATCH(t) (Current_tok == (t))
+
+void parse_err P(( ERR_NUM type ));
+void warning P(( WARN_NUM type ));
+void errmsg P(( int type, char **table, char *msgtype ));
+NFA *new P(( void ));
+void discard P(( NFA *nfa_to_discard ));
+char *save P(( char *str ));
+char *get_macro P(( char **namep ));
+void print_a_macro P(( MACRO *mac ));
+TOKEN advance P(( void ));
+NFA *machine P(( void ));
+NFA *rule P(( void ));
+void expr P(( NFA **startp, NFA **endp ));
+void cat_expr P(( NFA **startp, NFA **endp ));
+int first_in_cat P(( TOKEN tok ));
+void factor P(( NFA **startp, NFA **endp ));
+void term P(( NFA **startp, NFA **endp ));
+void dodash P(( SET *set ));
+
+
+
+void warning(WARN_NUM type)
+{
+ errmsg( (int)type, WARNING, "WARNING" );
+}
+
+
+void parse_err(ERR_NUM type)
+{
+ errmsg( (int)type, ERROR, "ERROR" );
+ exit(1);
+}
+
+
+/* Highlight point of error */
+void errmsg(int type, char **table, char *msgtype )
+{
+ char *p;
+ fprintf(stderr, "%s (line %d) %s\n", msgtype, Actual_lineno, table[type] );
+
+ for( p = S_input; ++p <= Input; putc('-', stderr) )
+ ;
+ fprintf( stderr, "v\n%s\n", S_input );
+ exit( 1 );
+}
+
+/*--------------------------------------------------------------*/
+
+NFA *new(void) /* NFA management functions */
+{
+ static int first = 1;
+ NFA *new;
+
+ if (first) {
+ if (!(Nfa_states = (NFA *) calloc(NFA_MAX, sizeof(NFA))))
+ parse_err(E_MEM);
+
+ first = 0;
+ Sp = &Sstack[ -1 ];
+ }
+
+ if (++Nstates >= NFA_MAX)
+ parse_err(E_LENGTH);
+
+ /* If the stack is not ok, it's empty */
+
+ p = !STACK_OK() ? &Nfa_states[Next_alloc++] : POP();
+ p->edge = EPSILON;
+
+ return p;
+}
+
+/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+void discard(NFA *nfa)
+{
+ --Nstates;
+
+ memset(nfa, 0, sizeof(NFA));
+ nfa->edge = EMPTY ;
+ PUSH(nfa);
+
+ if(!STACK_OK())
+ parse_err(E_STACK);
+}
+
+/*----------------------------------------------------------------------*/
+
+/* String management */
+char *save(char *str)
+{
+ char *textp, *startp;
+ int len;
+ static int first = 1;
+ static char size[8]; /* Query-mode size */
+ static int *strings; /* Place to save accepting strings */
+ static int *savep; /* Current position in strings array. */
+
+ if (first) {
+ if (!(savep = strings = (int *)malloc(STR_MAX)))
+ parse_err(E_MEM);
+ first_time = 0;
+ }
+
+ /* Query mode. Returns number of bytes in use. */
+ if (!str) {
+ sprintf( size, "%ld", (long)(savep - strings) );
+ return size;
+ }
+
+ if (*str == '|')
+ return (char *)(savep + 1);
+
+ *savep++ = Lineno;
+
+ for (textp = (char *)savep; *str; *textp++ = *str++) {
+ if (textp >= (char *)strings + (STR_MAX-1))
+ parse_err(E_STRINGS);
+ }
+
+ *textp++ = '\0' ;
+
+ /*
+ * Increment savep past the text. "len" is initialized
+ * to the string length. The "len/sizeof(int)" truncates
+ * the size down to an even multiple of the current int size.
+ * The "+(len % sizeof(int) != 0)" adds 1 to the truncated
+ * size if the string length isn't an even multiple of the int
+ * size (the != operator evaluates to 1 or 0). Return a pointer
+ * to the string itself.
+ */
+
+ startp = (char *)savep;
+ len = textp - startp;
+ savep += (len / sizeof(int)) + (len % sizeof(int) != 0);
+
+ return startp;
+}
+
+/*------------------------------------------------------------*/
+
+/*
+ * Add a new macro to the table. If two macros have the same name, the
+ * second one takes precedence. A definition takes the form:
+ * name <whitespace> text [<whitespace>]
+ * whitespace at the end of the line is ignored.
+ */
+void new_macro(char *def)
+{
+ unsigned hash_add();
+
+ char *name; /* Name component of macro definition */
+ char *text; /* text part of macro definition */
+ char *edef; /* pointer to end of text part */
+ MACRO *p;
+ static int first = 1;
+
+ if (first) {
+ first = 0;
+ Macros = maketab(31, hash_add, strcmp);
+ }
+
+ /* Isolate name */
+ for (name = def; *def && !isspace(*def); def++)
+ ;
+
+ if (*def)
+ *def++ = '\0' ;
+
+ /*
+ * Isolate the definition text. This process is complicated
+ * because you need to discard any trailing whitespace on the
+ * line. The first while loop skips the preceding whitespace.
+ * The for loop is looking for end of string. If you find a
+ * white character (and the \n at the end of string is white),
+ * remember the position as a potential end of string.
+ */
+ while (isspace(*def))
+ ++def;
+
+ text = def;
+
+ edef = NULL;
+
+ while (*def) {
+ if (!isspace(*def))
+ ++def;
+ else
+ for (edef = def++; isspace(*def); ++def)
+ ;
+ }
+
+ if (edef)
+ *edef = '\0';
+
+ /* Add the macro to the symbol table */
+ p = (MACRO *) newsym(sizeof(MACRO));
+ slcpy(p->name, name, MAC_NAME_MAX);
+ slcpy(p->text, text, MAC_TEXT_MAX);
+ addsym(Macros, p);
+
+ D(printf("Added macro definition, macro table now is:\n");)
+ D(print_macs();)
+}
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
+
+/*
+ * Return a pointer to the contents of a macro having the indicated
+ * name. Abort with a message if no macro exists. The macro name includes
+ * the brackets. *namep is modified to point past the close brace.
+ */
+char *get_macro(char **namep)
+{
+ char *p;
+ MACRO *mac;
+
+ /* Hit a '{', skip it and find '}'. */
+ if (!(p = strchr(++(*namep), '}')))
+ parse_err(E_BADMAC);
+ else {
+ /* Overwrite the close brace. */
+ *p = '\0';
+
+ if (!(mac = (MACRO *)findsym(Macros, *namep)))
+ parse_err(E_NOMAC);
+
+ /* Replace the brace. */
+ *p++ = '}';
+
+ /* Update name pointer */
+ *namep = p;
+
+ return mac->text;
+ }
+
+ /* You shouldn't be here. */
+ return "ERROR";
+}
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
+
+/* Workhorse function */
+void print_a_macro(MACRO *mac)
+{
+ printf("%-16s--[%s]--\n", mac->name, mac->text);
+}
+
+/* Print all macros to stdout. */
+void printmacs(void)
+{
+ if (!Macros)
+ printf("\tThere are no macros\n");
+ else {
+ printf("\nMACROS:\n");
+ ptab(Macros, (void(*)P((void*,...))) print_a_macro, NULL, 1);
+ }
+}
+
+
+/*----------------------------------------------------------------
+ * Lexical analyzer:
+ *
+ * Lexical analysis is trivial because all lexemes are single-character values.
+ * The only complications are escape sequences and quoted strings, both
+ * of which are handled by advance(), below. This routine advances past the
+ * current token, putting the new token into Current_tok and the equivalent
+ * lexeme into Lexeme. If the character was escaped, Lexeme holds the actual
+ * value. For example, if a "\s" is encountered, Lexeme will hold a space
+ * character. The MATCH(x) macro returns true if x matches the current token.
+ * Advance both modifies Current_tok to the current token and returns it.
+ *
+ * Macro expansion is handled by means of a stack (declared at the top
+ * of the subroutine). When an expansion request is encountered, the
+ * current input buffer is stacked, and input is read from the macro
+ * text. This process repeats with nested macros, so SSIZE controls
+ * the maximum macro-nesting depth.
+ */
+
+TOKEN advance(void)
+{
+ static int inquote = 0; // When processing quoted string
+ int saw_esc; // Saw a backslash escape
+ static char *stack[SSIZE] // Import stack
+ static char **sp = NULL; // Stack pointer
+
+ /* Initialize the stack pointer. */
+ if (!sp)
+ sp = stack - 1; // Necessary for a large model.
+
+ /* Get another line */
+ if (Current_tok == EOS) {
+ if(inquote)
+ parse_err(E_NEWLINE);
+ /*
+ * Sit in this loop until a non-blank line is read
+ * into the "Input" array.
+ */
+ do {
+
+ /* Then at the end of file. */
+ if (!(Input = (*Ifunct)())) {
+ Current_tok = END_OF_INPUT;
+ goto exit;
+ }
+ /* Ignore leading whitespace. */
+ while (isspace(*Input))
+ Input++;
+
+ } while (!*Input); /* Ignore blank lines. */
+
+ S_input = Input; // Remember start of line for errors.
+ }
+
+ while (*Input == '\0') {
+ /* Restore previous input source. */
+ if (INBOUNDS(stack, sp)) {
+ Input = *sp--;
+ continue;
+ }
+
+ /*
+ * No more input sources to restore; you're at the real end
+ * of the input string.
+ */
+ Current_tok = EOS;
+ Lexeme = '\0';
+ goto exit;
+ }
+
+ if (!inquote) {
+ /* Macro expansion required. */
+ while(*Input == '{') {
+ /* Stack current input string. */
+ *++sp = Input;
+ /* Use macro body as input string. */
+ Input = get_macro(sp);
+
+ if(TOOHIGH(stack,sp))
+ parse_err(E_MACDEPTH); // Stack overflow!
+ }
+ }
+
+ /*
+ * At either start or end of a quoted string. All characters are
+ * treated as literals while inquote is true.
+ */
+ if (*Input == '"') {
+ inquote = ~inquote;
+ if (!*++Input) {
+ Current_tok = EOS;
+ Lexeme = '\0';
+ goto exit;
+ }
+ }
+
+ saw_esc = (*Input == '\\');
+
+ if(!inquote) {
+ if(isspace(*Input)) {
+ Current_tok = EOS;
+ Lexeme = '\0';
+ goto exit;
+ }
+ Lexeme = esc(&Input);
+ } else {
+ if (saw_esc && Input[1] == '"') {
+ /* Skip the escaped character. */
+ Input += 2;
+ Lexeme = '"';
+ } else {
+ Lexeme = *Input++;
+ }
+ }
+
+ Current_tok = (inquote || saw_esc) ? L : Tokmap[Lexeme];
+
+ exit:
+ return Current_tok;
+}
+
+
+/*--------------------------------------------------------------
+ * The Parser:
+ * A simple recursive descent parser that creates a Thompson NFA for
+ * a regular expression. The access routine [thompson()] is at the
+ * bottom. The NFA is created as a directed graph, with each node
+ * containing pointer's to the next node. Since the structures are
+ * allocated from an array, the machine can also be considered
+ * as an array where the state number is the array index.
+ */
+
+NFA *machine()
+{
+ NFA *start;
+ NFA *p;
+
+ ENTER("machine");
+
+ p = start = new();
+ p->next = rule();
+
+ while(!MATCH(END_OF_INPUT)) {
+ p->next2 = new();
+ p = p->next2;
+ p->next = rule();
+ }
+
+ LEAVE("machine");
+
+ return start;
+}
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
+
+/* rule --> expr EOS action
+* ^expr EOS action
+* expr$ EOS action
+*
+* action --> <tabs> <string of characters>
+* epsilon
+*/
+NFA *rule(void)
+{
+ NFA *start = NULL;
+ NFA *end = NULL;
+ int anchor = NONE;
+
+ ENTER("rule");
+
+ if (MATCH(AT_BOL)) {
+ start = new();
+ start->edge = '\n';
+ anchor |= START;
+ advance();
+ expr(&start->next, &end);
+ } else {
+ expr(&start, &end);
+ }
+
+ /*
+ * Pattern followed by a carriage-return or linefeed
+ * (use a character class).
+ */
+ if (MATCH(AT_EOL)) {
+ advance();
+ end->next = new();
+ end->edge = CCL;
+
+ if (!(end->bitset = newset()))
+ parse_err(E_MEM);
+
+ ADD(end->bitset, '\n');
+
+ if (!Unix)
+ ADD( end->bitset, '\r' );
+
+ end = end->next;
+ anchor |= END;
+ }
+
+ while (isspace(*Input))
+ Input++;
+
+ end->accept = save(Input);
+ end->anchor = anchor;
+ advance(); // Skip past EOS
+
+ LEAVE("rule");
+
+ return start;
+}
+
+
+/*
+ * Because a recursive descent compiler can't handle left recursion,
+ * the productions:
+ *
+ * expr -> expr OR cat_expr
+ * | cat_expr
+ *
+ * must be translated into:
+ *
+ * expr -> cat_expr expr'
+ * expr' -> OR cat_expr expr'
+ * epsilon
+ *
+ * which can be implemented with this loop:
+ *
+ * cat_expr
+ * while( match(OR) )
+ * cat_expr
+ * do the OR
+ */
+void expr(NFA **startp, NFA **endp)
+{
+ NFA *e2_start = NULL; /* expression to right of | */
+ NFA *e2_end = NULL;
+ NFA *p;
+
+ ENTER("expr");
+
+ cat_expr(startp, endp);
+
+ while(MATCH(OR)) {
+ advance();
+ cat_expr(&e2_start, &e2_end);
+
+ p = new();
+ p->next2 = e2_start;
+ p->next = *startp;
+ *startp = p;
+
+ p = new();
+ (*endp)->next = p;
+ e2_end ->next = p;
+ *endp = p;
+ }
+ LEAVE("expr");
+}
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
+
+/* The same translations that were needed in the expr rules are needed again
+ * here:
+ *
+ * cat_expr -> cat_expr | factor
+ * factor
+ *
+ * is translated to:
+ *
+ * cat_expr -> factor cat_expr'
+ * cat_expr' -> | factor cat_expr'
+ * epsilon
+ */
+void cat_expr(NFA **startp, NFA **endp )
+{
+ NFA *e2_start;
+ NFA *e2_end;
+
+ ENTER("cat_expr");
+
+ if (first_in_cat(Current_tok))
+ factor(startp, endp);
+
+ while(first_in_cat(Current_tok)) {
+ factor(&e2_start, &e2_end);
+
+ memcpy(*endp, e2_start, sizeof(NFA));
+ discard(e2_start);
+
+ *endp = e2_end;
+ }
+
+ LEAVE("cat_expr");
+}
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
+
+int first_in_cat(TOKEN tok)
+{
+ switch(tok) {
+ case CLOSE_PAREN:
+ case AT_EOL:
+ case OR:
+ case EOS: return 0;
+
+ case CLOSURE:
+ case PLUS_CLOSE:
+ case OPTIONAL: parse_err( E_CLOSE ); return 0;
+
+ case CCL_END: parse_err( E_BRACKET ); return 0;
+ case AT_BOL: parse_err( E_BOL ); return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * factor --> term* | term+ | term?
+ */
+void factor(NFA **startp, NFA **endp)
+{
+ NFA *start;
+ NFA *end;
+
+ ENTER("factor");
+
+ term(startp, endp);
+
+ if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE) || MATCH(OPTIONAL)) {
+ start = new();
+ end = new();
+ start->next = *startp;
+ (*endp)->next = end;
+
+ // * or ?
+ if (MATCH(CLOSURE) || MATCH(OPTIONAL))
+ start->next2 = end;
+
+ // * or +
+ if (MATCH(CLOSURE) || MATCH(PLUS_CLOSE))
+ (*endp)->next2 = *startp;
+
+ *startp = start;
+ *endp = end;
+ advance();
+ }
+
+ LEAVE("factor");
+}
+
+
+/* Process the term productions:
+ *
+ * term --> [...] | [^...] | [] | [^] | . | (expr) | <character>
+ *
+ * The [] is nonstandard. It matches a space, tab, formfeed, or newline,
+ * but not a carriage return (\r). All of these are single nodes in the
+ * NFA.
+ */
+void term(NFA **startp, NFA **endp)
+{
+ NFA *start;
+ int c;
+
+ ENTER("term");
+
+ if (MATCH(OPEN_PAREN)) {
+ advance();
+ expr(startp, endp);
+ if (MATCH(CLOSE_PAREN))
+ advance();
+ else
+ parse_err(E_PAREN);
+ } else {
+ *startp = start = new();
+ *endp = start->next = new();
+
+ if (!(MATCH(ANY) || MATCH(CCL_START))) {
+ start->edge = Lexeme;
+ advance();
+ } else {
+ start->edge = CCL;
+
+ if (!(start->bitset = newset()))
+ parse_err(E_MEM);
+
+ /* dot (.) */
+ if (MATCH(ANY)) {
+ ADD(start->bitset, '\n');
+
+ if(!Unix)
+ ADD(start->bitset, '\r');
+
+ COMPLEMENT(start->bitset);
+ } else {
+ advance();
+ /* Negative character class */
+ if (MATCH(AT_BOL)) {
+ advance();
+
+ /* Don't include \n in class */
+ ADD(start->bitset, '\n');
+
+ if (!Unix)
+ ADD(start->bitset, '\r');
+
+ COMPLEMENT(start->bitset);
+ }
+
+ if (!MATCH(CCL_END)) {
+ dodash(start->bitset);
+ } else { // [] or [^]
+ for (c=0; c<=' '; ++c)
+ ADD(start->bitset, c);
+ }
+ }
+ advance();
+ }
+ }
+ LEAVE("term");
+}
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+void dodash(SET *set)
+{
+ register int first;
+
+ /* Treat [-...] as a literal dash, but print a warning. */
+ if (MATCH(DASH)) {
+ warning(W_STARTDASH);
+ ADD(set, Lexeme);
+ advance();
+ }
+
+ for (; !MATCH(EOS) && !MATCH(CCL_END); advance()) {
+ if (!MATCH(DASH)) {
+ first = Lexeme;
+ ADD(set, Lexeme);
+ /* Looking at a dash */
+ } else {
+ advance();
+ /* Treat [...-] as literal */
+ if (MATCH(CCL_END)) {
+ warning(W_ENDDASH);
+ ADD(set, '-');
+ } else {
+ for (; first<=Lexeme; first++)
+ ADD(set, first);
+ }
+ }
+ }
+}
+}
+
+
+/* Access routine to this module. Return a pointer to a NFA transition
+ * table that represents the regular expression pointed to by expr or
+ * NULL if there's not enough memory. Modify *max_state to reflect the
+ * largest state number used. This number will probably be a larger
+ * number than the total number of states. Modify *start_state to point
+ * to the start state. This pointer is garbage if thompson() returned 0.
+ * The memory for the table is fetched from malloc(); use free() to
+ * discard it.
+ */
+NFA *thompson(char *(*input_function) P((void)), int *max_state, NFA **start_state)
+{
+ CLEAR_STACK();
+
+ Ifunct = input_function;
+
+ /* Load first token */
+ Current_tok = EOS;
+ advance();
+
+ Nstates = 0;
+ Next_alloc = 0;
+
+ *start_state = machine(); // Manufacture the NFA
+ *max_state = Next_alloc ; // Max state # in NFA
+
+ if (Verbose > 1)
+ print_nfa(Nfa_states, *max_state, *start_state);
+
+ if(Verbose) {
+ printf("%d/%d NFA states used.\n", *max_state, NFA_MAX);
+ printf("%s/%d bytes used for accept strings.\n\n", save(NULL), STR_MAX);
+ }
+
+ return Nfa_states;
+}
+
+
+#ifdef MAIN
+
+#define ALLOC
+#include "globals.h"
+
+/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+/* For debugging, print a single NFA structure. */
+void pnode(NFA * nfa)
+{
+ if (!nfa)
+ printf("(NULL)");
+ else {
+ printf("+--Structure at 0x%04x-------------------\n", nfa);
+ printf("|edge = 0x%x (%c)\n", nfa->edge, nfa->edge);
+ printf("|next = 0x%x\n", nfa->next);
+ printf("|next2 = 0x%x\n", nfa->next2);
+ printf("|accept = <%s>\n", nfa->accept);
+ printf("|bitset = ");
+ printccl(nfa->bitset);
+ printf("\n");
+ printf("+----------------------------------------\n");
+ }
+}
+
+/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+char *getline()
+{
+ static char buf[80];
+
+ printf("%d: ", Lineno++);
+ gets(buf);
+
+ return *buf ? buf : NULL;
+}
+
+/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+main(int argc, char **argv )
+{
+ NFA *nfa;
+ NFA *start_state;
+ int max_state;
+
+ Verbose = 2;
+
+ nfa = thompson(getline, &max_state, &start_state);
+}
+
+#endif
+
+
diff --git a/tom.h b/tom.h
new file mode 100644
index 0000000..a12498b
--- /dev/null
+++ b/tom.h
@@ -0,0 +1,45 @@
+#ifndef _TOMLEXER_H
+#define _TOMLEXER_H
+
+/*
+ * An nfa_t is the non-deterministic finite automaton (NFA) produced
+ * in the Thompson construction.
+ */
+
+typedef struct nfa_t {
+ int edge; // Edge label: char, CCL, EMPTY, or EPSILON.
+ char *bitset; // Set to store character class.
+ NFA *next; // Next state (NULL if no next state).
+ NFA *next2; // Another next state if edge == EPSILON.
+ char *accept; // NULL if !accepting state, else the action.
+ int anchor; // Says whether pattern is anchored and where.
+} NFA;
+
+/* Non-character edge values */
+#define EPSILON -1
+#define CCL -2
+#define EMPTY -3
+
+/* Anchor field values (e.g. regex $, ^) */
+#define NONE 0
+#define START 1
+#define END 2
+#define BOTH (START | END)
+
+/*
+ * Maximum number of NFA states in a single machine.
+ * NFA_MAX * sizeof(struct nfa_t *) can't exceed 64k.
+ */
+#define NFA_MAX 512
+
+/*
+ * Total space that can be used by the accept strings.
+ */
+#define STR_MAX (10 * 1024)
+
+void new_macro(char *definition);
+void printmacs(void);
+NFA *thompson(char *(*input_func)(), int *max_state, NFA **start_state);
+void print_nfa(NFA *nfa, int len, NFA *start);
+
+