From 8590ffb64632762797efca1d684bbe1c16c18c7a Mon Sep 17 00:00:00 2001 From: gdr Date: Mon, 17 Nov 1997 04:13:08 +0000 Subject: [PATCH] These are Dave Tribby's changes that were necessary to make sed (which uses these routines) work correctly. See the file GNO.notes for more details. --- lib/libc/regex/GNO.notes | 71 +++++++++++++ lib/libc/regex/Makefile | 4 +- lib/libc/regex/engine.c | 9 +- lib/libc/regex/make.cmds | 28 ++++++ lib/libc/regex/regcomp.c | 209 ++++++++++++++++++++++++++++++++++++++- lib/libc/regex/regexec.c | 30 +++++- 6 files changed, 346 insertions(+), 5 deletions(-) create mode 100644 lib/libc/regex/GNO.notes create mode 100644 lib/libc/regex/make.cmds diff --git a/lib/libc/regex/GNO.notes b/lib/libc/regex/GNO.notes new file mode 100644 index 0000000..56803f3 --- /dev/null +++ b/lib/libc/regex/GNO.notes @@ -0,0 +1,71 @@ +Notes on the porting of regex to GNO + + Dave Tribby * November 7, 1997 + + +Devin Reade did the initial conversion of the BSD sources to compile +under GNO 2.0.6 headers with ORCA/C on the Apple IIGS. + +I completed the porting to the extent that the program sed (which +uses regex) works for many different test cases. + +The most time-comsuming aspect of the port was finding all the places +where long integers should be used instead of int or unsigned int. + + +DEBUGGING + +The file engine.c is used in an interesting way by regexec.c: +it includes engine.c *twice*, after muchos fiddling with the +macros that code uses. This lets the same code operate on two +different representations for state sets. + +When it was necessary to do source-level debugging, Splat! got +very confused about line numbers until I added the following at +the beginning of the file: + #ifdef __ORCAC__ + #line 2 "engine.c" + #endif + +The regex code had an compilation macro, REDEBUG, that would +turn on diagnostic messages when defined. I added additional +output statements that are turned on by REDEBUG. + + +PERFORMANCE ENHANCEMENT + +The program sed took a long time to execute a compilcated sed +program (provided with the BSD source code) that evaluates +expressions. I attempted to speed up the regex routines by +recoding two routines in assembly language (using the asm {} +construct available in ORCA/C). The two routines are +isinsets() and samesets() in regcomp.c. The C code was left +in place and can be turned on by compiling with the macro +__NOASM__ set. + +I also noticed an instance in regcomp.c where many fields of +a structure were individually set to 0 at initialization. I +recoded to set all the fields to 0 by a call to memset() and +then set only the non-zero fields individually. + +The following script was used to time the results: + + cd /src/gno/usr.bin/sed + echo '(4+4)*3' | time ./sed -f tests/math.sed + +The unmodified code took 51 seconds to run on my 8MHz Apple IIGS. +It took 49 seconds after I recoded isinsets() in asm, and 48 seconds +after I recoded samesets(). Initializing fields via memset() brought +it down to 47 seconds. + +I saw no other obvious candidates for recoding. The modest results +from these changes (even though they were made in routines that +ranked high in the profile) did not warrent further efforts. + + +BUILDING + +Because I did not rebuild all of libc, I created a library called +regex that only includes the regex routines. The commands I used +to build regex are included in the file make.cmds. I will leave it +to Devin to incorporate regex into the full libc build structure. diff --git a/lib/libc/regex/Makefile b/lib/libc/regex/Makefile index eef8820..e4af528 100644 --- a/lib/libc/regex/Makefile +++ b/lib/libc/regex/Makefile @@ -1,5 +1,5 @@ # -# $Id: Makefile,v 1.1 1997/10/08 07:07:49 gdr Exp $ +# $Id: Makefile,v 1.2 1997/11/17 04:13:07 gdr Exp $ # # Devin Reade, October 1997. # @@ -11,7 +11,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o # I've not bothered to determine what POSIX_MISTAKE means, but 4.4BSD # uses it. -CFLAGS += -DPOSIX_MISTAKE +CFLAGS += -v -r -DPOSIX_MISTAKE build .PHONY: ../libc obj: $(OBJS) diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index deaaa13..26484bd 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -1,3 +1,6 @@ +#ifdef __ORCAC__ /* For debugger since engine.c in included by regexec.c */ +#line 2 "engine.c" +#endif /*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 @@ -36,7 +39,7 @@ * * @(#)engine.c 8.5 (Berkeley) 3/20/94 * - * $Id: engine.c,v 1.2 1997/10/08 07:07:50 gdr Exp $ + * $Id: engine.c,v 1.3 1997/11/17 04:13:07 gdr Exp $ */ #ifdef __ORCAC__ @@ -898,7 +901,11 @@ step(register struct re_guts *g, register sopno pc; register onestate here; /* note, macros know this name */ register sopno look; +#ifndef __ORCAC__ register int i; +#else + unsigned long i; +#endif for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { s = g->strip[pc]; diff --git a/lib/libc/regex/make.cmds b/lib/libc/regex/make.cmds new file mode 100644 index 0000000..2f8a0ed --- /dev/null +++ b/lib/libc/regex/make.cmds @@ -0,0 +1,28 @@ +# Commands to create regex.lib, since normal Makefile +# assumes that it will be part of libc. + +# Name of library +set libname=/lib/regex.lib + +# Optimization compile option +set optimize=-O72 + +# Any debug needed? +set debug="" + +# Macro definitions +# set macros="-DPOSIX_MISTAKE -DREDEBUG -D__NOASM__" +set macros="-DPOSIX_MISTAKE" + +# Places to look for libraries +set inc="-I/usr/include -I/lang/orca/libraries/orcacdefs" + +# Compile the source files +17/occ $optimize $debug -o regcomp.o -c -a0 -i -r -w $inc -v -r $macros regcomp.c +17/occ $optimize $debug -o regerror.o -c -a0 -i -r -w $inc -v -r $macros regerror.c +17/occ $optimize $debug -o regexec.o -c -a0 -i -r -w $inc -v -r $macros regexec.c +17/occ $optimize $debug -o regfree.o -c -a0 -i -r -w $inc -v -r $macros regfree.c + +# Create library +rm $libname +17/makelib -p $libname +regcomp.o +regerror.o +regexec.o +regfree.o diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index be765be..56c104a 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -36,7 +36,7 @@ * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 * - * $Id: regcomp.c,v 1.2 1997/10/08 07:07:50 gdr Exp $ + * $Id: regcomp.c,v 1.3 1997/11/17 04:13:07 gdr Exp $ */ #ifdef __ORCAC__ @@ -56,6 +56,10 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; #include #include "utils.h" +#ifdef __ORCAC__ +/* For some reason this macro definition causes a problem after regex2.h */ +#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) +#endif #include "regex2.h" #include "cclass.h" @@ -157,7 +161,9 @@ static char nuls[10]; /* place to point scanner in event of error */ #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) +#ifndef __ORCAC__ #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) +#endif #define HERE() (p->slen) #define THERE() (p->slen - 1) #define THERETHERE() (p->slen - 2) @@ -197,6 +203,10 @@ regcomp(regex_t *preg, # define GOODFLAGS(f) ((f)&~REG_DUMP) #endif +#ifdef REDEBUG + fprintf(stdout, "regcomp: pattern \"%s\"\n", pattern); +#endif + cflags = GOODFLAGS(cflags); if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) return(REG_INVARG); @@ -227,6 +237,7 @@ regcomp(regex_t *preg, p->end = p->next + len; p->error = 0; p->ncsalloc = 0; +#ifndef __ORCAC__ for (i = 0; i < NPAREN; i++) { p->pbegin[i] = 0; p->pend[i] = 0; @@ -246,6 +257,20 @@ regcomp(regex_t *preg, g->categories = &g->catspace[-(CHAR_MIN)]; (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); g->backrefs = 0; +#else + /* Performance tune-up for Apple IIGS: set all 0-valued fields in */ + /* record via memset() and then set only non-0 values. */ + (void) memset((char *)g, 0, + sizeof(struct re_guts)+(NC-1)*sizeof(cat_t)); + p->g = g; + p->next = (char *)pattern; /* convenience; we do not modify it */ + p->end = p->next + len; + (void) memset((char *)p->pbegin, 0, sizeof(sopno)*NPAREN*2); + g->csetsize = NC; + g->cflags = cflags; + g->ncategories = 1; /* category 0 is "everything else" */ + g->categories = &g->catspace[-(CHAR_MIN)]; +#endif /* do it */ EMIT(OEND, 0); @@ -1342,6 +1367,7 @@ isinsets(register struct re_guts *g, { register uch *col; register int i; +#if defined(__NOASM__) || !defined(__ORCAC__) register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; register unsigned uc = (unsigned char)c; @@ -1349,6 +1375,56 @@ isinsets(register struct re_guts *g, if (col[uc] != 0) return(1); return(0); +#else + /* Hand-optimized code for Apple IIGS */ + int ncols; + int setsize; + int rtnval; + + col = g->setbits; + setsize = g->csetsize; + ncols = g->ncsets+(CHAR_BIT-1); +#if CHAR_BIT == 8 + asm{ + lda ncols + lsr A + lsr A + lsr A + sta ncols + } +#else + ncols = ncols / CHAR_BIT; +#endif + asm{ + stz rtnval ; Assume return = 0. + lda ncols ; Count i down from ncols + beq done + sta i ; to 0 to see when done. + lda c ; Y-reg = offset from col + and #0x00FF + tay + + ; Loop through cols + nextcol: + lda [col],y ; Get the byte value. + and #0x00FF + bne nonzero ; Done if != 0. + dec i ; Decrement loop counter. + beq done ; If 0, return 0. + lda col ; Increment base pointer + clc + adc setsize + sta col + bcc nextcol + inc col+2 + bra nextcol ; and stay in loop. + + nonzero: ; Non-zero value found: + inc rtnval ; Set return value to 1 + } + done: + return(rtnval); +#endif } /* @@ -1362,6 +1438,7 @@ samesets(register struct re_guts *g, { register uch *col; register int i; +#if defined(__NOASM__) || !defined(__ORCAC__) register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; register unsigned uc1 = (unsigned char)c1; register unsigned uc2 = (unsigned char)c2; @@ -1370,6 +1447,57 @@ samesets(register struct re_guts *g, if (col[uc1] != col[uc2]) return(0); return(1); +#else + /* Hand-optimized code for Apple IIGS */ + int ncols; + int setsize; + int rtnval; + int c1val; + + col = g->setbits; + setsize = g->csetsize; +#if CHAR_BIT == 8 + ncols = (g->ncsets+(CHAR_BIT-1)) >> 3; +#else + ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; +#endif + asm{ + stz rtnval ; Assume return = 0 + lda ncols ; Count i down from ncols + beq notfound + sta i ; to 0 to check when done. + + ; Loop through cols + nextcol: + lda c1 + and #0x00FF + tay + lda [col],y ; Get the 1st value + and #0x00FF + sta c1val + lda c2 + and #0x00FF + tay + lda [col],y ; Get the 2nd value + and #0x00FF + cmp c1val + bne done ; Done if != (return 0) + dec i ; Decrement loop counter + beq notfound ; If done, return 1 + lda col ; Increment base pointer + clc + adc setsize + sta col + bcc nextcol + inc col+2 + bra nextcol ; and stay in loop. + + notfound: + inc rtnval ; rtnval = 1 + } + done: + return(rtnval); +#endif } /* @@ -1435,6 +1563,10 @@ doemit(register struct parse *p, sop op, size_t opnd) { +#ifdef REDEBUG + /* Debug code added by DMT for GNO implementation */ + char *opname; +#endif /* avoid making error situations worse */ if (p->error != 0) return; @@ -1449,6 +1581,81 @@ doemit(register struct parse *p, /* finally, it's all reduced to the easy case */ p->strip[p->slen++] = SOP(op, opnd); + +#ifdef REDEBUG + /* Debug code added by DMT for GNO implementation */ + switch (op) { + case OEND: + opname = "OEND"; + break; + case OCHAR: + opname = "OCHAR"; + break; + case OBOL: + opname = "OBOL"; + break; + case OEOL: + opname = "OEOL"; + break; + case OANY: + opname = "OANY"; + break; + case OANYOF: + opname = "OANYOF"; + break; + case OBACK_: + opname = "OBACK_"; + break; + case O_BACK: + opname = "O_BACK"; + break; + case OPLUS_: + opname = "OPLUS_"; + break; + case O_PLUS: + opname = "O_PLUS"; + break; + case OQUEST_: + opname = "OQUEST_"; + break; + case O_QUEST: + opname = "O_QUEST"; + break; + case OLPAREN: + opname = "OLPAREN"; + break; + case ORPAREN: + opname = "ORPAREN"; + break; + case OCH_: + opname = "OCH_"; + break; + case OOR1: + opname = "OOR1"; + break; + case OOR2: + opname = "OOR2"; + break; + case O_CH: + opname = "O_CH"; + break; + case OBOW: + opname = "OBOW"; + break; + case OEOW: + opname = "OEOW"; + break; + default: /* uh oh */ + opname = "unknown"; + break; + } + + fprintf(stdout, "emit %2ld: %s %lu", p->slen-1,opname,opnd); + if (op == OCHAR && opnd>=32 && opnd<=126) { + fprintf(stdout, " \"%c\"", (char)opnd); + } + fprintf(stdout, "\n"); +#endif } /* diff --git a/lib/libc/regex/regexec.c b/lib/libc/regex/regexec.c index 01a4708..11b4570 100644 --- a/lib/libc/regex/regexec.c +++ b/lib/libc/regex/regexec.c @@ -36,7 +36,7 @@ * * @(#)regexec.c 8.3 (Berkeley) 3/20/94 * - * $Id: regexec.c,v 1.2 1997/10/08 07:07:50 gdr Exp $ + * $Id: regexec.c,v 1.3 1997/11/17 04:13:08 gdr Exp $ */ #ifdef __ORCAC__ @@ -71,24 +71,41 @@ static int nope = 0; /* for use in asserts; shuts lint up */ #define states long #define states1 states /* for later use in regexec() decision */ #define CLEAR(v) ((v) = 0) +#ifndef __ORCAC__ #define SET0(v, n) ((v) &= ~(1 << (n))) #define SET1(v, n) ((v) |= 1 << (n)) #define ISSET(v, n) ((v) & (1 << (n))) +#else /* ORCA/C doesn't use 32 bits by default */ +#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n))) +#define SET1(v, n) ((v) |= (unsigned long)1 << (n)) +#define ISSET(v, n) ((v) & ((unsigned long)1 << (n))) +#endif #define ASSIGN(d, s) ((d) = (s)) #define EQ(a, b) ((a) == (b)) #define STATEVARS int dummy /* dummy version */ #define STATESETUP(m, n) /* nothing */ #define STATETEARDOWN(m) /* nothing */ #define SETUP(v) ((v) = 0) +#ifndef __ORCAC__ #define onestate int #define INIT(o, n) ((o) = (unsigned)1 << (n)) +#else /* ORCA/C doesn't use 32 bits by default */ +#define onestate long +#define INIT(o, n) ((o) = (unsigned long)1 << (n)) +#endif #define INC(o) ((o) <<= 1) #define ISSTATEIN(v, o) ((v) & (o)) /* some abbreviations; note that some of these know variable names! */ /* do "if I'm here, I can also be there" etc without branches */ +#ifndef __ORCAC__ #define FWD(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) << (n)) #define BACK(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) >> (n)) #define ISSETBACK(v, n) ((v) & ((unsigned)here >> (n))) +#else /* ORCA/C doesn't use 32 bits by default */ +#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n)) +#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n)) +#define ISSETBACK(v, n) ((v) & ((unsigned long)(here) >> (n))) +#endif /* function names */ #define SNAMES /* engine.c looks after details */ @@ -129,7 +146,11 @@ static int nope = 0; /* for use in asserts; shuts lint up */ (m)->vn = 0; } #define STATETEARDOWN(m) { free((m)->space); } #define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) +#ifndef __ORCAC__ #define onestate int +#else /* ORCA/C doesn't use 32 bits by default */ +#define onestate long +#endif #define INIT(o, n) ((o) = (n)) #define INC(o) ((o)++) #define ISSTATEIN(v, o) ((v)[o]) @@ -172,6 +193,10 @@ regexec(const regex_t *preg, # define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) #endif +#ifdef REDEBUG + fprintf(stdout, "regexec: string \"%s\"\n", string); +#endif + if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) return(REG_BADPAT); assert(!(g->iflags&BAD)); @@ -179,6 +204,9 @@ regexec(const regex_t *preg, return(REG_BADPAT); eflags = GOODFLAGS(eflags); + /* Special...for debugging turn on register tracing */ + eflags |= REG_TRACE; + if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE)) return(smatcher(g, (char *)string, nmatch, pmatch, eflags)); else