These are Dave Tribby's changes that were necessary to make

sed (which uses these routines) work correctly.  See the file
GNO.notes for more details.
This commit is contained in:
gdr 1997-11-17 04:13:08 +00:00
parent a3f1e8a401
commit 8590ffb646
6 changed files with 346 additions and 5 deletions

71
lib/libc/regex/GNO.notes Normal file
View File

@ -0,0 +1,71 @@
Notes on the porting of regex to GNO
Dave Tribby * November 7, 1997
Devin Reade did the initial conversion of the BSD sources to compile
under GNO 2.0.6 headers with ORCA/C on the Apple IIGS.
I completed the porting to the extent that the program sed (which
uses regex) works for many different test cases.
The most time-comsuming aspect of the port was finding all the places
where long integers should be used instead of int or unsigned int.
DEBUGGING
The file engine.c is used in an interesting way by regexec.c:
it includes engine.c *twice*, after muchos fiddling with the
macros that code uses. This lets the same code operate on two
different representations for state sets.
When it was necessary to do source-level debugging, Splat! got
very confused about line numbers until I added the following at
the beginning of the file:
#ifdef __ORCAC__
#line 2 "engine.c"
#endif
The regex code had an compilation macro, REDEBUG, that would
turn on diagnostic messages when defined. I added additional
output statements that are turned on by REDEBUG.
PERFORMANCE ENHANCEMENT
The program sed took a long time to execute a compilcated sed
program (provided with the BSD source code) that evaluates
expressions. I attempted to speed up the regex routines by
recoding two routines in assembly language (using the asm {}
construct available in ORCA/C). The two routines are
isinsets() and samesets() in regcomp.c. The C code was left
in place and can be turned on by compiling with the macro
__NOASM__ set.
I also noticed an instance in regcomp.c where many fields of
a structure were individually set to 0 at initialization. I
recoded to set all the fields to 0 by a call to memset() and
then set only the non-zero fields individually.
The following script was used to time the results:
cd /src/gno/usr.bin/sed
echo '(4+4)*3' | time ./sed -f tests/math.sed
The unmodified code took 51 seconds to run on my 8MHz Apple IIGS.
It took 49 seconds after I recoded isinsets() in asm, and 48 seconds
after I recoded samesets(). Initializing fields via memset() brought
it down to 47 seconds.
I saw no other obvious candidates for recoding. The modest results
from these changes (even though they were made in routines that
ranked high in the profile) did not warrent further efforts.
BUILDING
Because I did not rebuild all of libc, I created a library called
regex that only includes the regex routines. The commands I used
to build regex are included in the file make.cmds. I will leave it
to Devin to incorporate regex into the full libc build structure.

View File

@ -1,5 +1,5 @@
#
# $Id: Makefile,v 1.1 1997/10/08 07:07:49 gdr Exp $
# $Id: Makefile,v 1.2 1997/11/17 04:13:07 gdr Exp $
#
# Devin Reade, October 1997.
#
@ -11,7 +11,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
# I've not bothered to determine what POSIX_MISTAKE means, but 4.4BSD
# uses it.
CFLAGS += -DPOSIX_MISTAKE
CFLAGS += -v -r -DPOSIX_MISTAKE
build .PHONY: ../libc
obj: $(OBJS)

View File

@ -1,3 +1,6 @@
#ifdef __ORCAC__ /* For debugger since engine.c in included by regexec.c */
#line 2 "engine.c"
#endif
/*-
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
@ -36,7 +39,7 @@
*
* @(#)engine.c 8.5 (Berkeley) 3/20/94
*
* $Id: engine.c,v 1.2 1997/10/08 07:07:50 gdr Exp $
* $Id: engine.c,v 1.3 1997/11/17 04:13:07 gdr Exp $
*/
#ifdef __ORCAC__
@ -898,7 +901,11 @@ step(register struct re_guts *g,
register sopno pc;
register onestate here; /* note, macros know this name */
register sopno look;
#ifndef __ORCAC__
register int i;
#else
unsigned long i;
#endif
for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
s = g->strip[pc];

28
lib/libc/regex/make.cmds Normal file
View File

@ -0,0 +1,28 @@
# Commands to create regex.lib, since normal Makefile
# assumes that it will be part of libc.
# Name of library
set libname=/lib/regex.lib
# Optimization compile option
set optimize=-O72
# Any debug needed?
set debug=""
# Macro definitions
# set macros="-DPOSIX_MISTAKE -DREDEBUG -D__NOASM__"
set macros="-DPOSIX_MISTAKE"
# Places to look for libraries
set inc="-I/usr/include -I/lang/orca/libraries/orcacdefs"
# Compile the source files
17/occ $optimize $debug -o regcomp.o -c -a0 -i -r -w $inc -v -r $macros regcomp.c
17/occ $optimize $debug -o regerror.o -c -a0 -i -r -w $inc -v -r $macros regerror.c
17/occ $optimize $debug -o regexec.o -c -a0 -i -r -w $inc -v -r $macros regexec.c
17/occ $optimize $debug -o regfree.o -c -a0 -i -r -w $inc -v -r $macros regfree.c
# Create library
rm $libname
17/makelib -p $libname +regcomp.o +regerror.o +regexec.o +regfree.o

View File

@ -36,7 +36,7 @@
*
* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
*
* $Id: regcomp.c,v 1.2 1997/10/08 07:07:50 gdr Exp $
* $Id: regcomp.c,v 1.3 1997/11/17 04:13:07 gdr Exp $
*/
#ifdef __ORCAC__
@ -56,6 +56,10 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
#include <regex.h>
#include "utils.h"
#ifdef __ORCAC__
/* For some reason this macro definition causes a problem after regex2.h */
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
#endif
#include "regex2.h"
#include "cclass.h"
@ -157,7 +161,9 @@ static char nuls[10]; /* place to point scanner in event of error */
#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
#ifndef __ORCAC__
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
#endif
#define HERE() (p->slen)
#define THERE() (p->slen - 1)
#define THERETHERE() (p->slen - 2)
@ -197,6 +203,10 @@ regcomp(regex_t *preg,
# define GOODFLAGS(f) ((f)&~REG_DUMP)
#endif
#ifdef REDEBUG
fprintf(stdout, "regcomp: pattern \"%s\"\n", pattern);
#endif
cflags = GOODFLAGS(cflags);
if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
return(REG_INVARG);
@ -227,6 +237,7 @@ regcomp(regex_t *preg,
p->end = p->next + len;
p->error = 0;
p->ncsalloc = 0;
#ifndef __ORCAC__
for (i = 0; i < NPAREN; i++) {
p->pbegin[i] = 0;
p->pend[i] = 0;
@ -246,6 +257,20 @@ regcomp(regex_t *preg,
g->categories = &g->catspace[-(CHAR_MIN)];
(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
g->backrefs = 0;
#else
/* Performance tune-up for Apple IIGS: set all 0-valued fields in */
/* record via memset() and then set only non-0 values. */
(void) memset((char *)g, 0,
sizeof(struct re_guts)+(NC-1)*sizeof(cat_t));
p->g = g;
p->next = (char *)pattern; /* convenience; we do not modify it */
p->end = p->next + len;
(void) memset((char *)p->pbegin, 0, sizeof(sopno)*NPAREN*2);
g->csetsize = NC;
g->cflags = cflags;
g->ncategories = 1; /* category 0 is "everything else" */
g->categories = &g->catspace[-(CHAR_MIN)];
#endif
/* do it */
EMIT(OEND, 0);
@ -1342,6 +1367,7 @@ isinsets(register struct re_guts *g,
{
register uch *col;
register int i;
#if defined(__NOASM__) || !defined(__ORCAC__)
register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
register unsigned uc = (unsigned char)c;
@ -1349,6 +1375,56 @@ isinsets(register struct re_guts *g,
if (col[uc] != 0)
return(1);
return(0);
#else
/* Hand-optimized code for Apple IIGS */
int ncols;
int setsize;
int rtnval;
col = g->setbits;
setsize = g->csetsize;
ncols = g->ncsets+(CHAR_BIT-1);
#if CHAR_BIT == 8
asm{
lda ncols
lsr A
lsr A
lsr A
sta ncols
}
#else
ncols = ncols / CHAR_BIT;
#endif
asm{
stz rtnval ; Assume return = 0.
lda ncols ; Count i down from ncols
beq done
sta i ; to 0 to see when done.
lda c ; Y-reg = offset from col
and #0x00FF
tay
; Loop through cols
nextcol:
lda [col],y ; Get the byte value.
and #0x00FF
bne nonzero ; Done if != 0.
dec i ; Decrement loop counter.
beq done ; If 0, return 0.
lda col ; Increment base pointer
clc
adc setsize
sta col
bcc nextcol
inc col+2
bra nextcol ; and stay in loop.
nonzero: ; Non-zero value found:
inc rtnval ; Set return value to 1
}
done:
return(rtnval);
#endif
}
/*
@ -1362,6 +1438,7 @@ samesets(register struct re_guts *g,
{
register uch *col;
register int i;
#if defined(__NOASM__) || !defined(__ORCAC__)
register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
register unsigned uc1 = (unsigned char)c1;
register unsigned uc2 = (unsigned char)c2;
@ -1370,6 +1447,57 @@ samesets(register struct re_guts *g,
if (col[uc1] != col[uc2])
return(0);
return(1);
#else
/* Hand-optimized code for Apple IIGS */
int ncols;
int setsize;
int rtnval;
int c1val;
col = g->setbits;
setsize = g->csetsize;
#if CHAR_BIT == 8
ncols = (g->ncsets+(CHAR_BIT-1)) >> 3;
#else
ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
#endif
asm{
stz rtnval ; Assume return = 0
lda ncols ; Count i down from ncols
beq notfound
sta i ; to 0 to check when done.
; Loop through cols
nextcol:
lda c1
and #0x00FF
tay
lda [col],y ; Get the 1st value
and #0x00FF
sta c1val
lda c2
and #0x00FF
tay
lda [col],y ; Get the 2nd value
and #0x00FF
cmp c1val
bne done ; Done if != (return 0)
dec i ; Decrement loop counter
beq notfound ; If done, return 1
lda col ; Increment base pointer
clc
adc setsize
sta col
bcc nextcol
inc col+2
bra nextcol ; and stay in loop.
notfound:
inc rtnval ; rtnval = 1
}
done:
return(rtnval);
#endif
}
/*
@ -1435,6 +1563,10 @@ doemit(register struct parse *p,
sop op,
size_t opnd)
{
#ifdef REDEBUG
/* Debug code added by DMT for GNO implementation */
char *opname;
#endif
/* avoid making error situations worse */
if (p->error != 0)
return;
@ -1449,6 +1581,81 @@ doemit(register struct parse *p,
/* finally, it's all reduced to the easy case */
p->strip[p->slen++] = SOP(op, opnd);
#ifdef REDEBUG
/* Debug code added by DMT for GNO implementation */
switch (op) {
case OEND:
opname = "OEND";
break;
case OCHAR:
opname = "OCHAR";
break;
case OBOL:
opname = "OBOL";
break;
case OEOL:
opname = "OEOL";
break;
case OANY:
opname = "OANY";
break;
case OANYOF:
opname = "OANYOF";
break;
case OBACK_:
opname = "OBACK_";
break;
case O_BACK:
opname = "O_BACK";
break;
case OPLUS_:
opname = "OPLUS_";
break;
case O_PLUS:
opname = "O_PLUS";
break;
case OQUEST_:
opname = "OQUEST_";
break;
case O_QUEST:
opname = "O_QUEST";
break;
case OLPAREN:
opname = "OLPAREN";
break;
case ORPAREN:
opname = "ORPAREN";
break;
case OCH_:
opname = "OCH_";
break;
case OOR1:
opname = "OOR1";
break;
case OOR2:
opname = "OOR2";
break;
case O_CH:
opname = "O_CH";
break;
case OBOW:
opname = "OBOW";
break;
case OEOW:
opname = "OEOW";
break;
default: /* uh oh */
opname = "unknown";
break;
}
fprintf(stdout, "emit %2ld: %s %lu", p->slen-1,opname,opnd);
if (op == OCHAR && opnd>=32 && opnd<=126) {
fprintf(stdout, " \"%c\"", (char)opnd);
}
fprintf(stdout, "\n");
#endif
}
/*

View File

@ -36,7 +36,7 @@
*
* @(#)regexec.c 8.3 (Berkeley) 3/20/94
*
* $Id: regexec.c,v 1.2 1997/10/08 07:07:50 gdr Exp $
* $Id: regexec.c,v 1.3 1997/11/17 04:13:08 gdr Exp $
*/
#ifdef __ORCAC__
@ -71,24 +71,41 @@ static int nope = 0; /* for use in asserts; shuts lint up */
#define states long
#define states1 states /* for later use in regexec() decision */
#define CLEAR(v) ((v) = 0)
#ifndef __ORCAC__
#define SET0(v, n) ((v) &= ~(1 << (n)))
#define SET1(v, n) ((v) |= 1 << (n))
#define ISSET(v, n) ((v) & (1 << (n)))
#else /* ORCA/C doesn't use 32 bits by default */
#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n)))
#define SET1(v, n) ((v) |= (unsigned long)1 << (n))
#define ISSET(v, n) ((v) & ((unsigned long)1 << (n)))
#endif
#define ASSIGN(d, s) ((d) = (s))
#define EQ(a, b) ((a) == (b))
#define STATEVARS int dummy /* dummy version */
#define STATESETUP(m, n) /* nothing */
#define STATETEARDOWN(m) /* nothing */
#define SETUP(v) ((v) = 0)
#ifndef __ORCAC__
#define onestate int
#define INIT(o, n) ((o) = (unsigned)1 << (n))
#else /* ORCA/C doesn't use 32 bits by default */
#define onestate long
#define INIT(o, n) ((o) = (unsigned long)1 << (n))
#endif
#define INC(o) ((o) <<= 1)
#define ISSTATEIN(v, o) ((v) & (o))
/* some abbreviations; note that some of these know variable names! */
/* do "if I'm here, I can also be there" etc without branches */
#ifndef __ORCAC__
#define FWD(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) >> (n))
#define ISSETBACK(v, n) ((v) & ((unsigned)here >> (n)))
#else /* ORCA/C doesn't use 32 bits by default */
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
#define ISSETBACK(v, n) ((v) & ((unsigned long)(here) >> (n)))
#endif
/* function names */
#define SNAMES /* engine.c looks after details */
@ -129,7 +146,11 @@ static int nope = 0; /* for use in asserts; shuts lint up */
(m)->vn = 0; }
#define STATETEARDOWN(m) { free((m)->space); }
#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates])
#ifndef __ORCAC__
#define onestate int
#else /* ORCA/C doesn't use 32 bits by default */
#define onestate long
#endif
#define INIT(o, n) ((o) = (n))
#define INC(o) ((o)++)
#define ISSTATEIN(v, o) ((v)[o])
@ -172,6 +193,10 @@ regexec(const regex_t *preg,
# define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
#endif
#ifdef REDEBUG
fprintf(stdout, "regexec: string \"%s\"\n", string);
#endif
if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
return(REG_BADPAT);
assert(!(g->iflags&BAD));
@ -179,6 +204,9 @@ regexec(const regex_t *preg,
return(REG_BADPAT);
eflags = GOODFLAGS(eflags);
/* Special...for debugging turn on register tracing */
eflags |= REG_TRACE;
if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
else