From 23e5a88e4d97152e926f1a12f0874c0989453f8b Mon Sep 17 00:00:00 2001 From: Andy McFadden Date: Mon, 10 Mar 2003 23:05:48 +0000 Subject: [PATCH] Added support for automatic high-ASCII text stripping. The behavior is activated by a feature flag (default off), and only kicks in when EOL conversion is on for the file in question. --- nufxlib-0/Archive.c | 1 + nufxlib-0/ChangeLog.txt | 3 + nufxlib-0/Funnel.c | 144 +++++++++++++++++++++++++++++++--------- nufxlib-0/NufxLib.h | 3 +- nufxlib-0/NufxLibPriv.h | 7 +- nufxlib-0/Value.c | 11 +++ 6 files changed, 135 insertions(+), 34 deletions(-) diff --git a/nufxlib-0/Archive.c b/nufxlib-0/Archive.c index 0e273e0..6c7d316 100644 --- a/nufxlib-0/Archive.c +++ b/nufxlib-0/Archive.c @@ -92,6 +92,7 @@ Nu_NuArchiveNew(NuArchive** ppArchive) (*ppArchive)->valModifyOrig = false; (*ppArchive)->valMimicSHK = false; (*ppArchive)->valMaskDataless = false; + (*ppArchive)->valStripHighASCII = false; (*ppArchive)->messageHandlerFunc = gNuGlobalErrorMessageHandler; diff --git a/nufxlib-0/ChangeLog.txt b/nufxlib-0/ChangeLog.txt index 38f5585..8e37ca6 100644 --- a/nufxlib-0/ChangeLog.txt +++ b/nufxlib-0/ChangeLog.txt @@ -1,3 +1,6 @@ +2003/03/10 fadden + - Added support for automatic high-ASCII text stripping. + 2003/02/23 fadden - Added test-twirl to samples. diff --git a/nufxlib-0/Funnel.c b/nufxlib-0/Funnel.c index 476410b..3ee4ca5 100644 --- a/nufxlib-0/Funnel.c +++ b/nufxlib-0/Funnel.c @@ -255,8 +255,14 @@ Nu_FunnelNew(NuArchive* pArchive, NuDataSink* pDataSink, NuValue convertEOL, pFunnel->pDataSink = pDataSink; pFunnel->convertEOL = convertEOL; pFunnel->convertEOLTo = convertEOLTo; + pFunnel->convertEOLFrom = kNuEOLUnknown; pFunnel->pProgress = pProgress; + pFunnel->checkStripHighASCII = pArchive->valStripHighASCII; + pFunnel->doStripHighASCII = false; /* determined on first write */ + + pFunnel->isFirstWrite = true; + bail: if (err != kNuErrNone) Nu_FunnelFree(pArchive, pFunnel); @@ -313,6 +319,34 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes) #endif +/* + * Check to see if this is a high-ASCII file. To qualify, EVERY + * character must have its high bit set, except for spaces (0x20). + * (The exception is courtesy Glen Bredon's "Merlin".) + */ +static Boolean +Nu_CheckHighASCII(const NuFunnel* pFunnel, const unsigned char* buffer, + unsigned long count) +{ + Boolean isHighASCII; + + Assert(buffer != nil); + Assert(count != 0); + Assert(pFunnel->checkStripHighASCII); + + isHighASCII = true; + while (count--) { + if ((*buffer & 0x80) == 0 && *buffer != 0x20) { + isHighASCII = false; + break; + } + + buffer++; + } + + return isHighASCII; +} + /* * Table determining what's a binary character and what isn't. It would * possibly be more compact to generate this from a simple description, @@ -323,7 +357,8 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes) * may be too loose by itself; we may want to require that the lower-ASCII * values appear in higher proportions than the upper-ASCII values. * Otherwise we run the risk of converting a binary file with specific - * properties. + * properties. (Note that "upper-ASCII" refers to umlauts and other + * accented characters, not DOS 3.3 "high ASCII".) * * The auto-detect mechanism will never be perfect though, so there's not * much point in tweaking it to death. @@ -347,7 +382,7 @@ static const char gNuIsBinary[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 */ }; -#define kNuMaxHighASCII 1 /* max #of binary chars per 100 bytes */ +#define kNuMaxUpperASCII 1 /* max #of binary chars per 100 bytes */ #define kNuMinConvThreshold 40 /* min of 40 chars for auto-detect */ /* * Decide, based on the contents of the buffer, whether we should do an @@ -358,25 +393,43 @@ static const char gNuIsBinary[256] = { * * If we don't have enough data to make a determination, don't mess with it. * (Thought for the day: add a "bias" flag, based on the NuRecord fileType, - * that causes us to handle borderline cases more reasonably. If it's of - * type TXT, it's probably text.) + * that causes us to handle borderline or sub-min-threshold cases more + * reasonably. If it's of type TXT, it's probably text.) * * We try to figure out whether it's CR, LF, or CRLF, so that we can * skip the CPU-intensive conversion process if it isn't necessary. + * + * We will also enable a "high-ASCII" stripper if requested. This is + * only enabled when EOL conversions are enabled. */ static NuValue Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count) { ulong bufCount, numBinary, numLF, numCR; + Boolean isHighASCII; uchar val; if (count < kNuMinConvThreshold) return kNuConvertOff; + /* + * Check to see if the buffer is all high-ASCII characters. If it is, + * we want to strip characters before we test them below. + */ + if (pFunnel->checkStripHighASCII) { + isHighASCII = Nu_CheckHighASCII(pFunnel, buffer, count); + DBUG(("+++ determined isHighASCII=%d\n", isHighASCII)); + } else { + isHighASCII = false; + DBUG(("+++ not even checking isHighASCII\n")); + } + bufCount = count; numBinary = numLF = numCR = 0; while (bufCount--) { val = *buffer++; + if (isHighASCII) + val &= 0x7f; if (gNuIsBinary[val]) numBinary++; if (val == kNuCharLF) @@ -388,9 +441,9 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count) /* if #found is > #allowed, it's a binary file */ if (count < 100) { /* use simplified check on files between kNuMinConvThreshold and 100 */ - if (numBinary > kNuMaxHighASCII) + if (numBinary > kNuMaxUpperASCII) return kNuConvertOff; - } else if (numBinary > (count / 100) * kNuMaxHighASCII) + } else if (numBinary > (count / 100) * kNuMaxUpperASCII) return kNuConvertOff; /* @@ -402,20 +455,25 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count) * and they just happen to be in equal amounts, but it's not clear * to me that an automatic EOL conversion makes sense on that sort * of file anyway. + * + * None of this applies if we also need to do a high-ASCII conversion. */ - if (numLF && !numCR) - pFunnel->convertEOLFrom = kNuEOLLF; - else if (!numLF && numCR) - pFunnel->convertEOLFrom = kNuEOLCR; - else if (numLF && numLF == numCR) - pFunnel->convertEOLFrom = kNuEOLCRLF; - else - pFunnel->convertEOLFrom = kNuEOLUnknown; + if (isHighASCII) { + pFunnel->doStripHighASCII = true; + } else { + if (numLF && !numCR) + pFunnel->convertEOLFrom = kNuEOLLF; + else if (!numLF && numCR) + pFunnel->convertEOLFrom = kNuEOLCR; + else if (numLF && numLF == numCR) + pFunnel->convertEOLFrom = kNuEOLCRLF; + else + pFunnel->convertEOLFrom = kNuEOLUnknown; + } return kNuConvertOn; } - /* * Write a block of data to the appropriate output device. Test for * excessive data, and raise "outMaxExceeded" if we overrun. @@ -488,53 +546,75 @@ Nu_FunnelWriteConvert(NuFunnel* pFunnel, const uchar* buffer, ulong count) /*if (pFunnel->outMaxExceeded) return kNuErrOutMax;*/ - if (pFunnel->convertEOL == kNuConvertAuto) { + if (pFunnel->isFirstWrite) { /* * This is the first write/flush we've done on this Funnel. * Check the data we have buffered to decide whether or not - * we want to convert this. + * we want to do text conversions. */ - pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer, count); - DBUG(("+++ DetermineConversion --> %ld / %ld\n", pFunnel->convertEOL, - pFunnel->convertEOLFrom)); + if (pFunnel->convertEOL == kNuConvertAuto) { + pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer,count); + DBUG(("+++ DetermineConversion --> %ld / %ld (%d)\n", + pFunnel->convertEOL, pFunnel->convertEOLFrom, + pFunnel->doStripHighASCII)); - if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) { - DBUG(("+++ Switching redundant converter off\n")); - pFunnel->convertEOL = kNuConvertOff; + if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) { + DBUG(("+++ Switching redundant converter off\n")); + pFunnel->convertEOL = kNuConvertOff; + } + /* put it where the progress meter can see it */ + if (pFunnel->pProgress != nil) + pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL; + } else if (pFunnel->convertEOL == kNuConvertOn) { + if (pFunnel->checkStripHighASCII) { + /* assume this part of the buffer is representative */ + pFunnel->doStripHighASCII = Nu_CheckHighASCII(pFunnel, + buffer, count); + } else { + Assert(!pFunnel->doStripHighASCII); + } + DBUG(("+++ Converter is on, convHighASCII=%d\n", + pFunnel->doStripHighASCII)); } - /* put it where the progress meter can see it */ - if (pFunnel->pProgress != nil) - pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL; } + Assert(pFunnel->convertEOL != kNuConvertAuto); /* on or off now */ + pFunnel->isFirstWrite = false; if (pFunnel->convertEOL == kNuConvertOff) { /* write it straight */ Nu_FunnelPutBlock(pFunnel, buffer, count); } else { - /* do the LF conversion */ - Boolean lastCR = pFunnel->lastCR; /* local copy */ + /* do the EOL conversion and optional high-bit stripping */ + Boolean lastCR = pFunnel->lastCR; /* make local copy */ uchar uch; + int mask; + + if (pFunnel->doStripHighASCII) + mask = 0x7f; + else + mask = 0xff; /* * We could get a significant speed improvement here by writing * non-EOL chars as a larger block instead of single bytes. */ while (count--) { - if (*buffer == kNuCharCR) { + uch = (*buffer) & mask; + + if (uch == kNuCharCR) { Nu_PutEOL(pFunnel); lastCR = true; - } else if (*buffer == kNuCharLF) { + } else if (uch == kNuCharLF) { if (!lastCR) Nu_PutEOL(pFunnel); lastCR = false; } else { - uch = *buffer; Nu_FunnelPutBlock(pFunnel, &uch, 1); lastCR = false; } buffer++; } - pFunnel->lastCR = lastCR; + pFunnel->lastCR = lastCR; /* save copy */ } diff --git a/nufxlib-0/NufxLib.h b/nufxlib-0/NufxLib.h index c1e3820..331fcae 100644 --- a/nufxlib-0/NufxLib.h +++ b/nufxlib-0/NufxLib.h @@ -267,7 +267,8 @@ typedef enum NuValueID { kNuValueHandleExisting = 8, kNuValueModifyOrig = 9, kNuValueMimicSHK = 10, - kNuValueMaskDataless = 11 + kNuValueMaskDataless = 11, + kNuValueStripHighASCII = 12 } NuValueID; typedef unsigned long NuValue; diff --git a/nufxlib-0/NufxLibPriv.h b/nufxlib-0/NufxLibPriv.h index 52bb4b2..60df09c 100644 --- a/nufxlib-0/NufxLibPriv.h +++ b/nufxlib-0/NufxLibPriv.h @@ -151,6 +151,7 @@ struct NuArchive { NuValue valMimicSHK; /* mimic some ShrinkIt quirks */ NuValue valModifyOrig; /* modify original arc in place? */ NuValue valOnlyUpdateOlder; /* modify original arc in place? */ + NuValue valStripHighASCII; /* during EOL conv, strip hi bit? */ /* callback functions */ NuCallback selectionFilterFunc; @@ -252,12 +253,16 @@ typedef struct NuFunnel { uchar* buffer; /* kNuFunnelBufSize worth of storage */ long bufCount; /* #of bytes in buffer */ - /* EOL conversion; if "auto", on first flush we convert to "on" or "off" */ + /* text conversion; if "auto", on first flush we convert to "on" or "off" */ NuValue convertEOL; /* on/off/auto */ NuValue convertEOLTo; /* EOL to switch to */ NuValue convertEOLFrom; /* EOL terminator we think we found */ + Boolean checkStripHighASCII; /* do we want to check for it? */ + Boolean doStripHighASCII; /* strip high ASCII during EOL conv */ Boolean lastCR; /* was last char a CR? */ + Boolean isFirstWrite; /* cleared on first write */ + #if 0 ulong inCount; /* total #of bytes in the top */ ulong outCount; /* total #of bytes out the bottom */ diff --git a/nufxlib-0/Value.c b/nufxlib-0/Value.c index c7a7549..7f56b65 100644 --- a/nufxlib-0/Value.c +++ b/nufxlib-0/Value.c @@ -54,6 +54,9 @@ Nu_GetValue(NuArchive* pArchive, NuValueID ident, NuValue* pValue) case kNuValueOnlyUpdateOlder: *pValue = pArchive->valOnlyUpdateOlder; break; + case kNuValueStripHighASCII: + *pValue = pArchive->valStripHighASCII; + break; default: err = kNuErrInvalidArg; Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident); @@ -162,6 +165,14 @@ Nu_SetValue(NuArchive* pArchive, NuValueID ident, NuValue value) } pArchive->valOnlyUpdateOlder = value; break; + case kNuValueStripHighASCII: + if (value != true && value != false) { + Nu_ReportError(NU_BLOB, err, + "Invalid kNuStripHighASCII value %ld", value); + goto bail; + } + pArchive->valStripHighASCII = value; + break; default: Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident); goto bail;