Added support for automatic high-ASCII text stripping. The behavior

is activated by a feature flag (default off), and only kicks in when
EOL conversion is on for the file in question.
This commit is contained in:
Andy McFadden 2003-03-10 23:05:48 +00:00
parent 4c253ada9f
commit 23e5a88e4d
6 changed files with 135 additions and 34 deletions

View File

@ -92,6 +92,7 @@ Nu_NuArchiveNew(NuArchive** ppArchive)
(*ppArchive)->valModifyOrig = false;
(*ppArchive)->valMimicSHK = false;
(*ppArchive)->valMaskDataless = false;
(*ppArchive)->valStripHighASCII = false;
(*ppArchive)->messageHandlerFunc = gNuGlobalErrorMessageHandler;

View File

@ -1,3 +1,6 @@
2003/03/10 fadden
- Added support for automatic high-ASCII text stripping.
2003/02/23 fadden
- Added test-twirl to samples.

View File

@ -255,8 +255,14 @@ Nu_FunnelNew(NuArchive* pArchive, NuDataSink* pDataSink, NuValue convertEOL,
pFunnel->pDataSink = pDataSink;
pFunnel->convertEOL = convertEOL;
pFunnel->convertEOLTo = convertEOLTo;
pFunnel->convertEOLFrom = kNuEOLUnknown;
pFunnel->pProgress = pProgress;
pFunnel->checkStripHighASCII = pArchive->valStripHighASCII;
pFunnel->doStripHighASCII = false; /* determined on first write */
pFunnel->isFirstWrite = true;
bail:
if (err != kNuErrNone)
Nu_FunnelFree(pArchive, pFunnel);
@ -313,6 +319,34 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
#endif
/*
* Check to see if this is a high-ASCII file. To qualify, EVERY
* character must have its high bit set, except for spaces (0x20).
* (The exception is courtesy Glen Bredon's "Merlin".)
*/
static Boolean
Nu_CheckHighASCII(const NuFunnel* pFunnel, const unsigned char* buffer,
unsigned long count)
{
Boolean isHighASCII;
Assert(buffer != nil);
Assert(count != 0);
Assert(pFunnel->checkStripHighASCII);
isHighASCII = true;
while (count--) {
if ((*buffer & 0x80) == 0 && *buffer != 0x20) {
isHighASCII = false;
break;
}
buffer++;
}
return isHighASCII;
}
/*
* Table determining what's a binary character and what isn't. It would
* possibly be more compact to generate this from a simple description,
@ -323,7 +357,8 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
* may be too loose by itself; we may want to require that the lower-ASCII
* values appear in higher proportions than the upper-ASCII values.
* Otherwise we run the risk of converting a binary file with specific
* properties.
* properties. (Note that "upper-ASCII" refers to umlauts and other
* accented characters, not DOS 3.3 "high ASCII".)
*
* The auto-detect mechanism will never be perfect though, so there's not
* much point in tweaking it to death.
@ -347,7 +382,7 @@ static const char gNuIsBinary[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 */
};
#define kNuMaxHighASCII 1 /* max #of binary chars per 100 bytes */
#define kNuMaxUpperASCII 1 /* max #of binary chars per 100 bytes */
#define kNuMinConvThreshold 40 /* min of 40 chars for auto-detect */
/*
* Decide, based on the contents of the buffer, whether we should do an
@ -358,25 +393,43 @@ static const char gNuIsBinary[256] = {
*
* If we don't have enough data to make a determination, don't mess with it.
* (Thought for the day: add a "bias" flag, based on the NuRecord fileType,
* that causes us to handle borderline cases more reasonably. If it's of
* type TXT, it's probably text.)
* that causes us to handle borderline or sub-min-threshold cases more
* reasonably. If it's of type TXT, it's probably text.)
*
* We try to figure out whether it's CR, LF, or CRLF, so that we can
* skip the CPU-intensive conversion process if it isn't necessary.
*
* We will also enable a "high-ASCII" stripper if requested. This is
* only enabled when EOL conversions are enabled.
*/
static NuValue
Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
{
ulong bufCount, numBinary, numLF, numCR;
Boolean isHighASCII;
uchar val;
if (count < kNuMinConvThreshold)
return kNuConvertOff;
/*
* Check to see if the buffer is all high-ASCII characters. If it is,
* we want to strip characters before we test them below.
*/
if (pFunnel->checkStripHighASCII) {
isHighASCII = Nu_CheckHighASCII(pFunnel, buffer, count);
DBUG(("+++ determined isHighASCII=%d\n", isHighASCII));
} else {
isHighASCII = false;
DBUG(("+++ not even checking isHighASCII\n"));
}
bufCount = count;
numBinary = numLF = numCR = 0;
while (bufCount--) {
val = *buffer++;
if (isHighASCII)
val &= 0x7f;
if (gNuIsBinary[val])
numBinary++;
if (val == kNuCharLF)
@ -388,9 +441,9 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
/* if #found is > #allowed, it's a binary file */
if (count < 100) {
/* use simplified check on files between kNuMinConvThreshold and 100 */
if (numBinary > kNuMaxHighASCII)
if (numBinary > kNuMaxUpperASCII)
return kNuConvertOff;
} else if (numBinary > (count / 100) * kNuMaxHighASCII)
} else if (numBinary > (count / 100) * kNuMaxUpperASCII)
return kNuConvertOff;
/*
@ -402,20 +455,25 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
* and they just happen to be in equal amounts, but it's not clear
* to me that an automatic EOL conversion makes sense on that sort
* of file anyway.
*
* None of this applies if we also need to do a high-ASCII conversion.
*/
if (numLF && !numCR)
pFunnel->convertEOLFrom = kNuEOLLF;
else if (!numLF && numCR)
pFunnel->convertEOLFrom = kNuEOLCR;
else if (numLF && numLF == numCR)
pFunnel->convertEOLFrom = kNuEOLCRLF;
else
pFunnel->convertEOLFrom = kNuEOLUnknown;
if (isHighASCII) {
pFunnel->doStripHighASCII = true;
} else {
if (numLF && !numCR)
pFunnel->convertEOLFrom = kNuEOLLF;
else if (!numLF && numCR)
pFunnel->convertEOLFrom = kNuEOLCR;
else if (numLF && numLF == numCR)
pFunnel->convertEOLFrom = kNuEOLCRLF;
else
pFunnel->convertEOLFrom = kNuEOLUnknown;
}
return kNuConvertOn;
}
/*
* Write a block of data to the appropriate output device. Test for
* excessive data, and raise "outMaxExceeded" if we overrun.
@ -488,53 +546,75 @@ Nu_FunnelWriteConvert(NuFunnel* pFunnel, const uchar* buffer, ulong count)
/*if (pFunnel->outMaxExceeded)
return kNuErrOutMax;*/
if (pFunnel->convertEOL == kNuConvertAuto) {
if (pFunnel->isFirstWrite) {
/*
* This is the first write/flush we've done on this Funnel.
* Check the data we have buffered to decide whether or not
* we want to convert this.
* we want to do text conversions.
*/
pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer, count);
DBUG(("+++ DetermineConversion --> %ld / %ld\n", pFunnel->convertEOL,
pFunnel->convertEOLFrom));
if (pFunnel->convertEOL == kNuConvertAuto) {
pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer,count);
DBUG(("+++ DetermineConversion --> %ld / %ld (%d)\n",
pFunnel->convertEOL, pFunnel->convertEOLFrom,
pFunnel->doStripHighASCII));
if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
DBUG(("+++ Switching redundant converter off\n"));
pFunnel->convertEOL = kNuConvertOff;
if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
DBUG(("+++ Switching redundant converter off\n"));
pFunnel->convertEOL = kNuConvertOff;
}
/* put it where the progress meter can see it */
if (pFunnel->pProgress != nil)
pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
} else if (pFunnel->convertEOL == kNuConvertOn) {
if (pFunnel->checkStripHighASCII) {
/* assume this part of the buffer is representative */
pFunnel->doStripHighASCII = Nu_CheckHighASCII(pFunnel,
buffer, count);
} else {
Assert(!pFunnel->doStripHighASCII);
}
DBUG(("+++ Converter is on, convHighASCII=%d\n",
pFunnel->doStripHighASCII));
}
/* put it where the progress meter can see it */
if (pFunnel->pProgress != nil)
pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
}
Assert(pFunnel->convertEOL != kNuConvertAuto); /* on or off now */
pFunnel->isFirstWrite = false;
if (pFunnel->convertEOL == kNuConvertOff) {
/* write it straight */
Nu_FunnelPutBlock(pFunnel, buffer, count);
} else {
/* do the LF conversion */
Boolean lastCR = pFunnel->lastCR; /* local copy */
/* do the EOL conversion and optional high-bit stripping */
Boolean lastCR = pFunnel->lastCR; /* make local copy */
uchar uch;
int mask;
if (pFunnel->doStripHighASCII)
mask = 0x7f;
else
mask = 0xff;
/*
* We could get a significant speed improvement here by writing
* non-EOL chars as a larger block instead of single bytes.
*/
while (count--) {
if (*buffer == kNuCharCR) {
uch = (*buffer) & mask;
if (uch == kNuCharCR) {
Nu_PutEOL(pFunnel);
lastCR = true;
} else if (*buffer == kNuCharLF) {
} else if (uch == kNuCharLF) {
if (!lastCR)
Nu_PutEOL(pFunnel);
lastCR = false;
} else {
uch = *buffer;
Nu_FunnelPutBlock(pFunnel, &uch, 1);
lastCR = false;
}
buffer++;
}
pFunnel->lastCR = lastCR;
pFunnel->lastCR = lastCR; /* save copy */
}

View File

@ -267,7 +267,8 @@ typedef enum NuValueID {
kNuValueHandleExisting = 8,
kNuValueModifyOrig = 9,
kNuValueMimicSHK = 10,
kNuValueMaskDataless = 11
kNuValueMaskDataless = 11,
kNuValueStripHighASCII = 12
} NuValueID;
typedef unsigned long NuValue;

View File

@ -151,6 +151,7 @@ struct NuArchive {
NuValue valMimicSHK; /* mimic some ShrinkIt quirks */
NuValue valModifyOrig; /* modify original arc in place? */
NuValue valOnlyUpdateOlder; /* modify original arc in place? */
NuValue valStripHighASCII; /* during EOL conv, strip hi bit? */
/* callback functions */
NuCallback selectionFilterFunc;
@ -252,12 +253,16 @@ typedef struct NuFunnel {
uchar* buffer; /* kNuFunnelBufSize worth of storage */
long bufCount; /* #of bytes in buffer */
/* EOL conversion; if "auto", on first flush we convert to "on" or "off" */
/* text conversion; if "auto", on first flush we convert to "on" or "off" */
NuValue convertEOL; /* on/off/auto */
NuValue convertEOLTo; /* EOL to switch to */
NuValue convertEOLFrom; /* EOL terminator we think we found */
Boolean checkStripHighASCII; /* do we want to check for it? */
Boolean doStripHighASCII; /* strip high ASCII during EOL conv */
Boolean lastCR; /* was last char a CR? */
Boolean isFirstWrite; /* cleared on first write */
#if 0
ulong inCount; /* total #of bytes in the top */
ulong outCount; /* total #of bytes out the bottom */

View File

@ -54,6 +54,9 @@ Nu_GetValue(NuArchive* pArchive, NuValueID ident, NuValue* pValue)
case kNuValueOnlyUpdateOlder:
*pValue = pArchive->valOnlyUpdateOlder;
break;
case kNuValueStripHighASCII:
*pValue = pArchive->valStripHighASCII;
break;
default:
err = kNuErrInvalidArg;
Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
@ -162,6 +165,14 @@ Nu_SetValue(NuArchive* pArchive, NuValueID ident, NuValue value)
}
pArchive->valOnlyUpdateOlder = value;
break;
case kNuValueStripHighASCII:
if (value != true && value != false) {
Nu_ReportError(NU_BLOB, err,
"Invalid kNuStripHighASCII value %ld", value);
goto bail;
}
pArchive->valStripHighASCII = value;
break;
default:
Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
goto bail;