mirror of
https://github.com/fadden/nulib2.git
synced 2024-05-28 23:41:29 +00:00
Added support for automatic high-ASCII text stripping. The behavior
is activated by a feature flag (default off), and only kicks in when EOL conversion is on for the file in question.
This commit is contained in:
parent
4c253ada9f
commit
23e5a88e4d
|
@ -92,6 +92,7 @@ Nu_NuArchiveNew(NuArchive** ppArchive)
|
||||||
(*ppArchive)->valModifyOrig = false;
|
(*ppArchive)->valModifyOrig = false;
|
||||||
(*ppArchive)->valMimicSHK = false;
|
(*ppArchive)->valMimicSHK = false;
|
||||||
(*ppArchive)->valMaskDataless = false;
|
(*ppArchive)->valMaskDataless = false;
|
||||||
|
(*ppArchive)->valStripHighASCII = false;
|
||||||
|
|
||||||
(*ppArchive)->messageHandlerFunc = gNuGlobalErrorMessageHandler;
|
(*ppArchive)->messageHandlerFunc = gNuGlobalErrorMessageHandler;
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
2003/03/10 fadden
|
||||||
|
- Added support for automatic high-ASCII text stripping.
|
||||||
|
|
||||||
2003/02/23 fadden
|
2003/02/23 fadden
|
||||||
- Added test-twirl to samples.
|
- Added test-twirl to samples.
|
||||||
|
|
||||||
|
|
|
@ -255,8 +255,14 @@ Nu_FunnelNew(NuArchive* pArchive, NuDataSink* pDataSink, NuValue convertEOL,
|
||||||
pFunnel->pDataSink = pDataSink;
|
pFunnel->pDataSink = pDataSink;
|
||||||
pFunnel->convertEOL = convertEOL;
|
pFunnel->convertEOL = convertEOL;
|
||||||
pFunnel->convertEOLTo = convertEOLTo;
|
pFunnel->convertEOLTo = convertEOLTo;
|
||||||
|
pFunnel->convertEOLFrom = kNuEOLUnknown;
|
||||||
pFunnel->pProgress = pProgress;
|
pFunnel->pProgress = pProgress;
|
||||||
|
|
||||||
|
pFunnel->checkStripHighASCII = pArchive->valStripHighASCII;
|
||||||
|
pFunnel->doStripHighASCII = false; /* determined on first write */
|
||||||
|
|
||||||
|
pFunnel->isFirstWrite = true;
|
||||||
|
|
||||||
bail:
|
bail:
|
||||||
if (err != kNuErrNone)
|
if (err != kNuErrNone)
|
||||||
Nu_FunnelFree(pArchive, pFunnel);
|
Nu_FunnelFree(pArchive, pFunnel);
|
||||||
|
@ -313,6 +319,34 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check to see if this is a high-ASCII file. To qualify, EVERY
|
||||||
|
* character must have its high bit set, except for spaces (0x20).
|
||||||
|
* (The exception is courtesy Glen Bredon's "Merlin".)
|
||||||
|
*/
|
||||||
|
static Boolean
|
||||||
|
Nu_CheckHighASCII(const NuFunnel* pFunnel, const unsigned char* buffer,
|
||||||
|
unsigned long count)
|
||||||
|
{
|
||||||
|
Boolean isHighASCII;
|
||||||
|
|
||||||
|
Assert(buffer != nil);
|
||||||
|
Assert(count != 0);
|
||||||
|
Assert(pFunnel->checkStripHighASCII);
|
||||||
|
|
||||||
|
isHighASCII = true;
|
||||||
|
while (count--) {
|
||||||
|
if ((*buffer & 0x80) == 0 && *buffer != 0x20) {
|
||||||
|
isHighASCII = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return isHighASCII;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Table determining what's a binary character and what isn't. It would
|
* Table determining what's a binary character and what isn't. It would
|
||||||
* possibly be more compact to generate this from a simple description,
|
* possibly be more compact to generate this from a simple description,
|
||||||
|
@ -323,7 +357,8 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
|
||||||
* may be too loose by itself; we may want to require that the lower-ASCII
|
* may be too loose by itself; we may want to require that the lower-ASCII
|
||||||
* values appear in higher proportions than the upper-ASCII values.
|
* values appear in higher proportions than the upper-ASCII values.
|
||||||
* Otherwise we run the risk of converting a binary file with specific
|
* Otherwise we run the risk of converting a binary file with specific
|
||||||
* properties.
|
* properties. (Note that "upper-ASCII" refers to umlauts and other
|
||||||
|
* accented characters, not DOS 3.3 "high ASCII".)
|
||||||
*
|
*
|
||||||
* The auto-detect mechanism will never be perfect though, so there's not
|
* The auto-detect mechanism will never be perfect though, so there's not
|
||||||
* much point in tweaking it to death.
|
* much point in tweaking it to death.
|
||||||
|
@ -347,7 +382,7 @@ static const char gNuIsBinary[256] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 */
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 */
|
||||||
};
|
};
|
||||||
|
|
||||||
#define kNuMaxHighASCII 1 /* max #of binary chars per 100 bytes */
|
#define kNuMaxUpperASCII 1 /* max #of binary chars per 100 bytes */
|
||||||
#define kNuMinConvThreshold 40 /* min of 40 chars for auto-detect */
|
#define kNuMinConvThreshold 40 /* min of 40 chars for auto-detect */
|
||||||
/*
|
/*
|
||||||
* Decide, based on the contents of the buffer, whether we should do an
|
* Decide, based on the contents of the buffer, whether we should do an
|
||||||
|
@ -358,25 +393,43 @@ static const char gNuIsBinary[256] = {
|
||||||
*
|
*
|
||||||
* If we don't have enough data to make a determination, don't mess with it.
|
* If we don't have enough data to make a determination, don't mess with it.
|
||||||
* (Thought for the day: add a "bias" flag, based on the NuRecord fileType,
|
* (Thought for the day: add a "bias" flag, based on the NuRecord fileType,
|
||||||
* that causes us to handle borderline cases more reasonably. If it's of
|
* that causes us to handle borderline or sub-min-threshold cases more
|
||||||
* type TXT, it's probably text.)
|
* reasonably. If it's of type TXT, it's probably text.)
|
||||||
*
|
*
|
||||||
* We try to figure out whether it's CR, LF, or CRLF, so that we can
|
* We try to figure out whether it's CR, LF, or CRLF, so that we can
|
||||||
* skip the CPU-intensive conversion process if it isn't necessary.
|
* skip the CPU-intensive conversion process if it isn't necessary.
|
||||||
|
*
|
||||||
|
* We will also enable a "high-ASCII" stripper if requested. This is
|
||||||
|
* only enabled when EOL conversions are enabled.
|
||||||
*/
|
*/
|
||||||
static NuValue
|
static NuValue
|
||||||
Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
|
Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
|
||||||
{
|
{
|
||||||
ulong bufCount, numBinary, numLF, numCR;
|
ulong bufCount, numBinary, numLF, numCR;
|
||||||
|
Boolean isHighASCII;
|
||||||
uchar val;
|
uchar val;
|
||||||
|
|
||||||
if (count < kNuMinConvThreshold)
|
if (count < kNuMinConvThreshold)
|
||||||
return kNuConvertOff;
|
return kNuConvertOff;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check to see if the buffer is all high-ASCII characters. If it is,
|
||||||
|
* we want to strip characters before we test them below.
|
||||||
|
*/
|
||||||
|
if (pFunnel->checkStripHighASCII) {
|
||||||
|
isHighASCII = Nu_CheckHighASCII(pFunnel, buffer, count);
|
||||||
|
DBUG(("+++ determined isHighASCII=%d\n", isHighASCII));
|
||||||
|
} else {
|
||||||
|
isHighASCII = false;
|
||||||
|
DBUG(("+++ not even checking isHighASCII\n"));
|
||||||
|
}
|
||||||
|
|
||||||
bufCount = count;
|
bufCount = count;
|
||||||
numBinary = numLF = numCR = 0;
|
numBinary = numLF = numCR = 0;
|
||||||
while (bufCount--) {
|
while (bufCount--) {
|
||||||
val = *buffer++;
|
val = *buffer++;
|
||||||
|
if (isHighASCII)
|
||||||
|
val &= 0x7f;
|
||||||
if (gNuIsBinary[val])
|
if (gNuIsBinary[val])
|
||||||
numBinary++;
|
numBinary++;
|
||||||
if (val == kNuCharLF)
|
if (val == kNuCharLF)
|
||||||
|
@ -388,9 +441,9 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
|
||||||
/* if #found is > #allowed, it's a binary file */
|
/* if #found is > #allowed, it's a binary file */
|
||||||
if (count < 100) {
|
if (count < 100) {
|
||||||
/* use simplified check on files between kNuMinConvThreshold and 100 */
|
/* use simplified check on files between kNuMinConvThreshold and 100 */
|
||||||
if (numBinary > kNuMaxHighASCII)
|
if (numBinary > kNuMaxUpperASCII)
|
||||||
return kNuConvertOff;
|
return kNuConvertOff;
|
||||||
} else if (numBinary > (count / 100) * kNuMaxHighASCII)
|
} else if (numBinary > (count / 100) * kNuMaxUpperASCII)
|
||||||
return kNuConvertOff;
|
return kNuConvertOff;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -402,20 +455,25 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
|
||||||
* and they just happen to be in equal amounts, but it's not clear
|
* and they just happen to be in equal amounts, but it's not clear
|
||||||
* to me that an automatic EOL conversion makes sense on that sort
|
* to me that an automatic EOL conversion makes sense on that sort
|
||||||
* of file anyway.
|
* of file anyway.
|
||||||
|
*
|
||||||
|
* None of this applies if we also need to do a high-ASCII conversion.
|
||||||
*/
|
*/
|
||||||
if (numLF && !numCR)
|
if (isHighASCII) {
|
||||||
pFunnel->convertEOLFrom = kNuEOLLF;
|
pFunnel->doStripHighASCII = true;
|
||||||
else if (!numLF && numCR)
|
} else {
|
||||||
pFunnel->convertEOLFrom = kNuEOLCR;
|
if (numLF && !numCR)
|
||||||
else if (numLF && numLF == numCR)
|
pFunnel->convertEOLFrom = kNuEOLLF;
|
||||||
pFunnel->convertEOLFrom = kNuEOLCRLF;
|
else if (!numLF && numCR)
|
||||||
else
|
pFunnel->convertEOLFrom = kNuEOLCR;
|
||||||
pFunnel->convertEOLFrom = kNuEOLUnknown;
|
else if (numLF && numLF == numCR)
|
||||||
|
pFunnel->convertEOLFrom = kNuEOLCRLF;
|
||||||
|
else
|
||||||
|
pFunnel->convertEOLFrom = kNuEOLUnknown;
|
||||||
|
}
|
||||||
|
|
||||||
return kNuConvertOn;
|
return kNuConvertOn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write a block of data to the appropriate output device. Test for
|
* Write a block of data to the appropriate output device. Test for
|
||||||
* excessive data, and raise "outMaxExceeded" if we overrun.
|
* excessive data, and raise "outMaxExceeded" if we overrun.
|
||||||
|
@ -488,53 +546,75 @@ Nu_FunnelWriteConvert(NuFunnel* pFunnel, const uchar* buffer, ulong count)
|
||||||
/*if (pFunnel->outMaxExceeded)
|
/*if (pFunnel->outMaxExceeded)
|
||||||
return kNuErrOutMax;*/
|
return kNuErrOutMax;*/
|
||||||
|
|
||||||
if (pFunnel->convertEOL == kNuConvertAuto) {
|
if (pFunnel->isFirstWrite) {
|
||||||
/*
|
/*
|
||||||
* This is the first write/flush we've done on this Funnel.
|
* This is the first write/flush we've done on this Funnel.
|
||||||
* Check the data we have buffered to decide whether or not
|
* Check the data we have buffered to decide whether or not
|
||||||
* we want to convert this.
|
* we want to do text conversions.
|
||||||
*/
|
*/
|
||||||
pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer, count);
|
if (pFunnel->convertEOL == kNuConvertAuto) {
|
||||||
DBUG(("+++ DetermineConversion --> %ld / %ld\n", pFunnel->convertEOL,
|
pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer,count);
|
||||||
pFunnel->convertEOLFrom));
|
DBUG(("+++ DetermineConversion --> %ld / %ld (%d)\n",
|
||||||
|
pFunnel->convertEOL, pFunnel->convertEOLFrom,
|
||||||
|
pFunnel->doStripHighASCII));
|
||||||
|
|
||||||
if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
|
if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
|
||||||
DBUG(("+++ Switching redundant converter off\n"));
|
DBUG(("+++ Switching redundant converter off\n"));
|
||||||
pFunnel->convertEOL = kNuConvertOff;
|
pFunnel->convertEOL = kNuConvertOff;
|
||||||
|
}
|
||||||
|
/* put it where the progress meter can see it */
|
||||||
|
if (pFunnel->pProgress != nil)
|
||||||
|
pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
|
||||||
|
} else if (pFunnel->convertEOL == kNuConvertOn) {
|
||||||
|
if (pFunnel->checkStripHighASCII) {
|
||||||
|
/* assume this part of the buffer is representative */
|
||||||
|
pFunnel->doStripHighASCII = Nu_CheckHighASCII(pFunnel,
|
||||||
|
buffer, count);
|
||||||
|
} else {
|
||||||
|
Assert(!pFunnel->doStripHighASCII);
|
||||||
|
}
|
||||||
|
DBUG(("+++ Converter is on, convHighASCII=%d\n",
|
||||||
|
pFunnel->doStripHighASCII));
|
||||||
}
|
}
|
||||||
/* put it where the progress meter can see it */
|
|
||||||
if (pFunnel->pProgress != nil)
|
|
||||||
pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
|
|
||||||
}
|
}
|
||||||
|
Assert(pFunnel->convertEOL != kNuConvertAuto); /* on or off now */
|
||||||
|
pFunnel->isFirstWrite = false;
|
||||||
|
|
||||||
if (pFunnel->convertEOL == kNuConvertOff) {
|
if (pFunnel->convertEOL == kNuConvertOff) {
|
||||||
/* write it straight */
|
/* write it straight */
|
||||||
Nu_FunnelPutBlock(pFunnel, buffer, count);
|
Nu_FunnelPutBlock(pFunnel, buffer, count);
|
||||||
} else {
|
} else {
|
||||||
/* do the LF conversion */
|
/* do the EOL conversion and optional high-bit stripping */
|
||||||
Boolean lastCR = pFunnel->lastCR; /* local copy */
|
Boolean lastCR = pFunnel->lastCR; /* make local copy */
|
||||||
uchar uch;
|
uchar uch;
|
||||||
|
int mask;
|
||||||
|
|
||||||
|
if (pFunnel->doStripHighASCII)
|
||||||
|
mask = 0x7f;
|
||||||
|
else
|
||||||
|
mask = 0xff;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We could get a significant speed improvement here by writing
|
* We could get a significant speed improvement here by writing
|
||||||
* non-EOL chars as a larger block instead of single bytes.
|
* non-EOL chars as a larger block instead of single bytes.
|
||||||
*/
|
*/
|
||||||
while (count--) {
|
while (count--) {
|
||||||
if (*buffer == kNuCharCR) {
|
uch = (*buffer) & mask;
|
||||||
|
|
||||||
|
if (uch == kNuCharCR) {
|
||||||
Nu_PutEOL(pFunnel);
|
Nu_PutEOL(pFunnel);
|
||||||
lastCR = true;
|
lastCR = true;
|
||||||
} else if (*buffer == kNuCharLF) {
|
} else if (uch == kNuCharLF) {
|
||||||
if (!lastCR)
|
if (!lastCR)
|
||||||
Nu_PutEOL(pFunnel);
|
Nu_PutEOL(pFunnel);
|
||||||
lastCR = false;
|
lastCR = false;
|
||||||
} else {
|
} else {
|
||||||
uch = *buffer;
|
|
||||||
Nu_FunnelPutBlock(pFunnel, &uch, 1);
|
Nu_FunnelPutBlock(pFunnel, &uch, 1);
|
||||||
lastCR = false;
|
lastCR = false;
|
||||||
}
|
}
|
||||||
buffer++;
|
buffer++;
|
||||||
}
|
}
|
||||||
pFunnel->lastCR = lastCR;
|
pFunnel->lastCR = lastCR; /* save copy */
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -267,7 +267,8 @@ typedef enum NuValueID {
|
||||||
kNuValueHandleExisting = 8,
|
kNuValueHandleExisting = 8,
|
||||||
kNuValueModifyOrig = 9,
|
kNuValueModifyOrig = 9,
|
||||||
kNuValueMimicSHK = 10,
|
kNuValueMimicSHK = 10,
|
||||||
kNuValueMaskDataless = 11
|
kNuValueMaskDataless = 11,
|
||||||
|
kNuValueStripHighASCII = 12
|
||||||
} NuValueID;
|
} NuValueID;
|
||||||
typedef unsigned long NuValue;
|
typedef unsigned long NuValue;
|
||||||
|
|
||||||
|
|
|
@ -151,6 +151,7 @@ struct NuArchive {
|
||||||
NuValue valMimicSHK; /* mimic some ShrinkIt quirks */
|
NuValue valMimicSHK; /* mimic some ShrinkIt quirks */
|
||||||
NuValue valModifyOrig; /* modify original arc in place? */
|
NuValue valModifyOrig; /* modify original arc in place? */
|
||||||
NuValue valOnlyUpdateOlder; /* modify original arc in place? */
|
NuValue valOnlyUpdateOlder; /* modify original arc in place? */
|
||||||
|
NuValue valStripHighASCII; /* during EOL conv, strip hi bit? */
|
||||||
|
|
||||||
/* callback functions */
|
/* callback functions */
|
||||||
NuCallback selectionFilterFunc;
|
NuCallback selectionFilterFunc;
|
||||||
|
@ -252,12 +253,16 @@ typedef struct NuFunnel {
|
||||||
uchar* buffer; /* kNuFunnelBufSize worth of storage */
|
uchar* buffer; /* kNuFunnelBufSize worth of storage */
|
||||||
long bufCount; /* #of bytes in buffer */
|
long bufCount; /* #of bytes in buffer */
|
||||||
|
|
||||||
/* EOL conversion; if "auto", on first flush we convert to "on" or "off" */
|
/* text conversion; if "auto", on first flush we convert to "on" or "off" */
|
||||||
NuValue convertEOL; /* on/off/auto */
|
NuValue convertEOL; /* on/off/auto */
|
||||||
NuValue convertEOLTo; /* EOL to switch to */
|
NuValue convertEOLTo; /* EOL to switch to */
|
||||||
NuValue convertEOLFrom; /* EOL terminator we think we found */
|
NuValue convertEOLFrom; /* EOL terminator we think we found */
|
||||||
|
Boolean checkStripHighASCII; /* do we want to check for it? */
|
||||||
|
Boolean doStripHighASCII; /* strip high ASCII during EOL conv */
|
||||||
Boolean lastCR; /* was last char a CR? */
|
Boolean lastCR; /* was last char a CR? */
|
||||||
|
|
||||||
|
Boolean isFirstWrite; /* cleared on first write */
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
ulong inCount; /* total #of bytes in the top */
|
ulong inCount; /* total #of bytes in the top */
|
||||||
ulong outCount; /* total #of bytes out the bottom */
|
ulong outCount; /* total #of bytes out the bottom */
|
||||||
|
|
|
@ -54,6 +54,9 @@ Nu_GetValue(NuArchive* pArchive, NuValueID ident, NuValue* pValue)
|
||||||
case kNuValueOnlyUpdateOlder:
|
case kNuValueOnlyUpdateOlder:
|
||||||
*pValue = pArchive->valOnlyUpdateOlder;
|
*pValue = pArchive->valOnlyUpdateOlder;
|
||||||
break;
|
break;
|
||||||
|
case kNuValueStripHighASCII:
|
||||||
|
*pValue = pArchive->valStripHighASCII;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
err = kNuErrInvalidArg;
|
err = kNuErrInvalidArg;
|
||||||
Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
|
Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
|
||||||
|
@ -162,6 +165,14 @@ Nu_SetValue(NuArchive* pArchive, NuValueID ident, NuValue value)
|
||||||
}
|
}
|
||||||
pArchive->valOnlyUpdateOlder = value;
|
pArchive->valOnlyUpdateOlder = value;
|
||||||
break;
|
break;
|
||||||
|
case kNuValueStripHighASCII:
|
||||||
|
if (value != true && value != false) {
|
||||||
|
Nu_ReportError(NU_BLOB, err,
|
||||||
|
"Invalid kNuStripHighASCII value %ld", value);
|
||||||
|
goto bail;
|
||||||
|
}
|
||||||
|
pArchive->valStripHighASCII = value;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
|
Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
|
||||||
goto bail;
|
goto bail;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user