Added support for automatic high-ASCII text stripping. The behavior

is activated by a feature flag (default off), and only kicks in when EOL conversion is on for the file in question.
2024-05-28 23:41:29 +00:00 · 2003-03-10 23:05:48 +00:00 · 2003-03-10 23:05:48 +00:00 · 23e5a88e4d
commit 23e5a88e4d
parent 4c253ada9f
6 changed files with 135 additions and 34 deletions
--- a/nufxlib-0/Archive.c
+++ b/nufxlib-0/Archive.c
@ -92,6 +92,7 @@ Nu_NuArchiveNew(NuArchive** ppArchive)
    (*ppArchive)->valModifyOrig = false;
    (*ppArchive)->valMimicSHK = false;
    (*ppArchive)->valMaskDataless = false;
    (*ppArchive)->valStripHighASCII = false;
    (*ppArchive)->messageHandlerFunc = gNuGlobalErrorMessageHandler;
--- a/nufxlib-0/ChangeLog.txt
+++ b/nufxlib-0/ChangeLog.txt
@ -1,3 +1,6 @@
 2003/03/10  fadden
    - Added support for automatic high-ASCII text stripping.
 2003/02/23  fadden
    - Added test-twirl to samples.
--- a/nufxlib-0/Funnel.c
+++ b/nufxlib-0/Funnel.c
@ -255,8 +255,14 @@ Nu_FunnelNew(NuArchive* pArchive, NuDataSink* pDataSink, NuValue convertEOL,
    pFunnel->pDataSink = pDataSink;
    pFunnel->convertEOL = convertEOL;
    pFunnel->convertEOLTo = convertEOLTo;
    pFunnel->convertEOLFrom = kNuEOLUnknown;
    pFunnel->pProgress = pProgress;
    pFunnel->checkStripHighASCII = pArchive->valStripHighASCII;
    pFunnel->doStripHighASCII = false;  /* determined on first write */
    pFunnel->isFirstWrite = true;
 bail:
    if (err != kNuErrNone)
        Nu_FunnelFree(pArchive, pFunnel);
@ -313,6 +319,34 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
 #endif
 /*
 * Check to see if this is a high-ASCII file.  To qualify, EVERY
 * character must have its high bit set, except for spaces (0x20).
 * (The exception is courtesy Glen Bredon's "Merlin".)
 */
 static Boolean
 Nu_CheckHighASCII(const NuFunnel* pFunnel, const unsigned char* buffer,
    unsigned long count)
 {
    Boolean isHighASCII;
    Assert(buffer != nil);
    Assert(count != 0);
    Assert(pFunnel->checkStripHighASCII);
    isHighASCII = true;
    while (count--) {
        if ((*buffer & 0x80) == 0 && *buffer != 0x20) {
            isHighASCII = false;
            break;
        }
        buffer++;
    }
    return isHighASCII;
 }
 /*
 * Table determining what's a binary character and what isn't.  It would
 * possibly be more compact to generate this from a simple description,
@ -323,7 +357,8 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
 * may be too loose by itself; we may want to require that the lower-ASCII
 * values appear in higher proportions than the upper-ASCII values.
 * Otherwise we run the risk of converting a binary file with specific
- * properties.
+ * properties.  (Note that "upper-ASCII" refers to umlauts and other
 * accented characters, not DOS 3.3 "high ASCII".)
 *
 * The auto-detect mechanism will never be perfect though, so there's not
 * much point in tweaking it to death.
@ -347,7 +382,7 @@ static const char gNuIsBinary[256] = {
    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,    /* 0xf0 */
 };
-#define kNuMaxHighASCII     1       /* max #of binary chars per 100 bytes */
+#define kNuMaxUpperASCII    1       /* max #of binary chars per 100 bytes */
 #define kNuMinConvThreshold 40      /* min of 40 chars for auto-detect */
 /*
 * Decide, based on the contents of the buffer, whether we should do an
@ -358,25 +393,43 @@ static const char gNuIsBinary[256] = {
 *
 * If we don't have enough data to make a determination, don't mess with it.
 * (Thought for the day: add a "bias" flag, based on the NuRecord fileType,
- * that causes us to handle borderline cases more reasonably.  If it's of
+ * that causes us to handle borderline or sub-min-threshold cases more
- * type TXT, it's probably text.)
+ * reasonably.  If it's of type TXT, it's probably text.)
 *
 * We try to figure out whether it's CR, LF, or CRLF, so that we can
 * skip the CPU-intensive conversion process if it isn't necessary.
 *
 * We will also enable a "high-ASCII" stripper if requested.  This is
 * only enabled when EOL conversions are enabled.
 */
 static NuValue
 Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
 {
    ulong bufCount, numBinary, numLF, numCR;
    Boolean isHighASCII;
    uchar val;
    if (count < kNuMinConvThreshold)
        return kNuConvertOff;
    /*
     * Check to see if the buffer is all high-ASCII characters.  If it is,
     * we want to strip characters before we test them below.
     */
    if (pFunnel->checkStripHighASCII) {
        isHighASCII = Nu_CheckHighASCII(pFunnel, buffer, count);
        DBUG(("+++ determined isHighASCII=%d\n", isHighASCII));
    } else {
        isHighASCII = false;
        DBUG(("+++ not even checking isHighASCII\n"));
    }
    bufCount = count;
    numBinary = numLF = numCR = 0;
    while (bufCount--) {
        val = *buffer++;
        if (isHighASCII)
            val &= 0x7f;
        if (gNuIsBinary[val])
            numBinary++;
        if (val == kNuCharLF)
@ -388,9 +441,9 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
    /* if #found is > #allowed, it's a binary file */
    if (count < 100) {
        /* use simplified check on files between kNuMinConvThreshold and 100 */
-        if (numBinary > kNuMaxHighASCII)
+        if (numBinary > kNuMaxUpperASCII)
            return kNuConvertOff;
-    } else if (numBinary > (count / 100) * kNuMaxHighASCII)
+    } else if (numBinary > (count / 100) * kNuMaxUpperASCII)
        return kNuConvertOff;
    /*
@ -402,20 +455,25 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
     * and they just happen to be in equal amounts, but it's not clear
     * to me that an automatic EOL conversion makes sense on that sort
     * of file anyway.
     *
     * None of this applies if we also need to do a high-ASCII conversion.
     */
-    if (numLF && !numCR)
+    if (isHighASCII) {
-        pFunnel->convertEOLFrom = kNuEOLLF;
+        pFunnel->doStripHighASCII = true;
-    else if (!numLF && numCR)
+    } else {
-        pFunnel->convertEOLFrom = kNuEOLCR;
+        if (numLF && !numCR)
-    else if (numLF && numLF == numCR)
+            pFunnel->convertEOLFrom = kNuEOLLF;
-        pFunnel->convertEOLFrom = kNuEOLCRLF;
+        else if (!numLF && numCR)
-    else
+            pFunnel->convertEOLFrom = kNuEOLCR;
-        pFunnel->convertEOLFrom = kNuEOLUnknown;
+        else if (numLF && numLF == numCR)
            pFunnel->convertEOLFrom = kNuEOLCRLF;
        else
            pFunnel->convertEOLFrom = kNuEOLUnknown;
    }
    return kNuConvertOn;
 }
 /*
 * Write a block of data to the appropriate output device.  Test for
 * excessive data, and raise "outMaxExceeded" if we overrun.
@ -488,53 +546,75 @@ Nu_FunnelWriteConvert(NuFunnel* pFunnel, const uchar* buffer, ulong count)
    /*if (pFunnel->outMaxExceeded)
        return kNuErrOutMax;*/
-    if (pFunnel->convertEOL == kNuConvertAuto) {
+    if (pFunnel->isFirstWrite) {
        /*
         * This is the first write/flush we've done on this Funnel.
         * Check the data we have buffered to decide whether or not
-         * we want to convert this.
+         * we want to do text conversions.
         */
-        pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer, count);
+        if (pFunnel->convertEOL == kNuConvertAuto) {
-        DBUG(("+++ DetermineConversion --> %ld / %ld\n", pFunnel->convertEOL,
+            pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer,count);
-            pFunnel->convertEOLFrom));
+            DBUG(("+++ DetermineConversion --> %ld / %ld (%d)\n",
                pFunnel->convertEOL, pFunnel->convertEOLFrom,
                pFunnel->doStripHighASCII));
-        if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
+            if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
-            DBUG(("+++ Switching redundant converter off\n"));
+                DBUG(("+++ Switching redundant converter off\n"));
-            pFunnel->convertEOL = kNuConvertOff;
+                pFunnel->convertEOL = kNuConvertOff;
            }
            /* put it where the progress meter can see it */
            if (pFunnel->pProgress != nil)
                pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
        } else if (pFunnel->convertEOL == kNuConvertOn) {
            if (pFunnel->checkStripHighASCII) {
                /* assume this part of the buffer is representative */
                pFunnel->doStripHighASCII = Nu_CheckHighASCII(pFunnel,
                                                buffer, count);
            } else {
                Assert(!pFunnel->doStripHighASCII);
            }
            DBUG(("+++ Converter is on, convHighASCII=%d\n",
                pFunnel->doStripHighASCII));
        }
        /* put it where the progress meter can see it */
        if (pFunnel->pProgress != nil)
            pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
    }
    Assert(pFunnel->convertEOL != kNuConvertAuto);  /* on or off now */
    pFunnel->isFirstWrite = false;
    if (pFunnel->convertEOL == kNuConvertOff) {
        /* write it straight */
        Nu_FunnelPutBlock(pFunnel, buffer, count);
    } else {
-        /* do the LF conversion */
+        /* do the EOL conversion and optional high-bit stripping */
-        Boolean lastCR = pFunnel->lastCR;   /* local copy */
+        Boolean lastCR = pFunnel->lastCR;   /* make local copy */
        uchar uch;
        int mask;
        if (pFunnel->doStripHighASCII)
            mask = 0x7f;
        else
            mask = 0xff;
        /*
         * We could get a significant speed improvement here by writing
         * non-EOL chars as a larger block instead of single bytes.
         */
        while (count--) {
-            if (*buffer == kNuCharCR) {
+            uch = (*buffer) & mask;
            if (uch == kNuCharCR) {
                Nu_PutEOL(pFunnel);
                lastCR = true;
-            } else if (*buffer == kNuCharLF) {
+            } else if (uch == kNuCharLF) {
                if (!lastCR)
                    Nu_PutEOL(pFunnel);
                lastCR = false;
            } else {
                uch = *buffer;
                Nu_FunnelPutBlock(pFunnel, &uch, 1);
                lastCR = false;
            }
            buffer++;
        }
-        pFunnel->lastCR = lastCR;
+        pFunnel->lastCR = lastCR;   /* save copy */
    }
--- a/nufxlib-0/NufxLib.h
+++ b/nufxlib-0/NufxLib.h
@ -267,7 +267,8 @@ typedef enum NuValueID {
    kNuValueHandleExisting      = 8,
    kNuValueModifyOrig          = 9,
    kNuValueMimicSHK            = 10,
-    kNuValueMaskDataless        = 11
+    kNuValueMaskDataless        = 11,
    kNuValueStripHighASCII      = 12
 } NuValueID;
 typedef unsigned long NuValue;
--- a/nufxlib-0/NufxLibPriv.h
+++ b/nufxlib-0/NufxLibPriv.h
@ -151,6 +151,7 @@ struct NuArchive {
    NuValue         valMimicSHK;            /* mimic some ShrinkIt quirks */
    NuValue         valModifyOrig;          /* modify original arc in place? */
    NuValue         valOnlyUpdateOlder;     /* modify original arc in place? */
    NuValue         valStripHighASCII;      /* during EOL conv, strip hi bit? */
    /* callback functions */
    NuCallback      selectionFilterFunc;
@ -252,12 +253,16 @@ typedef struct NuFunnel {
    uchar*          buffer;         /* kNuFunnelBufSize worth of storage */
    long            bufCount;       /* #of bytes in buffer */
-    /* EOL conversion; if "auto", on first flush we convert to "on" or "off" */
+    /* text conversion; if "auto", on first flush we convert to "on" or "off" */
    NuValue         convertEOL;     /* on/off/auto */
    NuValue         convertEOLTo;   /* EOL to switch to */
    NuValue         convertEOLFrom; /* EOL terminator we think we found */
    Boolean         checkStripHighASCII;    /* do we want to check for it? */
    Boolean         doStripHighASCII;   /* strip high ASCII during EOL conv */
    Boolean         lastCR;         /* was last char a CR? */
    Boolean         isFirstWrite;   /* cleared on first write */
 #if 0
    ulong           inCount;        /* total #of bytes in the top */
    ulong           outCount;       /* total #of bytes out the bottom */
--- a/nufxlib-0/Value.c
+++ b/nufxlib-0/Value.c
@ -54,6 +54,9 @@ Nu_GetValue(NuArchive* pArchive, NuValueID ident, NuValue* pValue)
    case kNuValueOnlyUpdateOlder:
        *pValue = pArchive->valOnlyUpdateOlder;
        break;
    case kNuValueStripHighASCII:
        *pValue = pArchive->valStripHighASCII;
        break;
    default:
        err = kNuErrInvalidArg;
        Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
@ -162,6 +165,14 @@ Nu_SetValue(NuArchive* pArchive, NuValueID ident, NuValue value)
        }
        pArchive->valOnlyUpdateOlder = value;
        break;
    case kNuValueStripHighASCII:
        if (value != true && value != false) {
            Nu_ReportError(NU_BLOB, err,
                "Invalid kNuStripHighASCII value %ld", value);
            goto bail;
        }
        pArchive->valStripHighASCII = value;
        break;
    default:
        Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
        goto bail;