Added support for automatic high-ASCII text stripping. The behavior

is activated by a feature flag (default off), and only kicks in when EOL conversion is on for the file in question.
2024-12-27 17:29:57 +00:00 · 2003-03-10 23:05:48 +00:00 · 2003-03-10 23:05:48 +00:00 · 23e5a88e4d
commit 23e5a88e4d
parent 4c253ada9f
6 changed files with 135 additions and 34 deletions
--- a/nufxlib-0/Archive.c
+++ b/nufxlib-0/Archive.c
@ -92,6 +92,7 @@ Nu_NuArchiveNew(NuArchive** ppArchive)
    (*ppArchive)->valModifyOrig = false;
    (*ppArchive)->valMimicSHK = false;
    (*ppArchive)->valMaskDataless = false;
+    (*ppArchive)->valStripHighASCII = false;

    (*ppArchive)->messageHandlerFunc = gNuGlobalErrorMessageHandler;

--- a/nufxlib-0/ChangeLog.txt
+++ b/nufxlib-0/ChangeLog.txt
@ -1,3 +1,6 @@
+2003/03/10  fadden
+    - Added support for automatic high-ASCII text stripping.
+
 2003/02/23  fadden
    - Added test-twirl to samples.

--- a/nufxlib-0/Funnel.c
+++ b/nufxlib-0/Funnel.c
@ -255,8 +255,14 @@ Nu_FunnelNew(NuArchive* pArchive, NuDataSink* pDataSink, NuValue convertEOL,
    pFunnel->pDataSink = pDataSink;
    pFunnel->convertEOL = convertEOL;
    pFunnel->convertEOLTo = convertEOLTo;
+    pFunnel->convertEOLFrom = kNuEOLUnknown;
    pFunnel->pProgress = pProgress;

+    pFunnel->checkStripHighASCII = pArchive->valStripHighASCII;
+    pFunnel->doStripHighASCII = false;  /* determined on first write */
+
+    pFunnel->isFirstWrite = true;
+
 bail:
    if (err != kNuErrNone)
        Nu_FunnelFree(pArchive, pFunnel);
@ -313,6 +319,34 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
 #endif


+/*
+ * Check to see if this is a high-ASCII file.  To qualify, EVERY
+ * character must have its high bit set, except for spaces (0x20).
+ * (The exception is courtesy Glen Bredon's "Merlin".)
+ */
+static Boolean
+Nu_CheckHighASCII(const NuFunnel* pFunnel, const unsigned char* buffer,
+    unsigned long count)
+{
+    Boolean isHighASCII;
+
+    Assert(buffer != nil);
+    Assert(count != 0);
+    Assert(pFunnel->checkStripHighASCII);
+
+    isHighASCII = true;
+    while (count--) {
+        if ((*buffer & 0x80) == 0 && *buffer != 0x20) {
+            isHighASCII = false;
+            break;
+        }
+        
+        buffer++;
+    }
+
+    return isHighASCII;
+}
+
 /*
 * Table determining what's a binary character and what isn't.  It would
 * possibly be more compact to generate this from a simple description,
@ -323,7 +357,8 @@ Nu_FunnelSetMaxOutput(NuFunnel* pFunnel, ulong maxBytes)
 * may be too loose by itself; we may want to require that the lower-ASCII
 * values appear in higher proportions than the upper-ASCII values.
 * Otherwise we run the risk of converting a binary file with specific
- * properties.
+ * properties.  (Note that "upper-ASCII" refers to umlauts and other
+ * accented characters, not DOS 3.3 "high ASCII".)
 *
 * The auto-detect mechanism will never be perfect though, so there's not
 * much point in tweaking it to death.
@ -347,7 +382,7 @@ static const char gNuIsBinary[256] = {
    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,    /* 0xf0 */
 };

-#define kNuMaxHighASCII     1       /* max #of binary chars per 100 bytes */
+#define kNuMaxUpperASCII    1       /* max #of binary chars per 100 bytes */
 #define kNuMinConvThreshold 40      /* min of 40 chars for auto-detect */
 /*
 * Decide, based on the contents of the buffer, whether we should do an
@ -358,25 +393,43 @@ static const char gNuIsBinary[256] = {
 *
 * If we don't have enough data to make a determination, don't mess with it.
 * (Thought for the day: add a "bias" flag, based on the NuRecord fileType,
- * that causes us to handle borderline cases more reasonably.  If it's of
- * type TXT, it's probably text.)
+ * that causes us to handle borderline or sub-min-threshold cases more
+ * reasonably.  If it's of type TXT, it's probably text.)
 *
 * We try to figure out whether it's CR, LF, or CRLF, so that we can
 * skip the CPU-intensive conversion process if it isn't necessary.
+ *
+ * We will also enable a "high-ASCII" stripper if requested.  This is
+ * only enabled when EOL conversions are enabled.
 */
 static NuValue
 Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
 {
    ulong bufCount, numBinary, numLF, numCR;
+    Boolean isHighASCII;
    uchar val;

    if (count < kNuMinConvThreshold)
        return kNuConvertOff;

+    /*
+     * Check to see if the buffer is all high-ASCII characters.  If it is,
+     * we want to strip characters before we test them below.
+     */
+    if (pFunnel->checkStripHighASCII) {
+        isHighASCII = Nu_CheckHighASCII(pFunnel, buffer, count);
+        DBUG(("+++ determined isHighASCII=%d\n", isHighASCII));
+    } else {
+        isHighASCII = false;
+        DBUG(("+++ not even checking isHighASCII\n"));
+    }
+
    bufCount = count;
    numBinary = numLF = numCR = 0;
    while (bufCount--) {
        val = *buffer++;
+        if (isHighASCII)
+            val &= 0x7f;
        if (gNuIsBinary[val])
            numBinary++;
        if (val == kNuCharLF)
@ -388,9 +441,9 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
    /* if #found is > #allowed, it's a binary file */
    if (count < 100) {
        /* use simplified check on files between kNuMinConvThreshold and 100 */
-        if (numBinary > kNuMaxHighASCII)
+        if (numBinary > kNuMaxUpperASCII)
            return kNuConvertOff;
-    } else if (numBinary > (count / 100) * kNuMaxHighASCII)
+    } else if (numBinary > (count / 100) * kNuMaxUpperASCII)
        return kNuConvertOff;

    /*
@ -402,20 +455,25 @@ Nu_DetermineConversion(NuFunnel* pFunnel, const uchar* buffer, ulong count)
     * and they just happen to be in equal amounts, but it's not clear
     * to me that an automatic EOL conversion makes sense on that sort
     * of file anyway.
+     *
+     * None of this applies if we also need to do a high-ASCII conversion.
     */
-    if (numLF && !numCR)
-        pFunnel->convertEOLFrom = kNuEOLLF;
-    else if (!numLF && numCR)
-        pFunnel->convertEOLFrom = kNuEOLCR;
-    else if (numLF && numLF == numCR)
-        pFunnel->convertEOLFrom = kNuEOLCRLF;
-    else
-        pFunnel->convertEOLFrom = kNuEOLUnknown;
+    if (isHighASCII) {
+        pFunnel->doStripHighASCII = true;
+    } else {
+        if (numLF && !numCR)
+            pFunnel->convertEOLFrom = kNuEOLLF;
+        else if (!numLF && numCR)
+            pFunnel->convertEOLFrom = kNuEOLCR;
+        else if (numLF && numLF == numCR)
+            pFunnel->convertEOLFrom = kNuEOLCRLF;
+        else
+            pFunnel->convertEOLFrom = kNuEOLUnknown;
+    }

    return kNuConvertOn;
 }

-
 /*
 * Write a block of data to the appropriate output device.  Test for
 * excessive data, and raise "outMaxExceeded" if we overrun.
@ -488,53 +546,75 @@ Nu_FunnelWriteConvert(NuFunnel* pFunnel, const uchar* buffer, ulong count)
    /*if (pFunnel->outMaxExceeded)
        return kNuErrOutMax;*/

-    if (pFunnel->convertEOL == kNuConvertAuto) {
+    if (pFunnel->isFirstWrite) {
        /*
         * This is the first write/flush we've done on this Funnel.
         * Check the data we have buffered to decide whether or not
-         * we want to convert this.
+         * we want to do text conversions.
         */
-        pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer, count);
-        DBUG(("+++ DetermineConversion --> %ld / %ld\n", pFunnel->convertEOL,
-            pFunnel->convertEOLFrom));
+        if (pFunnel->convertEOL == kNuConvertAuto) {
+            pFunnel->convertEOL = Nu_DetermineConversion(pFunnel, buffer,count);
+            DBUG(("+++ DetermineConversion --> %ld / %ld (%d)\n",
+                pFunnel->convertEOL, pFunnel->convertEOLFrom,
+                pFunnel->doStripHighASCII));

-        if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
-            DBUG(("+++ Switching redundant converter off\n"));
-            pFunnel->convertEOL = kNuConvertOff;
+            if (pFunnel->convertEOLFrom == pFunnel->convertEOLTo) {
+                DBUG(("+++ Switching redundant converter off\n"));
+                pFunnel->convertEOL = kNuConvertOff;
+            }
+            /* put it where the progress meter can see it */
+            if (pFunnel->pProgress != nil)
+                pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
+        } else if (pFunnel->convertEOL == kNuConvertOn) {
+            if (pFunnel->checkStripHighASCII) {
+                /* assume this part of the buffer is representative */
+                pFunnel->doStripHighASCII = Nu_CheckHighASCII(pFunnel,
+                                                buffer, count);
+            } else {
+                Assert(!pFunnel->doStripHighASCII);
+            }
+            DBUG(("+++ Converter is on, convHighASCII=%d\n",
+                pFunnel->doStripHighASCII));
        }
-        /* put it where the progress meter can see it */
-        if (pFunnel->pProgress != nil)
-            pFunnel->pProgress->expand.convertEOL = pFunnel->convertEOL;
    }
+    Assert(pFunnel->convertEOL != kNuConvertAuto);  /* on or off now */
+    pFunnel->isFirstWrite = false;

    if (pFunnel->convertEOL == kNuConvertOff) {
        /* write it straight */
        Nu_FunnelPutBlock(pFunnel, buffer, count);
    } else {
-        /* do the LF conversion */
-        Boolean lastCR = pFunnel->lastCR;   /* local copy */
+        /* do the EOL conversion and optional high-bit stripping */
+        Boolean lastCR = pFunnel->lastCR;   /* make local copy */
        uchar uch;
+        int mask;
+
+        if (pFunnel->doStripHighASCII)
+            mask = 0x7f;
+        else
+            mask = 0xff;

        /*
         * We could get a significant speed improvement here by writing
         * non-EOL chars as a larger block instead of single bytes.
         */
        while (count--) {
-            if (*buffer == kNuCharCR) {
+            uch = (*buffer) & mask;
+
+            if (uch == kNuCharCR) {
                Nu_PutEOL(pFunnel);
                lastCR = true;
-            } else if (*buffer == kNuCharLF) {
+            } else if (uch == kNuCharLF) {
                if (!lastCR)
                    Nu_PutEOL(pFunnel);
                lastCR = false;
            } else {
-                uch = *buffer;
                Nu_FunnelPutBlock(pFunnel, &uch, 1);
                lastCR = false;
            }
            buffer++;
        }
-        pFunnel->lastCR = lastCR;
+        pFunnel->lastCR = lastCR;   /* save copy */

    }

--- a/nufxlib-0/NufxLib.h
+++ b/nufxlib-0/NufxLib.h
@ -267,7 +267,8 @@ typedef enum NuValueID {
    kNuValueHandleExisting      = 8,
    kNuValueModifyOrig          = 9,
    kNuValueMimicSHK            = 10,
-    kNuValueMaskDataless        = 11
+    kNuValueMaskDataless        = 11,
+    kNuValueStripHighASCII      = 12
 } NuValueID;
 typedef unsigned long NuValue;

--- a/nufxlib-0/NufxLibPriv.h
+++ b/nufxlib-0/NufxLibPriv.h
@ -151,6 +151,7 @@ struct NuArchive {
    NuValue         valMimicSHK;            /* mimic some ShrinkIt quirks */
    NuValue         valModifyOrig;          /* modify original arc in place? */
    NuValue         valOnlyUpdateOlder;     /* modify original arc in place? */
+    NuValue         valStripHighASCII;      /* during EOL conv, strip hi bit? */

    /* callback functions */
    NuCallback      selectionFilterFunc;
@ -252,12 +253,16 @@ typedef struct NuFunnel {
    uchar*          buffer;         /* kNuFunnelBufSize worth of storage */
    long            bufCount;       /* #of bytes in buffer */

-    /* EOL conversion; if "auto", on first flush we convert to "on" or "off" */
+    /* text conversion; if "auto", on first flush we convert to "on" or "off" */
    NuValue         convertEOL;     /* on/off/auto */
    NuValue         convertEOLTo;   /* EOL to switch to */
    NuValue         convertEOLFrom; /* EOL terminator we think we found */
+    Boolean         checkStripHighASCII;    /* do we want to check for it? */
+    Boolean         doStripHighASCII;   /* strip high ASCII during EOL conv */
    Boolean         lastCR;         /* was last char a CR? */

+    Boolean         isFirstWrite;   /* cleared on first write */
+
 #if 0
    ulong           inCount;        /* total #of bytes in the top */
    ulong           outCount;       /* total #of bytes out the bottom */
--- a/nufxlib-0/Value.c
+++ b/nufxlib-0/Value.c
@ -54,6 +54,9 @@ Nu_GetValue(NuArchive* pArchive, NuValueID ident, NuValue* pValue)
    case kNuValueOnlyUpdateOlder:
        *pValue = pArchive->valOnlyUpdateOlder;
        break;
+    case kNuValueStripHighASCII:
+        *pValue = pArchive->valStripHighASCII;
+        break;
    default:
        err = kNuErrInvalidArg;
        Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
@ -162,6 +165,14 @@ Nu_SetValue(NuArchive* pArchive, NuValueID ident, NuValue value)
        }
        pArchive->valOnlyUpdateOlder = value;
        break;
+    case kNuValueStripHighASCII:
+        if (value != true && value != false) {
+            Nu_ReportError(NU_BLOB, err,
+                "Invalid kNuStripHighASCII value %ld", value);
+            goto bail;
+        }
+        pArchive->valStripHighASCII = value;
+        break;
    default:
        Nu_ReportError(NU_BLOB, err, "Unknown ValueID %d requested", ident);
        goto bail;