ciderpress/reformat/Text8.cpp

/*
 * CiderPress
 * Copyright (C) 2009 by CiderPress authors.  All Rights Reserved.
 * Copyright (C) 2007 by faddenSoft, LLC.  All Rights Reserved.
 * See the file LICENSE for distribution terms.
 */
/*
 * Convert 8-bit word processor files.
 *
 * Most formats convert reasonably well with "Converted Text", but this
 * allows the files to be handled more transparently (e.g. Magic Window
 * "formatted files", which can be mistaken for code.
 */
#include "StdAfx.h"
#include "Text8.h"


/*
 * ===========================================================================
 *      Magic Window / Magic Window II
 * ===========================================================================
 */

/*
 * Magic Window and Magic Window II appear to use the same format for their
 * "formatted files".  The files are of type 'B', with a valid address field,
 * and what looks like junk in the length field.  The files have a 256-byte
 * header that seems to hold some sort of title string as well as some
 * binary goodies that I'm not sure what they are.
 *
 * The data from offset 256 on is entirely mixed-case high-ASCII text.  It
 * may contain printer-specific escape codes for bold, italic, etc.
 *
 * A ".MW" filename suffix is enforced by the program.
 */

/*
 * Decide whether or not we want to handle this file.
 */
void
ReformatMagicWindow::Examine(ReformatHolder* pHolder)
{
    if (pHolder->GetFileType() == kTypeBIN) {
        bool isMW = ReformatMagicWindow::IsFormatted(pHolder);
        bool isDotMW = stricmp(pHolder->GetNameExt(), ".MW") == 0;

        if (isMW && isDotMW) {
            /* gotta be */
            pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,
                ReformatHolder::kApplicYes,
                ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);
        } else if (isDotMW) {
            /* right type and name; maybe our test is broken? */
            pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,
                ReformatHolder::kApplicProbably,
                ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);
        } else if (isMW) {
            /* not likely, but offer it as non-default option */
            pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,
                ReformatHolder::kApplicProbablyNot,
                ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);
        } else {
            /* not one of ours */
            pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,
                ReformatHolder::kApplicNot,
                ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);
        }
    } else {
        /* "unformatted" text even if ".MW"; nothing special required */
        pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,
            ReformatHolder::kApplicNot,
            ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);
    }
}

/*
 * Figure out if this is a Magic Window "formatted" file.
 *
 * I don't know much about the format, so this is based on the similarities
 * observed between half a dozen documents from different sources.
 */
/*static*/ bool
ReformatMagicWindow::IsFormatted(const ReformatHolder* pHolder)
{
    const uint8_t* ptr = pHolder->GetSourceBuf(ReformatHolder::kPartData);
    long srcLen = pHolder->GetSourceLen(ReformatHolder::kPartData);
    int i, count00, count20;


    /* want 256-byte header, plus a few bytes to check text */
    if (srcLen < kHeaderLen+8)
        return false;

    /*
     * First byte always seems to be 0x8d.
     */
    if (ptr[0x00] != 0x8d)
        return false;

    /*
     * 0x58 - 0xa0 is mostly filled with 0x00 (for Magic Window) or 0x20
     * (for Magic Window II).  Both seem to have space for the title in the
     * preceeding part, but it's high-ASCII for MW and low-ASCII for MW2.
     *
     * Expect 50 out of 72 to match.  If this is actually just uninitialized
     * data then this test will be bogus.
     */
    count00 = count20 = 0;
    for (i = 0x58; i < 0xa0; i++) {
        if (ptr[i] == 0x00)
            count00++;
        if (ptr[i] == 0x20)
            count20++;
    }
    if (count00 < 50 && count20 < 50)
        return false;

    /*
     * 0xa2 has some recognizeable bytes; sample values:
     *  MW  42 06 36 50 08 40
     *  MW2 42 06 36 55 08 40
     *  MW2 42 04 3a 50 00 50
     * Not really sure what to make of these.  If we can bracket these
     * values we might have something.
     */
    if (ptr[0xa2] != 0x42 ||
        (ptr[0xa3] < 2 && ptr[0xa3] > 10) ||
        (ptr[0xa4] < 0x30 && ptr[0xa4] > 0x40))
        return false;

    /*
     * Make sure the rest of the file is 100% high ASCII.
     */
    ptr += kHeaderLen;
    srcLen -= kHeaderLen;
    while (srcLen--) {
        if ((*ptr & 0x80) == 0)
            return false;
    }

    return true;
}


/*
 * Skip the header and text-convert the rest.
 */
int
ReformatMagicWindow::Process(const ReformatHolder* pHolder,
    ReformatHolder::ReformatID id, ReformatHolder::ReformatPart part,
    ReformatOutput* pOutput)
{
    const uint8_t* srcPtr = pHolder->GetSourceBuf(part);
    long srcLen = pHolder->GetSourceLen(part);
    long length = srcLen;
    int retval = -1;

    fUseRTF = false;

    RTFBegin();

    if (srcLen <= kHeaderLen)
        goto bail;

    ConvertEOL(srcPtr + kHeaderLen, srcLen - kHeaderLen, true);

    //done:
    RTFEnd();

    SetResultBuffer(pOutput);
    retval = 0;

bail:
    return retval;
}


/*
 * ===========================================================================
 *      Gutenberg Word Processor
 * ===========================================================================
 */

/*
 * Decide whether or not we want to handle this file.
 */
void
ReformatGutenberg::Examine(ReformatHolder* pHolder)
{
    if ((pHolder->GetFileType() == kTypeTXT)  && 
        (pHolder->GetSourceFormat() == ReformatHolder::kSourceFormatGutenberg)) {

        pHolder->SetApplic(ReformatHolder::kReformatGutenberg,
            ReformatHolder::kApplicYes,
            ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);
    }
}


/*
 * Convert the text.
 */
int
ReformatGutenberg::Process(const ReformatHolder* pHolder,
    ReformatHolder::ReformatID id, ReformatHolder::ReformatPart part,
    ReformatOutput* pOutput)
{
    const uint8_t* srcPtr = pHolder->GetSourceBuf(part);
    long srcLen = pHolder->GetSourceLen(part);
    long length = srcLen;
    int retval = -1;

    fUseRTF = false;

    RTFBegin();

    ConvertEOL(srcPtr, srcLen, true, true);

    RTFEnd();

    SetResultBuffer(pOutput);
    retval = 0;

    return retval;
}
Normalize indentation and EOL This updates all source files to use spaces instead of tabs for indentation. It also normalizes the end-of-line markers to be Windows-style CRLF, and ensures that all files end with EOL. No substantive changes were made; "diff -w" is empty. 2014-11-04 00:26:53 +00:00			`/*`
			`* CiderPress`
			`* Copyright (C) 2009 by CiderPress authors. All Rights Reserved.`
			`* Copyright (C) 2007 by faddenSoft, LLC. All Rights Reserved.`
			`* See the file LICENSE for distribution terms.`
			`*/`
			`/*`
			`* Convert 8-bit word processor files.`
			`*`
			`* Most formats convert reasonably well with "Converted Text", but this`
			`* allows the files to be handled more transparently (e.g. Magic Window`
			`* "formatted files", which can be mistaken for code.`
			`*/`
			`#include "StdAfx.h"`
			`#include "Text8.h"`


			`/*`
			`* ===========================================================================`
			`* Magic Window / Magic Window II`
			`* ===========================================================================`
			`*/`

			`/*`
			`* Magic Window and Magic Window II appear to use the same format for their`
			`* "formatted files". The files are of type 'B', with a valid address field,`
			`* and what looks like junk in the length field. The files have a 256-byte`
			`* header that seems to hold some sort of title string as well as some`
			`* binary goodies that I'm not sure what they are.`
			`*`
			`* The data from offset 256 on is entirely mixed-case high-ASCII text. It`
			`* may contain printer-specific escape codes for bold, italic, etc.`
			`*`
			`* A ".MW" filename suffix is enforced by the program.`
			`*/`

			`/*`
			`* Decide whether or not we want to handle this file.`
			`*/`
			`void`
			`ReformatMagicWindow::Examine(ReformatHolder* pHolder)`
			`{`
			`if (pHolder->GetFileType() == kTypeBIN) {`
			`bool isMW = ReformatMagicWindow::IsFormatted(pHolder);`
Minor fixes - Drop strcasecmp / strncasecmp defines; just use the VC++ version. - Fix LOG_WARNING vs. LOG_WARN. - Fix crash when NiftyList data file not available. 2014-11-19 01:10:23 +00:00			`bool isDotMW = stricmp(pHolder->GetNameExt(), ".MW") == 0;`
Normalize indentation and EOL This updates all source files to use spaces instead of tabs for indentation. It also normalizes the end-of-line markers to be Windows-style CRLF, and ensures that all files end with EOL. No substantive changes were made; "diff -w" is empty. 2014-11-04 00:26:53 +00:00
			`if (isMW && isDotMW) {`
			`/* gotta be */`
			`pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,`
			`ReformatHolder::kApplicYes,`
			`ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);`
			`} else if (isDotMW) {`
			`/* right type and name; maybe our test is broken? */`
			`pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,`
			`ReformatHolder::kApplicProbably,`
			`ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);`
			`} else if (isMW) {`
			`/* not likely, but offer it as non-default option */`
			`pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,`
			`ReformatHolder::kApplicProbablyNot,`
			`ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);`
			`} else {`
			`/* not one of ours */`
			`pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,`
			`ReformatHolder::kApplicNot,`
			`ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);`
			`}`
			`} else {`
			`/* "unformatted" text even if ".MW"; nothing special required */`
			`pHolder->SetApplic(ReformatHolder::kReformatMagicWindow,`
			`ReformatHolder::kApplicNot,`
			`ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);`
			`}`
			`}`

			`/*`
			`* Figure out if this is a Magic Window "formatted" file.`
			`*`
			`* I don't know much about the format, so this is based on the similarities`
			`* observed between half a dozen documents from different sources.`
			`*/`
			`/static/ bool`
			`ReformatMagicWindow::IsFormatted(const ReformatHolder* pHolder)`
			`{`
Use types with explicit sizes Much of what the "reformat" code does involves processing data that is 8, 16, or 32 bits. We want to use size-specific types from stdint.h (e.g. uint16_t) rather than "unsigned short". This was a quick pass to replace the various "unsigned" declarations. More can be done here and elsewhere. 2014-11-21 02:10:18 +00:00			`const uint8_t* ptr = pHolder->GetSourceBuf(ReformatHolder::kPartData);`
Normalize indentation and EOL This updates all source files to use spaces instead of tabs for indentation. It also normalizes the end-of-line markers to be Windows-style CRLF, and ensures that all files end with EOL. No substantive changes were made; "diff -w" is empty. 2014-11-04 00:26:53 +00:00			`long srcLen = pHolder->GetSourceLen(ReformatHolder::kPartData);`
			`int i, count00, count20;`


			`/* want 256-byte header, plus a few bytes to check text */`
			`if (srcLen < kHeaderLen+8)`
			`return false;`

			`/*`
			`* First byte always seems to be 0x8d.`
			`*/`
			`if (ptr[0x00] != 0x8d)`
			`return false;`

			`/*`
			`* 0x58 - 0xa0 is mostly filled with 0x00 (for Magic Window) or 0x20`
			`* (for Magic Window II). Both seem to have space for the title in the`
			`* preceeding part, but it's high-ASCII for MW and low-ASCII for MW2.`
			`*`
			`* Expect 50 out of 72 to match. If this is actually just uninitialized`
			`* data then this test will be bogus.`
			`*/`
			`count00 = count20 = 0;`
			`for (i = 0x58; i < 0xa0; i++) {`
			`if (ptr[i] == 0x00)`
			`count00++;`
			`if (ptr[i] == 0x20)`
			`count20++;`
			`}`
			`if (count00 < 50 && count20 < 50)`
			`return false;`

			`/*`
			`* 0xa2 has some recognizeable bytes; sample values:`
			`* MW 42 06 36 50 08 40`
			`* MW2 42 06 36 55 08 40`
			`* MW2 42 04 3a 50 00 50`
			`* Not really sure what to make of these. If we can bracket these`
			`* values we might have something.`
			`*/`
			`if (ptr[0xa2] != 0x42 \|\|`
			`(ptr[0xa3] < 2 && ptr[0xa3] > 10) \|\|`
			`(ptr[0xa4] < 0x30 && ptr[0xa4] > 0x40))`
			`return false;`

			`/*`
			`* Make sure the rest of the file is 100% high ASCII.`
			`*/`
			`ptr += kHeaderLen;`
			`srcLen -= kHeaderLen;`
			`while (srcLen--) {`
			`if ((*ptr & 0x80) == 0)`
			`return false;`
			`}`

			`return true;`
			`}`


			`/*`
			`* Skip the header and text-convert the rest.`
			`*/`
			`int`
			`ReformatMagicWindow::Process(const ReformatHolder* pHolder,`
			`ReformatHolder::ReformatID id, ReformatHolder::ReformatPart part,`
			`ReformatOutput* pOutput)`
			`{`
Use types with explicit sizes Much of what the "reformat" code does involves processing data that is 8, 16, or 32 bits. We want to use size-specific types from stdint.h (e.g. uint16_t) rather than "unsigned short". This was a quick pass to replace the various "unsigned" declarations. More can be done here and elsewhere. 2014-11-21 02:10:18 +00:00			`const uint8_t* srcPtr = pHolder->GetSourceBuf(part);`
Normalize indentation and EOL This updates all source files to use spaces instead of tabs for indentation. It also normalizes the end-of-line markers to be Windows-style CRLF, and ensures that all files end with EOL. No substantive changes were made; "diff -w" is empty. 2014-11-04 00:26:53 +00:00			`long srcLen = pHolder->GetSourceLen(part);`
			`long length = srcLen;`
			`int retval = -1;`

			`fUseRTF = false;`

			`RTFBegin();`

			`if (srcLen <= kHeaderLen)`
			`goto bail;`

			`ConvertEOL(srcPtr + kHeaderLen, srcLen - kHeaderLen, true);`

			`//done:`
			`RTFEnd();`

			`SetResultBuffer(pOutput);`
			`retval = 0;`

			`bail:`
			`return retval;`
			`}`


			`/*`
			`* ===========================================================================`
			`* Gutenberg Word Processor`
			`* ===========================================================================`
			`*/`

			`/*`
			`* Decide whether or not we want to handle this file.`
			`*/`
			`void`
			`ReformatGutenberg::Examine(ReformatHolder* pHolder)`
			`{`
			`if ((pHolder->GetFileType() == kTypeTXT) &&`
			`(pHolder->GetSourceFormat() == ReformatHolder::kSourceFormatGutenberg)) {`

			`pHolder->SetApplic(ReformatHolder::kReformatGutenberg,`
			`ReformatHolder::kApplicYes,`
			`ReformatHolder::kApplicNot, ReformatHolder::kApplicNot);`
			`}`
			`}`


			`/*`
			`* Convert the text.`
			`*/`
			`int`
			`ReformatGutenberg::Process(const ReformatHolder* pHolder,`
			`ReformatHolder::ReformatID id, ReformatHolder::ReformatPart part,`
			`ReformatOutput* pOutput)`
			`{`
Use types with explicit sizes Much of what the "reformat" code does involves processing data that is 8, 16, or 32 bits. We want to use size-specific types from stdint.h (e.g. uint16_t) rather than "unsigned short". This was a quick pass to replace the various "unsigned" declarations. More can be done here and elsewhere. 2014-11-21 02:10:18 +00:00			`const uint8_t* srcPtr = pHolder->GetSourceBuf(part);`
Normalize indentation and EOL This updates all source files to use spaces instead of tabs for indentation. It also normalizes the end-of-line markers to be Windows-style CRLF, and ensures that all files end with EOL. No substantive changes were made; "diff -w" is empty. 2014-11-04 00:26:53 +00:00			`long srcLen = pHolder->GetSourceLen(part);`
			`long length = srcLen;`
			`int retval = -1;`

			`fUseRTF = false;`

			`RTFBegin();`

			`ConvertEOL(srcPtr, srcLen, true, true);`

			`RTFEnd();`

			`SetResultBuffer(pOutput);`
			`retval = 0;`

			`return retval;`
			`}`