diff --git a/app/DiskArchive.cpp b/app/DiskArchive.cpp index 52d4f81..2ebeb7d 100644 --- a/app/DiskArchive.cpp +++ b/app/DiskArchive.cpp @@ -16,6 +16,7 @@ #include "RenameEntryDialog.h" #include "ConfirmOverwriteDialog.h" #include "../diskimg/DiskImgDetail.h" +#include "../reformat/Charset.h" static const char kEmptyFolderMarker[] = ".$$EmptyFolder"; @@ -748,7 +749,7 @@ CString DiskArchive::New(const WCHAR* fileName, const void* vOptions) * want to do it under Win2K/XP because it can be slow for larger * volumes. */ - fileNameA = fileName; + fileNameA = fileName; // TODO(Unicode) if (numBlocks > 0) { dierr = fDiskImg.CreateImage(fileNameA, NULL, DiskImg::kOuterFormatNone, @@ -797,6 +798,7 @@ CString DiskArchive::New(const WCHAR* fileName, const void* vOptions) volName.MakeUpper(); /* format it */ + // TODO(Unicode): for HFS, we need to convert Unicode to MOR volNameA = volName; dierr = fDiskImg.FormatImage(pOptions->base.format, volNameA); if (dierr != kDIErrNone) { @@ -1086,18 +1088,21 @@ int DiskArchive::LoadDiskFSContents(DiskFS* pDiskFS, const WCHAR* volName) */ pSubVol = pDiskFS->GetNextSubVolume(NULL); while (pSubVol != NULL) { + CStringA subVolNameMOR; CString concatSubVolName; - const char* subVolName; int ret; - subVolName = pSubVol->GetDiskFS()->GetVolumeName(); - if (subVolName == NULL) - subVolName = "+++"; // call it *something* + subVolNameMOR = pSubVol->GetDiskFS()->GetVolumeName(); + if (subVolNameMOR.IsEmpty()) { + subVolNameMOR = "+++"; // call it *something* + } + CString subVolName(Charset::ConvertMORToUNI(subVolNameMOR)); - if (volName[0] == '\0') - concatSubVolName.Format(L"_%hs", subVolName); - else - concatSubVolName.Format(L"%ls_%hs", volName, subVolName); + if (volName[0] == '\0') { + concatSubVolName.Format(L"_%ls", (LPCWSTR) subVolName); + } else { + concatSubVolName.Format(L"%ls_%ls", volName, (LPCWSTR) subVolName); + } ret = LoadDiskFSContents(pSubVol->GetDiskFS(), concatSubVolName); if (ret != 0) return ret; diff --git a/app/GenericArchive.cpp b/app/GenericArchive.cpp index a4f8cf8..96be02f 100644 --- a/app/GenericArchive.cpp +++ b/app/GenericArchive.cpp @@ -139,21 +139,6 @@ void GenericEntry::SetSubVolName(const WCHAR* name) fSubVolName = name; } -// Simple Mac OS Roman to Unicode conversion. -static CString ConvertMORToUNI(const CStringA& strMOR) -{ - // We know that all MOR characters are represented in Unicode with a - // single BMP code point, so we know that strlen(MOR) == wcslen(UNI). - const int len = strMOR.GetLength(); - CString strUNI; - WCHAR* uniBuf = strUNI.GetBuffer(len); - for (int i = 0; i < len; i++) { - uniBuf[i] = ReformatText::ConvertMacRomanToUTF16(strMOR[i]); - } - strUNI.ReleaseBuffer(len); - return strUNI; -} - const CString& GenericEntry::GetDisplayName(void) const { ASSERT(!fPathNameMOR.IsEmpty()); @@ -164,7 +149,7 @@ const CString& GenericEntry::GetDisplayName(void) const if (!fSubVolName.IsEmpty()) { fDisplayName = fSubVolName + (WCHAR) DiskFS::kDIFssep; } - fDisplayName += ConvertMORToUNI(fPathNameMOR); + fDisplayName += Charset::ConvertMORToUNI(fPathNameMOR); return fDisplayName; } @@ -1024,8 +1009,8 @@ void GenericArchive::LocalFileDetails::GenerateStoragePathName() // TODO(Unicode): generate MOR name from Unicode, instead of just // doing a generic CP-1252 conversion. We need to do this on both // sides though, so until we can extract MOR->Unicode we don't - // want to add Unicode->MOR. And it all depends on NufxLib and - // DiskImg being able to handle UTF-16 filenames. + // want to add Unicode->MOR. For this all to work well we need NufxLib + // and DiskImgLib to be able to handle UTF-16 filenames. fStoragePathNameMOR = fStrippedLocalPathName; } diff --git a/reformat/AWGS.cpp b/reformat/AWGS.cpp index 8556fa5..4b9efab 100644 --- a/reformat/AWGS.cpp +++ b/reformat/AWGS.cpp @@ -36,7 +36,7 @@ int ReformatAWGS_WP::Process(const ReformatHolder* pHolder, Chunk doc, header, footer; uint16_t val; - CheckGSCharConv(); + Charset::CheckGSCharConv(); /* must at least have the doc header and globals */ if (srcLen < kMinExpectedLen) { @@ -388,7 +388,7 @@ int ReformatAWGS_WP::PrintParagraph(const uint8_t* ptr, long maxLen) RTFTab(); break; default: - RTFPrintUTF16Char(ConvertMacRomanToUTF16(uch)); + RTFPrintUTF16Char(Charset::ConvertMacRomanToUTF16(uch)); break; } } diff --git a/reformat/Charset.cpp b/reformat/Charset.cpp new file mode 100644 index 0000000..3dd75d9 --- /dev/null +++ b/reformat/Charset.cpp @@ -0,0 +1,451 @@ +/* + * CiderPress + * Copyright (C) 2015 by faddenSoft. All Rights Reserved. + * See the file LICENSE for distribution terms. + */ +/* + * Reformatter base class implementation. + */ +#include "StdAfx.h" +#include "Charset.h" + +/* + * Convert Mac OS Roman to Windows CP1252. + */ +const int kUnk = 0x3f; // for unmappable chars, use '?' + +/*static*/ const uint8_t Charset::kCP1252Conv[128] = { + 0xc4, // 0x80 A + umlaut (diaeresis?) + 0xc5, // 0x81 A + overcircle + 0xc7, // 0x82 C + cedilla + 0xc9, // 0x83 E + acute + 0xd1, // 0x84 N + tilde + 0xd6, // 0x85 O + umlaut + 0xdc, // 0x86 U + umlaut + 0xe1, // 0x87 a + acute + 0xe0, // 0x88 a + grave + 0xe2, // 0x89 a + circumflex + 0xe4, // 0x8a a + umlaut + 0xe3, // 0x8b a + tilde + 0xe5, // 0x8c a + overcircle + 0xe7, // 0x8d c + cedilla + 0xe9, // 0x8e e + acute + 0xe8, // 0x8f e + grave + 0xea, // 0x90 e + circumflex + 0xeb, // 0x91 e + umlaut + 0xed, // 0x92 i + acute + 0xec, // 0x93 i + grave + 0xee, // 0x94 i + circumflex + 0xef, // 0x95 i + umlaut + 0xf1, // 0x96 n + tilde + 0xf3, // 0x97 o + acute + 0xf2, // 0x98 o + grave + 0xf4, // 0x99 o + circumflex + 0xf6, // 0x9a o + umlaut + 0xf5, // 0x9b o + tilde + 0xfa, // 0x9c u + acute + 0xf9, // 0x9d u + grave + 0xfb, // 0x9e u + circumflex + 0xfc, // 0x9f u + umlaut + 0x87, // 0xa0 double cross (dagger) + 0xb0, // 0xa1 degrees + 0xa2, // 0xa2 cents + 0xa3, // 0xa3 pounds (UK$) + 0xa7, // 0xa4 section start + 0x95, // 0xa5 small square (bullet) [using fat bullet] + 0xb6, // 0xa6 paragraph (pilcrow) + 0xdf, // 0xa7 curly B (latin small letter sharp S) + 0xae, // 0xa8 raised 'R' (registered) + 0xa9, // 0xa9 raised 'C' (copyright) + 0x99, // 0xaa raised 'TM' (trademark) + 0xb4, // 0xab acute accent + 0xa8, // 0xac umlaut (diaeresis) + kUnk, // 0xad not-equal + 0xc6, // 0xae merged AE + 0xd8, // 0xaf O + slash (upper-case nil?) + kUnk, // 0xb0 infinity + 0xb1, // 0xb1 +/- + kUnk, // 0xb2 <= + kUnk, // 0xb3 >= + 0xa5, // 0xb4 Yen (Japan$) + 0xb5, // 0xb5 mu (micro) + kUnk, // 0xb6 delta (partial differentiation) [could use D-bar 0xd0] + kUnk, // 0xb7 epsilon (N-ary summation) [could use C-double-bar 0x80] + kUnk, // 0xb8 PI (N-ary product) + kUnk, // 0xb9 pi + kUnk, // 0xba integral + 0xaa, // 0xbb a underbar (feminine ordinal) [using raised a] + 0xba, // 0xbc o underbar (masculine ordinal) [using raised o] + kUnk, // 0xbd omega (Ohm) + 0xe6, // 0xbe merged ae + 0xf8, // 0xbf o + slash (lower-case NULL?) + 0xbf, // 0xc0 upside-down question mark + 0xa1, // 0xc1 upside-down exclamation point + 0xac, // 0xc2 rotated L ("not" sign) + 0xb7, // 0xc3 checkmark (square root) [using small bullet] + 0x83, // 0xc4 script f + kUnk, // 0xc5 approximately equal + kUnk, // 0xc6 delta (triangle / increment) + 0xab, // 0xc7 much less than + 0xbb, // 0xc8 much greater than + 0x85, // 0xc9 ellipsis + 0xa0, // 0xca blank (sticky space) + 0xc0, // 0xcb A + grave + 0xc3, // 0xcc A + tilde + 0xd5, // 0xcd O + tilde + 0x8c, // 0xce merged OE + 0x9c, // 0xcf merged oe + 0x96, // 0xd0 short hyphen (en dash) + 0x97, // 0xd1 long hyphen (em dash) + 0x93, // 0xd2 smart double-quote start + 0x94, // 0xd3 smart double-quote end + 0x91, // 0xd4 smart single-quote start + 0x92, // 0xd5 smart single-quote end + 0xf7, // 0xd6 divide + 0xa4, // 0xd7 diamond (lozenge) [using spiky circle] + 0xff, // 0xd8 y + umlaut + // [nothing below here is part of standard Windows-ASCII?] + // remaining descriptions based on hfsutils' "charset.txt" + kUnk, // 0xd9 Y + umlaut + kUnk, // 0xda fraction slash + kUnk, // 0xdb currency sign + kUnk, // 0xdc single left-pointing angle quotation mark + kUnk, // 0xdd single right-pointing angle quotation mark + kUnk, // 0xde merged fi + kUnk, // 0xdf merged FL + kUnk, // 0xe0 double dagger + kUnk, // 0xe1 middle dot + kUnk, // 0xe2 single low-9 quotation mark + kUnk, // 0xe3 double low-9 quotation mark + kUnk, // 0xe4 per mille sign + kUnk, // 0xe5 A + circumflex + kUnk, // 0xe6 E + circumflex + kUnk, // 0xe7 A + acute accent + kUnk, // 0xe8 E + diaeresis + kUnk, // 0xe9 E + grave accent + kUnk, // 0xea I + acute accent + kUnk, // 0xeb I + circumflex + kUnk, // 0xec I + diaeresis + kUnk, // 0xed I + grave accent + kUnk, // 0xee O + acute accent + kUnk, // 0xef O + circumflex + kUnk, // 0xf0 apple logo + kUnk, // 0xf1 O + grave accent + kUnk, // 0xf2 U + acute accent + kUnk, // 0xf3 U + circumflex + kUnk, // 0xf4 U + grave accent + kUnk, // 0xf5 i without dot + kUnk, // 0xf6 modifier letter circumflex accent + kUnk, // 0xf7 small tilde + kUnk, // 0xf8 macron + kUnk, // 0xf9 breve + kUnk, // 0xfa dot above + kUnk, // 0xfb ring above + kUnk, // 0xfc cedilla + kUnk, // 0xfd double acute accent + kUnk, // 0xfe ogonek + kUnk, // 0xff caron +}; + +/* + * Convert Mac OS Roman to Unicode. Mapping comes from: + * + * http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT + * + * We use the "Control Pictures" block for the control characters + * (0x00-0x1f, 0x7f). + */ +/*static*/ const uint16_t Charset::kUTF16Conv[256] = { + /*0x00*/ 0x2400, // [control] NULL + /*0x01*/ 0x2401, // [control] START OF HEADING + /*0x02*/ 0x2402, // [control] START OF TEXT + /*0x03*/ 0x2403, // [control] END OF TEXT + /*0x04*/ 0x2404, // [control] END OF TRANSMISSION + /*0x05*/ 0x2405, // [control] ENQUIRY + /*0x06*/ 0x2406, // [control] ACKNOWLEDGE + /*0x07*/ 0x2407, // [control] BELL + /*0x08*/ 0x2408, // [control] BACKSPACE + /*0x09*/ 0x2409, // [control] HORIZONTAL TABULATION + /*0x0a*/ 0x240a, // [control] LINE FEED + /*0x0b*/ 0x240b, // [control] VERTICAL TABULATION + /*0x0c*/ 0x240c, // [control] FORM FEED + /*0x0d*/ 0x240d, // [control] CARRIAGE RETURN + /*0x0e*/ 0x240e, // [control] SHIFT OUT + /*0x0f*/ 0x240f, // [control] SHIFT IN + /*0x10*/ 0x2410, // [control] DATA LINK ESCAPE + /*0x11*/ 0x2411, // [control] DEVICE CONTROL ONE + /*0x12*/ 0x2412, // [control] DEVICE CONTROL TWO + /*0x13*/ 0x2413, // [control] DEVICE CONTROL THREE + /*0x14*/ 0x2414, // [control] DEVICE CONTROL FOUR + /*0x15*/ 0x2415, // [control] NEGATIVE ACKNOWLEDGE + /*0x16*/ 0x2416, // [control] SYNCHRONOUS IDLE + /*0x17*/ 0x2417, // [control] END OF TRANSMISSION BLOCK + /*0x18*/ 0x2418, // [control] CANCEL + /*0x19*/ 0x2419, // [control] END OF MEDIUM + /*0x1a*/ 0x241a, // [control] SUBSTITUTE + /*0x1b*/ 0x241b, // [control] ESCAPE + /*0x1c*/ 0x241c, // [control] FILE SEPARATOR + /*0x1d*/ 0x241d, // [control] GROUP SEPARATOR + /*0x1e*/ 0x241e, // [control] RECORD SEPARATOR + /*0x1f*/ 0x241f, // [control] UNIT SEPARATOR + /*0x20*/ 0x0020, // SPACE + /*0x21*/ 0x0021, // EXCLAMATION MARK + /*0x22*/ 0x0022, // QUOTATION MARK + /*0x23*/ 0x0023, // NUMBER SIGN + /*0x24*/ 0x0024, // DOLLAR SIGN + /*0x25*/ 0x0025, // PERCENT SIGN + /*0x26*/ 0x0026, // AMPERSAND + /*0x27*/ 0x0027, // APOSTROPHE + /*0x28*/ 0x0028, // LEFT PARENTHESIS + /*0x29*/ 0x0029, // RIGHT PARENTHESIS + /*0x2A*/ 0x002A, // ASTERISK + /*0x2B*/ 0x002B, // PLUS SIGN + /*0x2C*/ 0x002C, // COMMA + /*0x2D*/ 0x002D, // HYPHEN-MINUS + /*0x2E*/ 0x002E, // FULL STOP + /*0x2F*/ 0x002F, // SOLIDUS + /*0x30*/ 0x0030, // DIGIT ZERO + /*0x31*/ 0x0031, // DIGIT ONE + /*0x32*/ 0x0032, // DIGIT TWO + /*0x33*/ 0x0033, // DIGIT THREE + /*0x34*/ 0x0034, // DIGIT FOUR + /*0x35*/ 0x0035, // DIGIT FIVE + /*0x36*/ 0x0036, // DIGIT SIX + /*0x37*/ 0x0037, // DIGIT SEVEN + /*0x38*/ 0x0038, // DIGIT EIGHT + /*0x39*/ 0x0039, // DIGIT NINE + /*0x3A*/ 0x003A, // COLON + /*0x3B*/ 0x003B, // SEMICOLON + /*0x3C*/ 0x003C, // LESS-THAN SIGN + /*0x3D*/ 0x003D, // EQUALS SIGN + /*0x3E*/ 0x003E, // GREATER-THAN SIGN + /*0x3F*/ 0x003F, // QUESTION MARK + /*0x40*/ 0x0040, // COMMERCIAL AT + /*0x41*/ 0x0041, // LATIN CAPITAL LETTER A + /*0x42*/ 0x0042, // LATIN CAPITAL LETTER B + /*0x43*/ 0x0043, // LATIN CAPITAL LETTER C + /*0x44*/ 0x0044, // LATIN CAPITAL LETTER D + /*0x45*/ 0x0045, // LATIN CAPITAL LETTER E + /*0x46*/ 0x0046, // LATIN CAPITAL LETTER F + /*0x47*/ 0x0047, // LATIN CAPITAL LETTER G + /*0x48*/ 0x0048, // LATIN CAPITAL LETTER H + /*0x49*/ 0x0049, // LATIN CAPITAL LETTER I + /*0x4A*/ 0x004A, // LATIN CAPITAL LETTER J + /*0x4B*/ 0x004B, // LATIN CAPITAL LETTER K + /*0x4C*/ 0x004C, // LATIN CAPITAL LETTER L + /*0x4D*/ 0x004D, // LATIN CAPITAL LETTER M + /*0x4E*/ 0x004E, // LATIN CAPITAL LETTER N + /*0x4F*/ 0x004F, // LATIN CAPITAL LETTER O + /*0x50*/ 0x0050, // LATIN CAPITAL LETTER P + /*0x51*/ 0x0051, // LATIN CAPITAL LETTER Q + /*0x52*/ 0x0052, // LATIN CAPITAL LETTER R + /*0x53*/ 0x0053, // LATIN CAPITAL LETTER S + /*0x54*/ 0x0054, // LATIN CAPITAL LETTER T + /*0x55*/ 0x0055, // LATIN CAPITAL LETTER U + /*0x56*/ 0x0056, // LATIN CAPITAL LETTER V + /*0x57*/ 0x0057, // LATIN CAPITAL LETTER W + /*0x58*/ 0x0058, // LATIN CAPITAL LETTER X + /*0x59*/ 0x0059, // LATIN CAPITAL LETTER Y + /*0x5A*/ 0x005A, // LATIN CAPITAL LETTER Z + /*0x5B*/ 0x005B, // LEFT SQUARE BRACKET + /*0x5C*/ 0x005C, // REVERSE SOLIDUS + /*0x5D*/ 0x005D, // RIGHT SQUARE BRACKET + /*0x5E*/ 0x005E, // CIRCUMFLEX ACCENT + /*0x5F*/ 0x005F, // LOW LINE + /*0x60*/ 0x0060, // GRAVE ACCENT + /*0x61*/ 0x0061, // LATIN SMALL LETTER A + /*0x62*/ 0x0062, // LATIN SMALL LETTER B + /*0x63*/ 0x0063, // LATIN SMALL LETTER C + /*0x64*/ 0x0064, // LATIN SMALL LETTER D + /*0x65*/ 0x0065, // LATIN SMALL LETTER E + /*0x66*/ 0x0066, // LATIN SMALL LETTER F + /*0x67*/ 0x0067, // LATIN SMALL LETTER G + /*0x68*/ 0x0068, // LATIN SMALL LETTER H + /*0x69*/ 0x0069, // LATIN SMALL LETTER I + /*0x6A*/ 0x006A, // LATIN SMALL LETTER J + /*0x6B*/ 0x006B, // LATIN SMALL LETTER K + /*0x6C*/ 0x006C, // LATIN SMALL LETTER L + /*0x6D*/ 0x006D, // LATIN SMALL LETTER M + /*0x6E*/ 0x006E, // LATIN SMALL LETTER N + /*0x6F*/ 0x006F, // LATIN SMALL LETTER O + /*0x70*/ 0x0070, // LATIN SMALL LETTER P + /*0x71*/ 0x0071, // LATIN SMALL LETTER Q + /*0x72*/ 0x0072, // LATIN SMALL LETTER R + /*0x73*/ 0x0073, // LATIN SMALL LETTER S + /*0x74*/ 0x0074, // LATIN SMALL LETTER T + /*0x75*/ 0x0075, // LATIN SMALL LETTER U + /*0x76*/ 0x0076, // LATIN SMALL LETTER V + /*0x77*/ 0x0077, // LATIN SMALL LETTER W + /*0x78*/ 0x0078, // LATIN SMALL LETTER X + /*0x79*/ 0x0079, // LATIN SMALL LETTER Y + /*0x7A*/ 0x007A, // LATIN SMALL LETTER Z + /*0x7B*/ 0x007B, // LEFT CURLY BRACKET + /*0x7C*/ 0x007C, // VERTICAL LINE + /*0x7D*/ 0x007D, // RIGHT CURLY BRACKET + /*0x7E*/ 0x007E, // TILDE + /*0x7f*/ 0x2421, // [control] DELETE + /*0x80*/ 0x00C4, // LATIN CAPITAL LETTER A WITH DIAERESIS + /*0x81*/ 0x00C5, // LATIN CAPITAL LETTER A WITH RING ABOVE + /*0x82*/ 0x00C7, // LATIN CAPITAL LETTER C WITH CEDILLA + /*0x83*/ 0x00C9, // LATIN CAPITAL LETTER E WITH ACUTE + /*0x84*/ 0x00D1, // LATIN CAPITAL LETTER N WITH TILDE + /*0x85*/ 0x00D6, // LATIN CAPITAL LETTER O WITH DIAERESIS + /*0x86*/ 0x00DC, // LATIN CAPITAL LETTER U WITH DIAERESIS + /*0x87*/ 0x00E1, // LATIN SMALL LETTER A WITH ACUTE + /*0x88*/ 0x00E0, // LATIN SMALL LETTER A WITH GRAVE + /*0x89*/ 0x00E2, // LATIN SMALL LETTER A WITH CIRCUMFLEX + /*0x8A*/ 0x00E4, // LATIN SMALL LETTER A WITH DIAERESIS + /*0x8B*/ 0x00E3, // LATIN SMALL LETTER A WITH TILDE + /*0x8C*/ 0x00E5, // LATIN SMALL LETTER A WITH RING ABOVE + /*0x8D*/ 0x00E7, // LATIN SMALL LETTER C WITH CEDILLA + /*0x8E*/ 0x00E9, // LATIN SMALL LETTER E WITH ACUTE + /*0x8F*/ 0x00E8, // LATIN SMALL LETTER E WITH GRAVE + /*0x90*/ 0x00EA, // LATIN SMALL LETTER E WITH CIRCUMFLEX + /*0x91*/ 0x00EB, // LATIN SMALL LETTER E WITH DIAERESIS + /*0x92*/ 0x00ED, // LATIN SMALL LETTER I WITH ACUTE + /*0x93*/ 0x00EC, // LATIN SMALL LETTER I WITH GRAVE + /*0x94*/ 0x00EE, // LATIN SMALL LETTER I WITH CIRCUMFLEX + /*0x95*/ 0x00EF, // LATIN SMALL LETTER I WITH DIAERESIS + /*0x96*/ 0x00F1, // LATIN SMALL LETTER N WITH TILDE + /*0x97*/ 0x00F3, // LATIN SMALL LETTER O WITH ACUTE + /*0x98*/ 0x00F2, // LATIN SMALL LETTER O WITH GRAVE + /*0x99*/ 0x00F4, // LATIN SMALL LETTER O WITH CIRCUMFLEX + /*0x9A*/ 0x00F6, // LATIN SMALL LETTER O WITH DIAERESIS + /*0x9B*/ 0x00F5, // LATIN SMALL LETTER O WITH TILDE + /*0x9C*/ 0x00FA, // LATIN SMALL LETTER U WITH ACUTE + /*0x9D*/ 0x00F9, // LATIN SMALL LETTER U WITH GRAVE + /*0x9E*/ 0x00FB, // LATIN SMALL LETTER U WITH CIRCUMFLEX + /*0x9F*/ 0x00FC, // LATIN SMALL LETTER U WITH DIAERESIS + /*0xA0*/ 0x2020, // DAGGER + /*0xA1*/ 0x00B0, // DEGREE SIGN + /*0xA2*/ 0x00A2, // CENT SIGN + /*0xA3*/ 0x00A3, // POUND SIGN + /*0xA4*/ 0x00A7, // SECTION SIGN + /*0xA5*/ 0x2022, // BULLET + /*0xA6*/ 0x00B6, // PILCROW SIGN + /*0xA7*/ 0x00DF, // LATIN SMALL LETTER SHARP S + /*0xA8*/ 0x00AE, // REGISTERED SIGN + /*0xA9*/ 0x00A9, // COPYRIGHT SIGN + /*0xAA*/ 0x2122, // TRADE MARK SIGN + /*0xAB*/ 0x00B4, // ACUTE ACCENT + /*0xAC*/ 0x00A8, // DIAERESIS + /*0xAD*/ 0x2260, // NOT EQUAL TO + /*0xAE*/ 0x00C6, // LATIN CAPITAL LETTER AE + /*0xAF*/ 0x00D8, // LATIN CAPITAL LETTER O WITH STROKE + /*0xB0*/ 0x221E, // INFINITY + /*0xB1*/ 0x00B1, // PLUS-MINUS SIGN + /*0xB2*/ 0x2264, // LESS-THAN OR EQUAL TO + /*0xB3*/ 0x2265, // GREATER-THAN OR EQUAL TO + /*0xB4*/ 0x00A5, // YEN SIGN + /*0xB5*/ 0x00B5, // MICRO SIGN + /*0xB6*/ 0x2202, // PARTIAL DIFFERENTIAL + /*0xB7*/ 0x2211, // N-ARY SUMMATION + /*0xB8*/ 0x220F, // N-ARY PRODUCT + /*0xB9*/ 0x03C0, // GREEK SMALL LETTER PI + /*0xBA*/ 0x222B, // INTEGRAL + /*0xBB*/ 0x00AA, // FEMININE ORDINAL INDICATOR + /*0xBC*/ 0x00BA, // MASCULINE ORDINAL INDICATOR + /*0xBD*/ 0x03A9, // GREEK CAPITAL LETTER OMEGA + /*0xBE*/ 0x00E6, // LATIN SMALL LETTER AE + /*0xBF*/ 0x00F8, // LATIN SMALL LETTER O WITH STROKE + /*0xC0*/ 0x00BF, // INVERTED QUESTION MARK + /*0xC1*/ 0x00A1, // INVERTED EXCLAMATION MARK + /*0xC2*/ 0x00AC, // NOT SIGN + /*0xC3*/ 0x221A, // SQUARE ROOT + /*0xC4*/ 0x0192, // LATIN SMALL LETTER F WITH HOOK + /*0xC5*/ 0x2248, // ALMOST EQUAL TO + /*0xC6*/ 0x2206, // INCREMENT + /*0xC7*/ 0x00AB, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + /*0xC8*/ 0x00BB, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + /*0xC9*/ 0x2026, // HORIZONTAL ELLIPSIS + /*0xCA*/ 0x00A0, // NO-BREAK SPACE + /*0xCB*/ 0x00C0, // LATIN CAPITAL LETTER A WITH GRAVE + /*0xCC*/ 0x00C3, // LATIN CAPITAL LETTER A WITH TILDE + /*0xCD*/ 0x00D5, // LATIN CAPITAL LETTER O WITH TILDE + /*0xCE*/ 0x0152, // LATIN CAPITAL LIGATURE OE + /*0xCF*/ 0x0153, // LATIN SMALL LIGATURE OE + /*0xD0*/ 0x2013, // EN DASH + /*0xD1*/ 0x2014, // EM DASH + /*0xD2*/ 0x201C, // LEFT DOUBLE QUOTATION MARK + /*0xD3*/ 0x201D, // RIGHT DOUBLE QUOTATION MARK + /*0xD4*/ 0x2018, // LEFT SINGLE QUOTATION MARK + /*0xD5*/ 0x2019, // RIGHT SINGLE QUOTATION MARK + /*0xD6*/ 0x00F7, // DIVISION SIGN + /*0xD7*/ 0x25CA, // LOZENGE + /*0xD8*/ 0x00FF, // LATIN SMALL LETTER Y WITH DIAERESIS + /*0xD9*/ 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS + /*0xDA*/ 0x2044, // FRACTION SLASH + /*0xDB*/ 0x00A4, // CURRENCY SIGN (was EURO SIGN) + /*0xDC*/ 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK + /*0xDD*/ 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + /*0xDE*/ 0xFB01, // LATIN SMALL LIGATURE FI + /*0xDF*/ 0xFB02, // LATIN SMALL LIGATURE FL + /*0xE0*/ 0x2021, // DOUBLE DAGGER + /*0xE1*/ 0x00B7, // MIDDLE DOT + /*0xE2*/ 0x201A, // SINGLE LOW-9 QUOTATION MARK + /*0xE3*/ 0x201E, // DOUBLE LOW-9 QUOTATION MARK + /*0xE4*/ 0x2030, // PER MILLE SIGN + /*0xE5*/ 0x00C2, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX + /*0xE6*/ 0x00CA, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX + /*0xE7*/ 0x00C1, // LATIN CAPITAL LETTER A WITH ACUTE + /*0xE8*/ 0x00CB, // LATIN CAPITAL LETTER E WITH DIAERESIS + /*0xE9*/ 0x00C8, // LATIN CAPITAL LETTER E WITH GRAVE + /*0xEA*/ 0x00CD, // LATIN CAPITAL LETTER I WITH ACUTE + /*0xEB*/ 0x00CE, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX + /*0xEC*/ 0x00CF, // LATIN CAPITAL LETTER I WITH DIAERESIS + /*0xED*/ 0x00CC, // LATIN CAPITAL LETTER I WITH GRAVE + /*0xEE*/ 0x00D3, // LATIN CAPITAL LETTER O WITH ACUTE + /*0xEF*/ 0x00D4, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX + /*0xF0*/ 0xF8FF, // Apple logo + /*0xF1*/ 0x00D2, // LATIN CAPITAL LETTER O WITH GRAVE + /*0xF2*/ 0x00DA, // LATIN CAPITAL LETTER U WITH ACUTE + /*0xF3*/ 0x00DB, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX + /*0xF4*/ 0x00D9, // LATIN CAPITAL LETTER U WITH GRAVE + /*0xF5*/ 0x0131, // LATIN SMALL LETTER DOTLESS I + /*0xF6*/ 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT + /*0xF7*/ 0x02DC, // SMALL TILDE + /*0xF8*/ 0x00AF, // MACRON + /*0xF9*/ 0x02D8, // BREVE + /*0xFA*/ 0x02D9, // DOT ABOVE + /*0xFB*/ 0x02DA, // RING ABOVE + /*0xFC*/ 0x00B8, // CEDILLA + /*0xFD*/ 0x02DD, // DOUBLE ACUTE ACCENT + /*0xFE*/ 0x02DB, // OGONEK + /*0xFF*/ 0x02C7, // CARON +}; + +/* + * Quick sanity check on contents of array. + * + * No two characters should map to the same thing. This isn't vital, but + * if we want to have a reversible transformation someday, it'll make our + * lives easier then. + */ +/*static*/ void Charset::CheckGSCharConv(void) +{ +#ifdef _DEBUG + bool* test = (bool*) malloc(65536 * sizeof(bool)); + + memset(test, 0, 65536 * sizeof(bool)); + for (int i = 0; i < NELEM(kCP1252Conv); i++) { + if (test[kCP1252Conv[i]] && kCP1252Conv[i] != kUnk) { + LOGW("Character used twice: 0x%02x at %d (0x%02x)", + kCP1252Conv[i], i, i+128); + assert(false); + } + test[kCP1252Conv[i]] = true; + } + + memset(test, 0, 65536 * sizeof(bool)); + for (int i = 0; i < NELEM(kUTF16Conv); i++) { + if (test[kUTF16Conv[i]]) { + LOGW("Character used twice: 0x%02x at %d (0x%02x)", + kUTF16Conv[i], i, i+128); + assert(false); + } + test[kUTF16Conv[i]] = true; + } + + free(test); +#endif +} diff --git a/reformat/Charset.h b/reformat/Charset.h new file mode 100644 index 0000000..94808c3 --- /dev/null +++ b/reformat/Charset.h @@ -0,0 +1,52 @@ +/* + * CiderPress + * Copyright (C) 2015 by faddenSoft. All Rights Reserved. + * See the file LICENSE for distribution terms. + */ +#ifndef REFORMAT_CHARSET_H +#define REFORMAT_CHARSET_H + +/* + * Character set conversions. + */ +class Charset { +public: + // Convert a Mac OS Roman character value (from a IIgs document) to + // its UTF-16 Unicode equivalent. This also includes a conversion + // for the control characters. The transformation is reversible. + static uint16_t ConvertMacRomanToUTF16(uint8_t ch) { + return kUTF16Conv[ch]; + } + + // Convert a Mac OS Roman character value an 8-bit Windows CP1252 + // equivalent. The transformation is NOT reversible. + static uint8_t ConvertMacRomanTo1252(uint8_t ch) { + if (ch < 128) + return ch; + else + return kCP1252Conv[ch-128]; + } + + // Simple Mac OS Roman to Unicode string conversion. + static CString ConvertMORToUNI(const CStringA& strMOR) + { + // We know that all MOR characters are represented in Unicode with a + // single BMP code point, so we know that strlen(MOR) == wcslen(UNI). + const int len = strMOR.GetLength(); + CString strUNI; + WCHAR* uniBuf = strUNI.GetBuffer(len); + for (int i = 0; i < len; i++) { + uniBuf[i] = Charset::ConvertMacRomanToUTF16(strMOR[i]); + } + strUNI.ReleaseBuffer(len); + return strUNI; + } + + static void CheckGSCharConv(void); + +private: + static const uint8_t kCP1252Conv[]; + static const uint16_t kUTF16Conv[]; +}; + +#endif /*REFORMAT_CHARSET_H*/ diff --git a/reformat/ReformatBase.cpp b/reformat/ReformatBase.cpp index d399298..af7f0c3 100644 --- a/reformat/ReformatBase.cpp +++ b/reformat/ReformatBase.cpp @@ -17,447 +17,6 @@ * ========================================================================== */ -/* - * Convert Mac OS Roman to Windows CP1252. - */ -const int kUnk = 0x3f; // for unmappable chars, use '?' - -/*static*/ const uint8_t ReformatText::kCP1252Conv[128] = { - 0xc4, // 0x80 A + umlaut (diaeresis?) - 0xc5, // 0x81 A + overcircle - 0xc7, // 0x82 C + cedilla - 0xc9, // 0x83 E + acute - 0xd1, // 0x84 N + tilde - 0xd6, // 0x85 O + umlaut - 0xdc, // 0x86 U + umlaut - 0xe1, // 0x87 a + acute - 0xe0, // 0x88 a + grave - 0xe2, // 0x89 a + circumflex - 0xe4, // 0x8a a + umlaut - 0xe3, // 0x8b a + tilde - 0xe5, // 0x8c a + overcircle - 0xe7, // 0x8d c + cedilla - 0xe9, // 0x8e e + acute - 0xe8, // 0x8f e + grave - 0xea, // 0x90 e + circumflex - 0xeb, // 0x91 e + umlaut - 0xed, // 0x92 i + acute - 0xec, // 0x93 i + grave - 0xee, // 0x94 i + circumflex - 0xef, // 0x95 i + umlaut - 0xf1, // 0x96 n + tilde - 0xf3, // 0x97 o + acute - 0xf2, // 0x98 o + grave - 0xf4, // 0x99 o + circumflex - 0xf6, // 0x9a o + umlaut - 0xf5, // 0x9b o + tilde - 0xfa, // 0x9c u + acute - 0xf9, // 0x9d u + grave - 0xfb, // 0x9e u + circumflex - 0xfc, // 0x9f u + umlaut - 0x87, // 0xa0 double cross (dagger) - 0xb0, // 0xa1 degrees - 0xa2, // 0xa2 cents - 0xa3, // 0xa3 pounds (UK$) - 0xa7, // 0xa4 section start - 0x95, // 0xa5 small square (bullet) [using fat bullet] - 0xb6, // 0xa6 paragraph (pilcrow) - 0xdf, // 0xa7 curly B (latin small letter sharp S) - 0xae, // 0xa8 raised 'R' (registered) - 0xa9, // 0xa9 raised 'C' (copyright) - 0x99, // 0xaa raised 'TM' (trademark) - 0xb4, // 0xab acute accent - 0xa8, // 0xac umlaut (diaeresis) - kUnk, // 0xad not-equal - 0xc6, // 0xae merged AE - 0xd8, // 0xaf O + slash (upper-case nil?) - kUnk, // 0xb0 infinity - 0xb1, // 0xb1 +/- - kUnk, // 0xb2 <= - kUnk, // 0xb3 >= - 0xa5, // 0xb4 Yen (Japan$) - 0xb5, // 0xb5 mu (micro) - kUnk, // 0xb6 delta (partial differentiation) [could use D-bar 0xd0] - kUnk, // 0xb7 epsilon (N-ary summation) [could use C-double-bar 0x80] - kUnk, // 0xb8 PI (N-ary product) - kUnk, // 0xb9 pi - kUnk, // 0xba integral - 0xaa, // 0xbb a underbar (feminine ordinal) [using raised a] - 0xba, // 0xbc o underbar (masculine ordinal) [using raised o] - kUnk, // 0xbd omega (Ohm) - 0xe6, // 0xbe merged ae - 0xf8, // 0xbf o + slash (lower-case NULL?) - 0xbf, // 0xc0 upside-down question mark - 0xa1, // 0xc1 upside-down exclamation point - 0xac, // 0xc2 rotated L ("not" sign) - 0xb7, // 0xc3 checkmark (square root) [using small bullet] - 0x83, // 0xc4 script f - kUnk, // 0xc5 approximately equal - kUnk, // 0xc6 delta (triangle / increment) - 0xab, // 0xc7 much less than - 0xbb, // 0xc8 much greater than - 0x85, // 0xc9 ellipsis - 0xa0, // 0xca blank (sticky space) - 0xc0, // 0xcb A + grave - 0xc3, // 0xcc A + tilde - 0xd5, // 0xcd O + tilde - 0x8c, // 0xce merged OE - 0x9c, // 0xcf merged oe - 0x96, // 0xd0 short hyphen (en dash) - 0x97, // 0xd1 long hyphen (em dash) - 0x93, // 0xd2 smart double-quote start - 0x94, // 0xd3 smart double-quote end - 0x91, // 0xd4 smart single-quote start - 0x92, // 0xd5 smart single-quote end - 0xf7, // 0xd6 divide - 0xa4, // 0xd7 diamond (lozenge) [using spiky circle] - 0xff, // 0xd8 y + umlaut - // [nothing below here is part of standard Windows-ASCII?] - // remaining descriptions based on hfsutils' "charset.txt" - kUnk, // 0xd9 Y + umlaut - kUnk, // 0xda fraction slash - kUnk, // 0xdb currency sign - kUnk, // 0xdc single left-pointing angle quotation mark - kUnk, // 0xdd single right-pointing angle quotation mark - kUnk, // 0xde merged fi - kUnk, // 0xdf merged FL - kUnk, // 0xe0 double dagger - kUnk, // 0xe1 middle dot - kUnk, // 0xe2 single low-9 quotation mark - kUnk, // 0xe3 double low-9 quotation mark - kUnk, // 0xe4 per mille sign - kUnk, // 0xe5 A + circumflex - kUnk, // 0xe6 E + circumflex - kUnk, // 0xe7 A + acute accent - kUnk, // 0xe8 E + diaeresis - kUnk, // 0xe9 E + grave accent - kUnk, // 0xea I + acute accent - kUnk, // 0xeb I + circumflex - kUnk, // 0xec I + diaeresis - kUnk, // 0xed I + grave accent - kUnk, // 0xee O + acute accent - kUnk, // 0xef O + circumflex - kUnk, // 0xf0 apple logo - kUnk, // 0xf1 O + grave accent - kUnk, // 0xf2 U + acute accent - kUnk, // 0xf3 U + circumflex - kUnk, // 0xf4 U + grave accent - kUnk, // 0xf5 i without dot - kUnk, // 0xf6 modifier letter circumflex accent - kUnk, // 0xf7 small tilde - kUnk, // 0xf8 macron - kUnk, // 0xf9 breve - kUnk, // 0xfa dot above - kUnk, // 0xfb ring above - kUnk, // 0xfc cedilla - kUnk, // 0xfd double acute accent - kUnk, // 0xfe ogonek - kUnk, // 0xff caron -}; - -/* - * Convert Mac OS Roman to Unicode. Mapping comes from: - * - * http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT - * - * We use the "Control Pictures" block for the control characters - * (0x00-0x1f, 0x7f). - */ -/*static*/ const uint16_t ReformatText::kUTF16Conv[256] = { - /*0x00*/ 0x2400, // [control] NULL - /*0x01*/ 0x2401, // [control] START OF HEADING - /*0x02*/ 0x2402, // [control] START OF TEXT - /*0x03*/ 0x2403, // [control] END OF TEXT - /*0x04*/ 0x2404, // [control] END OF TRANSMISSION - /*0x05*/ 0x2405, // [control] ENQUIRY - /*0x06*/ 0x2406, // [control] ACKNOWLEDGE - /*0x07*/ 0x2407, // [control] BELL - /*0x08*/ 0x2408, // [control] BACKSPACE - /*0x09*/ 0x2409, // [control] HORIZONTAL TABULATION - /*0x0a*/ 0x240a, // [control] LINE FEED - /*0x0b*/ 0x240b, // [control] VERTICAL TABULATION - /*0x0c*/ 0x240c, // [control] FORM FEED - /*0x0d*/ 0x240d, // [control] CARRIAGE RETURN - /*0x0e*/ 0x240e, // [control] SHIFT OUT - /*0x0f*/ 0x240f, // [control] SHIFT IN - /*0x10*/ 0x2410, // [control] DATA LINK ESCAPE - /*0x11*/ 0x2411, // [control] DEVICE CONTROL ONE - /*0x12*/ 0x2412, // [control] DEVICE CONTROL TWO - /*0x13*/ 0x2413, // [control] DEVICE CONTROL THREE - /*0x14*/ 0x2414, // [control] DEVICE CONTROL FOUR - /*0x15*/ 0x2415, // [control] NEGATIVE ACKNOWLEDGE - /*0x16*/ 0x2416, // [control] SYNCHRONOUS IDLE - /*0x17*/ 0x2417, // [control] END OF TRANSMISSION BLOCK - /*0x18*/ 0x2418, // [control] CANCEL - /*0x19*/ 0x2419, // [control] END OF MEDIUM - /*0x1a*/ 0x241a, // [control] SUBSTITUTE - /*0x1b*/ 0x241b, // [control] ESCAPE - /*0x1c*/ 0x241c, // [control] FILE SEPARATOR - /*0x1d*/ 0x241d, // [control] GROUP SEPARATOR - /*0x1e*/ 0x241e, // [control] RECORD SEPARATOR - /*0x1f*/ 0x241f, // [control] UNIT SEPARATOR - /*0x20*/ 0x0020, // SPACE - /*0x21*/ 0x0021, // EXCLAMATION MARK - /*0x22*/ 0x0022, // QUOTATION MARK - /*0x23*/ 0x0023, // NUMBER SIGN - /*0x24*/ 0x0024, // DOLLAR SIGN - /*0x25*/ 0x0025, // PERCENT SIGN - /*0x26*/ 0x0026, // AMPERSAND - /*0x27*/ 0x0027, // APOSTROPHE - /*0x28*/ 0x0028, // LEFT PARENTHESIS - /*0x29*/ 0x0029, // RIGHT PARENTHESIS - /*0x2A*/ 0x002A, // ASTERISK - /*0x2B*/ 0x002B, // PLUS SIGN - /*0x2C*/ 0x002C, // COMMA - /*0x2D*/ 0x002D, // HYPHEN-MINUS - /*0x2E*/ 0x002E, // FULL STOP - /*0x2F*/ 0x002F, // SOLIDUS - /*0x30*/ 0x0030, // DIGIT ZERO - /*0x31*/ 0x0031, // DIGIT ONE - /*0x32*/ 0x0032, // DIGIT TWO - /*0x33*/ 0x0033, // DIGIT THREE - /*0x34*/ 0x0034, // DIGIT FOUR - /*0x35*/ 0x0035, // DIGIT FIVE - /*0x36*/ 0x0036, // DIGIT SIX - /*0x37*/ 0x0037, // DIGIT SEVEN - /*0x38*/ 0x0038, // DIGIT EIGHT - /*0x39*/ 0x0039, // DIGIT NINE - /*0x3A*/ 0x003A, // COLON - /*0x3B*/ 0x003B, // SEMICOLON - /*0x3C*/ 0x003C, // LESS-THAN SIGN - /*0x3D*/ 0x003D, // EQUALS SIGN - /*0x3E*/ 0x003E, // GREATER-THAN SIGN - /*0x3F*/ 0x003F, // QUESTION MARK - /*0x40*/ 0x0040, // COMMERCIAL AT - /*0x41*/ 0x0041, // LATIN CAPITAL LETTER A - /*0x42*/ 0x0042, // LATIN CAPITAL LETTER B - /*0x43*/ 0x0043, // LATIN CAPITAL LETTER C - /*0x44*/ 0x0044, // LATIN CAPITAL LETTER D - /*0x45*/ 0x0045, // LATIN CAPITAL LETTER E - /*0x46*/ 0x0046, // LATIN CAPITAL LETTER F - /*0x47*/ 0x0047, // LATIN CAPITAL LETTER G - /*0x48*/ 0x0048, // LATIN CAPITAL LETTER H - /*0x49*/ 0x0049, // LATIN CAPITAL LETTER I - /*0x4A*/ 0x004A, // LATIN CAPITAL LETTER J - /*0x4B*/ 0x004B, // LATIN CAPITAL LETTER K - /*0x4C*/ 0x004C, // LATIN CAPITAL LETTER L - /*0x4D*/ 0x004D, // LATIN CAPITAL LETTER M - /*0x4E*/ 0x004E, // LATIN CAPITAL LETTER N - /*0x4F*/ 0x004F, // LATIN CAPITAL LETTER O - /*0x50*/ 0x0050, // LATIN CAPITAL LETTER P - /*0x51*/ 0x0051, // LATIN CAPITAL LETTER Q - /*0x52*/ 0x0052, // LATIN CAPITAL LETTER R - /*0x53*/ 0x0053, // LATIN CAPITAL LETTER S - /*0x54*/ 0x0054, // LATIN CAPITAL LETTER T - /*0x55*/ 0x0055, // LATIN CAPITAL LETTER U - /*0x56*/ 0x0056, // LATIN CAPITAL LETTER V - /*0x57*/ 0x0057, // LATIN CAPITAL LETTER W - /*0x58*/ 0x0058, // LATIN CAPITAL LETTER X - /*0x59*/ 0x0059, // LATIN CAPITAL LETTER Y - /*0x5A*/ 0x005A, // LATIN CAPITAL LETTER Z - /*0x5B*/ 0x005B, // LEFT SQUARE BRACKET - /*0x5C*/ 0x005C, // REVERSE SOLIDUS - /*0x5D*/ 0x005D, // RIGHT SQUARE BRACKET - /*0x5E*/ 0x005E, // CIRCUMFLEX ACCENT - /*0x5F*/ 0x005F, // LOW LINE - /*0x60*/ 0x0060, // GRAVE ACCENT - /*0x61*/ 0x0061, // LATIN SMALL LETTER A - /*0x62*/ 0x0062, // LATIN SMALL LETTER B - /*0x63*/ 0x0063, // LATIN SMALL LETTER C - /*0x64*/ 0x0064, // LATIN SMALL LETTER D - /*0x65*/ 0x0065, // LATIN SMALL LETTER E - /*0x66*/ 0x0066, // LATIN SMALL LETTER F - /*0x67*/ 0x0067, // LATIN SMALL LETTER G - /*0x68*/ 0x0068, // LATIN SMALL LETTER H - /*0x69*/ 0x0069, // LATIN SMALL LETTER I - /*0x6A*/ 0x006A, // LATIN SMALL LETTER J - /*0x6B*/ 0x006B, // LATIN SMALL LETTER K - /*0x6C*/ 0x006C, // LATIN SMALL LETTER L - /*0x6D*/ 0x006D, // LATIN SMALL LETTER M - /*0x6E*/ 0x006E, // LATIN SMALL LETTER N - /*0x6F*/ 0x006F, // LATIN SMALL LETTER O - /*0x70*/ 0x0070, // LATIN SMALL LETTER P - /*0x71*/ 0x0071, // LATIN SMALL LETTER Q - /*0x72*/ 0x0072, // LATIN SMALL LETTER R - /*0x73*/ 0x0073, // LATIN SMALL LETTER S - /*0x74*/ 0x0074, // LATIN SMALL LETTER T - /*0x75*/ 0x0075, // LATIN SMALL LETTER U - /*0x76*/ 0x0076, // LATIN SMALL LETTER V - /*0x77*/ 0x0077, // LATIN SMALL LETTER W - /*0x78*/ 0x0078, // LATIN SMALL LETTER X - /*0x79*/ 0x0079, // LATIN SMALL LETTER Y - /*0x7A*/ 0x007A, // LATIN SMALL LETTER Z - /*0x7B*/ 0x007B, // LEFT CURLY BRACKET - /*0x7C*/ 0x007C, // VERTICAL LINE - /*0x7D*/ 0x007D, // RIGHT CURLY BRACKET - /*0x7E*/ 0x007E, // TILDE - /*0x7f*/ 0x2421, // [control] DELETE - /*0x80*/ 0x00C4, // LATIN CAPITAL LETTER A WITH DIAERESIS - /*0x81*/ 0x00C5, // LATIN CAPITAL LETTER A WITH RING ABOVE - /*0x82*/ 0x00C7, // LATIN CAPITAL LETTER C WITH CEDILLA - /*0x83*/ 0x00C9, // LATIN CAPITAL LETTER E WITH ACUTE - /*0x84*/ 0x00D1, // LATIN CAPITAL LETTER N WITH TILDE - /*0x85*/ 0x00D6, // LATIN CAPITAL LETTER O WITH DIAERESIS - /*0x86*/ 0x00DC, // LATIN CAPITAL LETTER U WITH DIAERESIS - /*0x87*/ 0x00E1, // LATIN SMALL LETTER A WITH ACUTE - /*0x88*/ 0x00E0, // LATIN SMALL LETTER A WITH GRAVE - /*0x89*/ 0x00E2, // LATIN SMALL LETTER A WITH CIRCUMFLEX - /*0x8A*/ 0x00E4, // LATIN SMALL LETTER A WITH DIAERESIS - /*0x8B*/ 0x00E3, // LATIN SMALL LETTER A WITH TILDE - /*0x8C*/ 0x00E5, // LATIN SMALL LETTER A WITH RING ABOVE - /*0x8D*/ 0x00E7, // LATIN SMALL LETTER C WITH CEDILLA - /*0x8E*/ 0x00E9, // LATIN SMALL LETTER E WITH ACUTE - /*0x8F*/ 0x00E8, // LATIN SMALL LETTER E WITH GRAVE - /*0x90*/ 0x00EA, // LATIN SMALL LETTER E WITH CIRCUMFLEX - /*0x91*/ 0x00EB, // LATIN SMALL LETTER E WITH DIAERESIS - /*0x92*/ 0x00ED, // LATIN SMALL LETTER I WITH ACUTE - /*0x93*/ 0x00EC, // LATIN SMALL LETTER I WITH GRAVE - /*0x94*/ 0x00EE, // LATIN SMALL LETTER I WITH CIRCUMFLEX - /*0x95*/ 0x00EF, // LATIN SMALL LETTER I WITH DIAERESIS - /*0x96*/ 0x00F1, // LATIN SMALL LETTER N WITH TILDE - /*0x97*/ 0x00F3, // LATIN SMALL LETTER O WITH ACUTE - /*0x98*/ 0x00F2, // LATIN SMALL LETTER O WITH GRAVE - /*0x99*/ 0x00F4, // LATIN SMALL LETTER O WITH CIRCUMFLEX - /*0x9A*/ 0x00F6, // LATIN SMALL LETTER O WITH DIAERESIS - /*0x9B*/ 0x00F5, // LATIN SMALL LETTER O WITH TILDE - /*0x9C*/ 0x00FA, // LATIN SMALL LETTER U WITH ACUTE - /*0x9D*/ 0x00F9, // LATIN SMALL LETTER U WITH GRAVE - /*0x9E*/ 0x00FB, // LATIN SMALL LETTER U WITH CIRCUMFLEX - /*0x9F*/ 0x00FC, // LATIN SMALL LETTER U WITH DIAERESIS - /*0xA0*/ 0x2020, // DAGGER - /*0xA1*/ 0x00B0, // DEGREE SIGN - /*0xA2*/ 0x00A2, // CENT SIGN - /*0xA3*/ 0x00A3, // POUND SIGN - /*0xA4*/ 0x00A7, // SECTION SIGN - /*0xA5*/ 0x2022, // BULLET - /*0xA6*/ 0x00B6, // PILCROW SIGN - /*0xA7*/ 0x00DF, // LATIN SMALL LETTER SHARP S - /*0xA8*/ 0x00AE, // REGISTERED SIGN - /*0xA9*/ 0x00A9, // COPYRIGHT SIGN - /*0xAA*/ 0x2122, // TRADE MARK SIGN - /*0xAB*/ 0x00B4, // ACUTE ACCENT - /*0xAC*/ 0x00A8, // DIAERESIS - /*0xAD*/ 0x2260, // NOT EQUAL TO - /*0xAE*/ 0x00C6, // LATIN CAPITAL LETTER AE - /*0xAF*/ 0x00D8, // LATIN CAPITAL LETTER O WITH STROKE - /*0xB0*/ 0x221E, // INFINITY - /*0xB1*/ 0x00B1, // PLUS-MINUS SIGN - /*0xB2*/ 0x2264, // LESS-THAN OR EQUAL TO - /*0xB3*/ 0x2265, // GREATER-THAN OR EQUAL TO - /*0xB4*/ 0x00A5, // YEN SIGN - /*0xB5*/ 0x00B5, // MICRO SIGN - /*0xB6*/ 0x2202, // PARTIAL DIFFERENTIAL - /*0xB7*/ 0x2211, // N-ARY SUMMATION - /*0xB8*/ 0x220F, // N-ARY PRODUCT - /*0xB9*/ 0x03C0, // GREEK SMALL LETTER PI - /*0xBA*/ 0x222B, // INTEGRAL - /*0xBB*/ 0x00AA, // FEMININE ORDINAL INDICATOR - /*0xBC*/ 0x00BA, // MASCULINE ORDINAL INDICATOR - /*0xBD*/ 0x03A9, // GREEK CAPITAL LETTER OMEGA - /*0xBE*/ 0x00E6, // LATIN SMALL LETTER AE - /*0xBF*/ 0x00F8, // LATIN SMALL LETTER O WITH STROKE - /*0xC0*/ 0x00BF, // INVERTED QUESTION MARK - /*0xC1*/ 0x00A1, // INVERTED EXCLAMATION MARK - /*0xC2*/ 0x00AC, // NOT SIGN - /*0xC3*/ 0x221A, // SQUARE ROOT - /*0xC4*/ 0x0192, // LATIN SMALL LETTER F WITH HOOK - /*0xC5*/ 0x2248, // ALMOST EQUAL TO - /*0xC6*/ 0x2206, // INCREMENT - /*0xC7*/ 0x00AB, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - /*0xC8*/ 0x00BB, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - /*0xC9*/ 0x2026, // HORIZONTAL ELLIPSIS - /*0xCA*/ 0x00A0, // NO-BREAK SPACE - /*0xCB*/ 0x00C0, // LATIN CAPITAL LETTER A WITH GRAVE - /*0xCC*/ 0x00C3, // LATIN CAPITAL LETTER A WITH TILDE - /*0xCD*/ 0x00D5, // LATIN CAPITAL LETTER O WITH TILDE - /*0xCE*/ 0x0152, // LATIN CAPITAL LIGATURE OE - /*0xCF*/ 0x0153, // LATIN SMALL LIGATURE OE - /*0xD0*/ 0x2013, // EN DASH - /*0xD1*/ 0x2014, // EM DASH - /*0xD2*/ 0x201C, // LEFT DOUBLE QUOTATION MARK - /*0xD3*/ 0x201D, // RIGHT DOUBLE QUOTATION MARK - /*0xD4*/ 0x2018, // LEFT SINGLE QUOTATION MARK - /*0xD5*/ 0x2019, // RIGHT SINGLE QUOTATION MARK - /*0xD6*/ 0x00F7, // DIVISION SIGN - /*0xD7*/ 0x25CA, // LOZENGE - /*0xD8*/ 0x00FF, // LATIN SMALL LETTER Y WITH DIAERESIS - /*0xD9*/ 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS - /*0xDA*/ 0x2044, // FRACTION SLASH - /*0xDB*/ 0x00A4, // CURRENCY SIGN (was EURO SIGN) - /*0xDC*/ 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK - /*0xDD*/ 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - /*0xDE*/ 0xFB01, // LATIN SMALL LIGATURE FI - /*0xDF*/ 0xFB02, // LATIN SMALL LIGATURE FL - /*0xE0*/ 0x2021, // DOUBLE DAGGER - /*0xE1*/ 0x00B7, // MIDDLE DOT - /*0xE2*/ 0x201A, // SINGLE LOW-9 QUOTATION MARK - /*0xE3*/ 0x201E, // DOUBLE LOW-9 QUOTATION MARK - /*0xE4*/ 0x2030, // PER MILLE SIGN - /*0xE5*/ 0x00C2, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX - /*0xE6*/ 0x00CA, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX - /*0xE7*/ 0x00C1, // LATIN CAPITAL LETTER A WITH ACUTE - /*0xE8*/ 0x00CB, // LATIN CAPITAL LETTER E WITH DIAERESIS - /*0xE9*/ 0x00C8, // LATIN CAPITAL LETTER E WITH GRAVE - /*0xEA*/ 0x00CD, // LATIN CAPITAL LETTER I WITH ACUTE - /*0xEB*/ 0x00CE, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX - /*0xEC*/ 0x00CF, // LATIN CAPITAL LETTER I WITH DIAERESIS - /*0xED*/ 0x00CC, // LATIN CAPITAL LETTER I WITH GRAVE - /*0xEE*/ 0x00D3, // LATIN CAPITAL LETTER O WITH ACUTE - /*0xEF*/ 0x00D4, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX - /*0xF0*/ 0xF8FF, // Apple logo - /*0xF1*/ 0x00D2, // LATIN CAPITAL LETTER O WITH GRAVE - /*0xF2*/ 0x00DA, // LATIN CAPITAL LETTER U WITH ACUTE - /*0xF3*/ 0x00DB, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX - /*0xF4*/ 0x00D9, // LATIN CAPITAL LETTER U WITH GRAVE - /*0xF5*/ 0x0131, // LATIN SMALL LETTER DOTLESS I - /*0xF6*/ 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT - /*0xF7*/ 0x02DC, // SMALL TILDE - /*0xF8*/ 0x00AF, // MACRON - /*0xF9*/ 0x02D8, // BREVE - /*0xFA*/ 0x02D9, // DOT ABOVE - /*0xFB*/ 0x02DA, // RING ABOVE - /*0xFC*/ 0x00B8, // CEDILLA - /*0xFD*/ 0x02DD, // DOUBLE ACUTE ACCENT - /*0xFE*/ 0x02DB, // OGONEK - /*0xFF*/ 0x02C7, // CARON -}; - -/* - * Quick sanity check on contents of array. - * - * No two characters should map to the same thing. This isn't vital, but - * if we want to have a reversible transformation someday, it'll make our - * lives easier then. - */ -void ReformatText::CheckGSCharConv(void) -{ -#ifdef _DEBUG - bool* test = (bool*) malloc(65536 * sizeof(bool)); - - memset(test, 0, 65536 * sizeof(bool)); - for (int i = 0; i < NELEM(kCP1252Conv); i++) { - if (test[kCP1252Conv[i]] && kCP1252Conv[i] != kUnk) { - LOGW("Character used twice: 0x%02x at %d (0x%02x)", - kCP1252Conv[i], i, i+128); - assert(false); - } - test[kCP1252Conv[i]] = true; - } - - memset(test, 0, 65536 * sizeof(bool)); - for (int i = 0; i < NELEM(kUTF16Conv); i++) { - if (test[kUTF16Conv[i]]) { - LOGW("Character used twice: 0x%02x at %d (0x%02x)", - kUTF16Conv[i], i, i+128); - assert(false); - } - test[kUTF16Conv[i]] = true; - } - - free(test); -#endif -} - /* * Set the output format and buffer. * diff --git a/reformat/ReformatBase.h b/reformat/ReformatBase.h index d00f2ed..d52c28d 100644 --- a/reformat/ReformatBase.h +++ b/reformat/ReformatBase.h @@ -11,10 +11,11 @@ * that, but we'd have to figure out what that means when extracting a file * (i.e. figure out the RTF embedded bitmap format). */ -#ifndef REFORMAT_REFORMATBASE -#define REFORMAT_REFORMATBASE +#ifndef REFORMAT_REFORMATBASE_H +#define REFORMAT_REFORMATBASE_H #include "Reformat.h" +#include "Charset.h" #define BufPrintf fExpBuf.Printf @@ -290,13 +291,6 @@ public: kRTFFlagColorTable = 1, // include color table }; - // Convert a Mac OS Roman character value (from a IIgs document) to - // its UTF-16 Unicode equivalent. This also includes a conversion - // for the control characters. - static uint16_t ConvertMacRomanToUTF16(uint8_t ch) { - return kUTF16Conv[ch]; - } - protected: void RTFBegin(int flags = 0); void RTFEnd(void); @@ -387,25 +381,11 @@ protected: fExpBuf.Printf("%c", ch); } - // Convert a Mac OS Roman character value (from a IIgs document) to - // an 8-bit Windows CP1252 equivalent. - static uint8_t ConvertMacRomanTo1252(uint8_t ch) { - if (ch < 128) - return ch; - else - return kCP1252Conv[ch-128]; - } - - void CheckGSCharConv(void); - private: DECLARE_COPY_AND_OPEQ(ReformatText) int CreateWorkBuf(void); enum { kRTFUnitsPerInch = 1440 }; // TWIPS - static const uint8_t kCP1252Conv[]; - static const uint16_t kUTF16Conv[]; - int fLeftMargin, fRightMargin; // for documents, in 1/10th inch int fPointSize; int fPreMultPointSize; @@ -421,4 +401,4 @@ private: TextColor fTextColor; }; -#endif /*REFORMAT_REFORMATBASE*/ +#endif /*REFORMAT_REFORMATBASE_H*/ diff --git a/reformat/Teach.cpp b/reformat/Teach.cpp index 21f657e..82436e9 100644 --- a/reformat/Teach.cpp +++ b/reformat/Teach.cpp @@ -47,7 +47,7 @@ int ReformatGWP::Process(const ReformatHolder* pHolder, long srcLen = pHolder->GetSourceLen(part); fUseRTF = false; - CheckGSCharConv(); + Charset::CheckGSCharConv(); RTFBegin(); /* convert EOL markers and IIgs characters */ @@ -67,7 +67,7 @@ int ReformatGWP::Process(const ReformatHolder* pHolder, BufPrintf("\r\n"); } else { // RTF is always off, so just use BufPrintf - BufPrintf("%c", ConvertMacRomanTo1252(ch)); + BufPrintf("%c", Charset::ConvertMacRomanTo1252(ch)); } } @@ -124,7 +124,7 @@ int ReformatTeach::Process(const ReformatHolder* pHolder, LOGI("Teach reformatter missing one fork of the file"); return -1; } - CheckGSCharConv(); + Charset::CheckGSCharConv(); /* find the rStyleBlock */ if (!ReformatResourceFork::GetResource(rsrcBuf, rsrcLen, 0x8012, 0x0001, @@ -206,7 +206,7 @@ int ReformatTeach::Process(const ReformatHolder* pHolder, } else if (uch == '\t') { RTFTab(); } else { - RTFPrintUTF16Char(ConvertMacRomanToUTF16(uch)); + RTFPrintUTF16Char(Charset::ConvertMacRomanToUTF16(uch)); } dataBuf++; dataLen--; diff --git a/reformat/reformat.vcxproj b/reformat/reformat.vcxproj index 176a023..1b94811 100644 --- a/reformat/reformat.vcxproj +++ b/reformat/reformat.vcxproj @@ -108,6 +108,7 @@ + @@ -130,6 +131,7 @@ + diff --git a/reformat/reformat.vcxproj.filters b/reformat/reformat.vcxproj.filters index fd56a68..0da9e0e 100644 --- a/reformat/reformat.vcxproj.filters +++ b/reformat/reformat.vcxproj.filters @@ -71,6 +71,9 @@ Header Files + + Header Files + @@ -142,5 +145,8 @@ Source Files + + Source Files + \ No newline at end of file