diff --git a/app/DiskArchive.cpp b/app/DiskArchive.cpp
index 52d4f81..2ebeb7d 100644
--- a/app/DiskArchive.cpp
+++ b/app/DiskArchive.cpp
@@ -16,6 +16,7 @@
#include "RenameEntryDialog.h"
#include "ConfirmOverwriteDialog.h"
#include "../diskimg/DiskImgDetail.h"
+#include "../reformat/Charset.h"
static const char kEmptyFolderMarker[] = ".$$EmptyFolder";
@@ -748,7 +749,7 @@ CString DiskArchive::New(const WCHAR* fileName, const void* vOptions)
* want to do it under Win2K/XP because it can be slow for larger
* volumes.
*/
- fileNameA = fileName;
+ fileNameA = fileName; // TODO(Unicode)
if (numBlocks > 0) {
dierr = fDiskImg.CreateImage(fileNameA, NULL,
DiskImg::kOuterFormatNone,
@@ -797,6 +798,7 @@ CString DiskArchive::New(const WCHAR* fileName, const void* vOptions)
volName.MakeUpper();
/* format it */
+ // TODO(Unicode): for HFS, we need to convert Unicode to MOR
volNameA = volName;
dierr = fDiskImg.FormatImage(pOptions->base.format, volNameA);
if (dierr != kDIErrNone) {
@@ -1086,18 +1088,21 @@ int DiskArchive::LoadDiskFSContents(DiskFS* pDiskFS, const WCHAR* volName)
*/
pSubVol = pDiskFS->GetNextSubVolume(NULL);
while (pSubVol != NULL) {
+ CStringA subVolNameMOR;
CString concatSubVolName;
- const char* subVolName;
int ret;
- subVolName = pSubVol->GetDiskFS()->GetVolumeName();
- if (subVolName == NULL)
- subVolName = "+++"; // call it *something*
+ subVolNameMOR = pSubVol->GetDiskFS()->GetVolumeName();
+ if (subVolNameMOR.IsEmpty()) {
+ subVolNameMOR = "+++"; // call it *something*
+ }
+ CString subVolName(Charset::ConvertMORToUNI(subVolNameMOR));
- if (volName[0] == '\0')
- concatSubVolName.Format(L"_%hs", subVolName);
- else
- concatSubVolName.Format(L"%ls_%hs", volName, subVolName);
+ if (volName[0] == '\0') {
+ concatSubVolName.Format(L"_%ls", (LPCWSTR) subVolName);
+ } else {
+ concatSubVolName.Format(L"%ls_%ls", volName, (LPCWSTR) subVolName);
+ }
ret = LoadDiskFSContents(pSubVol->GetDiskFS(), concatSubVolName);
if (ret != 0)
return ret;
diff --git a/app/GenericArchive.cpp b/app/GenericArchive.cpp
index a4f8cf8..96be02f 100644
--- a/app/GenericArchive.cpp
+++ b/app/GenericArchive.cpp
@@ -139,21 +139,6 @@ void GenericEntry::SetSubVolName(const WCHAR* name)
fSubVolName = name;
}
-// Simple Mac OS Roman to Unicode conversion.
-static CString ConvertMORToUNI(const CStringA& strMOR)
-{
- // We know that all MOR characters are represented in Unicode with a
- // single BMP code point, so we know that strlen(MOR) == wcslen(UNI).
- const int len = strMOR.GetLength();
- CString strUNI;
- WCHAR* uniBuf = strUNI.GetBuffer(len);
- for (int i = 0; i < len; i++) {
- uniBuf[i] = ReformatText::ConvertMacRomanToUTF16(strMOR[i]);
- }
- strUNI.ReleaseBuffer(len);
- return strUNI;
-}
-
const CString& GenericEntry::GetDisplayName(void) const
{
ASSERT(!fPathNameMOR.IsEmpty());
@@ -164,7 +149,7 @@ const CString& GenericEntry::GetDisplayName(void) const
if (!fSubVolName.IsEmpty()) {
fDisplayName = fSubVolName + (WCHAR) DiskFS::kDIFssep;
}
- fDisplayName += ConvertMORToUNI(fPathNameMOR);
+ fDisplayName += Charset::ConvertMORToUNI(fPathNameMOR);
return fDisplayName;
}
@@ -1024,8 +1009,8 @@ void GenericArchive::LocalFileDetails::GenerateStoragePathName()
// TODO(Unicode): generate MOR name from Unicode, instead of just
// doing a generic CP-1252 conversion. We need to do this on both
// sides though, so until we can extract MOR->Unicode we don't
- // want to add Unicode->MOR. And it all depends on NufxLib and
- // DiskImg being able to handle UTF-16 filenames.
+ // want to add Unicode->MOR. For this all to work well we need NufxLib
+ // and DiskImgLib to be able to handle UTF-16 filenames.
fStoragePathNameMOR = fStrippedLocalPathName;
}
diff --git a/reformat/AWGS.cpp b/reformat/AWGS.cpp
index 8556fa5..4b9efab 100644
--- a/reformat/AWGS.cpp
+++ b/reformat/AWGS.cpp
@@ -36,7 +36,7 @@ int ReformatAWGS_WP::Process(const ReformatHolder* pHolder,
Chunk doc, header, footer;
uint16_t val;
- CheckGSCharConv();
+ Charset::CheckGSCharConv();
/* must at least have the doc header and globals */
if (srcLen < kMinExpectedLen) {
@@ -388,7 +388,7 @@ int ReformatAWGS_WP::PrintParagraph(const uint8_t* ptr, long maxLen)
RTFTab();
break;
default:
- RTFPrintUTF16Char(ConvertMacRomanToUTF16(uch));
+ RTFPrintUTF16Char(Charset::ConvertMacRomanToUTF16(uch));
break;
}
}
diff --git a/reformat/Charset.cpp b/reformat/Charset.cpp
new file mode 100644
index 0000000..3dd75d9
--- /dev/null
+++ b/reformat/Charset.cpp
@@ -0,0 +1,451 @@
+/*
+ * CiderPress
+ * Copyright (C) 2015 by faddenSoft. All Rights Reserved.
+ * See the file LICENSE for distribution terms.
+ */
+/*
+ * Reformatter base class implementation.
+ */
+#include "StdAfx.h"
+#include "Charset.h"
+
+/*
+ * Convert Mac OS Roman to Windows CP1252.
+ */
+const int kUnk = 0x3f; // for unmappable chars, use '?'
+
+/*static*/ const uint8_t Charset::kCP1252Conv[128] = {
+ 0xc4, // 0x80 A + umlaut (diaeresis?)
+ 0xc5, // 0x81 A + overcircle
+ 0xc7, // 0x82 C + cedilla
+ 0xc9, // 0x83 E + acute
+ 0xd1, // 0x84 N + tilde
+ 0xd6, // 0x85 O + umlaut
+ 0xdc, // 0x86 U + umlaut
+ 0xe1, // 0x87 a + acute
+ 0xe0, // 0x88 a + grave
+ 0xe2, // 0x89 a + circumflex
+ 0xe4, // 0x8a a + umlaut
+ 0xe3, // 0x8b a + tilde
+ 0xe5, // 0x8c a + overcircle
+ 0xe7, // 0x8d c + cedilla
+ 0xe9, // 0x8e e + acute
+ 0xe8, // 0x8f e + grave
+ 0xea, // 0x90 e + circumflex
+ 0xeb, // 0x91 e + umlaut
+ 0xed, // 0x92 i + acute
+ 0xec, // 0x93 i + grave
+ 0xee, // 0x94 i + circumflex
+ 0xef, // 0x95 i + umlaut
+ 0xf1, // 0x96 n + tilde
+ 0xf3, // 0x97 o + acute
+ 0xf2, // 0x98 o + grave
+ 0xf4, // 0x99 o + circumflex
+ 0xf6, // 0x9a o + umlaut
+ 0xf5, // 0x9b o + tilde
+ 0xfa, // 0x9c u + acute
+ 0xf9, // 0x9d u + grave
+ 0xfb, // 0x9e u + circumflex
+ 0xfc, // 0x9f u + umlaut
+ 0x87, // 0xa0 double cross (dagger)
+ 0xb0, // 0xa1 degrees
+ 0xa2, // 0xa2 cents
+ 0xa3, // 0xa3 pounds (UK$)
+ 0xa7, // 0xa4 section start
+ 0x95, // 0xa5 small square (bullet) [using fat bullet]
+ 0xb6, // 0xa6 paragraph (pilcrow)
+ 0xdf, // 0xa7 curly B (latin small letter sharp S)
+ 0xae, // 0xa8 raised 'R' (registered)
+ 0xa9, // 0xa9 raised 'C' (copyright)
+ 0x99, // 0xaa raised 'TM' (trademark)
+ 0xb4, // 0xab acute accent
+ 0xa8, // 0xac umlaut (diaeresis)
+ kUnk, // 0xad not-equal
+ 0xc6, // 0xae merged AE
+ 0xd8, // 0xaf O + slash (upper-case nil?)
+ kUnk, // 0xb0 infinity
+ 0xb1, // 0xb1 +/-
+ kUnk, // 0xb2 <=
+ kUnk, // 0xb3 >=
+ 0xa5, // 0xb4 Yen (Japan$)
+ 0xb5, // 0xb5 mu (micro)
+ kUnk, // 0xb6 delta (partial differentiation) [could use D-bar 0xd0]
+ kUnk, // 0xb7 epsilon (N-ary summation) [could use C-double-bar 0x80]
+ kUnk, // 0xb8 PI (N-ary product)
+ kUnk, // 0xb9 pi
+ kUnk, // 0xba integral
+ 0xaa, // 0xbb a underbar (feminine ordinal) [using raised a]
+ 0xba, // 0xbc o underbar (masculine ordinal) [using raised o]
+ kUnk, // 0xbd omega (Ohm)
+ 0xe6, // 0xbe merged ae
+ 0xf8, // 0xbf o + slash (lower-case NULL?)
+ 0xbf, // 0xc0 upside-down question mark
+ 0xa1, // 0xc1 upside-down exclamation point
+ 0xac, // 0xc2 rotated L ("not" sign)
+ 0xb7, // 0xc3 checkmark (square root) [using small bullet]
+ 0x83, // 0xc4 script f
+ kUnk, // 0xc5 approximately equal
+ kUnk, // 0xc6 delta (triangle / increment)
+ 0xab, // 0xc7 much less than
+ 0xbb, // 0xc8 much greater than
+ 0x85, // 0xc9 ellipsis
+ 0xa0, // 0xca blank (sticky space)
+ 0xc0, // 0xcb A + grave
+ 0xc3, // 0xcc A + tilde
+ 0xd5, // 0xcd O + tilde
+ 0x8c, // 0xce merged OE
+ 0x9c, // 0xcf merged oe
+ 0x96, // 0xd0 short hyphen (en dash)
+ 0x97, // 0xd1 long hyphen (em dash)
+ 0x93, // 0xd2 smart double-quote start
+ 0x94, // 0xd3 smart double-quote end
+ 0x91, // 0xd4 smart single-quote start
+ 0x92, // 0xd5 smart single-quote end
+ 0xf7, // 0xd6 divide
+ 0xa4, // 0xd7 diamond (lozenge) [using spiky circle]
+ 0xff, // 0xd8 y + umlaut
+ // [nothing below here is part of standard Windows-ASCII?]
+ // remaining descriptions based on hfsutils' "charset.txt"
+ kUnk, // 0xd9 Y + umlaut
+ kUnk, // 0xda fraction slash
+ kUnk, // 0xdb currency sign
+ kUnk, // 0xdc single left-pointing angle quotation mark
+ kUnk, // 0xdd single right-pointing angle quotation mark
+ kUnk, // 0xde merged fi
+ kUnk, // 0xdf merged FL
+ kUnk, // 0xe0 double dagger
+ kUnk, // 0xe1 middle dot
+ kUnk, // 0xe2 single low-9 quotation mark
+ kUnk, // 0xe3 double low-9 quotation mark
+ kUnk, // 0xe4 per mille sign
+ kUnk, // 0xe5 A + circumflex
+ kUnk, // 0xe6 E + circumflex
+ kUnk, // 0xe7 A + acute accent
+ kUnk, // 0xe8 E + diaeresis
+ kUnk, // 0xe9 E + grave accent
+ kUnk, // 0xea I + acute accent
+ kUnk, // 0xeb I + circumflex
+ kUnk, // 0xec I + diaeresis
+ kUnk, // 0xed I + grave accent
+ kUnk, // 0xee O + acute accent
+ kUnk, // 0xef O + circumflex
+ kUnk, // 0xf0 apple logo
+ kUnk, // 0xf1 O + grave accent
+ kUnk, // 0xf2 U + acute accent
+ kUnk, // 0xf3 U + circumflex
+ kUnk, // 0xf4 U + grave accent
+ kUnk, // 0xf5 i without dot
+ kUnk, // 0xf6 modifier letter circumflex accent
+ kUnk, // 0xf7 small tilde
+ kUnk, // 0xf8 macron
+ kUnk, // 0xf9 breve
+ kUnk, // 0xfa dot above
+ kUnk, // 0xfb ring above
+ kUnk, // 0xfc cedilla
+ kUnk, // 0xfd double acute accent
+ kUnk, // 0xfe ogonek
+ kUnk, // 0xff caron
+};
+
+/*
+ * Convert Mac OS Roman to Unicode. Mapping comes from:
+ *
+ * http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
+ *
+ * We use the "Control Pictures" block for the control characters
+ * (0x00-0x1f, 0x7f).
+ */
+/*static*/ const uint16_t Charset::kUTF16Conv[256] = {
+ /*0x00*/ 0x2400, // [control] NULL
+ /*0x01*/ 0x2401, // [control] START OF HEADING
+ /*0x02*/ 0x2402, // [control] START OF TEXT
+ /*0x03*/ 0x2403, // [control] END OF TEXT
+ /*0x04*/ 0x2404, // [control] END OF TRANSMISSION
+ /*0x05*/ 0x2405, // [control] ENQUIRY
+ /*0x06*/ 0x2406, // [control] ACKNOWLEDGE
+ /*0x07*/ 0x2407, // [control] BELL
+ /*0x08*/ 0x2408, // [control] BACKSPACE
+ /*0x09*/ 0x2409, // [control] HORIZONTAL TABULATION
+ /*0x0a*/ 0x240a, // [control] LINE FEED
+ /*0x0b*/ 0x240b, // [control] VERTICAL TABULATION
+ /*0x0c*/ 0x240c, // [control] FORM FEED
+ /*0x0d*/ 0x240d, // [control] CARRIAGE RETURN
+ /*0x0e*/ 0x240e, // [control] SHIFT OUT
+ /*0x0f*/ 0x240f, // [control] SHIFT IN
+ /*0x10*/ 0x2410, // [control] DATA LINK ESCAPE
+ /*0x11*/ 0x2411, // [control] DEVICE CONTROL ONE
+ /*0x12*/ 0x2412, // [control] DEVICE CONTROL TWO
+ /*0x13*/ 0x2413, // [control] DEVICE CONTROL THREE
+ /*0x14*/ 0x2414, // [control] DEVICE CONTROL FOUR
+ /*0x15*/ 0x2415, // [control] NEGATIVE ACKNOWLEDGE
+ /*0x16*/ 0x2416, // [control] SYNCHRONOUS IDLE
+ /*0x17*/ 0x2417, // [control] END OF TRANSMISSION BLOCK
+ /*0x18*/ 0x2418, // [control] CANCEL
+ /*0x19*/ 0x2419, // [control] END OF MEDIUM
+ /*0x1a*/ 0x241a, // [control] SUBSTITUTE
+ /*0x1b*/ 0x241b, // [control] ESCAPE
+ /*0x1c*/ 0x241c, // [control] FILE SEPARATOR
+ /*0x1d*/ 0x241d, // [control] GROUP SEPARATOR
+ /*0x1e*/ 0x241e, // [control] RECORD SEPARATOR
+ /*0x1f*/ 0x241f, // [control] UNIT SEPARATOR
+ /*0x20*/ 0x0020, // SPACE
+ /*0x21*/ 0x0021, // EXCLAMATION MARK
+ /*0x22*/ 0x0022, // QUOTATION MARK
+ /*0x23*/ 0x0023, // NUMBER SIGN
+ /*0x24*/ 0x0024, // DOLLAR SIGN
+ /*0x25*/ 0x0025, // PERCENT SIGN
+ /*0x26*/ 0x0026, // AMPERSAND
+ /*0x27*/ 0x0027, // APOSTROPHE
+ /*0x28*/ 0x0028, // LEFT PARENTHESIS
+ /*0x29*/ 0x0029, // RIGHT PARENTHESIS
+ /*0x2A*/ 0x002A, // ASTERISK
+ /*0x2B*/ 0x002B, // PLUS SIGN
+ /*0x2C*/ 0x002C, // COMMA
+ /*0x2D*/ 0x002D, // HYPHEN-MINUS
+ /*0x2E*/ 0x002E, // FULL STOP
+ /*0x2F*/ 0x002F, // SOLIDUS
+ /*0x30*/ 0x0030, // DIGIT ZERO
+ /*0x31*/ 0x0031, // DIGIT ONE
+ /*0x32*/ 0x0032, // DIGIT TWO
+ /*0x33*/ 0x0033, // DIGIT THREE
+ /*0x34*/ 0x0034, // DIGIT FOUR
+ /*0x35*/ 0x0035, // DIGIT FIVE
+ /*0x36*/ 0x0036, // DIGIT SIX
+ /*0x37*/ 0x0037, // DIGIT SEVEN
+ /*0x38*/ 0x0038, // DIGIT EIGHT
+ /*0x39*/ 0x0039, // DIGIT NINE
+ /*0x3A*/ 0x003A, // COLON
+ /*0x3B*/ 0x003B, // SEMICOLON
+ /*0x3C*/ 0x003C, // LESS-THAN SIGN
+ /*0x3D*/ 0x003D, // EQUALS SIGN
+ /*0x3E*/ 0x003E, // GREATER-THAN SIGN
+ /*0x3F*/ 0x003F, // QUESTION MARK
+ /*0x40*/ 0x0040, // COMMERCIAL AT
+ /*0x41*/ 0x0041, // LATIN CAPITAL LETTER A
+ /*0x42*/ 0x0042, // LATIN CAPITAL LETTER B
+ /*0x43*/ 0x0043, // LATIN CAPITAL LETTER C
+ /*0x44*/ 0x0044, // LATIN CAPITAL LETTER D
+ /*0x45*/ 0x0045, // LATIN CAPITAL LETTER E
+ /*0x46*/ 0x0046, // LATIN CAPITAL LETTER F
+ /*0x47*/ 0x0047, // LATIN CAPITAL LETTER G
+ /*0x48*/ 0x0048, // LATIN CAPITAL LETTER H
+ /*0x49*/ 0x0049, // LATIN CAPITAL LETTER I
+ /*0x4A*/ 0x004A, // LATIN CAPITAL LETTER J
+ /*0x4B*/ 0x004B, // LATIN CAPITAL LETTER K
+ /*0x4C*/ 0x004C, // LATIN CAPITAL LETTER L
+ /*0x4D*/ 0x004D, // LATIN CAPITAL LETTER M
+ /*0x4E*/ 0x004E, // LATIN CAPITAL LETTER N
+ /*0x4F*/ 0x004F, // LATIN CAPITAL LETTER O
+ /*0x50*/ 0x0050, // LATIN CAPITAL LETTER P
+ /*0x51*/ 0x0051, // LATIN CAPITAL LETTER Q
+ /*0x52*/ 0x0052, // LATIN CAPITAL LETTER R
+ /*0x53*/ 0x0053, // LATIN CAPITAL LETTER S
+ /*0x54*/ 0x0054, // LATIN CAPITAL LETTER T
+ /*0x55*/ 0x0055, // LATIN CAPITAL LETTER U
+ /*0x56*/ 0x0056, // LATIN CAPITAL LETTER V
+ /*0x57*/ 0x0057, // LATIN CAPITAL LETTER W
+ /*0x58*/ 0x0058, // LATIN CAPITAL LETTER X
+ /*0x59*/ 0x0059, // LATIN CAPITAL LETTER Y
+ /*0x5A*/ 0x005A, // LATIN CAPITAL LETTER Z
+ /*0x5B*/ 0x005B, // LEFT SQUARE BRACKET
+ /*0x5C*/ 0x005C, // REVERSE SOLIDUS
+ /*0x5D*/ 0x005D, // RIGHT SQUARE BRACKET
+ /*0x5E*/ 0x005E, // CIRCUMFLEX ACCENT
+ /*0x5F*/ 0x005F, // LOW LINE
+ /*0x60*/ 0x0060, // GRAVE ACCENT
+ /*0x61*/ 0x0061, // LATIN SMALL LETTER A
+ /*0x62*/ 0x0062, // LATIN SMALL LETTER B
+ /*0x63*/ 0x0063, // LATIN SMALL LETTER C
+ /*0x64*/ 0x0064, // LATIN SMALL LETTER D
+ /*0x65*/ 0x0065, // LATIN SMALL LETTER E
+ /*0x66*/ 0x0066, // LATIN SMALL LETTER F
+ /*0x67*/ 0x0067, // LATIN SMALL LETTER G
+ /*0x68*/ 0x0068, // LATIN SMALL LETTER H
+ /*0x69*/ 0x0069, // LATIN SMALL LETTER I
+ /*0x6A*/ 0x006A, // LATIN SMALL LETTER J
+ /*0x6B*/ 0x006B, // LATIN SMALL LETTER K
+ /*0x6C*/ 0x006C, // LATIN SMALL LETTER L
+ /*0x6D*/ 0x006D, // LATIN SMALL LETTER M
+ /*0x6E*/ 0x006E, // LATIN SMALL LETTER N
+ /*0x6F*/ 0x006F, // LATIN SMALL LETTER O
+ /*0x70*/ 0x0070, // LATIN SMALL LETTER P
+ /*0x71*/ 0x0071, // LATIN SMALL LETTER Q
+ /*0x72*/ 0x0072, // LATIN SMALL LETTER R
+ /*0x73*/ 0x0073, // LATIN SMALL LETTER S
+ /*0x74*/ 0x0074, // LATIN SMALL LETTER T
+ /*0x75*/ 0x0075, // LATIN SMALL LETTER U
+ /*0x76*/ 0x0076, // LATIN SMALL LETTER V
+ /*0x77*/ 0x0077, // LATIN SMALL LETTER W
+ /*0x78*/ 0x0078, // LATIN SMALL LETTER X
+ /*0x79*/ 0x0079, // LATIN SMALL LETTER Y
+ /*0x7A*/ 0x007A, // LATIN SMALL LETTER Z
+ /*0x7B*/ 0x007B, // LEFT CURLY BRACKET
+ /*0x7C*/ 0x007C, // VERTICAL LINE
+ /*0x7D*/ 0x007D, // RIGHT CURLY BRACKET
+ /*0x7E*/ 0x007E, // TILDE
+ /*0x7f*/ 0x2421, // [control] DELETE
+ /*0x80*/ 0x00C4, // LATIN CAPITAL LETTER A WITH DIAERESIS
+ /*0x81*/ 0x00C5, // LATIN CAPITAL LETTER A WITH RING ABOVE
+ /*0x82*/ 0x00C7, // LATIN CAPITAL LETTER C WITH CEDILLA
+ /*0x83*/ 0x00C9, // LATIN CAPITAL LETTER E WITH ACUTE
+ /*0x84*/ 0x00D1, // LATIN CAPITAL LETTER N WITH TILDE
+ /*0x85*/ 0x00D6, // LATIN CAPITAL LETTER O WITH DIAERESIS
+ /*0x86*/ 0x00DC, // LATIN CAPITAL LETTER U WITH DIAERESIS
+ /*0x87*/ 0x00E1, // LATIN SMALL LETTER A WITH ACUTE
+ /*0x88*/ 0x00E0, // LATIN SMALL LETTER A WITH GRAVE
+ /*0x89*/ 0x00E2, // LATIN SMALL LETTER A WITH CIRCUMFLEX
+ /*0x8A*/ 0x00E4, // LATIN SMALL LETTER A WITH DIAERESIS
+ /*0x8B*/ 0x00E3, // LATIN SMALL LETTER A WITH TILDE
+ /*0x8C*/ 0x00E5, // LATIN SMALL LETTER A WITH RING ABOVE
+ /*0x8D*/ 0x00E7, // LATIN SMALL LETTER C WITH CEDILLA
+ /*0x8E*/ 0x00E9, // LATIN SMALL LETTER E WITH ACUTE
+ /*0x8F*/ 0x00E8, // LATIN SMALL LETTER E WITH GRAVE
+ /*0x90*/ 0x00EA, // LATIN SMALL LETTER E WITH CIRCUMFLEX
+ /*0x91*/ 0x00EB, // LATIN SMALL LETTER E WITH DIAERESIS
+ /*0x92*/ 0x00ED, // LATIN SMALL LETTER I WITH ACUTE
+ /*0x93*/ 0x00EC, // LATIN SMALL LETTER I WITH GRAVE
+ /*0x94*/ 0x00EE, // LATIN SMALL LETTER I WITH CIRCUMFLEX
+ /*0x95*/ 0x00EF, // LATIN SMALL LETTER I WITH DIAERESIS
+ /*0x96*/ 0x00F1, // LATIN SMALL LETTER N WITH TILDE
+ /*0x97*/ 0x00F3, // LATIN SMALL LETTER O WITH ACUTE
+ /*0x98*/ 0x00F2, // LATIN SMALL LETTER O WITH GRAVE
+ /*0x99*/ 0x00F4, // LATIN SMALL LETTER O WITH CIRCUMFLEX
+ /*0x9A*/ 0x00F6, // LATIN SMALL LETTER O WITH DIAERESIS
+ /*0x9B*/ 0x00F5, // LATIN SMALL LETTER O WITH TILDE
+ /*0x9C*/ 0x00FA, // LATIN SMALL LETTER U WITH ACUTE
+ /*0x9D*/ 0x00F9, // LATIN SMALL LETTER U WITH GRAVE
+ /*0x9E*/ 0x00FB, // LATIN SMALL LETTER U WITH CIRCUMFLEX
+ /*0x9F*/ 0x00FC, // LATIN SMALL LETTER U WITH DIAERESIS
+ /*0xA0*/ 0x2020, // DAGGER
+ /*0xA1*/ 0x00B0, // DEGREE SIGN
+ /*0xA2*/ 0x00A2, // CENT SIGN
+ /*0xA3*/ 0x00A3, // POUND SIGN
+ /*0xA4*/ 0x00A7, // SECTION SIGN
+ /*0xA5*/ 0x2022, // BULLET
+ /*0xA6*/ 0x00B6, // PILCROW SIGN
+ /*0xA7*/ 0x00DF, // LATIN SMALL LETTER SHARP S
+ /*0xA8*/ 0x00AE, // REGISTERED SIGN
+ /*0xA9*/ 0x00A9, // COPYRIGHT SIGN
+ /*0xAA*/ 0x2122, // TRADE MARK SIGN
+ /*0xAB*/ 0x00B4, // ACUTE ACCENT
+ /*0xAC*/ 0x00A8, // DIAERESIS
+ /*0xAD*/ 0x2260, // NOT EQUAL TO
+ /*0xAE*/ 0x00C6, // LATIN CAPITAL LETTER AE
+ /*0xAF*/ 0x00D8, // LATIN CAPITAL LETTER O WITH STROKE
+ /*0xB0*/ 0x221E, // INFINITY
+ /*0xB1*/ 0x00B1, // PLUS-MINUS SIGN
+ /*0xB2*/ 0x2264, // LESS-THAN OR EQUAL TO
+ /*0xB3*/ 0x2265, // GREATER-THAN OR EQUAL TO
+ /*0xB4*/ 0x00A5, // YEN SIGN
+ /*0xB5*/ 0x00B5, // MICRO SIGN
+ /*0xB6*/ 0x2202, // PARTIAL DIFFERENTIAL
+ /*0xB7*/ 0x2211, // N-ARY SUMMATION
+ /*0xB8*/ 0x220F, // N-ARY PRODUCT
+ /*0xB9*/ 0x03C0, // GREEK SMALL LETTER PI
+ /*0xBA*/ 0x222B, // INTEGRAL
+ /*0xBB*/ 0x00AA, // FEMININE ORDINAL INDICATOR
+ /*0xBC*/ 0x00BA, // MASCULINE ORDINAL INDICATOR
+ /*0xBD*/ 0x03A9, // GREEK CAPITAL LETTER OMEGA
+ /*0xBE*/ 0x00E6, // LATIN SMALL LETTER AE
+ /*0xBF*/ 0x00F8, // LATIN SMALL LETTER O WITH STROKE
+ /*0xC0*/ 0x00BF, // INVERTED QUESTION MARK
+ /*0xC1*/ 0x00A1, // INVERTED EXCLAMATION MARK
+ /*0xC2*/ 0x00AC, // NOT SIGN
+ /*0xC3*/ 0x221A, // SQUARE ROOT
+ /*0xC4*/ 0x0192, // LATIN SMALL LETTER F WITH HOOK
+ /*0xC5*/ 0x2248, // ALMOST EQUAL TO
+ /*0xC6*/ 0x2206, // INCREMENT
+ /*0xC7*/ 0x00AB, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+ /*0xC8*/ 0x00BB, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+ /*0xC9*/ 0x2026, // HORIZONTAL ELLIPSIS
+ /*0xCA*/ 0x00A0, // NO-BREAK SPACE
+ /*0xCB*/ 0x00C0, // LATIN CAPITAL LETTER A WITH GRAVE
+ /*0xCC*/ 0x00C3, // LATIN CAPITAL LETTER A WITH TILDE
+ /*0xCD*/ 0x00D5, // LATIN CAPITAL LETTER O WITH TILDE
+ /*0xCE*/ 0x0152, // LATIN CAPITAL LIGATURE OE
+ /*0xCF*/ 0x0153, // LATIN SMALL LIGATURE OE
+ /*0xD0*/ 0x2013, // EN DASH
+ /*0xD1*/ 0x2014, // EM DASH
+ /*0xD2*/ 0x201C, // LEFT DOUBLE QUOTATION MARK
+ /*0xD3*/ 0x201D, // RIGHT DOUBLE QUOTATION MARK
+ /*0xD4*/ 0x2018, // LEFT SINGLE QUOTATION MARK
+ /*0xD5*/ 0x2019, // RIGHT SINGLE QUOTATION MARK
+ /*0xD6*/ 0x00F7, // DIVISION SIGN
+ /*0xD7*/ 0x25CA, // LOZENGE
+ /*0xD8*/ 0x00FF, // LATIN SMALL LETTER Y WITH DIAERESIS
+ /*0xD9*/ 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS
+ /*0xDA*/ 0x2044, // FRACTION SLASH
+ /*0xDB*/ 0x00A4, // CURRENCY SIGN (was EURO SIGN)
+ /*0xDC*/ 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ /*0xDD*/ 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ /*0xDE*/ 0xFB01, // LATIN SMALL LIGATURE FI
+ /*0xDF*/ 0xFB02, // LATIN SMALL LIGATURE FL
+ /*0xE0*/ 0x2021, // DOUBLE DAGGER
+ /*0xE1*/ 0x00B7, // MIDDLE DOT
+ /*0xE2*/ 0x201A, // SINGLE LOW-9 QUOTATION MARK
+ /*0xE3*/ 0x201E, // DOUBLE LOW-9 QUOTATION MARK
+ /*0xE4*/ 0x2030, // PER MILLE SIGN
+ /*0xE5*/ 0x00C2, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
+ /*0xE6*/ 0x00CA, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
+ /*0xE7*/ 0x00C1, // LATIN CAPITAL LETTER A WITH ACUTE
+ /*0xE8*/ 0x00CB, // LATIN CAPITAL LETTER E WITH DIAERESIS
+ /*0xE9*/ 0x00C8, // LATIN CAPITAL LETTER E WITH GRAVE
+ /*0xEA*/ 0x00CD, // LATIN CAPITAL LETTER I WITH ACUTE
+ /*0xEB*/ 0x00CE, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+ /*0xEC*/ 0x00CF, // LATIN CAPITAL LETTER I WITH DIAERESIS
+ /*0xED*/ 0x00CC, // LATIN CAPITAL LETTER I WITH GRAVE
+ /*0xEE*/ 0x00D3, // LATIN CAPITAL LETTER O WITH ACUTE
+ /*0xEF*/ 0x00D4, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
+ /*0xF0*/ 0xF8FF, // Apple logo
+ /*0xF1*/ 0x00D2, // LATIN CAPITAL LETTER O WITH GRAVE
+ /*0xF2*/ 0x00DA, // LATIN CAPITAL LETTER U WITH ACUTE
+ /*0xF3*/ 0x00DB, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
+ /*0xF4*/ 0x00D9, // LATIN CAPITAL LETTER U WITH GRAVE
+ /*0xF5*/ 0x0131, // LATIN SMALL LETTER DOTLESS I
+ /*0xF6*/ 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT
+ /*0xF7*/ 0x02DC, // SMALL TILDE
+ /*0xF8*/ 0x00AF, // MACRON
+ /*0xF9*/ 0x02D8, // BREVE
+ /*0xFA*/ 0x02D9, // DOT ABOVE
+ /*0xFB*/ 0x02DA, // RING ABOVE
+ /*0xFC*/ 0x00B8, // CEDILLA
+ /*0xFD*/ 0x02DD, // DOUBLE ACUTE ACCENT
+ /*0xFE*/ 0x02DB, // OGONEK
+ /*0xFF*/ 0x02C7, // CARON
+};
+
+/*
+ * Quick sanity check on contents of array.
+ *
+ * No two characters should map to the same thing. This isn't vital, but
+ * if we want to have a reversible transformation someday, it'll make our
+ * lives easier then.
+ */
+/*static*/ void Charset::CheckGSCharConv(void)
+{
+#ifdef _DEBUG
+ bool* test = (bool*) malloc(65536 * sizeof(bool));
+
+ memset(test, 0, 65536 * sizeof(bool));
+ for (int i = 0; i < NELEM(kCP1252Conv); i++) {
+ if (test[kCP1252Conv[i]] && kCP1252Conv[i] != kUnk) {
+ LOGW("Character used twice: 0x%02x at %d (0x%02x)",
+ kCP1252Conv[i], i, i+128);
+ assert(false);
+ }
+ test[kCP1252Conv[i]] = true;
+ }
+
+ memset(test, 0, 65536 * sizeof(bool));
+ for (int i = 0; i < NELEM(kUTF16Conv); i++) {
+ if (test[kUTF16Conv[i]]) {
+ LOGW("Character used twice: 0x%02x at %d (0x%02x)",
+ kUTF16Conv[i], i, i+128);
+ assert(false);
+ }
+ test[kUTF16Conv[i]] = true;
+ }
+
+ free(test);
+#endif
+}
diff --git a/reformat/Charset.h b/reformat/Charset.h
new file mode 100644
index 0000000..94808c3
--- /dev/null
+++ b/reformat/Charset.h
@@ -0,0 +1,52 @@
+/*
+ * CiderPress
+ * Copyright (C) 2015 by faddenSoft. All Rights Reserved.
+ * See the file LICENSE for distribution terms.
+ */
+#ifndef REFORMAT_CHARSET_H
+#define REFORMAT_CHARSET_H
+
+/*
+ * Character set conversions.
+ */
+class Charset {
+public:
+ // Convert a Mac OS Roman character value (from a IIgs document) to
+ // its UTF-16 Unicode equivalent. This also includes a conversion
+ // for the control characters. The transformation is reversible.
+ static uint16_t ConvertMacRomanToUTF16(uint8_t ch) {
+ return kUTF16Conv[ch];
+ }
+
+ // Convert a Mac OS Roman character value an 8-bit Windows CP1252
+ // equivalent. The transformation is NOT reversible.
+ static uint8_t ConvertMacRomanTo1252(uint8_t ch) {
+ if (ch < 128)
+ return ch;
+ else
+ return kCP1252Conv[ch-128];
+ }
+
+ // Simple Mac OS Roman to Unicode string conversion.
+ static CString ConvertMORToUNI(const CStringA& strMOR)
+ {
+ // We know that all MOR characters are represented in Unicode with a
+ // single BMP code point, so we know that strlen(MOR) == wcslen(UNI).
+ const int len = strMOR.GetLength();
+ CString strUNI;
+ WCHAR* uniBuf = strUNI.GetBuffer(len);
+ for (int i = 0; i < len; i++) {
+ uniBuf[i] = Charset::ConvertMacRomanToUTF16(strMOR[i]);
+ }
+ strUNI.ReleaseBuffer(len);
+ return strUNI;
+ }
+
+ static void CheckGSCharConv(void);
+
+private:
+ static const uint8_t kCP1252Conv[];
+ static const uint16_t kUTF16Conv[];
+};
+
+#endif /*REFORMAT_CHARSET_H*/
diff --git a/reformat/ReformatBase.cpp b/reformat/ReformatBase.cpp
index d399298..af7f0c3 100644
--- a/reformat/ReformatBase.cpp
+++ b/reformat/ReformatBase.cpp
@@ -17,447 +17,6 @@
* ==========================================================================
*/
-/*
- * Convert Mac OS Roman to Windows CP1252.
- */
-const int kUnk = 0x3f; // for unmappable chars, use '?'
-
-/*static*/ const uint8_t ReformatText::kCP1252Conv[128] = {
- 0xc4, // 0x80 A + umlaut (diaeresis?)
- 0xc5, // 0x81 A + overcircle
- 0xc7, // 0x82 C + cedilla
- 0xc9, // 0x83 E + acute
- 0xd1, // 0x84 N + tilde
- 0xd6, // 0x85 O + umlaut
- 0xdc, // 0x86 U + umlaut
- 0xe1, // 0x87 a + acute
- 0xe0, // 0x88 a + grave
- 0xe2, // 0x89 a + circumflex
- 0xe4, // 0x8a a + umlaut
- 0xe3, // 0x8b a + tilde
- 0xe5, // 0x8c a + overcircle
- 0xe7, // 0x8d c + cedilla
- 0xe9, // 0x8e e + acute
- 0xe8, // 0x8f e + grave
- 0xea, // 0x90 e + circumflex
- 0xeb, // 0x91 e + umlaut
- 0xed, // 0x92 i + acute
- 0xec, // 0x93 i + grave
- 0xee, // 0x94 i + circumflex
- 0xef, // 0x95 i + umlaut
- 0xf1, // 0x96 n + tilde
- 0xf3, // 0x97 o + acute
- 0xf2, // 0x98 o + grave
- 0xf4, // 0x99 o + circumflex
- 0xf6, // 0x9a o + umlaut
- 0xf5, // 0x9b o + tilde
- 0xfa, // 0x9c u + acute
- 0xf9, // 0x9d u + grave
- 0xfb, // 0x9e u + circumflex
- 0xfc, // 0x9f u + umlaut
- 0x87, // 0xa0 double cross (dagger)
- 0xb0, // 0xa1 degrees
- 0xa2, // 0xa2 cents
- 0xa3, // 0xa3 pounds (UK$)
- 0xa7, // 0xa4 section start
- 0x95, // 0xa5 small square (bullet) [using fat bullet]
- 0xb6, // 0xa6 paragraph (pilcrow)
- 0xdf, // 0xa7 curly B (latin small letter sharp S)
- 0xae, // 0xa8 raised 'R' (registered)
- 0xa9, // 0xa9 raised 'C' (copyright)
- 0x99, // 0xaa raised 'TM' (trademark)
- 0xb4, // 0xab acute accent
- 0xa8, // 0xac umlaut (diaeresis)
- kUnk, // 0xad not-equal
- 0xc6, // 0xae merged AE
- 0xd8, // 0xaf O + slash (upper-case nil?)
- kUnk, // 0xb0 infinity
- 0xb1, // 0xb1 +/-
- kUnk, // 0xb2 <=
- kUnk, // 0xb3 >=
- 0xa5, // 0xb4 Yen (Japan$)
- 0xb5, // 0xb5 mu (micro)
- kUnk, // 0xb6 delta (partial differentiation) [could use D-bar 0xd0]
- kUnk, // 0xb7 epsilon (N-ary summation) [could use C-double-bar 0x80]
- kUnk, // 0xb8 PI (N-ary product)
- kUnk, // 0xb9 pi
- kUnk, // 0xba integral
- 0xaa, // 0xbb a underbar (feminine ordinal) [using raised a]
- 0xba, // 0xbc o underbar (masculine ordinal) [using raised o]
- kUnk, // 0xbd omega (Ohm)
- 0xe6, // 0xbe merged ae
- 0xf8, // 0xbf o + slash (lower-case NULL?)
- 0xbf, // 0xc0 upside-down question mark
- 0xa1, // 0xc1 upside-down exclamation point
- 0xac, // 0xc2 rotated L ("not" sign)
- 0xb7, // 0xc3 checkmark (square root) [using small bullet]
- 0x83, // 0xc4 script f
- kUnk, // 0xc5 approximately equal
- kUnk, // 0xc6 delta (triangle / increment)
- 0xab, // 0xc7 much less than
- 0xbb, // 0xc8 much greater than
- 0x85, // 0xc9 ellipsis
- 0xa0, // 0xca blank (sticky space)
- 0xc0, // 0xcb A + grave
- 0xc3, // 0xcc A + tilde
- 0xd5, // 0xcd O + tilde
- 0x8c, // 0xce merged OE
- 0x9c, // 0xcf merged oe
- 0x96, // 0xd0 short hyphen (en dash)
- 0x97, // 0xd1 long hyphen (em dash)
- 0x93, // 0xd2 smart double-quote start
- 0x94, // 0xd3 smart double-quote end
- 0x91, // 0xd4 smart single-quote start
- 0x92, // 0xd5 smart single-quote end
- 0xf7, // 0xd6 divide
- 0xa4, // 0xd7 diamond (lozenge) [using spiky circle]
- 0xff, // 0xd8 y + umlaut
- // [nothing below here is part of standard Windows-ASCII?]
- // remaining descriptions based on hfsutils' "charset.txt"
- kUnk, // 0xd9 Y + umlaut
- kUnk, // 0xda fraction slash
- kUnk, // 0xdb currency sign
- kUnk, // 0xdc single left-pointing angle quotation mark
- kUnk, // 0xdd single right-pointing angle quotation mark
- kUnk, // 0xde merged fi
- kUnk, // 0xdf merged FL
- kUnk, // 0xe0 double dagger
- kUnk, // 0xe1 middle dot
- kUnk, // 0xe2 single low-9 quotation mark
- kUnk, // 0xe3 double low-9 quotation mark
- kUnk, // 0xe4 per mille sign
- kUnk, // 0xe5 A + circumflex
- kUnk, // 0xe6 E + circumflex
- kUnk, // 0xe7 A + acute accent
- kUnk, // 0xe8 E + diaeresis
- kUnk, // 0xe9 E + grave accent
- kUnk, // 0xea I + acute accent
- kUnk, // 0xeb I + circumflex
- kUnk, // 0xec I + diaeresis
- kUnk, // 0xed I + grave accent
- kUnk, // 0xee O + acute accent
- kUnk, // 0xef O + circumflex
- kUnk, // 0xf0 apple logo
- kUnk, // 0xf1 O + grave accent
- kUnk, // 0xf2 U + acute accent
- kUnk, // 0xf3 U + circumflex
- kUnk, // 0xf4 U + grave accent
- kUnk, // 0xf5 i without dot
- kUnk, // 0xf6 modifier letter circumflex accent
- kUnk, // 0xf7 small tilde
- kUnk, // 0xf8 macron
- kUnk, // 0xf9 breve
- kUnk, // 0xfa dot above
- kUnk, // 0xfb ring above
- kUnk, // 0xfc cedilla
- kUnk, // 0xfd double acute accent
- kUnk, // 0xfe ogonek
- kUnk, // 0xff caron
-};
-
-/*
- * Convert Mac OS Roman to Unicode. Mapping comes from:
- *
- * http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
- *
- * We use the "Control Pictures" block for the control characters
- * (0x00-0x1f, 0x7f).
- */
-/*static*/ const uint16_t ReformatText::kUTF16Conv[256] = {
- /*0x00*/ 0x2400, // [control] NULL
- /*0x01*/ 0x2401, // [control] START OF HEADING
- /*0x02*/ 0x2402, // [control] START OF TEXT
- /*0x03*/ 0x2403, // [control] END OF TEXT
- /*0x04*/ 0x2404, // [control] END OF TRANSMISSION
- /*0x05*/ 0x2405, // [control] ENQUIRY
- /*0x06*/ 0x2406, // [control] ACKNOWLEDGE
- /*0x07*/ 0x2407, // [control] BELL
- /*0x08*/ 0x2408, // [control] BACKSPACE
- /*0x09*/ 0x2409, // [control] HORIZONTAL TABULATION
- /*0x0a*/ 0x240a, // [control] LINE FEED
- /*0x0b*/ 0x240b, // [control] VERTICAL TABULATION
- /*0x0c*/ 0x240c, // [control] FORM FEED
- /*0x0d*/ 0x240d, // [control] CARRIAGE RETURN
- /*0x0e*/ 0x240e, // [control] SHIFT OUT
- /*0x0f*/ 0x240f, // [control] SHIFT IN
- /*0x10*/ 0x2410, // [control] DATA LINK ESCAPE
- /*0x11*/ 0x2411, // [control] DEVICE CONTROL ONE
- /*0x12*/ 0x2412, // [control] DEVICE CONTROL TWO
- /*0x13*/ 0x2413, // [control] DEVICE CONTROL THREE
- /*0x14*/ 0x2414, // [control] DEVICE CONTROL FOUR
- /*0x15*/ 0x2415, // [control] NEGATIVE ACKNOWLEDGE
- /*0x16*/ 0x2416, // [control] SYNCHRONOUS IDLE
- /*0x17*/ 0x2417, // [control] END OF TRANSMISSION BLOCK
- /*0x18*/ 0x2418, // [control] CANCEL
- /*0x19*/ 0x2419, // [control] END OF MEDIUM
- /*0x1a*/ 0x241a, // [control] SUBSTITUTE
- /*0x1b*/ 0x241b, // [control] ESCAPE
- /*0x1c*/ 0x241c, // [control] FILE SEPARATOR
- /*0x1d*/ 0x241d, // [control] GROUP SEPARATOR
- /*0x1e*/ 0x241e, // [control] RECORD SEPARATOR
- /*0x1f*/ 0x241f, // [control] UNIT SEPARATOR
- /*0x20*/ 0x0020, // SPACE
- /*0x21*/ 0x0021, // EXCLAMATION MARK
- /*0x22*/ 0x0022, // QUOTATION MARK
- /*0x23*/ 0x0023, // NUMBER SIGN
- /*0x24*/ 0x0024, // DOLLAR SIGN
- /*0x25*/ 0x0025, // PERCENT SIGN
- /*0x26*/ 0x0026, // AMPERSAND
- /*0x27*/ 0x0027, // APOSTROPHE
- /*0x28*/ 0x0028, // LEFT PARENTHESIS
- /*0x29*/ 0x0029, // RIGHT PARENTHESIS
- /*0x2A*/ 0x002A, // ASTERISK
- /*0x2B*/ 0x002B, // PLUS SIGN
- /*0x2C*/ 0x002C, // COMMA
- /*0x2D*/ 0x002D, // HYPHEN-MINUS
- /*0x2E*/ 0x002E, // FULL STOP
- /*0x2F*/ 0x002F, // SOLIDUS
- /*0x30*/ 0x0030, // DIGIT ZERO
- /*0x31*/ 0x0031, // DIGIT ONE
- /*0x32*/ 0x0032, // DIGIT TWO
- /*0x33*/ 0x0033, // DIGIT THREE
- /*0x34*/ 0x0034, // DIGIT FOUR
- /*0x35*/ 0x0035, // DIGIT FIVE
- /*0x36*/ 0x0036, // DIGIT SIX
- /*0x37*/ 0x0037, // DIGIT SEVEN
- /*0x38*/ 0x0038, // DIGIT EIGHT
- /*0x39*/ 0x0039, // DIGIT NINE
- /*0x3A*/ 0x003A, // COLON
- /*0x3B*/ 0x003B, // SEMICOLON
- /*0x3C*/ 0x003C, // LESS-THAN SIGN
- /*0x3D*/ 0x003D, // EQUALS SIGN
- /*0x3E*/ 0x003E, // GREATER-THAN SIGN
- /*0x3F*/ 0x003F, // QUESTION MARK
- /*0x40*/ 0x0040, // COMMERCIAL AT
- /*0x41*/ 0x0041, // LATIN CAPITAL LETTER A
- /*0x42*/ 0x0042, // LATIN CAPITAL LETTER B
- /*0x43*/ 0x0043, // LATIN CAPITAL LETTER C
- /*0x44*/ 0x0044, // LATIN CAPITAL LETTER D
- /*0x45*/ 0x0045, // LATIN CAPITAL LETTER E
- /*0x46*/ 0x0046, // LATIN CAPITAL LETTER F
- /*0x47*/ 0x0047, // LATIN CAPITAL LETTER G
- /*0x48*/ 0x0048, // LATIN CAPITAL LETTER H
- /*0x49*/ 0x0049, // LATIN CAPITAL LETTER I
- /*0x4A*/ 0x004A, // LATIN CAPITAL LETTER J
- /*0x4B*/ 0x004B, // LATIN CAPITAL LETTER K
- /*0x4C*/ 0x004C, // LATIN CAPITAL LETTER L
- /*0x4D*/ 0x004D, // LATIN CAPITAL LETTER M
- /*0x4E*/ 0x004E, // LATIN CAPITAL LETTER N
- /*0x4F*/ 0x004F, // LATIN CAPITAL LETTER O
- /*0x50*/ 0x0050, // LATIN CAPITAL LETTER P
- /*0x51*/ 0x0051, // LATIN CAPITAL LETTER Q
- /*0x52*/ 0x0052, // LATIN CAPITAL LETTER R
- /*0x53*/ 0x0053, // LATIN CAPITAL LETTER S
- /*0x54*/ 0x0054, // LATIN CAPITAL LETTER T
- /*0x55*/ 0x0055, // LATIN CAPITAL LETTER U
- /*0x56*/ 0x0056, // LATIN CAPITAL LETTER V
- /*0x57*/ 0x0057, // LATIN CAPITAL LETTER W
- /*0x58*/ 0x0058, // LATIN CAPITAL LETTER X
- /*0x59*/ 0x0059, // LATIN CAPITAL LETTER Y
- /*0x5A*/ 0x005A, // LATIN CAPITAL LETTER Z
- /*0x5B*/ 0x005B, // LEFT SQUARE BRACKET
- /*0x5C*/ 0x005C, // REVERSE SOLIDUS
- /*0x5D*/ 0x005D, // RIGHT SQUARE BRACKET
- /*0x5E*/ 0x005E, // CIRCUMFLEX ACCENT
- /*0x5F*/ 0x005F, // LOW LINE
- /*0x60*/ 0x0060, // GRAVE ACCENT
- /*0x61*/ 0x0061, // LATIN SMALL LETTER A
- /*0x62*/ 0x0062, // LATIN SMALL LETTER B
- /*0x63*/ 0x0063, // LATIN SMALL LETTER C
- /*0x64*/ 0x0064, // LATIN SMALL LETTER D
- /*0x65*/ 0x0065, // LATIN SMALL LETTER E
- /*0x66*/ 0x0066, // LATIN SMALL LETTER F
- /*0x67*/ 0x0067, // LATIN SMALL LETTER G
- /*0x68*/ 0x0068, // LATIN SMALL LETTER H
- /*0x69*/ 0x0069, // LATIN SMALL LETTER I
- /*0x6A*/ 0x006A, // LATIN SMALL LETTER J
- /*0x6B*/ 0x006B, // LATIN SMALL LETTER K
- /*0x6C*/ 0x006C, // LATIN SMALL LETTER L
- /*0x6D*/ 0x006D, // LATIN SMALL LETTER M
- /*0x6E*/ 0x006E, // LATIN SMALL LETTER N
- /*0x6F*/ 0x006F, // LATIN SMALL LETTER O
- /*0x70*/ 0x0070, // LATIN SMALL LETTER P
- /*0x71*/ 0x0071, // LATIN SMALL LETTER Q
- /*0x72*/ 0x0072, // LATIN SMALL LETTER R
- /*0x73*/ 0x0073, // LATIN SMALL LETTER S
- /*0x74*/ 0x0074, // LATIN SMALL LETTER T
- /*0x75*/ 0x0075, // LATIN SMALL LETTER U
- /*0x76*/ 0x0076, // LATIN SMALL LETTER V
- /*0x77*/ 0x0077, // LATIN SMALL LETTER W
- /*0x78*/ 0x0078, // LATIN SMALL LETTER X
- /*0x79*/ 0x0079, // LATIN SMALL LETTER Y
- /*0x7A*/ 0x007A, // LATIN SMALL LETTER Z
- /*0x7B*/ 0x007B, // LEFT CURLY BRACKET
- /*0x7C*/ 0x007C, // VERTICAL LINE
- /*0x7D*/ 0x007D, // RIGHT CURLY BRACKET
- /*0x7E*/ 0x007E, // TILDE
- /*0x7f*/ 0x2421, // [control] DELETE
- /*0x80*/ 0x00C4, // LATIN CAPITAL LETTER A WITH DIAERESIS
- /*0x81*/ 0x00C5, // LATIN CAPITAL LETTER A WITH RING ABOVE
- /*0x82*/ 0x00C7, // LATIN CAPITAL LETTER C WITH CEDILLA
- /*0x83*/ 0x00C9, // LATIN CAPITAL LETTER E WITH ACUTE
- /*0x84*/ 0x00D1, // LATIN CAPITAL LETTER N WITH TILDE
- /*0x85*/ 0x00D6, // LATIN CAPITAL LETTER O WITH DIAERESIS
- /*0x86*/ 0x00DC, // LATIN CAPITAL LETTER U WITH DIAERESIS
- /*0x87*/ 0x00E1, // LATIN SMALL LETTER A WITH ACUTE
- /*0x88*/ 0x00E0, // LATIN SMALL LETTER A WITH GRAVE
- /*0x89*/ 0x00E2, // LATIN SMALL LETTER A WITH CIRCUMFLEX
- /*0x8A*/ 0x00E4, // LATIN SMALL LETTER A WITH DIAERESIS
- /*0x8B*/ 0x00E3, // LATIN SMALL LETTER A WITH TILDE
- /*0x8C*/ 0x00E5, // LATIN SMALL LETTER A WITH RING ABOVE
- /*0x8D*/ 0x00E7, // LATIN SMALL LETTER C WITH CEDILLA
- /*0x8E*/ 0x00E9, // LATIN SMALL LETTER E WITH ACUTE
- /*0x8F*/ 0x00E8, // LATIN SMALL LETTER E WITH GRAVE
- /*0x90*/ 0x00EA, // LATIN SMALL LETTER E WITH CIRCUMFLEX
- /*0x91*/ 0x00EB, // LATIN SMALL LETTER E WITH DIAERESIS
- /*0x92*/ 0x00ED, // LATIN SMALL LETTER I WITH ACUTE
- /*0x93*/ 0x00EC, // LATIN SMALL LETTER I WITH GRAVE
- /*0x94*/ 0x00EE, // LATIN SMALL LETTER I WITH CIRCUMFLEX
- /*0x95*/ 0x00EF, // LATIN SMALL LETTER I WITH DIAERESIS
- /*0x96*/ 0x00F1, // LATIN SMALL LETTER N WITH TILDE
- /*0x97*/ 0x00F3, // LATIN SMALL LETTER O WITH ACUTE
- /*0x98*/ 0x00F2, // LATIN SMALL LETTER O WITH GRAVE
- /*0x99*/ 0x00F4, // LATIN SMALL LETTER O WITH CIRCUMFLEX
- /*0x9A*/ 0x00F6, // LATIN SMALL LETTER O WITH DIAERESIS
- /*0x9B*/ 0x00F5, // LATIN SMALL LETTER O WITH TILDE
- /*0x9C*/ 0x00FA, // LATIN SMALL LETTER U WITH ACUTE
- /*0x9D*/ 0x00F9, // LATIN SMALL LETTER U WITH GRAVE
- /*0x9E*/ 0x00FB, // LATIN SMALL LETTER U WITH CIRCUMFLEX
- /*0x9F*/ 0x00FC, // LATIN SMALL LETTER U WITH DIAERESIS
- /*0xA0*/ 0x2020, // DAGGER
- /*0xA1*/ 0x00B0, // DEGREE SIGN
- /*0xA2*/ 0x00A2, // CENT SIGN
- /*0xA3*/ 0x00A3, // POUND SIGN
- /*0xA4*/ 0x00A7, // SECTION SIGN
- /*0xA5*/ 0x2022, // BULLET
- /*0xA6*/ 0x00B6, // PILCROW SIGN
- /*0xA7*/ 0x00DF, // LATIN SMALL LETTER SHARP S
- /*0xA8*/ 0x00AE, // REGISTERED SIGN
- /*0xA9*/ 0x00A9, // COPYRIGHT SIGN
- /*0xAA*/ 0x2122, // TRADE MARK SIGN
- /*0xAB*/ 0x00B4, // ACUTE ACCENT
- /*0xAC*/ 0x00A8, // DIAERESIS
- /*0xAD*/ 0x2260, // NOT EQUAL TO
- /*0xAE*/ 0x00C6, // LATIN CAPITAL LETTER AE
- /*0xAF*/ 0x00D8, // LATIN CAPITAL LETTER O WITH STROKE
- /*0xB0*/ 0x221E, // INFINITY
- /*0xB1*/ 0x00B1, // PLUS-MINUS SIGN
- /*0xB2*/ 0x2264, // LESS-THAN OR EQUAL TO
- /*0xB3*/ 0x2265, // GREATER-THAN OR EQUAL TO
- /*0xB4*/ 0x00A5, // YEN SIGN
- /*0xB5*/ 0x00B5, // MICRO SIGN
- /*0xB6*/ 0x2202, // PARTIAL DIFFERENTIAL
- /*0xB7*/ 0x2211, // N-ARY SUMMATION
- /*0xB8*/ 0x220F, // N-ARY PRODUCT
- /*0xB9*/ 0x03C0, // GREEK SMALL LETTER PI
- /*0xBA*/ 0x222B, // INTEGRAL
- /*0xBB*/ 0x00AA, // FEMININE ORDINAL INDICATOR
- /*0xBC*/ 0x00BA, // MASCULINE ORDINAL INDICATOR
- /*0xBD*/ 0x03A9, // GREEK CAPITAL LETTER OMEGA
- /*0xBE*/ 0x00E6, // LATIN SMALL LETTER AE
- /*0xBF*/ 0x00F8, // LATIN SMALL LETTER O WITH STROKE
- /*0xC0*/ 0x00BF, // INVERTED QUESTION MARK
- /*0xC1*/ 0x00A1, // INVERTED EXCLAMATION MARK
- /*0xC2*/ 0x00AC, // NOT SIGN
- /*0xC3*/ 0x221A, // SQUARE ROOT
- /*0xC4*/ 0x0192, // LATIN SMALL LETTER F WITH HOOK
- /*0xC5*/ 0x2248, // ALMOST EQUAL TO
- /*0xC6*/ 0x2206, // INCREMENT
- /*0xC7*/ 0x00AB, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
- /*0xC8*/ 0x00BB, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
- /*0xC9*/ 0x2026, // HORIZONTAL ELLIPSIS
- /*0xCA*/ 0x00A0, // NO-BREAK SPACE
- /*0xCB*/ 0x00C0, // LATIN CAPITAL LETTER A WITH GRAVE
- /*0xCC*/ 0x00C3, // LATIN CAPITAL LETTER A WITH TILDE
- /*0xCD*/ 0x00D5, // LATIN CAPITAL LETTER O WITH TILDE
- /*0xCE*/ 0x0152, // LATIN CAPITAL LIGATURE OE
- /*0xCF*/ 0x0153, // LATIN SMALL LIGATURE OE
- /*0xD0*/ 0x2013, // EN DASH
- /*0xD1*/ 0x2014, // EM DASH
- /*0xD2*/ 0x201C, // LEFT DOUBLE QUOTATION MARK
- /*0xD3*/ 0x201D, // RIGHT DOUBLE QUOTATION MARK
- /*0xD4*/ 0x2018, // LEFT SINGLE QUOTATION MARK
- /*0xD5*/ 0x2019, // RIGHT SINGLE QUOTATION MARK
- /*0xD6*/ 0x00F7, // DIVISION SIGN
- /*0xD7*/ 0x25CA, // LOZENGE
- /*0xD8*/ 0x00FF, // LATIN SMALL LETTER Y WITH DIAERESIS
- /*0xD9*/ 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS
- /*0xDA*/ 0x2044, // FRACTION SLASH
- /*0xDB*/ 0x00A4, // CURRENCY SIGN (was EURO SIGN)
- /*0xDC*/ 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
- /*0xDD*/ 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
- /*0xDE*/ 0xFB01, // LATIN SMALL LIGATURE FI
- /*0xDF*/ 0xFB02, // LATIN SMALL LIGATURE FL
- /*0xE0*/ 0x2021, // DOUBLE DAGGER
- /*0xE1*/ 0x00B7, // MIDDLE DOT
- /*0xE2*/ 0x201A, // SINGLE LOW-9 QUOTATION MARK
- /*0xE3*/ 0x201E, // DOUBLE LOW-9 QUOTATION MARK
- /*0xE4*/ 0x2030, // PER MILLE SIGN
- /*0xE5*/ 0x00C2, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
- /*0xE6*/ 0x00CA, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
- /*0xE7*/ 0x00C1, // LATIN CAPITAL LETTER A WITH ACUTE
- /*0xE8*/ 0x00CB, // LATIN CAPITAL LETTER E WITH DIAERESIS
- /*0xE9*/ 0x00C8, // LATIN CAPITAL LETTER E WITH GRAVE
- /*0xEA*/ 0x00CD, // LATIN CAPITAL LETTER I WITH ACUTE
- /*0xEB*/ 0x00CE, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
- /*0xEC*/ 0x00CF, // LATIN CAPITAL LETTER I WITH DIAERESIS
- /*0xED*/ 0x00CC, // LATIN CAPITAL LETTER I WITH GRAVE
- /*0xEE*/ 0x00D3, // LATIN CAPITAL LETTER O WITH ACUTE
- /*0xEF*/ 0x00D4, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
- /*0xF0*/ 0xF8FF, // Apple logo
- /*0xF1*/ 0x00D2, // LATIN CAPITAL LETTER O WITH GRAVE
- /*0xF2*/ 0x00DA, // LATIN CAPITAL LETTER U WITH ACUTE
- /*0xF3*/ 0x00DB, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
- /*0xF4*/ 0x00D9, // LATIN CAPITAL LETTER U WITH GRAVE
- /*0xF5*/ 0x0131, // LATIN SMALL LETTER DOTLESS I
- /*0xF6*/ 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT
- /*0xF7*/ 0x02DC, // SMALL TILDE
- /*0xF8*/ 0x00AF, // MACRON
- /*0xF9*/ 0x02D8, // BREVE
- /*0xFA*/ 0x02D9, // DOT ABOVE
- /*0xFB*/ 0x02DA, // RING ABOVE
- /*0xFC*/ 0x00B8, // CEDILLA
- /*0xFD*/ 0x02DD, // DOUBLE ACUTE ACCENT
- /*0xFE*/ 0x02DB, // OGONEK
- /*0xFF*/ 0x02C7, // CARON
-};
-
-/*
- * Quick sanity check on contents of array.
- *
- * No two characters should map to the same thing. This isn't vital, but
- * if we want to have a reversible transformation someday, it'll make our
- * lives easier then.
- */
-void ReformatText::CheckGSCharConv(void)
-{
-#ifdef _DEBUG
- bool* test = (bool*) malloc(65536 * sizeof(bool));
-
- memset(test, 0, 65536 * sizeof(bool));
- for (int i = 0; i < NELEM(kCP1252Conv); i++) {
- if (test[kCP1252Conv[i]] && kCP1252Conv[i] != kUnk) {
- LOGW("Character used twice: 0x%02x at %d (0x%02x)",
- kCP1252Conv[i], i, i+128);
- assert(false);
- }
- test[kCP1252Conv[i]] = true;
- }
-
- memset(test, 0, 65536 * sizeof(bool));
- for (int i = 0; i < NELEM(kUTF16Conv); i++) {
- if (test[kUTF16Conv[i]]) {
- LOGW("Character used twice: 0x%02x at %d (0x%02x)",
- kUTF16Conv[i], i, i+128);
- assert(false);
- }
- test[kUTF16Conv[i]] = true;
- }
-
- free(test);
-#endif
-}
-
/*
* Set the output format and buffer.
*
diff --git a/reformat/ReformatBase.h b/reformat/ReformatBase.h
index d00f2ed..d52c28d 100644
--- a/reformat/ReformatBase.h
+++ b/reformat/ReformatBase.h
@@ -11,10 +11,11 @@
* that, but we'd have to figure out what that means when extracting a file
* (i.e. figure out the RTF embedded bitmap format).
*/
-#ifndef REFORMAT_REFORMATBASE
-#define REFORMAT_REFORMATBASE
+#ifndef REFORMAT_REFORMATBASE_H
+#define REFORMAT_REFORMATBASE_H
#include "Reformat.h"
+#include "Charset.h"
#define BufPrintf fExpBuf.Printf
@@ -290,13 +291,6 @@ public:
kRTFFlagColorTable = 1, // include color table
};
- // Convert a Mac OS Roman character value (from a IIgs document) to
- // its UTF-16 Unicode equivalent. This also includes a conversion
- // for the control characters.
- static uint16_t ConvertMacRomanToUTF16(uint8_t ch) {
- return kUTF16Conv[ch];
- }
-
protected:
void RTFBegin(int flags = 0);
void RTFEnd(void);
@@ -387,25 +381,11 @@ protected:
fExpBuf.Printf("%c", ch);
}
- // Convert a Mac OS Roman character value (from a IIgs document) to
- // an 8-bit Windows CP1252 equivalent.
- static uint8_t ConvertMacRomanTo1252(uint8_t ch) {
- if (ch < 128)
- return ch;
- else
- return kCP1252Conv[ch-128];
- }
-
- void CheckGSCharConv(void);
-
private:
DECLARE_COPY_AND_OPEQ(ReformatText)
int CreateWorkBuf(void);
enum { kRTFUnitsPerInch = 1440 }; // TWIPS
- static const uint8_t kCP1252Conv[];
- static const uint16_t kUTF16Conv[];
-
int fLeftMargin, fRightMargin; // for documents, in 1/10th inch
int fPointSize;
int fPreMultPointSize;
@@ -421,4 +401,4 @@ private:
TextColor fTextColor;
};
-#endif /*REFORMAT_REFORMATBASE*/
+#endif /*REFORMAT_REFORMATBASE_H*/
diff --git a/reformat/Teach.cpp b/reformat/Teach.cpp
index 21f657e..82436e9 100644
--- a/reformat/Teach.cpp
+++ b/reformat/Teach.cpp
@@ -47,7 +47,7 @@ int ReformatGWP::Process(const ReformatHolder* pHolder,
long srcLen = pHolder->GetSourceLen(part);
fUseRTF = false;
- CheckGSCharConv();
+ Charset::CheckGSCharConv();
RTFBegin();
/* convert EOL markers and IIgs characters */
@@ -67,7 +67,7 @@ int ReformatGWP::Process(const ReformatHolder* pHolder,
BufPrintf("\r\n");
} else {
// RTF is always off, so just use BufPrintf
- BufPrintf("%c", ConvertMacRomanTo1252(ch));
+ BufPrintf("%c", Charset::ConvertMacRomanTo1252(ch));
}
}
@@ -124,7 +124,7 @@ int ReformatTeach::Process(const ReformatHolder* pHolder,
LOGI("Teach reformatter missing one fork of the file");
return -1;
}
- CheckGSCharConv();
+ Charset::CheckGSCharConv();
/* find the rStyleBlock */
if (!ReformatResourceFork::GetResource(rsrcBuf, rsrcLen, 0x8012, 0x0001,
@@ -206,7 +206,7 @@ int ReformatTeach::Process(const ReformatHolder* pHolder,
} else if (uch == '\t') {
RTFTab();
} else {
- RTFPrintUTF16Char(ConvertMacRomanToUTF16(uch));
+ RTFPrintUTF16Char(Charset::ConvertMacRomanToUTF16(uch));
}
dataBuf++;
dataLen--;
diff --git a/reformat/reformat.vcxproj b/reformat/reformat.vcxproj
index 176a023..1b94811 100644
--- a/reformat/reformat.vcxproj
+++ b/reformat/reformat.vcxproj
@@ -108,6 +108,7 @@
+
@@ -130,6 +131,7 @@
+
diff --git a/reformat/reformat.vcxproj.filters b/reformat/reformat.vcxproj.filters
index fd56a68..0da9e0e 100644
--- a/reformat/reformat.vcxproj.filters
+++ b/reformat/reformat.vcxproj.filters
@@ -71,6 +71,9 @@
Header Files
+
+ Header Files
+
@@ -142,5 +145,8 @@
Source Files
+
+ Source Files
+
\ No newline at end of file