#include #include unsigned char unicode_to_macroman(uint16_t c){ #undef _ #define _(macroman,unicode,comment) if (c == unicode) return macroman; // http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT _(0x80, 0x00C4, "# LATIN CAPITAL LETTER A WITH DIAERESIS") _(0x81, 0x00C5, "# LATIN CAPITAL LETTER A WITH RING ABOVE") _(0x82, 0x00C7, "# LATIN CAPITAL LETTER C WITH CEDILLA") _(0x83, 0x00C9, "# LATIN CAPITAL LETTER E WITH ACUTE") _(0x84, 0x00D1, "# LATIN CAPITAL LETTER N WITH TILDE") _(0x85, 0x00D6, "# LATIN CAPITAL LETTER O WITH DIAERESIS") _(0x86, 0x00DC, "# LATIN CAPITAL LETTER U WITH DIAERESIS") _(0x87, 0x00E1, "# LATIN SMALL LETTER A WITH ACUTE") _(0x88, 0x00E0, "# LATIN SMALL LETTER A WITH GRAVE") _(0x89, 0x00E2, "# LATIN SMALL LETTER A WITH CIRCUMFLEX") _(0x8A, 0x00E4, "# LATIN SMALL LETTER A WITH DIAERESIS") _(0x8B, 0x00E3, "# LATIN SMALL LETTER A WITH TILDE") _(0x8C, 0x00E5, "# LATIN SMALL LETTER A WITH RING ABOVE") _(0x8D, 0x00E7, "# LATIN SMALL LETTER C WITH CEDILLA") _(0x8E, 0x00E9, "# LATIN SMALL LETTER E WITH ACUTE") _(0x8F, 0x00E8, "# LATIN SMALL LETTER E WITH GRAVE") _(0x90, 0x00EA, "# LATIN SMALL LETTER E WITH CIRCUMFLEX") _(0x91, 0x00EB, "# LATIN SMALL LETTER E WITH DIAERESIS") _(0x92, 0x00ED, "# LATIN SMALL LETTER I WITH ACUTE") _(0x93, 0x00EC, "# LATIN SMALL LETTER I WITH GRAVE") _(0x94, 0x00EE, "# LATIN SMALL LETTER I WITH CIRCUMFLEX") _(0x95, 0x00EF, "# LATIN SMALL LETTER I WITH DIAERESIS") _(0x96, 0x00F1, "# LATIN SMALL LETTER N WITH TILDE") _(0x97, 0x00F3, "# LATIN SMALL LETTER O WITH ACUTE") _(0x98, 0x00F2, "# LATIN SMALL LETTER O WITH GRAVE") _(0x99, 0x00F4, "# LATIN SMALL LETTER O WITH CIRCUMFLEX") _(0x9A, 0x00F6, "# LATIN SMALL LETTER O WITH DIAERESIS") _(0x9B, 0x00F5, "# LATIN SMALL LETTER O WITH TILDE") _(0x9C, 0x00FA, "# LATIN SMALL LETTER U WITH ACUTE") _(0x9D, 0x00F9, "# LATIN SMALL LETTER U WITH GRAVE") _(0x9E, 0x00FB, "# LATIN SMALL LETTER U WITH CIRCUMFLEX") _(0x9F, 0x00FC, "# LATIN SMALL LETTER U WITH DIAERESIS") _(0xA0, 0x2020, "# DAGGER") _(0xA1, 0x00B0, "# DEGREE SIGN") _(0xA2, 0x00A2, "# CENT SIGN") _(0xA3, 0x00A3, "# POUND SIGN") _(0xA4, 0x00A7, "# SECTION SIGN") _(0xA5, 0x2022, "# BULLET") _(0xA6, 0x00B6, "# PILCROW SIGN") _(0xA7, 0x00DF, "# LATIN SMALL LETTER SHARP S") _(0xA8, 0x00AE, "# REGISTERED SIGN") _(0xA9, 0x00A9, "# COPYRIGHT SIGN") _(0xAA, 0x2122, "# TRADE MARK SIGN") _(0xAB, 0x00B4, "# ACUTE ACCENT") _(0xAC, 0x00A8, "# DIAERESIS") _(0xAD, 0x2260, "# NOT EQUAL TO") _(0xAE, 0x00C6, "# LATIN CAPITAL LETTER AE") _(0xAF, 0x00D8, "# LATIN CAPITAL LETTER O WITH STROKE") _(0xB0, 0x221E, "# INFINITY") _(0xB1, 0x00B1, "# PLUS-MINUS SIGN") _(0xB2, 0x2264, "# LESS-THAN OR EQUAL TO") _(0xB3, 0x2265, "# GREATER-THAN OR EQUAL TO") _(0xB4, 0x00A5, "# YEN SIGN") _(0xB5, 0x00B5, "# MICRO SIGN") _(0xB6, 0x2202, "# PARTIAL DIFFERENTIAL") _(0xB7, 0x2211, "# N-ARY SUMMATION") _(0xB8, 0x220F, "# N-ARY PRODUCT") _(0xB9, 0x03C0, "# GREEK SMALL LETTER PI") _(0xBA, 0x222B, "# INTEGRAL") _(0xBB, 0x00AA, "# FEMININE ORDINAL INDICATOR") _(0xBC, 0x00BA, "# MASCULINE ORDINAL INDICATOR") _(0xBD, 0x03A9, "# GREEK CAPITAL LETTER OMEGA") _(0xBE, 0x00E6, "# LATIN SMALL LETTER AE") _(0xBF, 0x00F8, "# LATIN SMALL LETTER O WITH STROKE") _(0xC0, 0x00BF, "# INVERTED QUESTION MARK") _(0xC1, 0x00A1, "# INVERTED EXCLAMATION MARK") _(0xC2, 0x00AC, "# NOT SIGN") _(0xC3, 0x221A, "# SQUARE ROOT") _(0xC4, 0x0192, "# LATIN SMALL LETTER F WITH HOOK") _(0xC5, 0x2248, "# ALMOST EQUAL TO") _(0xC6, 0x2206, "# INCREMENT") _(0xC7, 0x00AB, "# LEFT-POINTING DOUBLE ANGLE QUOTATION MARK") _(0xC8, 0x00BB, "# RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK") _(0xC9, 0x2026, "# HORIZONTAL ELLIPSIS") _(0xCA, 0x00A0, "# NO-BREAK SPACE") _(0xCB, 0x00C0, "# LATIN CAPITAL LETTER A WITH GRAVE") _(0xCC, 0x00C3, "# LATIN CAPITAL LETTER A WITH TILDE") _(0xCD, 0x00D5, "# LATIN CAPITAL LETTER O WITH TILDE") _(0xCE, 0x0152, "# LATIN CAPITAL LIGATURE OE") _(0xCF, 0x0153, "# LATIN SMALL LIGATURE OE") _(0xD0, 0x2013, "# EN DASH") _(0xD1, 0x2014, "# EM DASH") _(0xD2, 0x201C, "# LEFT DOUBLE QUOTATION MARK") _(0xD3, 0x201D, "# RIGHT DOUBLE QUOTATION MARK") _(0xD4, 0x2018, "# LEFT SINGLE QUOTATION MARK") _(0xD5, 0x2019, "# RIGHT SINGLE QUOTATION MARK") _(0xD6, 0x00F7, "# DIVISION SIGN") _(0xD7, 0x25CA, "# LOZENGE") _(0xD8, 0x00FF, "# LATIN SMALL LETTER Y WITH DIAERESIS") _(0xD9, 0x0178, "# LATIN CAPITAL LETTER Y WITH DIAERESIS") _(0xDA, 0x2044, "# FRACTION SLASH") _(0xDB, 0x20AC, "# EURO SIGN") _(0xDC, 0x2039, "# SINGLE LEFT-POINTING ANGLE QUOTATION MARK") _(0xDD, 0x203A, "# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK") _(0xDE, 0xFB01, "# LATIN SMALL LIGATURE FI") _(0xDF, 0xFB02, "# LATIN SMALL LIGATURE FL") _(0xE0, 0x2021, "# DOUBLE DAGGER") _(0xE1, 0x00B7, "# MIDDLE DOT") _(0xE2, 0x201A, "# SINGLE LOW-9 QUOTATION MARK") _(0xE3, 0x201E, "# DOUBLE LOW-9 QUOTATION MARK") _(0xE4, 0x2030, "# PER MILLE SIGN") _(0xE5, 0x00C2, "# LATIN CAPITAL LETTER A WITH CIRCUMFLEX") _(0xE6, 0x00CA, "# LATIN CAPITAL LETTER E WITH CIRCUMFLEX") _(0xE7, 0x00C1, "# LATIN CAPITAL LETTER A WITH ACUTE") _(0xE8, 0x00CB, "# LATIN CAPITAL LETTER E WITH DIAERESIS") _(0xE9, 0x00C8, "# LATIN CAPITAL LETTER E WITH GRAVE") _(0xEA, 0x00CD, "# LATIN CAPITAL LETTER I WITH ACUTE") _(0xEB, 0x00CE, "# LATIN CAPITAL LETTER I WITH CIRCUMFLEX") _(0xEC, 0x00CF, "# LATIN CAPITAL LETTER I WITH DIAERESIS") _(0xED, 0x00CC, "# LATIN CAPITAL LETTER I WITH GRAVE") _(0xEE, 0x00D3, "# LATIN CAPITAL LETTER O WITH ACUTE") _(0xEF, 0x00D4, "# LATIN CAPITAL LETTER O WITH CIRCUMFLEX") _(0xF0, 0xF8FF, "# Apple logo") _(0xF1, 0x00D2, "# LATIN CAPITAL LETTER O WITH GRAVE") _(0xF2, 0x00DA, "# LATIN CAPITAL LETTER U WITH ACUTE") _(0xF3, 0x00DB, "# LATIN CAPITAL LETTER U WITH CIRCUMFLEX") _(0xF4, 0x00D9, "# LATIN CAPITAL LETTER U WITH GRAVE") _(0xF5, 0x0131, "# LATIN SMALL LETTER DOTLESS I") _(0xF6, 0x02C6, "# MODIFIER LETTER CIRCUMFLEX ACCENT") _(0xF7, 0x02DC, "# SMALL TILDE") _(0xF8, 0x00AF, "# MACRON") _(0xF9, 0x02D8, "# BREVE") _(0xFA, 0x02D9, "# DOT ABOVE") _(0xFB, 0x02DA, "# RING ABOVE") _(0xFC, 0x00B8, "# CEDILLA") _(0xFD, 0x02DD, "# DOUBLE ACUTE ACCENT") _(0xFE, 0x02DB, "# OGONEK") _(0xFF, 0x02C7, "# CARON") #undef _ return 0; } std::string utf8_to_macroman(const std::string &s) { if (std::find_if(s.begin(), s.end(), [](unsigned char c){ return c & 0x80; }) == s.end()) return s; std::string rv; unsigned cs = 0; uint16_t tmp; for (unsigned char c : s) { switch(cs) { case 0: if (c <= 0x7f) { rv.push_back(c); continue; } if ((c & 0b11100000) == 0b11000000) { (tmp = c & 0b00011111); cs = 1; continue; } if ((c & 0b11110000) == 0b11100000) { (tmp = c & 0b00001111); cs = 2; continue; } if ((c & 0b11111000) == 0b11110000) { (tmp = c & 0b00000111); cs = 3; continue; } // not utf8... break; case 1: case 2: case 3: if ((c & 0b11000000) != 0b10000000) { //not utf8... } tmp = (tmp << 6) + (c & 0b00111111); if(--cs == 0) { c = unicode_to_macroman(tmp); if (c) rv.push_back(c); } break; } } return rv; }