From 6d929aa87f0e8e386242b8ae17a91b350ede3e3d Mon Sep 17 00:00:00 2001 From: Kelvin Sherlock Date: Thu, 21 Jul 2016 15:14:27 -0400 Subject: [PATCH] convert utf8 input to macroman. --- CMakeLists.txt | 1 + macroman.cpp | 207 +++++++++++++++++++++++++++++++++++++++++++++++++ mpw-shell.cpp | 31 +++++--- phase1.h | 1 + 4 files changed, 228 insertions(+), 12 deletions(-) create mode 100644 macroman.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 62fa92d..cf53d3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,7 @@ add_executable(mpw-shell mpw-shell.cpp mpw-shell-token.cpp mpw-shell-expand.cpp mpw-shell-parser.cpp value.cpp mpw-shell-quote.cpp phase1.cpp phase2.cpp phase2-parser.cpp command.cpp environment.cpp builtins.cpp pathnames.cpp + macroman.cpp cxx/mapped_file.cpp cxx/filesystem.cpp cxx/path.cpp diff --git a/macroman.cpp b/macroman.cpp new file mode 100644 index 0000000..83272ff --- /dev/null +++ b/macroman.cpp @@ -0,0 +1,207 @@ + + +#include +#include + + +unsigned char unicode_to_macroman(uint16_t c){ + + +#undef _ +#define _(macroman,unicode,comment) if (c == unicode) return macroman; + // http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT + _(0x80, 0x00C4, "# LATIN CAPITAL LETTER A WITH DIAERESIS") + _(0x81, 0x00C5, "# LATIN CAPITAL LETTER A WITH RING ABOVE") + _(0x82, 0x00C7, "# LATIN CAPITAL LETTER C WITH CEDILLA") + _(0x83, 0x00C9, "# LATIN CAPITAL LETTER E WITH ACUTE") + _(0x84, 0x00D1, "# LATIN CAPITAL LETTER N WITH TILDE") + _(0x85, 0x00D6, "# LATIN CAPITAL LETTER O WITH DIAERESIS") + _(0x86, 0x00DC, "# LATIN CAPITAL LETTER U WITH DIAERESIS") + _(0x87, 0x00E1, "# LATIN SMALL LETTER A WITH ACUTE") + _(0x88, 0x00E0, "# LATIN SMALL LETTER A WITH GRAVE") + _(0x89, 0x00E2, "# LATIN SMALL LETTER A WITH CIRCUMFLEX") + _(0x8A, 0x00E4, "# LATIN SMALL LETTER A WITH DIAERESIS") + _(0x8B, 0x00E3, "# LATIN SMALL LETTER A WITH TILDE") + _(0x8C, 0x00E5, "# LATIN SMALL LETTER A WITH RING ABOVE") + _(0x8D, 0x00E7, "# LATIN SMALL LETTER C WITH CEDILLA") + _(0x8E, 0x00E9, "# LATIN SMALL LETTER E WITH ACUTE") + _(0x8F, 0x00E8, "# LATIN SMALL LETTER E WITH GRAVE") + _(0x90, 0x00EA, "# LATIN SMALL LETTER E WITH CIRCUMFLEX") + _(0x91, 0x00EB, "# LATIN SMALL LETTER E WITH DIAERESIS") + _(0x92, 0x00ED, "# LATIN SMALL LETTER I WITH ACUTE") + _(0x93, 0x00EC, "# LATIN SMALL LETTER I WITH GRAVE") + _(0x94, 0x00EE, "# LATIN SMALL LETTER I WITH CIRCUMFLEX") + _(0x95, 0x00EF, "# LATIN SMALL LETTER I WITH DIAERESIS") + _(0x96, 0x00F1, "# LATIN SMALL LETTER N WITH TILDE") + _(0x97, 0x00F3, "# LATIN SMALL LETTER O WITH ACUTE") + _(0x98, 0x00F2, "# LATIN SMALL LETTER O WITH GRAVE") + _(0x99, 0x00F4, "# LATIN SMALL LETTER O WITH CIRCUMFLEX") + _(0x9A, 0x00F6, "# LATIN SMALL LETTER O WITH DIAERESIS") + _(0x9B, 0x00F5, "# LATIN SMALL LETTER O WITH TILDE") + _(0x9C, 0x00FA, "# LATIN SMALL LETTER U WITH ACUTE") + _(0x9D, 0x00F9, "# LATIN SMALL LETTER U WITH GRAVE") + _(0x9E, 0x00FB, "# LATIN SMALL LETTER U WITH CIRCUMFLEX") + _(0x9F, 0x00FC, "# LATIN SMALL LETTER U WITH DIAERESIS") + _(0xA0, 0x2020, "# DAGGER") + _(0xA1, 0x00B0, "# DEGREE SIGN") + _(0xA2, 0x00A2, "# CENT SIGN") + _(0xA3, 0x00A3, "# POUND SIGN") + _(0xA4, 0x00A7, "# SECTION SIGN") + _(0xA5, 0x2022, "# BULLET") + _(0xA6, 0x00B6, "# PILCROW SIGN") + _(0xA7, 0x00DF, "# LATIN SMALL LETTER SHARP S") + _(0xA8, 0x00AE, "# REGISTERED SIGN") + _(0xA9, 0x00A9, "# COPYRIGHT SIGN") + _(0xAA, 0x2122, "# TRADE MARK SIGN") + _(0xAB, 0x00B4, "# ACUTE ACCENT") + _(0xAC, 0x00A8, "# DIAERESIS") + _(0xAD, 0x2260, "# NOT EQUAL TO") + _(0xAE, 0x00C6, "# LATIN CAPITAL LETTER AE") + _(0xAF, 0x00D8, "# LATIN CAPITAL LETTER O WITH STROKE") + _(0xB0, 0x221E, "# INFINITY") + _(0xB1, 0x00B1, "# PLUS-MINUS SIGN") + _(0xB2, 0x2264, "# LESS-THAN OR EQUAL TO") + _(0xB3, 0x2265, "# GREATER-THAN OR EQUAL TO") + _(0xB4, 0x00A5, "# YEN SIGN") + _(0xB5, 0x00B5, "# MICRO SIGN") + _(0xB6, 0x2202, "# PARTIAL DIFFERENTIAL") + _(0xB7, 0x2211, "# N-ARY SUMMATION") + _(0xB8, 0x220F, "# N-ARY PRODUCT") + _(0xB9, 0x03C0, "# GREEK SMALL LETTER PI") + _(0xBA, 0x222B, "# INTEGRAL") + _(0xBB, 0x00AA, "# FEMININE ORDINAL INDICATOR") + _(0xBC, 0x00BA, "# MASCULINE ORDINAL INDICATOR") + _(0xBD, 0x03A9, "# GREEK CAPITAL LETTER OMEGA") + _(0xBE, 0x00E6, "# LATIN SMALL LETTER AE") + _(0xBF, 0x00F8, "# LATIN SMALL LETTER O WITH STROKE") + _(0xC0, 0x00BF, "# INVERTED QUESTION MARK") + _(0xC1, 0x00A1, "# INVERTED EXCLAMATION MARK") + _(0xC2, 0x00AC, "# NOT SIGN") + _(0xC3, 0x221A, "# SQUARE ROOT") + _(0xC4, 0x0192, "# LATIN SMALL LETTER F WITH HOOK") + _(0xC5, 0x2248, "# ALMOST EQUAL TO") + _(0xC6, 0x2206, "# INCREMENT") + _(0xC7, 0x00AB, "# LEFT-POINTING DOUBLE ANGLE QUOTATION MARK") + _(0xC8, 0x00BB, "# RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK") + _(0xC9, 0x2026, "# HORIZONTAL ELLIPSIS") + _(0xCA, 0x00A0, "# NO-BREAK SPACE") + _(0xCB, 0x00C0, "# LATIN CAPITAL LETTER A WITH GRAVE") + _(0xCC, 0x00C3, "# LATIN CAPITAL LETTER A WITH TILDE") + _(0xCD, 0x00D5, "# LATIN CAPITAL LETTER O WITH TILDE") + _(0xCE, 0x0152, "# LATIN CAPITAL LIGATURE OE") + _(0xCF, 0x0153, "# LATIN SMALL LIGATURE OE") + _(0xD0, 0x2013, "# EN DASH") + _(0xD1, 0x2014, "# EM DASH") + _(0xD2, 0x201C, "# LEFT DOUBLE QUOTATION MARK") + _(0xD3, 0x201D, "# RIGHT DOUBLE QUOTATION MARK") + _(0xD4, 0x2018, "# LEFT SINGLE QUOTATION MARK") + _(0xD5, 0x2019, "# RIGHT SINGLE QUOTATION MARK") + _(0xD6, 0x00F7, "# DIVISION SIGN") + _(0xD7, 0x25CA, "# LOZENGE") + _(0xD8, 0x00FF, "# LATIN SMALL LETTER Y WITH DIAERESIS") + _(0xD9, 0x0178, "# LATIN CAPITAL LETTER Y WITH DIAERESIS") + _(0xDA, 0x2044, "# FRACTION SLASH") + _(0xDB, 0x20AC, "# EURO SIGN") + _(0xDC, 0x2039, "# SINGLE LEFT-POINTING ANGLE QUOTATION MARK") + _(0xDD, 0x203A, "# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK") + _(0xDE, 0xFB01, "# LATIN SMALL LIGATURE FI") + _(0xDF, 0xFB02, "# LATIN SMALL LIGATURE FL") + _(0xE0, 0x2021, "# DOUBLE DAGGER") + _(0xE1, 0x00B7, "# MIDDLE DOT") + _(0xE2, 0x201A, "# SINGLE LOW-9 QUOTATION MARK") + _(0xE3, 0x201E, "# DOUBLE LOW-9 QUOTATION MARK") + _(0xE4, 0x2030, "# PER MILLE SIGN") + _(0xE5, 0x00C2, "# LATIN CAPITAL LETTER A WITH CIRCUMFLEX") + _(0xE6, 0x00CA, "# LATIN CAPITAL LETTER E WITH CIRCUMFLEX") + _(0xE7, 0x00C1, "# LATIN CAPITAL LETTER A WITH ACUTE") + _(0xE8, 0x00CB, "# LATIN CAPITAL LETTER E WITH DIAERESIS") + _(0xE9, 0x00C8, "# LATIN CAPITAL LETTER E WITH GRAVE") + _(0xEA, 0x00CD, "# LATIN CAPITAL LETTER I WITH ACUTE") + _(0xEB, 0x00CE, "# LATIN CAPITAL LETTER I WITH CIRCUMFLEX") + _(0xEC, 0x00CF, "# LATIN CAPITAL LETTER I WITH DIAERESIS") + _(0xED, 0x00CC, "# LATIN CAPITAL LETTER I WITH GRAVE") + _(0xEE, 0x00D3, "# LATIN CAPITAL LETTER O WITH ACUTE") + _(0xEF, 0x00D4, "# LATIN CAPITAL LETTER O WITH CIRCUMFLEX") + _(0xF0, 0xF8FF, "# Apple logo") + _(0xF1, 0x00D2, "# LATIN CAPITAL LETTER O WITH GRAVE") + _(0xF2, 0x00DA, "# LATIN CAPITAL LETTER U WITH ACUTE") + _(0xF3, 0x00DB, "# LATIN CAPITAL LETTER U WITH CIRCUMFLEX") + _(0xF4, 0x00D9, "# LATIN CAPITAL LETTER U WITH GRAVE") + _(0xF5, 0x0131, "# LATIN SMALL LETTER DOTLESS I") + _(0xF6, 0x02C6, "# MODIFIER LETTER CIRCUMFLEX ACCENT") + _(0xF7, 0x02DC, "# SMALL TILDE") + _(0xF8, 0x00AF, "# MACRON") + _(0xF9, 0x02D8, "# BREVE") + _(0xFA, 0x02D9, "# DOT ABOVE") + _(0xFB, 0x02DA, "# RING ABOVE") + _(0xFC, 0x00B8, "# CEDILLA") + _(0xFD, 0x02DD, "# DOUBLE ACUTE ACCENT") + _(0xFE, 0x02DB, "# OGONEK") + _(0xFF, 0x02C7, "# CARON") + +#undef _ + return 0; +} + + + + + + + + +std::string utf8_to_macroman(const std::string &s) { + + + if (std::find_if(s.begin(), s.end(), [](unsigned char c){ return c & 0x80; }) == s.end()) + return s; + + std::string rv; + + unsigned cs = 0; + uint16_t tmp; + + for (unsigned char c : s) { + switch(cs) { + case 0: + if (c <= 0x7f) { + rv.push_back(c); + continue; + } + if ((c & 0b11100000) == 0b11000000) { + (tmp = c & 0b00011111); + cs = 1; + continue; + } + if ((c & 0b11110000) == 0b11100000) { + (tmp = c & 0b00001111); + cs = 2; + continue; + } + if ((c & 0b11111000) == 0b11110000) { + (tmp = c & 0b00000111); + cs = 3; + continue; + } + // not utf8... + break; + case 1: + case 2: + case 3: + if ((c & 0b11000000) != 0b10000000) { + //not utf8... + } + tmp = (tmp << 6) + (c & 0b00111111); + if(--cs == 0) { + c = unicode_to_macroman(tmp); + if (c) rv.push_back(c); + } + break; + } + } + + return rv; +} + + + diff --git a/mpw-shell.cpp b/mpw-shell.cpp index a7de18c..901d249 100644 --- a/mpw-shell.cpp +++ b/mpw-shell.cpp @@ -181,7 +181,9 @@ void control_c_handler(int signal, siginfo_t *sinfo, void *context) { //fprintf(stderr, "interrupt!\n"); } -int interactive(Environment &env, phase1 &p, phase2& p2) { +std::string utf8_to_macroman(const std::string &s); + +int interactive(Environment &env, phase1 &p1, phase2& p2) { std::string history_file = root(); history_file += ".history"; @@ -198,15 +200,17 @@ int interactive(Environment &env, phase1 &p, phase2& p2) { sigaction(SIGINT, &act, &old_act); + + for(;;) { const char *prompt = "# "; - if (p2.continuation()) prompt = "> "; + if (p1.continuation() || p2.continuation()) prompt = "> "; char *cp = readline(prompt); if (!cp) { if (control_c) { control_c = 0; fprintf(stdout, "\n"); - p.abort(); + p1.abort(); p2.abort(); env.status(-9, false); continue; @@ -217,29 +221,32 @@ int interactive(Environment &env, phase1 &p, phase2& p2) { std::string s(cp); free(cp); - if (s.empty()) continue; + //if (s.empty()) continue; // don't add if same as previous entry. - HIST_ENTRY *he = history_get(history_length); - if (he == nullptr || s != he->line) - add_history(s.c_str()); - + if (!s.empty()) { + HIST_ENTRY *he = history_get(history_length); + if (he == nullptr || s != he->line) + add_history(s.c_str()); + } + // only if utf8.... + s = utf8_to_macroman(s); s.push_back('\n'); try { - p.process(s); + p1.process(s); } catch(std::exception &ex) { fprintf(stderr, "%s\n", ex.what()); - p.reset(); + p1.reset(); } } try { - p.finish(); + p1.finish(); } catch(std::exception &ex) { fprintf(stderr, "%s\n", ex.what()); - p.reset(); + p1.reset(); } sigaction(SIGINT, &old_act, nullptr); diff --git a/phase1.h b/phase1.h index e1995d4..0630df8 100644 --- a/phase1.h +++ b/phase1.h @@ -37,6 +37,7 @@ public: } + bool continuation() const { return multiline; } private: