convert utf8 input to macroman.

This commit is contained in:
Kelvin Sherlock 2016-07-21 15:14:27 -04:00
parent 80aaaa208d
commit 6d929aa87f
4 changed files with 228 additions and 12 deletions

View File

@ -78,6 +78,7 @@ add_executable(mpw-shell mpw-shell.cpp mpw-shell-token.cpp mpw-shell-expand.cpp
mpw-shell-parser.cpp value.cpp mpw-shell-quote.cpp
phase1.cpp phase2.cpp phase2-parser.cpp command.cpp environment.cpp builtins.cpp
pathnames.cpp
macroman.cpp
cxx/mapped_file.cpp
cxx/filesystem.cpp
cxx/path.cpp

207
macroman.cpp Normal file
View File

@ -0,0 +1,207 @@
#include <string>
#include <algorithm>
unsigned char unicode_to_macroman(uint16_t c){
#undef _
#define _(macroman,unicode,comment) if (c == unicode) return macroman;
// http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
_(0x80, 0x00C4, "# LATIN CAPITAL LETTER A WITH DIAERESIS")
_(0x81, 0x00C5, "# LATIN CAPITAL LETTER A WITH RING ABOVE")
_(0x82, 0x00C7, "# LATIN CAPITAL LETTER C WITH CEDILLA")
_(0x83, 0x00C9, "# LATIN CAPITAL LETTER E WITH ACUTE")
_(0x84, 0x00D1, "# LATIN CAPITAL LETTER N WITH TILDE")
_(0x85, 0x00D6, "# LATIN CAPITAL LETTER O WITH DIAERESIS")
_(0x86, 0x00DC, "# LATIN CAPITAL LETTER U WITH DIAERESIS")
_(0x87, 0x00E1, "# LATIN SMALL LETTER A WITH ACUTE")
_(0x88, 0x00E0, "# LATIN SMALL LETTER A WITH GRAVE")
_(0x89, 0x00E2, "# LATIN SMALL LETTER A WITH CIRCUMFLEX")
_(0x8A, 0x00E4, "# LATIN SMALL LETTER A WITH DIAERESIS")
_(0x8B, 0x00E3, "# LATIN SMALL LETTER A WITH TILDE")
_(0x8C, 0x00E5, "# LATIN SMALL LETTER A WITH RING ABOVE")
_(0x8D, 0x00E7, "# LATIN SMALL LETTER C WITH CEDILLA")
_(0x8E, 0x00E9, "# LATIN SMALL LETTER E WITH ACUTE")
_(0x8F, 0x00E8, "# LATIN SMALL LETTER E WITH GRAVE")
_(0x90, 0x00EA, "# LATIN SMALL LETTER E WITH CIRCUMFLEX")
_(0x91, 0x00EB, "# LATIN SMALL LETTER E WITH DIAERESIS")
_(0x92, 0x00ED, "# LATIN SMALL LETTER I WITH ACUTE")
_(0x93, 0x00EC, "# LATIN SMALL LETTER I WITH GRAVE")
_(0x94, 0x00EE, "# LATIN SMALL LETTER I WITH CIRCUMFLEX")
_(0x95, 0x00EF, "# LATIN SMALL LETTER I WITH DIAERESIS")
_(0x96, 0x00F1, "# LATIN SMALL LETTER N WITH TILDE")
_(0x97, 0x00F3, "# LATIN SMALL LETTER O WITH ACUTE")
_(0x98, 0x00F2, "# LATIN SMALL LETTER O WITH GRAVE")
_(0x99, 0x00F4, "# LATIN SMALL LETTER O WITH CIRCUMFLEX")
_(0x9A, 0x00F6, "# LATIN SMALL LETTER O WITH DIAERESIS")
_(0x9B, 0x00F5, "# LATIN SMALL LETTER O WITH TILDE")
_(0x9C, 0x00FA, "# LATIN SMALL LETTER U WITH ACUTE")
_(0x9D, 0x00F9, "# LATIN SMALL LETTER U WITH GRAVE")
_(0x9E, 0x00FB, "# LATIN SMALL LETTER U WITH CIRCUMFLEX")
_(0x9F, 0x00FC, "# LATIN SMALL LETTER U WITH DIAERESIS")
_(0xA0, 0x2020, "# DAGGER")
_(0xA1, 0x00B0, "# DEGREE SIGN")
_(0xA2, 0x00A2, "# CENT SIGN")
_(0xA3, 0x00A3, "# POUND SIGN")
_(0xA4, 0x00A7, "# SECTION SIGN")
_(0xA5, 0x2022, "# BULLET")
_(0xA6, 0x00B6, "# PILCROW SIGN")
_(0xA7, 0x00DF, "# LATIN SMALL LETTER SHARP S")
_(0xA8, 0x00AE, "# REGISTERED SIGN")
_(0xA9, 0x00A9, "# COPYRIGHT SIGN")
_(0xAA, 0x2122, "# TRADE MARK SIGN")
_(0xAB, 0x00B4, "# ACUTE ACCENT")
_(0xAC, 0x00A8, "# DIAERESIS")
_(0xAD, 0x2260, "# NOT EQUAL TO")
_(0xAE, 0x00C6, "# LATIN CAPITAL LETTER AE")
_(0xAF, 0x00D8, "# LATIN CAPITAL LETTER O WITH STROKE")
_(0xB0, 0x221E, "# INFINITY")
_(0xB1, 0x00B1, "# PLUS-MINUS SIGN")
_(0xB2, 0x2264, "# LESS-THAN OR EQUAL TO")
_(0xB3, 0x2265, "# GREATER-THAN OR EQUAL TO")
_(0xB4, 0x00A5, "# YEN SIGN")
_(0xB5, 0x00B5, "# MICRO SIGN")
_(0xB6, 0x2202, "# PARTIAL DIFFERENTIAL")
_(0xB7, 0x2211, "# N-ARY SUMMATION")
_(0xB8, 0x220F, "# N-ARY PRODUCT")
_(0xB9, 0x03C0, "# GREEK SMALL LETTER PI")
_(0xBA, 0x222B, "# INTEGRAL")
_(0xBB, 0x00AA, "# FEMININE ORDINAL INDICATOR")
_(0xBC, 0x00BA, "# MASCULINE ORDINAL INDICATOR")
_(0xBD, 0x03A9, "# GREEK CAPITAL LETTER OMEGA")
_(0xBE, 0x00E6, "# LATIN SMALL LETTER AE")
_(0xBF, 0x00F8, "# LATIN SMALL LETTER O WITH STROKE")
_(0xC0, 0x00BF, "# INVERTED QUESTION MARK")
_(0xC1, 0x00A1, "# INVERTED EXCLAMATION MARK")
_(0xC2, 0x00AC, "# NOT SIGN")
_(0xC3, 0x221A, "# SQUARE ROOT")
_(0xC4, 0x0192, "# LATIN SMALL LETTER F WITH HOOK")
_(0xC5, 0x2248, "# ALMOST EQUAL TO")
_(0xC6, 0x2206, "# INCREMENT")
_(0xC7, 0x00AB, "# LEFT-POINTING DOUBLE ANGLE QUOTATION MARK")
_(0xC8, 0x00BB, "# RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK")
_(0xC9, 0x2026, "# HORIZONTAL ELLIPSIS")
_(0xCA, 0x00A0, "# NO-BREAK SPACE")
_(0xCB, 0x00C0, "# LATIN CAPITAL LETTER A WITH GRAVE")
_(0xCC, 0x00C3, "# LATIN CAPITAL LETTER A WITH TILDE")
_(0xCD, 0x00D5, "# LATIN CAPITAL LETTER O WITH TILDE")
_(0xCE, 0x0152, "# LATIN CAPITAL LIGATURE OE")
_(0xCF, 0x0153, "# LATIN SMALL LIGATURE OE")
_(0xD0, 0x2013, "# EN DASH")
_(0xD1, 0x2014, "# EM DASH")
_(0xD2, 0x201C, "# LEFT DOUBLE QUOTATION MARK")
_(0xD3, 0x201D, "# RIGHT DOUBLE QUOTATION MARK")
_(0xD4, 0x2018, "# LEFT SINGLE QUOTATION MARK")
_(0xD5, 0x2019, "# RIGHT SINGLE QUOTATION MARK")
_(0xD6, 0x00F7, "# DIVISION SIGN")
_(0xD7, 0x25CA, "# LOZENGE")
_(0xD8, 0x00FF, "# LATIN SMALL LETTER Y WITH DIAERESIS")
_(0xD9, 0x0178, "# LATIN CAPITAL LETTER Y WITH DIAERESIS")
_(0xDA, 0x2044, "# FRACTION SLASH")
_(0xDB, 0x20AC, "# EURO SIGN")
_(0xDC, 0x2039, "# SINGLE LEFT-POINTING ANGLE QUOTATION MARK")
_(0xDD, 0x203A, "# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK")
_(0xDE, 0xFB01, "# LATIN SMALL LIGATURE FI")
_(0xDF, 0xFB02, "# LATIN SMALL LIGATURE FL")
_(0xE0, 0x2021, "# DOUBLE DAGGER")
_(0xE1, 0x00B7, "# MIDDLE DOT")
_(0xE2, 0x201A, "# SINGLE LOW-9 QUOTATION MARK")
_(0xE3, 0x201E, "# DOUBLE LOW-9 QUOTATION MARK")
_(0xE4, 0x2030, "# PER MILLE SIGN")
_(0xE5, 0x00C2, "# LATIN CAPITAL LETTER A WITH CIRCUMFLEX")
_(0xE6, 0x00CA, "# LATIN CAPITAL LETTER E WITH CIRCUMFLEX")
_(0xE7, 0x00C1, "# LATIN CAPITAL LETTER A WITH ACUTE")
_(0xE8, 0x00CB, "# LATIN CAPITAL LETTER E WITH DIAERESIS")
_(0xE9, 0x00C8, "# LATIN CAPITAL LETTER E WITH GRAVE")
_(0xEA, 0x00CD, "# LATIN CAPITAL LETTER I WITH ACUTE")
_(0xEB, 0x00CE, "# LATIN CAPITAL LETTER I WITH CIRCUMFLEX")
_(0xEC, 0x00CF, "# LATIN CAPITAL LETTER I WITH DIAERESIS")
_(0xED, 0x00CC, "# LATIN CAPITAL LETTER I WITH GRAVE")
_(0xEE, 0x00D3, "# LATIN CAPITAL LETTER O WITH ACUTE")
_(0xEF, 0x00D4, "# LATIN CAPITAL LETTER O WITH CIRCUMFLEX")
_(0xF0, 0xF8FF, "# Apple logo")
_(0xF1, 0x00D2, "# LATIN CAPITAL LETTER O WITH GRAVE")
_(0xF2, 0x00DA, "# LATIN CAPITAL LETTER U WITH ACUTE")
_(0xF3, 0x00DB, "# LATIN CAPITAL LETTER U WITH CIRCUMFLEX")
_(0xF4, 0x00D9, "# LATIN CAPITAL LETTER U WITH GRAVE")
_(0xF5, 0x0131, "# LATIN SMALL LETTER DOTLESS I")
_(0xF6, 0x02C6, "# MODIFIER LETTER CIRCUMFLEX ACCENT")
_(0xF7, 0x02DC, "# SMALL TILDE")
_(0xF8, 0x00AF, "# MACRON")
_(0xF9, 0x02D8, "# BREVE")
_(0xFA, 0x02D9, "# DOT ABOVE")
_(0xFB, 0x02DA, "# RING ABOVE")
_(0xFC, 0x00B8, "# CEDILLA")
_(0xFD, 0x02DD, "# DOUBLE ACUTE ACCENT")
_(0xFE, 0x02DB, "# OGONEK")
_(0xFF, 0x02C7, "# CARON")
#undef _
return 0;
}
std::string utf8_to_macroman(const std::string &s) {
if (std::find_if(s.begin(), s.end(), [](unsigned char c){ return c & 0x80; }) == s.end())
return s;
std::string rv;
unsigned cs = 0;
uint16_t tmp;
for (unsigned char c : s) {
switch(cs) {
case 0:
if (c <= 0x7f) {
rv.push_back(c);
continue;
}
if ((c & 0b11100000) == 0b11000000) {
(tmp = c & 0b00011111);
cs = 1;
continue;
}
if ((c & 0b11110000) == 0b11100000) {
(tmp = c & 0b00001111);
cs = 2;
continue;
}
if ((c & 0b11111000) == 0b11110000) {
(tmp = c & 0b00000111);
cs = 3;
continue;
}
// not utf8...
break;
case 1:
case 2:
case 3:
if ((c & 0b11000000) != 0b10000000) {
//not utf8...
}
tmp = (tmp << 6) + (c & 0b00111111);
if(--cs == 0) {
c = unicode_to_macroman(tmp);
if (c) rv.push_back(c);
}
break;
}
}
return rv;
}

View File

@ -181,7 +181,9 @@ void control_c_handler(int signal, siginfo_t *sinfo, void *context) {
//fprintf(stderr, "interrupt!\n");
}
int interactive(Environment &env, phase1 &p, phase2& p2) {
std::string utf8_to_macroman(const std::string &s);
int interactive(Environment &env, phase1 &p1, phase2& p2) {
std::string history_file = root();
history_file += ".history";
@ -198,15 +200,17 @@ int interactive(Environment &env, phase1 &p, phase2& p2) {
sigaction(SIGINT, &act, &old_act);
for(;;) {
const char *prompt = "# ";
if (p2.continuation()) prompt = "> ";
if (p1.continuation() || p2.continuation()) prompt = "> ";
char *cp = readline(prompt);
if (!cp) {
if (control_c) {
control_c = 0;
fprintf(stdout, "\n");
p.abort();
p1.abort();
p2.abort();
env.status(-9, false);
continue;
@ -217,29 +221,32 @@ int interactive(Environment &env, phase1 &p, phase2& p2) {
std::string s(cp);
free(cp);
if (s.empty()) continue;
//if (s.empty()) continue;
// don't add if same as previous entry.
HIST_ENTRY *he = history_get(history_length);
if (he == nullptr || s != he->line)
add_history(s.c_str());
if (!s.empty()) {
HIST_ENTRY *he = history_get(history_length);
if (he == nullptr || s != he->line)
add_history(s.c_str());
}
// only if utf8....
s = utf8_to_macroman(s);
s.push_back('\n');
try {
p.process(s);
p1.process(s);
} catch(std::exception &ex) {
fprintf(stderr, "%s\n", ex.what());
p.reset();
p1.reset();
}
}
try {
p.finish();
p1.finish();
} catch(std::exception &ex) {
fprintf(stderr, "%s\n", ex.what());
p.reset();
p1.reset();
}
sigaction(SIGINT, &old_act, nullptr);

View File

@ -37,6 +37,7 @@ public:
}
bool continuation() const { return multiline; }
private: