diff --git a/CMakeLists.txt b/CMakeLists.txt index b5f3a75..21ebd87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,7 +110,7 @@ endif() add_executable(mpw-shell mpw-shell.cpp mpw-shell-token.cpp mpw-shell-expand.cpp - mpw-shell-parser.cpp mpw_parser.cpp value.cpp mpw-shell-quote.cpp + mpw-shell-parser.cpp mpw_parser.cpp value.cpp mpw-shell-quote.cpp mpw-regex.cpp phase1.cpp phase2.cpp phase3.cpp command.cpp environment.cpp builtins.cpp pathnames.cpp macroman.cpp diff --git a/builtins.cpp b/builtins.cpp index 2da1968..5dbdaaf 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -691,7 +691,7 @@ int builtin_evaluate(Environment &env, std::vector &&tokens, const fdmask tokens.pop_back(); tokens.pop_back(); - int32_t i = evaluate_expression("Evaluate", std::move(tokens)); + int32_t i = evaluate_expression(env, "Evaluate", std::move(tokens)); switch(type) { case '=': @@ -721,7 +721,7 @@ int builtin_evaluate(Environment &env, std::vector &&tokens, const fdmask } } - int32_t i = evaluate_expression("Evaluate", std::move(tokens)); + int32_t i = evaluate_expression(env, "Evaluate", std::move(tokens)); if (output == 'h') { fdprintf(stdout, "0x%08x\n", i); diff --git a/command.cpp b/command.cpp index d95d6c2..7c88fd1 100644 --- a/command.cpp +++ b/command.cpp @@ -101,7 +101,7 @@ namespace { case IF: tokens.pop_back(); try { - e = evaluate_expression("If", std::move(tokens)); + e = evaluate_expression(env, "If", std::move(tokens)); } catch (std::exception &ex) { fprintf(stderr, "%s\n", ex.what()); diff --git a/error.h b/error.h index ec7326b..486b072 100644 --- a/error.h +++ b/error.h @@ -81,6 +81,13 @@ public: {} }; +class regex_error : public mpw_error { +public: + regex_error(int status = -2) : + mpw_error(status, "MPW Shell - File name pattern is incorrect") + {} + +}; /* these are used for flow-control. diff --git a/mpw-regex.cpp b/mpw-regex.cpp new file mode 100644 index 0000000..364177b --- /dev/null +++ b/mpw-regex.cpp @@ -0,0 +1,269 @@ + +#include "mpw-regex.h" +#include "environment.h" + +typedef std::string::const_iterator iterator; + +namespace { + bool ecma_special(unsigned char c) { + // + switch(c) { + case '|': + case '{': + case '}': + case '(': + case ')': + case '[': + case ']': + case '*': + case '+': + case '^': + case '$': + case '.': + case '\\': + return true; + default: + return false; + } + } +} + +mpw_regex::mpw_regex(const std::string &s, bool slash) { + convert_re(s, slash); +} + +bool mpw_regex::is_glob(const std::string &s) { + bool esc = false; + for (unsigned char c : s) { + if (esc) { + esc = false; + continue; + } + switch(c) { + case 0xb6: + esc = true; + break; + case '[': + case '?': + case '*': + case '+': + case 0xc7: + case 0xc5: + return true; + default: + break; + } + } + return false; +} + +bool mpw_regex::match(const std::string &s, Environment &e) { + std::smatch m; + bool ok = std::regex_match(s, m, re); + if (!ok) return false; + + for (int i = 0; i < 10; ++i) { + int index = capture_map[i]; + + if (index && index < m.size() && m[index].matched) { + std::string v(m[index].first, m[index].second); + std::string k("\xa8"); + k += (i + '0'); + e.set(k, std::move(v)); + } + } + + return true; +} + +bool mpw_regex::match(const std::string &s) { + return std::regex_match(s, re); +} + + +// convert a mpw-flavor regex to std::regex flavor regex. +void mpw_regex::convert_re(const std::string &s, bool slash) { + + + std::string accumulator; + auto iter = s.begin(); + auto end = s.end(); + + if (slash) { + if (iter == end || *iter++ != '/') + throw std::regex_error(std::regex_constants::error_space); + } + + iter = convert_re(iter, end, accumulator, slash ? '/' : 0); + + if (iter != end) throw std::regex_error(std::regex_constants::error_space); + + + re = std::regex(accumulator); + if (slash) key = s; + else key = "/" + s + "/"; +} + + +iterator mpw_regex::convert_re(iterator iter, iterator end, std::string &accumulator, unsigned char term) { + + while (iter != end) { + unsigned char c = *iter++; + if (c == 0xb6) { + // escape + if (iter == end) throw std::regex_error(std::regex_constants::error_escape); + c = *iter++; + if (ecma_special(c)) + accumulator += '\\'; + accumulator += c; + continue; + } + if (term && c == term) { + return iter; + } + if (c == '?') { + // match any char + accumulator += '.'; + continue; + } + if (c == 0xc5) { + // match any string + accumulator += ".*"; + continue; + } + if (c == '[') { + // begin a set + iter = convert_re_set(iter, end, accumulator); + continue; + } + if (c == '(') { + // begin a capture + iter = convert_re_capture(iter, end, accumulator); + continue; + } + if (c == 0xc7) { + // repeat + iter = convert_re_repeat(iter, end, accumulator); + continue; + } + if (c == '+' || c == '*') { + // same meaning + accumulator += c; + continue; + } + if (ecma_special(c)) { + accumulator += '\\'; + } + accumulator += c; + } + + if (term) throw std::regex_error(std::regex_constants::error_paren); + return iter; +} + +iterator mpw_regex::convert_re_repeat(iterator iter, iterator end, std::string &accumulator) { + int min = -1; + int max = -1; + + accumulator += "{"; + + while (iter != end) { + unsigned char c = *iter++; + if (c == 0xc8) { + accumulator += "}"; + return iter; + } + if (c != ',' && !isdigit(c)) break; + accumulator += c; + } + throw std::regex_error(std::regex_constants::error_brace); +} + +iterator mpw_regex::convert_re_set(iterator iter, iterator end, std::string &accumulator) { + // need extra logic to block character classes. + + unsigned char c; + accumulator += "["; + + if (iter != end && static_cast(*iter) == 0xc2) { + accumulator += "^"; + ++iter; + } else if (iter != end && *iter == '^') { + // leading ^ needs to be escaped. + accumulator += "\\^"; + ++iter; + } + while (iter != end) { + c = *iter++; + + if (c == 0xb6) { + // escape + if (iter == end) throw std::regex_error(std::regex_constants::error_escape); + c = *iter++; + accumulator += '\\'; + accumulator += c; + continue; + } + + if (c == ']') { + accumulator += "]"; + return iter; + } + if (c == '\\') { + accumulator += "\\\\"; + continue; + } + accumulator += c; + } + + throw std::regex_error(std::regex_constants::error_brack); +} + +iterator mpw_regex::convert_re_capture(iterator iter, iterator end, std::string &accumulator) { + + + /* + * consider: (abc(abc)®1(xyz))®2 + * m[1] = (abcabcxyz) + * m[2] = (abc) + * BUT we don't know if it's captured until the ® is parsed. + */ + + std::string scratch; + bool capture = false; + int n = -1; + + int ecma_index = ++num_captures; + + if (iter != end && *iter == '?') { + // leading ? needs to be escaped. + scratch += "\\?"; + ++iter; + } + iter = convert_re(iter, end, scratch, ')'); + + // check for capture? + if (iter != end && static_cast(*iter) == 0xa8) { + ++iter; + if (iter == end || !isdigit(*iter)) + throw std::regex_error(std::regex_constants::error_badbrace); // eh + n = *iter++ - '0'; + capture = true; + } + + accumulator += '('; + if (capture) { + /// ummm capture within a capture? backwards? + capture_map[n] = ecma_index; + } else { + accumulator += "?:"; + // re-number all sub-captures. + --num_captures; + for (int &index : capture_map) { + if (index >= ecma_index) --index; + } + } + accumulator += scratch; + accumulator += ')'; + return iter; +} diff --git a/mpw-regex.h b/mpw-regex.h new file mode 100644 index 0000000..5893629 --- /dev/null +++ b/mpw-regex.h @@ -0,0 +1,48 @@ +#ifndef __mpw_regex_h__ +#define __mpw_regex_h__ + +#include "environment.h" + +#include +#include + +class mpw_regex { + +public: + + mpw_regex(const std::string &s, bool slash); + + mpw_regex(const mpw_regex &) = default; + // mpw_regex(mpw_regex &&) = default; + + ~mpw_regex() = default; + + + mpw_regex &operator=(const mpw_regex &) = default; + // mpw_regex &operator=(mpw_regex &&) = default; + + bool match(const std::string &, class Environment &); + bool match(const std::string &); + + static bool is_glob(const std::string &s); + +private: + typedef std::string::const_iterator iterator; + + + void convert_re(const std::string &, bool slash); + + iterator convert_re(iterator iter, iterator end, std::string &accumulator, unsigned char term); + iterator convert_re_repeat(iterator iter, iterator end, std::string &accumulator); + iterator convert_re_set(iterator iter, iterator end, std::string &accumulator); + iterator convert_re_capture(iterator iter, iterator end, std::string &accumulator); + + + std::regex re; + std::string key; + int capture_map[10] = {}; // map mpw capture number to ecma group + int num_captures = 0; + +}; + +#endif diff --git a/mpw-shell-expand.rl b/mpw-shell-expand.rl index d67f8b5..74508ef 100644 --- a/mpw-shell-expand.rl +++ b/mpw-shell-expand.rl @@ -80,12 +80,18 @@ escape = 0xb6; - char = any - escape - ['"{`]; + char = any - escape - ['"{`/\\]; escape_seq = escape any; schar = [^']; sstring = ['] schar** [']; + fchar = [^/]; + fstring = [/] fchar** [/]; + + bchar = [^\\]; + bstring = [\\] bchar** [\\]; + vchar = [^}] $vpush; vchar1 = [^{}] $vpush; @@ -114,6 +120,8 @@ main := ( escape_seq $push | sstring $push + | fstring $push + | bstring $push | dstring $push | vstring | estring diff --git a/mpw-shell-parser.cpp b/mpw-shell-parser.cpp index 88d4ffa..8556c01 100644 --- a/mpw-shell-parser.cpp +++ b/mpw-shell-parser.cpp @@ -2,6 +2,7 @@ #include "fdset.h" #include "value.h" #include "error.h" +#include "mpw-regex.h" #include #include @@ -172,8 +173,8 @@ class expression_parser { public: - expression_parser(const std::string &n, std::vector &&t) : - name(n), tokens(std::move(t)) + expression_parser(Environment &e, const std::string &n, std::vector &&t) : + environment(e), name(n), tokens(std::move(t)) {} expression_parser(const expression_parser &) = delete; @@ -194,6 +195,7 @@ private: value eval(int op, value &lhs, value &rhs); + value eval_regex(value &lhs, value &rhs); [[noreturn]] void expect_binary_operator(); [[noreturn]] void end_of_expression(); @@ -207,6 +209,7 @@ private: if (!tokens.empty()) tokens.pop_back(); } + Environment &environment; const std::string &name; std::vector tokens; }; @@ -323,8 +326,8 @@ int expression_parser::precedence(int op) { case '==': case '!=': - case token::equivalent: - case token::not_equivalent: + case '=~': + case '!~': return 7; case '&': return 8; @@ -341,6 +344,25 @@ int expression_parser::precedence(int op) { //throw std::runtime_error("unimplemented op";); } +value expression_parser::eval_regex(value &lhs, value &rhs) { + try { + mpw_regex re(rhs.string, true); + // todo -- need environment to store matches. + bool ok = re.match(lhs.string, environment); + return ok ? 1 : 0; + + } catch (std::exception &ex) { + std::string error; + error = name; + if (rhs.string.empty() || rhs.string.front() != '/') + error += " - Missing /s around regular expression: "; + else + error += " - Invalid regular expression encountered: "; + error += rhs.string; + throw mpw_error(-5, error); + } +} + value expression_parser::eval(int op, value &lhs, value &rhs) { switch (op) { @@ -407,6 +429,12 @@ value expression_parser::eval(int op, value &lhs, value &rhs) { return lhs.string != rhs.string; + case '=~': + return eval_regex(lhs, rhs); + + case '!~': + return !eval_regex(lhs, rhs).number; + } // todo... throw std::runtime_error("unimplemented op"); @@ -469,8 +497,8 @@ int32_t expression_parser::evaluate() { return v.to_number(1); } -int32_t evaluate_expression(const std::string &name, std::vector &&tokens) { +int32_t evaluate_expression(Environment &env, const std::string &name, std::vector &&tokens) { - expression_parser p(name, std::move(tokens)); + expression_parser p(env, name, std::move(tokens)); return p.evaluate(); } diff --git a/mpw-shell-token.rl b/mpw-shell-token.rl index 0a6d5c3..80c9106 100644 --- a/mpw-shell-token.rl +++ b/mpw-shell-token.rl @@ -38,19 +38,21 @@ dstring = ["] dchar** ["] $err{ throw dstring_error(); } ; # search-forward string - fschar = escape_seq | (any - escape - [/]); - fsstring = [/] fschar** [/] $err{ throw fsstring_error(); } ; + # fschar = escape_seq | (any - escape - [/]); + fchar = [^/]; + fstring = [/] fchar** [/] $err{ throw fsstring_error(); } ; # search-backward string - bschar = escape_seq | (any - escape - [\\]); - bsstring = [\\] bschar** [\\] $err{ throw bsstring_error(); } ; + # bschar = escape_seq | (any - escape - [\\]); + bchar = [^\\]; + bstring = [\\] bchar** [\\] $err{ throw bsstring_error(); } ; action eval { eval } # > == start state (single char tokens or common prefix) # % == final state (multi char tokens w/ unique prefix) # $ == all states - char = any - ['"]; + char = any - ['"/\\]; main := |* ws+ >push_token; '>>' %push_token => { tokens.emplace_back(">>", '>>'); }; @@ -142,8 +144,8 @@ sstring => push_string; dstring => push_string; - fsstring => push_string; - bsstring => push_string; + fstring => push_string; + bstring => push_string; escape_seq => push_string; char => push; @@ -202,11 +204,19 @@ void unquote(token &t) { action push { scratch.push_back(fc); } escape = 0xb6; - char = any - escape - ['"]; + char = any - escape - ['"/\\]; schar = [^'] $push; sstring = ['] schar** [']; + # // and \\ strings retain the delimiter. + fchar = [^/]; + fstring = ([/] fchar** [/]) $push; + + bchar = [^\\]; + bstring = ([\\] bchar** [\\]) $push; + + ecode = 'f' ${ scratch.push_back('\f'); } | 'n' ${ scratch.push_back('\n'); } @@ -222,6 +232,8 @@ void unquote(token &t) { main := ( escape_seq | sstring + | fstring + | bstring | dstring | char $push )**; diff --git a/mpw-shell.h b/mpw-shell.h index c87df4a..a9126be 100644 --- a/mpw-shell.h +++ b/mpw-shell.h @@ -64,7 +64,7 @@ void parse_tokens(std::vector &&tokens, process &p); -int32_t evaluate_expression(const std::string &name, std::vector &&tokens); +int32_t evaluate_expression(Environment &e, const std::string &name, std::vector &&tokens); diff --git a/phase1.cpp b/phase1.cpp index 32a50d6..7048bdf 100644 --- a/phase1.cpp +++ b/phase1.cpp @@ -25,6 +25,13 @@ enum { st_estring2_esc, st_estring3, + + st_fstring, + st_fstring_esc, + + st_bstring, + st_bstring_esc, + }; @@ -54,6 +61,8 @@ int phase1::process(unsigned char c, int st) { case st_sstring_esc: case st_estring1_esc: case st_estring2_esc: + case st_fstring_esc: + case st_bstring_esc: multiline = true; scratch.pop_back(); line++; @@ -81,6 +90,10 @@ text: return st_sstring; case '`': return st_estring; + case '/': + return st_fstring; + case '\\': + return st_bstring; default: return st_text; @@ -109,6 +122,23 @@ text: return st_sstring; break; + + case st_fstring_esc: + // fall through + case st_fstring: + if (c == '/') return st_text; + if (c == esc) return st_fstring_esc; + return st_fstring; + break; + + case st_bstring_esc: + // fall through + case st_bstring: + if (c == '\\') return st_text; + if (c == esc) return st_bstring_esc; + return st_bstring; + break; + case st_dstring: if (c == '\"') return st_text; if (c == esc) return st_dstring_esc; diff --git a/phase2.rl b/phase2.rl index 647affa..bad23f7 100644 --- a/phase2.rl +++ b/phase2.rl @@ -86,6 +86,13 @@ schar = [^']; sstring = ['] schar** ['] ; + fchar = [^/]; + fstring = [/] fchar** [/] ; + + bchar = [^\\]; + bstring = [\\] bchar** [\\] ; + + vchar = [^}]; vstring = [{] vchar** [}] ; @@ -112,6 +119,8 @@ | '&' '&' $parse_amp_amp | escape_seq | sstring + | fstring + | bstring | dstring | vstring | estring