From 413b9a805b7ec2f67989ac873eb252839d9ff439 Mon Sep 17 00:00:00 2001 From: Kelvin Sherlock Date: Thu, 28 Jul 2016 13:43:33 -0400 Subject: [PATCH] tokenizer -- remove quotes as separate step, return clean command line. --- mpw-shell-token.rl | 134 ++++++++++++++++++++++++++++++--------------- mpw-shell.h | 2 +- 2 files changed, 92 insertions(+), 44 deletions(-) diff --git a/mpw-shell-token.rl b/mpw-shell-token.rl index 1a3cd3d..31f447c 100644 --- a/mpw-shell-token.rl +++ b/mpw-shell-token.rl @@ -11,44 +11,31 @@ escape = 0xb6; - ws = [ \t]; - nl = '\n' | '\r'; + ws = [ \t\n\r]; action push_token { - if (!scratch.empty() || quoted) { + if (!scratch.empty()) { tokens.emplace_back(std::move(scratch)); scratch.clear(); - quoted = false; } } - action push_back { + action push { scratch.push_back(fc); } - schar = [^'] $push_back; - sstring = - ['] ${ quoted = true; } schar** ['] - $err{ throw sstring_error(); } - ; + action push_string { + scratch.append(ts, te); + } - # if eof, should push escape... - escape_seq = - escape $err{ scratch.push_back(escape); } - ( - 'f' ${scratch.push_back('\f'); } - | 'n' ${scratch.push_back('\n'); /* \r ? */ } - | 't' ${scratch.push_back('\t'); } - | [^fnt] $push_back - ) - ; + schar = [^'] ; + sstring = ['] schar** ['] $err{ throw sstring_error(); } ; + + escape_seq = escape any ; # double-quoted string. - dchar = escape_seq | (any - escape - ["]) $push_back; - dstring = - ["] ${ quoted = true; } dchar** ["] - $err{ throw dstring_error(); } - ; + dchar = escape_seq | (any - escape - ["]); + dstring = ["] dchar** ["] $err{ throw dstring_error(); } ; action eval { eval } @@ -56,7 +43,7 @@ # > == start state (single char tokens or common prefix) # % == final state (multi char tokens w/ unique prefix) # $ == all states - char = any - escape - ['"]; + char = any - ['"]; main := |* ws+ >push_token; '>>' %push_token => { tokens.emplace_back(">>", '>>'); }; @@ -141,33 +128,29 @@ %push_token => { tokens.emplace_back("-=", '-='); }; - sstring ; - dstring ; - escape_seq; + sstring => push_string; + dstring => push_string; + escape_seq => push_string; - char => push_back; + char => push; *| ; }%% -inline void replace_eval_token(token &t) { +void replace_eval_token(token &t) { %%{ machine eval_keywords; main := - /and/i %{ t.type = '&&'; } - | - /or/i %{ t.type = '||'; } - | - /not/i %{ t.type = '!'; } - | - /div/i %{ t.type = '/'; } - | - /mod/i %{ t.type = '%'; } + 'and'i %{ t.type = '&&'; } + | 'or'i %{ t.type = '||'; } + | 'not'i %{ t.type = '!'; } + | 'div'i %{ t.type = '/'; } + | 'mod'i %{ t.type = '%'; } ; }%% @@ -180,15 +163,68 @@ inline void replace_eval_token(token &t) { const char *pe = t.string.data() + t.string.size(); const char *eof = pe; int cs; + %%write init; %%write exec; } -std::vector tokenize(const std::string &s, bool eval) + + +void unquote(token &t) { + + if (t.string.find_first_of("'\"\xb6", 0, 3) == t.string.npos) return; + + int cs; + const unsigned char *p = (const unsigned char *)t.string.data(); + const unsigned char *pe = p + t.string.length(); + const unsigned char *eof = pe; + + std::string scratch; + scratch.reserve(t.string.length()); +%%{ + + machine unquote; + alphtype unsigned char; + + action push { scratch.push_back(fc); } + escape = 0xb6; + char = any - escape - ['"]; + + schar = [^'] $push; + sstring = ['] schar** [']; + + ecode = + 'f' ${ scratch.push_back('\f'); } + | 'n' ${ scratch.push_back('\n'); } + | 't' ${ scratch.push_back('\t'); } + | [^fnt] ${ scratch.push_back(fc); } + ; + + escape_seq = escape $err{ scratch.push_back(escape); } ecode; + + dchar = escape ecode | (any - escape - ["]) $push; + dstring = ["] dchar** ["]; + + main := ( + escape_seq + | sstring + | dstring + | char $push + )**; + + write data; + write init; + write exec; +}%% + + t.string = std::move(scratch); +} + + +std::vector tokenize(std::string &s, bool eval) { std::vector tokens; std::string scratch; - bool quoted = false; // found a quote character ("" creates a token) %%machine tokenizer; @@ -205,11 +241,23 @@ std::vector tokenize(const std::string &s, bool eval) %%write exec; - if (!scratch.empty() || quoted) { + if (!scratch.empty()) { tokens.emplace_back(std::move(scratch)); scratch.clear(); } + // re-build s. + s.clear(); + for (const token &t : tokens) { + s.append(t.string); + s.push_back(' '); + } + if (!s.empty()) s.pop_back(); + + for (token &t : tokens) { + if (t.type == token::text) unquote(t); + } + // alternate operator tokens for eval if (eval) { diff --git a/mpw-shell.h b/mpw-shell.h index 48fd696..dee4ec2 100644 --- a/mpw-shell.h +++ b/mpw-shell.h @@ -48,7 +48,7 @@ public: -std::vector tokenize(const std::string &s, bool eval = false); +std::vector tokenize(std::string &s, bool eval = false); std::string expand_vars(const std::string &s, const class Environment &); //std::string quote(std::string &&s);