tokenizer -- remove quotes as separate step, return clean command line.

This commit is contained in:
Kelvin Sherlock 2016-07-28 13:43:33 -04:00
parent a2a48fcba7
commit 413b9a805b
2 changed files with 92 additions and 44 deletions

View File

@ -11,44 +11,31 @@
escape = 0xb6; escape = 0xb6;
ws = [ \t]; ws = [ \t\n\r];
nl = '\n' | '\r';
action push_token { action push_token {
if (!scratch.empty() || quoted) { if (!scratch.empty()) {
tokens.emplace_back(std::move(scratch)); tokens.emplace_back(std::move(scratch));
scratch.clear(); scratch.clear();
quoted = false;
} }
} }
action push_back { action push {
scratch.push_back(fc); scratch.push_back(fc);
} }
schar = [^'] $push_back; action push_string {
sstring = scratch.append(ts, te);
['] ${ quoted = true; } schar** ['] }
$err{ throw sstring_error(); }
;
# if eof, should push escape... schar = [^'] ;
escape_seq = sstring = ['] schar** ['] $err{ throw sstring_error(); } ;
escape $err{ scratch.push_back(escape); }
( escape_seq = escape any ;
'f' ${scratch.push_back('\f'); }
| 'n' ${scratch.push_back('\n'); /* \r ? */ }
| 't' ${scratch.push_back('\t'); }
| [^fnt] $push_back
)
;
# double-quoted string. # double-quoted string.
dchar = escape_seq | (any - escape - ["]) $push_back; dchar = escape_seq | (any - escape - ["]);
dstring = dstring = ["] dchar** ["] $err{ throw dstring_error(); } ;
["] ${ quoted = true; } dchar** ["]
$err{ throw dstring_error(); }
;
action eval { eval } action eval { eval }
@ -56,7 +43,7 @@
# > == start state (single char tokens or common prefix) # > == start state (single char tokens or common prefix)
# % == final state (multi char tokens w/ unique prefix) # % == final state (multi char tokens w/ unique prefix)
# $ == all states # $ == all states
char = any - escape - ['"]; char = any - ['"];
main := |* main := |*
ws+ >push_token; ws+ >push_token;
'>>' %push_token => { tokens.emplace_back(">>", '>>'); }; '>>' %push_token => { tokens.emplace_back(">>", '>>'); };
@ -141,33 +128,29 @@
%push_token => { tokens.emplace_back("-=", '-='); }; %push_token => { tokens.emplace_back("-=", '-='); };
sstring ; sstring => push_string;
dstring ; dstring => push_string;
escape_seq; escape_seq => push_string;
char => push_back; char => push;
*| *|
; ;
}%% }%%
inline void replace_eval_token(token &t) { void replace_eval_token(token &t) {
%%{ %%{
machine eval_keywords; machine eval_keywords;
main := main :=
/and/i %{ t.type = '&&'; } 'and'i %{ t.type = '&&'; }
| | 'or'i %{ t.type = '||'; }
/or/i %{ t.type = '||'; } | 'not'i %{ t.type = '!'; }
| | 'div'i %{ t.type = '/'; }
/not/i %{ t.type = '!'; } | 'mod'i %{ t.type = '%'; }
|
/div/i %{ t.type = '/'; }
|
/mod/i %{ t.type = '%'; }
; ;
}%% }%%
@ -180,15 +163,68 @@ inline void replace_eval_token(token &t) {
const char *pe = t.string.data() + t.string.size(); const char *pe = t.string.data() + t.string.size();
const char *eof = pe; const char *eof = pe;
int cs; int cs;
%%write init; %%write init;
%%write exec; %%write exec;
} }
std::vector<token> tokenize(const std::string &s, bool eval)
void unquote(token &t) {
if (t.string.find_first_of("'\"\xb6", 0, 3) == t.string.npos) return;
int cs;
const unsigned char *p = (const unsigned char *)t.string.data();
const unsigned char *pe = p + t.string.length();
const unsigned char *eof = pe;
std::string scratch;
scratch.reserve(t.string.length());
%%{
machine unquote;
alphtype unsigned char;
action push { scratch.push_back(fc); }
escape = 0xb6;
char = any - escape - ['"];
schar = [^'] $push;
sstring = ['] schar** ['];
ecode =
'f' ${ scratch.push_back('\f'); }
| 'n' ${ scratch.push_back('\n'); }
| 't' ${ scratch.push_back('\t'); }
| [^fnt] ${ scratch.push_back(fc); }
;
escape_seq = escape $err{ scratch.push_back(escape); } ecode;
dchar = escape ecode | (any - escape - ["]) $push;
dstring = ["] dchar** ["];
main := (
escape_seq
| sstring
| dstring
| char $push
)**;
write data;
write init;
write exec;
}%%
t.string = std::move(scratch);
}
std::vector<token> tokenize(std::string &s, bool eval)
{ {
std::vector<token> tokens; std::vector<token> tokens;
std::string scratch; std::string scratch;
bool quoted = false; // found a quote character ("" creates a token)
%%machine tokenizer; %%machine tokenizer;
@ -205,11 +241,23 @@ std::vector<token> tokenize(const std::string &s, bool eval)
%%write exec; %%write exec;
if (!scratch.empty() || quoted) { if (!scratch.empty()) {
tokens.emplace_back(std::move(scratch)); tokens.emplace_back(std::move(scratch));
scratch.clear(); scratch.clear();
} }
// re-build s.
s.clear();
for (const token &t : tokens) {
s.append(t.string);
s.push_back(' ');
}
if (!s.empty()) s.pop_back();
for (token &t : tokens) {
if (t.type == token::text) unquote(t);
}
// alternate operator tokens for eval // alternate operator tokens for eval
if (eval) { if (eval) {

View File

@ -48,7 +48,7 @@ public:
std::vector<token> tokenize(const std::string &s, bool eval = false); std::vector<token> tokenize(std::string &s, bool eval = false);
std::string expand_vars(const std::string &s, const class Environment &); std::string expand_vars(const std::string &s, const class Environment &);
//std::string quote(std::string &&s); //std::string quote(std::string &&s);