pipe-lined lexing/parsing.

This commit is contained in:
Kelvin Sherlock 2016-01-29 22:23:14 -05:00
parent 4297c7095e
commit 2157dc0ba8
4 changed files with 445 additions and 0 deletions

39
phase1.h Normal file
View File

@ -0,0 +1,39 @@
#ifndef __phase1_h__
#define __phase1_h__
#include <string>
#include <functional>
class phase1 {
public:
typedef std::function<void(std::string &&)> pipe_function;
phase1();
void process(const unsigned char *begin, const unsigned char *end, bool final = false);
void process(const char *begin, const char *end, bool final = false) {
process((const unsigned char *)begin, (const unsigned char *)end, final);
}
void process(const std::string &s) { process(s.data(), s.data() + s.size()); }
void finish() { const char *tmp = ""; process(tmp, tmp, true); }
void reset();
//template<class F>
//phase1 &operator >>= (F &&f) { pipe_to = pipe_function(f); return *this; }
phase1 &operator >>= (pipe_function f) { pipe_to = f; return *this; }
private:
std::string scratch;
pipe_function pipe_to;
int line = 1;
int cs = 0;
};
#endif

180
phase1.rl Normal file
View File

@ -0,0 +1,180 @@
/*
* phase1 -- strip comments and merge multi-lines.
*
*/
#include "phase1.h"
#include <stdexcept>
#include <stdint.h>
const unsigned char escape = 0xb6;
/*
* from experimentation, mpw splits on ; after variable expansion;
* this splits before. something stupid like:
* set q '"'; echo {q} ; "
* will not be handled correctly. oh well.
* (should probably just drop that and we can then combine tokenizing w/
* variable expansion)
*/
%%{
machine main;
alphtype unsigned char;
escape = 0xb6;
ws = [ \t];
nl = ('\n' | '\r');
action add_line {
/* strip trailing ws */
while (!scratch.empty() && isspace(scratch.back())) scratch.pop_back();
if (!scratch.empty()) {
std::string tmp = std::move(scratch);
scratch.clear();
if (pipe_to) pipe_to(std::move(tmp));
}
fgoto main;
}
action push_back {
scratch.push_back(fc);
}
action push_back_escape {
scratch.push_back(escape);
scratch.push_back(fc);
}
comment = '#' (any-nl)*;
escape_seq =
escape
(
nl ${ /* esc newline */ line++; }
|
(any-nl) $push_back_escape
)
;
# single-quoted string. only escape \n is special.
# handling is so stupid I'm not going to support it.
sstring =
['] $push_back
( (any-nl-[']) $push_back )*
['] $push_back
$err{
throw std::runtime_error("### MPW Shell - 's must occur in pairs.");
}
;
# same quoting logic as ' string
vstring =
'{' $push_back
( (any-nl-'}') $push_back )*
'}' $push_back
$err{
throw std::runtime_error("### MPW Shell - {s must occur in pairs.");
}
;
# double-quoted string.
# escape \n is ignored. others do nothing.
dstring =
["] $push_back
(
escape_seq
|
vstring
|
(any-escape-nl-["{]) $push_back
)* ["] $push_back
$err{
throw std::runtime_error("### MPW Shell - \"s must occur in pairs.");
}
;
# this is a mess ...
coalesce_ws =
ws
(
ws
|
escape nl ${ line++; }
)*
<:
any ${ scratch.push_back(' '); fhold; }
;
line :=
(
sstring
|
dstring
|
vstring
|
escape_seq
|
coalesce_ws
|
(any-escape-nl-ws-[#'"{]) $push_back
)*
comment?
nl ${ line++; } $add_line
;
main :=
# strip leading whitespace.
ws*
<: # left guard -- higher priority to ws.
any ${ fhold; fgoto line; }
;
}%%
namespace {
%% write data;
}
phase1::phase1() {
%% write init;
}
void phase1::reset() {
%% write init;
scratch.clear();
// line = 1?
}
void phase1::process(const unsigned char *begin, const unsigned char *end, bool final) {
int start_line;
const unsigned char *p = begin;
const unsigned char *pe = end;
const unsigned char *eof = nullptr;
if (final)
eof = pe;
%% write exec;
if (cs == main_error) {
throw std::runtime_error("MPW Shell - Lexer error.");
}
#if 0
if (cs != main_start && final) {
// will this happen?
throw std::runtime_error("MPW Shell - Lexer error.");
}
#endif
}

44
phase2.h Normal file
View File

@ -0,0 +1,44 @@
#ifndef __phase2_h__
#define __phase2_h__
#include <string>
#include <vector>
#include <functional>
#include <memory>
typedef std::unique_ptr<struct command> command_ptr;
typedef std::vector<command_ptr> command_ptr_vector;
class phase2 {
public:
typedef std::function<void(command_ptr &&)> pipe_function;
void process(const std::string &line);
void finish();
virtual void syntax_error();
virtual void parse_accept();
virtual void parse(int, std::string &&);
phase2 &operator >>=(pipe_function f) { pipe_to = f; return *this; }
private:
std::string scratch;
int type = 0;
bool error = false;
bool immediate = false;
pipe_function pipe_to;
void flush();
bool special();
void classify();
void exec();
command_ptr_vector command_queue;
};
#endif

182
phase2.rl Normal file
View File

@ -0,0 +1,182 @@
/*
* phase2 -- parse a line into major control structures (begin/end/if/etc)
* input is a full line -- comments have been removed, escape-nl handled, trailing newline stripped.
*
*/
#include "mpw-shell-grammar.h"
#include "phase2.h"
#include "command.h"
%%{
machine main;
alphtype unsigned char;
action not_special { !special() }
ws = [ \t];
main := |*
'||' when not_special => {
flush();
parse(PIPE_PIPE, std::string(ts, te));
};
'&&' when not_special => {
flush();
parse(AMP_AMP, std::string(ts, te));
};
'(' when not_special => {
flush();
parse(LPAREN, std::string(ts, te));
};
# ) may include redirection so start a new token but don't parse it yet.
')' when not_special => {
flush();
scratch.push_back(fc);
type = RPAREN;
};
# todo -- also add in strings and escapes.
';' => { flush(); parse(SEMI, ";"); };
ws => { if (!scratch.empty()) scratch.push_back(fc); };
any => { scratch.push_back(fc); };
*|;
}%%
%%{
machine classify;
alphtype unsigned char;
ws = [ \t];
IF = /if/i;
ELSE = /else/i;
END = /end/i;
BEGIN = /begin/i;
EVALUATE = /evaluate/i;
main := |*
IF %eof{ type = IF; return; };
IF ws => { type = IF; return; };
ELSE %eof{ type = ELSE; return; };
ELSE ws => { type = ELSE; return; };
ELSE ws+ IF %eof{ type = ELSE_IF; return; };
ELSE ws+ IF ws => { type = ELSE_IF; return; };
EVALUATE %eof{ type = EVALUATE; return; };
EVALUATE ws => { type = EVALUATE; return; };
END %eof{ type = END; return; };
END ws => { type = END; return; };
BEGIN %eof{ type = BEGIN; return; };
BEGIN ws => { type = BEGIN; return; };
')' => { type = LPAREN; return; };
*|;
}%%
namespace {
%% machine classify;
%% write data;
%% machine main;
%% write data;
}
void phase2::flush() {
// remove white space...
while (!scratch.empty() && isspace(scratch.back())) scratch.pop_back();
if (!scratch.empty()) {
if (!type) classify();
parse(type, std::move(scratch));
}
type = 0;
scratch.clear();
}
bool phase2::special() {
if (!type) classify();
switch (type) {
case IF:
case ELSE:
case ELSE_IF:
case EVALUATE:
return true;
default:
return false;
}
}
void phase2::classify() {
if (type) return;
if (scratch.empty()) return;
int cs;
int act;
const unsigned char *p = (const unsigned char *)scratch.data();
const unsigned char *pe = p + scratch.size();
const unsigned char *eof = pe;
const unsigned char *te, *ts;
type = COMMAND;
%% machine classify;
%% write init;
%% write exec;
}
void phase2::process(const std::string &line) {
int cs;
int act;
const unsigned char *p = (const unsigned char *)line.data();
const unsigned char *pe = p + line.size();
const unsigned char *eof = pe;
const unsigned char *te, *ts;
scratch.clear();
type = 0;
%% machine main;
%% write init;
%% write exec;
flush();
parse(NL, "");
exec();
}
void phase2::finish() {
parse(0, "");
exec();
}
void phase2::exec() {
if (pipe_to) {
for (auto &p : command_queue) {
if (p) {
pipe_to(std::move(p));
}
}
command_queue.clear();
}
}