// CODYlib -*- mode:c++ -*- // Copyright (C) 2020 Nathan Sidwell, nathan@acm.org // License: Apache v2.0 // Cody #include "internal.hh" // C++ #include // C #include // OS #include #include // MessageBuffer code // Lines consist of words and end with a NEWLINE (0xa) char // Whitespace characters are TAB (0x9) and SPACE (0x20) // Words consist of non-whitespace chars separated by whitespace. // Multiple lines in one transaction are indicated by ending non-final // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE // Continuations with ; preceding it // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting. // Quoting with '...' // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting // Anything outside of <= or DEL or \' or \\ needs escaping. // Escapes are \\, \', \n, \t, \_, everything else as \? // Spaces separate words, UTF8 encoding for non-ascii chars namespace Cody { namespace Detail { static const char CONTINUE = S2C(u8";"); void MessageBuffer::BeginLine () { if (!buffer.empty ()) { // Terminate the previous line with a continuation buffer.reserve (buffer.size () + 3); buffer.push_back (S2C(u8" ")); buffer.push_back (CONTINUE); buffer.push_back (S2C(u8"\n")); } lastBol = buffer.size (); } // QUOTE means 'maybe quote', we search it for quote-needing chars void MessageBuffer::Append (char const *str, bool quote, size_t len) { if (len == ~size_t (0)) len = strlen (str); if (!len && !quote) return; // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything // that could remotely be shell-active. UTF8 encoding for non-ascii. if (quote && len) { quote = false; // Scan looking for quote-needing characters. We could just // append until we find one, but that's probably confusing for (size_t ix = len; ix--;) { unsigned char c = (unsigned char)str[ix]; if (!((c >= S2C(u8"a") && c <= S2C(u8"z")) || (c >= S2C(u8"A") && c <= S2C(u8"Z")) || (c >= S2C(u8"0") && c <= S2C(u8"9")) || c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_") || c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8"."))) { quote = true; break; } } } // Maximal length of appended string buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2); if (quote) buffer.push_back (S2C(u8"'")); for (auto *end = str + len; str != end;) { auto *e = end; if (quote) // Look for next escape-needing char. More relaxed than // the earlier needs-quoting check. for (e = str; e != end; ++e) { unsigned char c = (unsigned char)*e; if (c < S2C(u8" ") || c == 0x7f || c == S2C(u8"\\") || c == S2C(u8"'")) break; } buffer.insert (buffer.end (), str, e); str = e; if (str == end) break; buffer.push_back (S2C(u8"\\")); switch (unsigned char c = (unsigned char)*str++) { case S2C(u8"\t"): c = S2C(u8"t"); goto append; case S2C(u8"\n"): c = S2C(u8"n"); goto append; case S2C(u8"'"): case S2C(u8"\\"): append: buffer.push_back (c); break; default: // Full-on escape. Use 2 lower-case hex chars for (unsigned shift = 8; shift;) { shift -= 4; char nibble = (c >> shift) & 0xf; nibble += S2C(u8"0"); if (nibble > S2C(u8"9")) nibble += S2C(u8"a") - (S2C(u8"9") + 1); buffer.push_back (nibble); } } } if (quote) buffer.push_back (S2C(u8"'")); } void MessageBuffer::Append (char c) { buffer.push_back (c); } void MessageBuffer::AppendInteger (unsigned u) { // Sigh, even though std::to_string is C++11, we support building on // gcc 4.8, which is a C++11 compiler lacking std::to_string. so // have something horrible. std::string v (20, 0); size_t len = snprintf (const_cast (v.data ()), v.size (), "%u", u); v.erase (len); AppendWord (v); } int MessageBuffer::Write (int fd) noexcept { size_t limit = buffer.size () - lastBol; ssize_t count = write (fd, &buffer.data ()[lastBol], limit); int err = 0; if (count < 0) err = errno; else { lastBol += count; if (size_t (count) != limit) err = EAGAIN; } if (err != EAGAIN && err != EINTR) { // Reset for next message buffer.clear (); lastBol = 0; } return err; } int MessageBuffer::Read (int fd) noexcept { constexpr size_t blockSize = 200; size_t lwm = buffer.size (); size_t hwm = buffer.capacity (); if (hwm - lwm < blockSize / 2) hwm += blockSize; buffer.resize (hwm); auto iter = buffer.begin () + lwm; ssize_t count = read (fd, &*iter, hwm - lwm); buffer.resize (lwm + (count >= 0 ? count : 0)); if (count < 0) return errno; if (!count) // End of file return -1; bool more = true; for (;;) { auto newline = std::find (iter, buffer.end (), S2C(u8"\n")); if (newline == buffer.end ()) break; more = newline != buffer.begin () && newline[-1] == CONTINUE; iter = newline + 1; if (iter == buffer.end ()) break; if (!more) { // There is no continuation, but there are chars after the // newline. Truncate the buffer and return an error buffer.resize (iter - buffer.begin ()); return EINVAL; } } return more ? EAGAIN : 0; } int MessageBuffer::Lex (std::vector &result) { result.clear (); if (IsAtEnd ()) return ENOENT; Assert (buffer.back () == S2C(u8"\n")); auto iter = buffer.begin () + lastBol; for (std::string *word = nullptr;;) { char c = *iter; ++iter; if (c == S2C(u8" ") || c == S2C(u8"\t")) { word = nullptr; continue; } if (c == S2C(u8"\n")) break; if (c == CONTINUE) { // Line continuation if (word || *iter != S2C(u8"\n")) goto malformed; ++iter; break; } if (c <= S2C(u8" ") || c >= 0x7f) goto malformed; if (!word) { result.emplace_back (); word = &result.back (); } if (c == S2C(u8"'")) { // Quoted word for (;;) { c = *iter; if (c == S2C(u8"\n")) { malformed:; result.clear (); iter = std::find (iter, buffer.end (), S2C(u8"\n")); auto back = iter; if (back[-1] == CONTINUE && back[-2] == S2C(u8" ")) // Smells like a line continuation back -= 2; result.emplace_back (&buffer[lastBol], back - buffer.begin () - lastBol); ++iter; lastBol = iter - buffer.begin (); return EINVAL; } if (c < S2C(u8" ") || c >= 0x7f) goto malformed; ++iter; if (c == S2C(u8"'")) break; if (c == S2C(u8"\\")) // escape switch (c = *iter) { case S2C(u8"\\"): case S2C(u8"'"): ++iter; break; case S2C(u8"n"): c = S2C(u8"\n"); ++iter; break; case S2C(u8"_"): // We used to escape SPACE as \_, so accept that c = S2C(u8" "); ++iter; break; case S2C(u8"t"): c = S2C(u8"\t"); ++iter; break; default: { unsigned v = 0; for (unsigned nibble = 0; nibble != 2; nibble++) { c = *iter; if (c < S2C(u8"0")) { if (!nibble) goto malformed; break; } else if (c <= S2C(u8"9")) c -= S2C(u8"0"); else if (c < S2C(u8"a")) { if (!nibble) goto malformed; break; } else if (c <= S2C(u8"f")) c -= S2C(u8"a") - 10; else { if (!nibble) goto malformed; break; } ++iter; v = (v << 4) | c; } c = v; } } word->push_back (c); } } else // Unquoted character word->push_back (c); } lastBol = iter - buffer.begin (); if (result.empty ()) return ENOENT; return 0; } void MessageBuffer::LexedLine (std::string &str) { if (lastBol) { size_t pos = lastBol - 1; for (; pos; pos--) if (buffer[pos-1] == S2C(u8"\n")) break; size_t end = lastBol - 1; if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" ")) // Strip line continuation end -= 2; str.append (&buffer[pos], end - pos); } } } // Detail } // Cody