YAML: Implement block scalar parsing.

This commit implements the parsing of YAML block scalars.
Some code existed for it before, but it couldn't parse block
scalars.

This commit adds a new yaml node type to represent the block
scalar values. 

This commit also deletes the 'spec-09-27' and 'spec-09-28' tests
as they are identical to the test file 'spec-09-26'.

This commit introduces 3 new utility functions to the YAML scanner
class: `skip_s_space`, `advanceWhile` and `consumeLineBreakIfPresent`.

Reviewers: Duncan P. N. Exon Smith

Differential Revision: http://reviews.llvm.org/D9503


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237314 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Alex Lorenz 2015-05-13 23:10:51 +00:00
parent 7c001dac7a
commit b96942f6ec
15 changed files with 363 additions and 59 deletions

View File

@ -107,6 +107,7 @@ public:
enum NodeKind { enum NodeKind {
NK_Null, NK_Null,
NK_Scalar, NK_Scalar,
NK_BlockScalar,
NK_KeyValue, NK_KeyValue,
NK_Mapping, NK_Mapping,
NK_Sequence, NK_Sequence,
@ -222,6 +223,36 @@ private:
SmallVectorImpl<char> &Storage) const; SmallVectorImpl<char> &Storage) const;
}; };
/// \brief A block scalar node is an opaque datum that can be presented as a
/// series of zero or more Unicode scalar values.
///
/// Example:
/// |
/// Hello
/// World
class BlockScalarNode : public Node {
void anchor() override;
public:
BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
std::string &Value, StringRef RawVal)
: Node(NK_BlockScalar, D, Anchor, Tag), Value(std::move(Value)) {
SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
SMLoc End = SMLoc::getFromPointer(RawVal.end());
SourceRange = SMRange(Start, End);
}
/// \brief Gets the value of this node as a StringRef.
StringRef getValue() const { return Value; }
static inline bool classof(const Node *N) {
return N->getType() == NK_BlockScalar;
}
private:
std::string Value;
};
/// \brief A key and value pair. While not technically a Node under the YAML /// \brief A key and value pair. While not technically a Node under the YAML
/// representation graph, it is easier to treat them this way. /// representation graph, it is easier to treat them this way.
/// ///

View File

@ -101,6 +101,7 @@ namespace yaml {
void Node::anchor() {} void Node::anchor() {}
void NullNode::anchor() {} void NullNode::anchor() {}
void ScalarNode::anchor() {} void ScalarNode::anchor() {}
void BlockScalarNode::anchor() {}
void KeyValueNode::anchor() {} void KeyValueNode::anchor() {}
void MappingNode::anchor() {} void MappingNode::anchor() {}
void SequenceNode::anchor() {} void SequenceNode::anchor() {}
@ -128,6 +129,7 @@ struct Token : ilist_node<Token> {
TK_Key, TK_Key,
TK_Value, TK_Value,
TK_Scalar, TK_Scalar,
TK_BlockScalar,
TK_Alias, TK_Alias,
TK_Anchor, TK_Anchor,
TK_Tag TK_Tag
@ -137,6 +139,9 @@ struct Token : ilist_node<Token> {
/// of the token in the input. /// of the token in the input.
StringRef Range; StringRef Range;
/// The value of a block scalar node.
std::string Value;
Token() : Kind(TK_Error) {} Token() : Kind(TK_Error) {}
}; };
} }
@ -348,6 +353,14 @@ private:
/// b-break. /// b-break.
StringRef::iterator skip_b_break(StringRef::iterator Position); StringRef::iterator skip_b_break(StringRef::iterator Position);
/// Skip a single s-space[31] starting at Position.
///
/// An s-space is 0x20
///
/// @returns The code unit after the s-space, or Position if it's not a
/// s-space.
StringRef::iterator skip_s_space(StringRef::iterator Position);
/// @brief Skip a single s-white[33] starting at Position. /// @brief Skip a single s-white[33] starting at Position.
/// ///
/// A s-white is 0x20 | 0x9 /// A s-white is 0x20 | 0x9
@ -373,6 +386,10 @@ private:
StringRef::iterator skip_while( SkipWhileFunc Func StringRef::iterator skip_while( SkipWhileFunc Func
, StringRef::iterator Position); , StringRef::iterator Position);
/// Skip minimal well-formed code unit subsequences until Func returns its
/// input.
void advanceWhile(SkipWhileFunc Func);
/// @brief Scan ns-uri-char[39]s starting at Cur. /// @brief Scan ns-uri-char[39]s starting at Cur.
/// ///
/// This updates Cur and Column while scanning. /// This updates Cur and Column while scanning.
@ -393,6 +410,11 @@ private:
/// Pos is whitespace or a new line /// Pos is whitespace or a new line
bool isBlankOrBreak(StringRef::iterator Position); bool isBlankOrBreak(StringRef::iterator Position);
/// Consume a single b-break[28] if it's present at the current position.
///
/// Return false if the code unit at the current position isn't a line break.
bool consumeLineBreakIfPresent();
/// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
void saveSimpleKeyCandidate( TokenQueueT::iterator Tok void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
, unsigned AtColumn , unsigned AtColumn
@ -466,6 +488,30 @@ private:
/// @brief Scan a block scalar starting with | or >. /// @brief Scan a block scalar starting with | or >.
bool scanBlockScalar(bool IsLiteral); bool scanBlockScalar(bool IsLiteral);
/// Scan a chomping indicator in a block scalar header.
char scanBlockChompingIndicator();
/// Scan an indentation indicator in a block scalar header.
unsigned scanBlockIndentationIndicator();
/// Scan a block scalar header.
///
/// Return false if an error occurred.
bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
bool &IsDone);
/// Look for the indentation level of a block scalar.
///
/// Return false if an error occurred.
bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
unsigned &LineBreaks, bool &IsDone);
/// Scan the indentation of a text line in a block scalar.
///
/// Return false if an error occurred.
bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
bool &IsDone);
/// @brief Scan a tag of the form !stuff. /// @brief Scan a tag of the form !stuff.
bool scanTag(); bool scanTag();
@ -612,6 +658,9 @@ bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
case Token::TK_Scalar: case Token::TK_Scalar:
OS << "Scalar: "; OS << "Scalar: ";
break; break;
case Token::TK_BlockScalar:
OS << "Block Scalar: ";
break;
case Token::TK_Alias: case Token::TK_Alias:
OS << "Alias: "; OS << "Alias: ";
break; break;
@ -816,6 +865,13 @@ StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
return Position; return Position;
} }
StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
if (Position == End)
return Position;
if (*Position == ' ')
return Position + 1;
return Position;
}
StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
if (Position == End) if (Position == End)
@ -844,6 +900,12 @@ StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
return Position; return Position;
} }
void Scanner::advanceWhile(SkipWhileFunc Func) {
auto Final = skip_while(Func, Current);
Column += Final - Current;
Current = Final;
}
static bool is_ns_hex_digit(const char C) { static bool is_ns_hex_digit(const char C) {
return (C >= '0' && C <= '9') return (C >= '0' && C <= '9')
|| (C >= 'a' && C <= 'z') || (C >= 'a' && C <= 'z')
@ -906,6 +968,16 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
return false; return false;
} }
bool Scanner::consumeLineBreakIfPresent() {
auto Next = skip_b_break(Current);
if (Next == Current)
return false;
Column = 0;
++Line;
Current = Next;
return true;
}
void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
, unsigned AtColumn , unsigned AtColumn
, bool IsRequired) { , bool IsRequired) {
@ -1374,38 +1446,204 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
return true; return true;
} }
bool Scanner::scanBlockScalar(bool IsLiteral) { char Scanner::scanBlockChompingIndicator() {
StringRef::iterator Start = Current; char Indicator = ' ';
skip(1); // Eat | or > if (Current != End && (*Current == '+' || *Current == '-')) {
while(true) { Indicator = *Current;
StringRef::iterator i = skip_nb_char(Current); skip(1);
if (i == Current) { }
if (Column == 0) return Indicator;
break; }
i = skip_b_break(Current);
if (i != Current) { /// Get the number of line breaks after chomping.
// We got a line break. ///
Column = 0; /// Return the number of trailing line breaks to emit, depending on
++Line; /// \p ChompingIndicator.
Current = i; static unsigned getChompedLineBreaks(char ChompingIndicator,
continue; unsigned LineBreaks, StringRef Str) {
} else { if (ChompingIndicator == '-') // Strip all line breaks.
// There was an error, which should already have been printed out. return 0;
if (ChompingIndicator == '+') // Keep all line breaks.
return LineBreaks;
// Clip trailing lines.
return Str.empty() ? 0 : 1;
}
unsigned Scanner::scanBlockIndentationIndicator() {
unsigned Indent = 0;
if (Current != End && (*Current >= '1' && *Current <= '9')) {
Indent = unsigned(*Current - '0');
skip(1);
}
return Indent;
}
bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
unsigned &IndentIndicator, bool &IsDone) {
auto Start = Current;
ChompingIndicator = scanBlockChompingIndicator();
IndentIndicator = scanBlockIndentationIndicator();
// Check for the chomping indicator once again.
if (ChompingIndicator == ' ')
ChompingIndicator = scanBlockChompingIndicator();
Current = skip_while(&Scanner::skip_s_white, Current);
skipComment();
if (Current == End) { // EOF, we have an empty scalar.
Token T;
T.Kind = Token::TK_BlockScalar;
T.Range = StringRef(Start, Current - Start);
TokenQueue.push_back(T);
IsDone = true;
return true;
}
if (!consumeLineBreakIfPresent()) {
setError("Expected a line break after block scalar header", Current);
return false; return false;
} }
return true;
} }
Current = i;
bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
unsigned BlockExitIndent,
unsigned &LineBreaks, bool &IsDone) {
unsigned MaxAllSpaceLineCharacters = 0;
StringRef::iterator LongestAllSpaceLine;
while (true) {
advanceWhile(&Scanner::skip_s_space);
if (skip_nb_char(Current) != Current) {
// This line isn't empty, so try and find the indentation.
if (Column <= BlockExitIndent) { // End of the block literal.
IsDone = true;
return true;
}
// We found the block's indentation.
BlockIndent = Column;
if (MaxAllSpaceLineCharacters > BlockIndent) {
setError(
"Leading all-spaces line must be smaller than the block indent",
LongestAllSpaceLine);
return false;
}
return true;
}
if (skip_b_break(Current) != Current &&
Column > MaxAllSpaceLineCharacters) {
// Record the longest all-space line in case it's longer than the
// discovered block indent.
MaxAllSpaceLineCharacters = Column;
LongestAllSpaceLine = Current;
}
// Check for EOF.
if (Current == End) {
IsDone = true;
return true;
}
if (!consumeLineBreakIfPresent()) {
IsDone = true;
return true;
}
++LineBreaks;
}
return true;
}
bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
unsigned BlockExitIndent, bool &IsDone) {
// Skip the indentation.
while (Column < BlockIndent) {
auto I = skip_s_space(Current);
if (I == Current)
break;
Current = I;
++Column; ++Column;
} }
if (Start == Current) { if (skip_nb_char(Current) == Current)
setError("Got empty block scalar", Start); return true;
if (Column <= BlockExitIndent) { // End of the block literal.
IsDone = true;
return true;
}
if (Column < BlockIndent) {
if (Current != End && *Current == '#') { // Trailing comment.
IsDone = true;
return true;
}
setError("A text line is less indented than the block scalar", Current);
return false;
}
return true; // A normal text line.
}
bool Scanner::scanBlockScalar(bool IsLiteral) {
// Eat '|' or '>'
assert(*Current == '|' || *Current == '>');
skip(1);
char ChompingIndicator;
unsigned BlockIndent;
bool IsDone = false;
if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
return false;
if (IsDone)
return true;
auto Start = Current;
unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
unsigned LineBreaks = 0;
if (BlockIndent == 0) {
if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
IsDone))
return false; return false;
} }
// Scan the block's scalars body.
SmallString<256> Str;
while (!IsDone) {
if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
return false;
if (IsDone)
break;
// Parse the current line.
auto LineStart = Current;
advanceWhile(&Scanner::skip_nb_char);
if (LineStart != Current) {
Str.append(LineBreaks, '\n');
Str.append(StringRef(LineStart, Current - LineStart));
LineBreaks = 0;
}
// Check for EOF.
if (Current == End)
break;
if (!consumeLineBreakIfPresent())
break;
++LineBreaks;
}
if (Current == End && !LineBreaks)
// Ensure that there is at least one line break before the end of file.
LineBreaks = 1;
Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
// New lines may start a simple key.
if (!FlowLevel)
IsSimpleKeyAllowed = true;
Token T; Token T;
T.Kind = Token::TK_Scalar; T.Kind = Token::TK_BlockScalar;
T.Range = StringRef(Start, Current - Start); T.Range = StringRef(Start, Current - Start);
T.Value = Str.str().str();
TokenQueue.push_back(T); TokenQueue.push_back(T);
return true; return true;
} }
@ -1607,6 +1845,7 @@ std::string Node::getVerbatimTag() const {
case NK_Null: case NK_Null:
return "tag:yaml.org,2002:null"; return "tag:yaml.org,2002:null";
case NK_Scalar: case NK_Scalar:
case NK_BlockScalar:
// TODO: Tag resolution. // TODO: Tag resolution.
return "tag:yaml.org,2002:str"; return "tag:yaml.org,2002:str";
case NK_Mapping: case NK_Mapping:
@ -2138,6 +2377,11 @@ parse_property:
, AnchorInfo.Range.substr(1) , AnchorInfo.Range.substr(1)
, TagInfo.Range , TagInfo.Range
, T.Range); , T.Range);
case Token::TK_BlockScalar:
getNext();
return new (NodeAllocator)
BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
TagInfo.Range, T.Value, T.Range);
case Token::TK_Key: case Token::TK_Key:
// Don't eat the TK_Key, KeyValueNode expects it. // Don't eat the TK_Key, KeyValueNode expects it.
return new (NodeAllocator) return new (NodeAllocator)

View File

@ -1,9 +1,6 @@
# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s # RUN: not yaml-bench -canonical %s 2>&1 | FileCheck %s
# #
# FIXME: This test should actually fail. Yaml bench should report an error that # CHECK: error: Expected a line break after block scalar header
# says that the '---' and '...' document start/end markers must not be specified
# as the first content line of a non-indented plain scalar.
# CHECK: !!str
--- ---
--- ||| : foo --- ||| : foo

View File

@ -1,4 +1,8 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: !!str "literal\n"
# CHECK: !!str " folded\n"
# CHECK: !!str "keep\n\n"
# CHECK: !!str " strip"
- | # Just the style - | # Just the style
literal literal

View File

@ -1,4 +1,6 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: !!str "literal\n"
# CHECK: !!str "folded\n"
- | - |
literal literal

View File

@ -1,4 +1,8 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: !!str "detected\n"
# CHECK: !!str "\n\n# detected\n"
# CHECK: !!str " explicit\n"
# CHECK: !!str "\t\ndetected\n"
- | - |
detected detected

View File

@ -9,4 +9,4 @@
- |1 - |1
text text
# CHECK: error # CHECK: 8:2: error: A text line is less indented than the block scalar

View File

@ -1,6 +1,12 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: !!str "text"
# CHECK: !!str "text\n"
# CHECK: !!str "text\n\n"
strip: |- strip: |-
textclip: | text
text…keep: |+ clip: |
text text
keep: |+
text

View File

@ -1,8 +1,13 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: ? !!str "strip"
# CHECK: : !!str ""
# CHECK: ? !!str "clip"
# CHECK: : !!str ""
# CHECK: ? !!str "keep"
# CHECK: : !!str "\n"
strip: >- strip: >-
clip: > clip: >
keep: |+ keep: |+

View File

@ -1,4 +1,5 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: !!str "literal\n\ttext\n"
| # Simple block scalar | # Simple block scalar
literal literal

View File

@ -1,4 +1,5 @@
# RUN: yaml-bench -canonical %s # RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: !!str "\n\nliteral\n\ntext\n"
| |

View File

@ -1,10 +0,0 @@
# RUN: yaml-bench -canonical %s
|
literal
text
# Comment

View File

@ -1,10 +0,0 @@
# RUN: yaml-bench -canonical %s
|
literal
text
# Comment

View File

@ -130,6 +130,33 @@ TEST(YAMLParser, ParsesArrayOfArrays) {
ExpectParseSuccess("Array of arrays", "[[]]"); ExpectParseSuccess("Array of arrays", "[[]]");
} }
TEST(YAMLParser, ParsesBlockLiteralScalars) {
ExpectParseSuccess("Block literal scalar", "test: |\n Hello\n World\n");
ExpectParseSuccess("Block literal scalar EOF", "test: |\n Hello\n World");
ExpectParseSuccess("Empty block literal scalar header EOF", "test: | ");
ExpectParseSuccess("Empty block literal scalar", "test: |\ntest2: 20");
ExpectParseSuccess("Empty block literal scalar 2", "- | \n \n\n \n- 42");
ExpectParseSuccess("Block literal scalar in sequence",
"- |\n Testing\n Out\n\n- 22");
ExpectParseSuccess("Block literal scalar in document",
"--- |\n Document\n...");
ExpectParseSuccess("Empty non indented lines still count",
"- |\n First line\n \n\n Another line\n\n- 2");
ExpectParseSuccess("Comment in block literal scalar header",
"test: | # Comment \n No Comment\ntest 2: | # Void");
ExpectParseSuccess("Chomping indicators in block literal scalar header",
"test: |- \n Hello\n\ntest 2: |+ \n\n World\n\n\n");
ExpectParseSuccess("Indent indicators in block literal scalar header",
"test: |1 \n \n Hello \n World\n");
ExpectParseSuccess("Chomping and indent indicators in block literals",
"test: |-1\n Hello\ntest 2: |9+\n World");
ExpectParseSuccess("Trailing comments in block literals",
"test: |\n Content\n # Trailing\n #Comment\ntest 2: 3");
ExpectParseError("Invalid block scalar header", "test: | failure");
ExpectParseError("Invalid line indentation", "test: |\n First line\n Error");
ExpectParseError("Long leading space line", "test: |\n \n Test\n");
}
TEST(YAMLParser, HandlesEndOfFileGracefully) { TEST(YAMLParser, HandlesEndOfFileGracefully) {
ExpectParseError("In string starting with EOF", "[\""); ExpectParseError("In string starting with EOF", "[\"");
ExpectParseError("In string hitting EOF", "[\" "); ExpectParseError("In string hitting EOF", "[\" ");

View File

@ -96,6 +96,8 @@ static void dumpNode( yaml::Node *n
SmallString<32> Storage; SmallString<32> Storage;
StringRef Val = sn->getValue(Storage); StringRef Val = sn->getValue(Storage);
outs() << prettyTag(n) << " \"" << yaml::escape(Val) << "\""; outs() << prettyTag(n) << " \"" << yaml::escape(Val) << "\"";
} else if (yaml::BlockScalarNode *BN = dyn_cast<yaml::BlockScalarNode>(n)) {
outs() << prettyTag(n) << " \"" << yaml::escape(BN->getValue()) << "\"";
} else if (yaml::SequenceNode *sn = dyn_cast<yaml::SequenceNode>(n)) { } else if (yaml::SequenceNode *sn = dyn_cast<yaml::SequenceNode>(n)) {
outs() << prettyTag(n) << " [\n"; outs() << prettyTag(n) << " [\n";
++Indent; ++Indent;