1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 /// YAML scanner.
8 /// Code based on PyYAML: http://www.pyyaml.org
9 module dyaml.scanner;
10 
11 
12 import core.stdc.string;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.ascii : isAlphaNum, isDigit, isHexDigit;
18 import std.exception;
19 import std.string;
20 import std.typecons;
21 import std.traits : Unqual;
22 import std.utf;
23 
24 import dyaml.escapes;
25 import dyaml.exception;
26 import dyaml.queue;
27 import dyaml.reader;
28 import dyaml.style;
29 import dyaml.token;
30 
31 package:
32 /// Scanner produces tokens of the following types:
33 /// STREAM-START
34 /// STREAM-END
35 /// DIRECTIVE(name, value)
36 /// DOCUMENT-START
37 /// DOCUMENT-END
38 /// BLOCK-SEQUENCE-START
39 /// BLOCK-MAPPING-START
40 /// BLOCK-END
41 /// FLOW-SEQUENCE-START
42 /// FLOW-MAPPING-START
43 /// FLOW-SEQUENCE-END
44 /// FLOW-MAPPING-END
45 /// BLOCK-ENTRY
46 /// FLOW-ENTRY
47 /// KEY
48 /// VALUE
49 /// ALIAS(value)
50 /// ANCHOR(value)
51 /// TAG(value)
52 /// SCALAR(value, plain, style)
53 
54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
55 
56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
57 
58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
59 
60 alias isNonLinebreakWhitespace = among!(' ', '\t');
61 
62 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}',
63     '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n',
64     '\r', '\u0085', '\u2028', '\u2029');
65 
66 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',',
67     '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%');
68 
69 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029');
70 
71 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029');
72 
73 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\');
74 
75 /// Marked exception thrown at scanner errors.
76 ///
77 /// See_Also: MarkedYAMLException
78 class ScannerException : MarkedYAMLException
79 {
80     mixin MarkedExceptionCtors;
81 }
82 
83 /// Generates tokens from data provided by a Reader.
84 struct Scanner
85 {
86     private:
87         /// A simple key is a key that is not denoted by the '?' indicator.
88         /// For example:
89         ///   ---
90         ///   block simple key: value
91         ///   ? not a simple key:
92         ///   : { flow simple key: value }
93         /// We emit the KEY token before all keys, so when we find a potential simple
94         /// key, we try to locate the corresponding ':' indicator. Simple keys should be
95         /// limited to a single line and 1024 characters.
96         ///
97         /// 16 bytes on 64-bit.
98         static struct SimpleKey
99         {
100             /// Character index in reader where the key starts.
101             uint charIndex = uint.max;
102             /// Index of the key token from start (first token scanned being 0).
103             uint tokenIndex;
104             /// Line the key starts at.
105             uint line;
106             /// Column the key starts at.
107             ushort column;
108             /// Is this required to be a simple key?
109             bool required;
110             /// Is this struct "null" (invalid)?.
111             bool isNull;
112         }
113 
114         /// Block chomping types.
115         enum Chomping
116         {
117             /// Strip all trailing line breaks. '-' indicator.
118             strip,
119             /// Line break of the last line is preserved, others discarded. Default.
120             clip,
121             /// All trailing line breaks are preserved. '+' indicator.
122             keep
123         }
124 
125         /// Reader used to read from a file/stream.
126         Reader reader_;
127         /// Are we done scanning?
128         bool done_;
129 
130         /// Level of nesting in flow context. If 0, we're in block context.
131         uint flowLevel_;
132         /// Current indentation level.
133         int indent_ = -1;
134         /// Past indentation levels. Used as a stack.
135         Appender!(int[]) indents_;
136 
137         /// Processed tokens not yet emitted. Used as a queue.
138         Queue!Token tokens_;
139 
140         /// Number of tokens emitted through the getToken method.
141         uint tokensTaken_;
142 
143         /// Can a simple key start at the current position? A simple key may start:
144         /// - at the beginning of the line, not counting indentation spaces
145         ///       (in block context),
146         /// - after '{', '[', ',' (in the flow context),
147         /// - after '?', ':', '-' (in the block context).
148         /// In the block context, this flag also signifies if a block collection
149         /// may start at the current position.
150         bool allowSimpleKey_ = true;
151 
152         /// Possible simple keys indexed by flow levels.
153         SimpleKey[] possibleSimpleKeys_;
154 
155     public:
156         /// Construct a Scanner using specified Reader.
157         this(Reader reader) @safe nothrow
158         {
159             // Return the next token, but do not delete it from the queue
160             reader_   = reader;
161             fetchStreamStart();
162         }
163 
164         /// Advance to the next token
165         void popFront() @safe
166         {
167             ++tokensTaken_;
168             tokens_.pop();
169         }
170 
171         /// Return the current token
172         const(Token) front() @safe
173         {
174             enforce(!empty, "No token left to peek");
175             return tokens_.peek();
176         }
177 
178         /// Return whether there are any more tokens left.
179         bool empty() @safe
180         {
181             while (needMoreTokens())
182             {
183                 fetchToken();
184             }
185             return tokens_.empty;
186         }
187 
188         /// Set file name.
189         void name(string name) @safe pure nothrow @nogc
190         {
191             reader_.name = name;
192         }
193 
194     private:
195         /// Most scanning error messages have the same format; so build them with this
196         /// function.
197         string expected(T)(string expected, T found)
198         {
199             return text("expected ", expected, ", but found ", found);
200         }
201 
202         /// Determine whether or not we need to fetch more tokens before peeking/getting a token.
203         bool needMoreTokens() @safe pure
204         {
205             if(done_)         { return false; }
206             if(tokens_.empty) { return true; }
207 
208             /// The current token may be a potential simple key, so we need to look further.
209             stalePossibleSimpleKeys();
210             return nextPossibleSimpleKey() == tokensTaken_;
211         }
212 
213         /// Fetch at token, adding it to tokens_.
214         void fetchToken() @safe
215         {
216             // Eat whitespaces and comments until we reach the next token.
217             scanToNextToken();
218 
219             // Remove obsolete possible simple keys.
220             stalePossibleSimpleKeys();
221 
222             // Compare current indentation and column. It may add some tokens
223             // and decrease the current indentation level.
224             unwindIndent(reader_.column);
225 
226             // Get the next character.
227             const dchar c = reader_.peekByte();
228 
229             // Fetch the token.
230             if(c == '\0')            { return fetchStreamEnd();     }
231             if(checkDirective())     { return fetchDirective();     }
232             if(checkDocumentStart()) { return fetchDocumentStart(); }
233             if(checkDocumentEnd())   { return fetchDocumentEnd();   }
234             // Order of the following checks is NOT significant.
235             switch(c)
236             {
237                 case '[':  return fetchFlowSequenceStart();
238                 case '{':  return fetchFlowMappingStart();
239                 case ']':  return fetchFlowSequenceEnd();
240                 case '}':  return fetchFlowMappingEnd();
241                 case ',':  return fetchFlowEntry();
242                 case '!':  return fetchTag();
243                 case '\'': return fetchSingle();
244                 case '\"': return fetchDouble();
245                 case '*':  return fetchAlias();
246                 case '&':  return fetchAnchor();
247                 case '?':  if(checkKey())        { return fetchKey();        } goto default;
248                 case ':':  if(checkValue())      { return fetchValue();      } goto default;
249                 case '-':  if(checkBlockEntry()) { return fetchBlockEntry(); } goto default;
250                 case '|':  if(flowLevel_ == 0)   { return fetchLiteral();    } break;
251                 case '>':  if(flowLevel_ == 0)   { return fetchFolded();     } break;
252                 default:   if(checkPlain())      { return fetchPlain();      }
253             }
254 
255             throw new ScannerException("While scanning for the next token, found character " ~
256                                        "\'%s\', index %s that cannot start any token"
257                                        .format(c, to!int(c)), reader_.mark);
258         }
259 
260 
261         /// Return the token number of the nearest possible simple key.
262         uint nextPossibleSimpleKey() @safe pure nothrow @nogc
263         {
264             uint minTokenNumber = uint.max;
265             foreach(k, ref simpleKey; possibleSimpleKeys_)
266             {
267                 if(simpleKey.isNull) { continue; }
268                 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex);
269             }
270             return minTokenNumber;
271         }
272 
273         /// Remove entries that are no longer possible simple keys.
274         ///
275         /// According to the YAML specification, simple keys
276         /// - should be limited to a single line,
277         /// - should be no longer than 1024 characters.
278         /// Disabling this will allow simple keys of any length and
279         /// height (may cause problems if indentation is broken though).
280         void stalePossibleSimpleKeys() @safe pure
281         {
282             foreach(level, ref key; possibleSimpleKeys_)
283             {
284                 if(key.isNull) { continue; }
285                 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024)
286                 {
287                     enforce(!key.required,
288                             new ScannerException("While scanning a simple key",
289                                                  Mark(reader_.name, key.line, key.column),
290                                                  "could not find expected ':'", reader_.mark));
291                     key.isNull = true;
292                 }
293             }
294         }
295 
296         /// Check if the next token starts a possible simple key and if so, save its position.
297         ///
298         /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
299         void savePossibleSimpleKey() @safe pure
300         {
301             // Check if a simple key is required at the current position.
302             const required = (flowLevel_ == 0 && indent_ == reader_.column);
303             assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~
304                    "the first token in the current line. Therefore it is always allowed.");
305 
306             if(!allowSimpleKey_) { return; }
307 
308             // The next token might be a simple key, so save its number and position.
309             removePossibleSimpleKey();
310             const tokenCount = tokensTaken_ + cast(uint)tokens_.length;
311 
312             const line   = reader_.line;
313             const column = reader_.column;
314             const key    = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line,
315                                      cast(ushort)min(column, ushort.max), required);
316 
317             if(possibleSimpleKeys_.length <= flowLevel_)
318             {
319                 const oldLength = possibleSimpleKeys_.length;
320                 possibleSimpleKeys_.length = flowLevel_ + 1;
321                 //No need to initialize the last element, it's already done in the next line.
322                 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init;
323             }
324             possibleSimpleKeys_[flowLevel_] = key;
325         }
326 
327         /// Remove the saved possible key position at the current flow level.
328         void removePossibleSimpleKey() @safe pure
329         {
330             if(possibleSimpleKeys_.length <= flowLevel_) { return; }
331 
332             if(!possibleSimpleKeys_[flowLevel_].isNull)
333             {
334                 const key = possibleSimpleKeys_[flowLevel_];
335                 enforce(!key.required,
336                         new ScannerException("While scanning a simple key",
337                                              Mark(reader_.name, key.line, key.column),
338                                              "could not find expected ':'", reader_.mark));
339                 possibleSimpleKeys_[flowLevel_].isNull = true;
340             }
341         }
342 
343         /// Decrease indentation, removing entries in indents_.
344         ///
345         /// Params:  column = Current column in the file/stream.
346         void unwindIndent(const int column) @safe
347         {
348             if(flowLevel_ > 0)
349             {
350                 // In flow context, tokens should respect indentation.
351                 // The condition should be `indent >= column` according to the spec.
352                 // But this condition will prohibit intuitively correct
353                 // constructions such as
354                 // key : {
355                 // }
356 
357                 // In the flow context, indentation is ignored. We make the scanner less
358                 // restrictive than what the specification requires.
359                 // if(pedantic_ && flowLevel_ > 0 && indent_ > column)
360                 // {
361                 //     throw new ScannerException("Invalid intendation or unclosed '[' or '{'",
362                 //                                reader_.mark)
363                 // }
364                 return;
365             }
366 
367             // In block context, we may need to issue the BLOCK-END tokens.
368             while(indent_ > column)
369             {
370                 indent_ = indents_.data.back;
371                 assert(indents_.data.length);
372                 indents_.shrinkTo(indents_.data.length - 1);
373                 tokens_.push(blockEndToken(reader_.mark, reader_.mark));
374             }
375         }
376 
377         /// Increase indentation if needed.
378         ///
379         /// Params:  column = Current column in the file/stream.
380         ///
381         /// Returns: true if the indentation was increased, false otherwise.
382         bool addIndent(int column) @safe
383         {
384             if(indent_ >= column){return false;}
385             indents_ ~= indent_;
386             indent_ = column;
387             return true;
388         }
389 
390 
391         /// Add STREAM-START token.
392         void fetchStreamStart() @safe nothrow
393         {
394             tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding));
395         }
396 
397         ///Add STREAM-END token.
398         void fetchStreamEnd() @safe
399         {
400             //Set intendation to -1 .
401             unwindIndent(-1);
402             removePossibleSimpleKey();
403             allowSimpleKey_ = false;
404             possibleSimpleKeys_.destroy;
405 
406             tokens_.push(streamEndToken(reader_.mark, reader_.mark));
407             done_ = true;
408         }
409 
410         /// Add DIRECTIVE token.
411         void fetchDirective() @safe
412         {
413             // Set intendation to -1 .
414             unwindIndent(-1);
415             // Reset simple keys.
416             removePossibleSimpleKey();
417             allowSimpleKey_ = false;
418 
419             auto directive = scanDirective();
420             tokens_.push(directive);
421         }
422 
423         /// Add DOCUMENT-START or DOCUMENT-END token.
424         void fetchDocumentIndicator(TokenID id)()
425             if(id == TokenID.documentStart || id == TokenID.documentEnd)
426         {
427             // Set indentation to -1 .
428             unwindIndent(-1);
429             // Reset simple keys. Note that there can't be a block collection after '---'.
430             removePossibleSimpleKey();
431             allowSimpleKey_ = false;
432 
433             Mark startMark = reader_.mark;
434             reader_.forward(3);
435             tokens_.push(simpleToken!id(startMark, reader_.mark));
436         }
437 
438         /// Aliases to add DOCUMENT-START or DOCUMENT-END token.
439         alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart);
440         alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd);
441 
442         /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
443         void fetchFlowCollectionStart(TokenID id)() @safe
444         {
445             // '[' and '{' may start a simple key.
446             savePossibleSimpleKey();
447             // Simple keys are allowed after '[' and '{'.
448             allowSimpleKey_ = true;
449             ++flowLevel_;
450 
451             Mark startMark = reader_.mark;
452             reader_.forward();
453             tokens_.push(simpleToken!id(startMark, reader_.mark));
454         }
455 
456         /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
457         alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart);
458         alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart);
459 
460         /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
461         void fetchFlowCollectionEnd(TokenID id)()
462         {
463             // Reset possible simple key on the current level.
464             removePossibleSimpleKey();
465             // No simple keys after ']' and '}'.
466             allowSimpleKey_ = false;
467             --flowLevel_;
468 
469             Mark startMark = reader_.mark;
470             reader_.forward();
471             tokens_.push(simpleToken!id(startMark, reader_.mark));
472         }
473 
474         /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/
475         alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd);
476         alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd);
477 
478         /// Add FLOW-ENTRY token;
479         void fetchFlowEntry() @safe
480         {
481             // Reset possible simple key on the current level.
482             removePossibleSimpleKey();
483             // Simple keys are allowed after ','.
484             allowSimpleKey_ = true;
485 
486             Mark startMark = reader_.mark;
487             reader_.forward();
488             tokens_.push(flowEntryToken(startMark, reader_.mark));
489         }
490 
491         /// Additional checks used in block context in fetchBlockEntry and fetchKey.
492         ///
493         /// Params:  type = String representing the token type we might need to add.
494         ///          id   = Token type we might need to add.
495         void blockChecks(string type, TokenID id)()
496         {
497             enum context = type ~ " keys are not allowed here";
498             // Are we allowed to start a key (not neccesarily a simple one)?
499             enforce(allowSimpleKey_, new ScannerException(context, reader_.mark));
500 
501             if(addIndent(reader_.column))
502             {
503                 tokens_.push(simpleToken!id(reader_.mark, reader_.mark));
504             }
505         }
506 
507         /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process.
508         void fetchBlockEntry() @safe
509         {
510             if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); }
511 
512             // It's an error for the block entry to occur in the flow context,
513             // but we let the parser detect this.
514 
515             // Reset possible simple key on the current level.
516             removePossibleSimpleKey();
517             // Simple keys are allowed after '-'.
518             allowSimpleKey_ = true;
519 
520             Mark startMark = reader_.mark;
521             reader_.forward();
522             tokens_.push(blockEntryToken(startMark, reader_.mark));
523         }
524 
525         /// Add KEY token. Might add BLOCK-MAPPING-START in the process.
526         void fetchKey() @safe
527         {
528             if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); }
529 
530             // Reset possible simple key on the current level.
531             removePossibleSimpleKey();
532             // Simple keys are allowed after '?' in the block context.
533             allowSimpleKey_ = (flowLevel_ == 0);
534 
535             Mark startMark = reader_.mark;
536             reader_.forward();
537             tokens_.push(keyToken(startMark, reader_.mark));
538         }
539 
540         /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process.
541         void fetchValue() @safe
542         {
543             //Do we determine a simple key?
544             if(possibleSimpleKeys_.length > flowLevel_ &&
545                !possibleSimpleKeys_[flowLevel_].isNull)
546             {
547                 const key = possibleSimpleKeys_[flowLevel_];
548                 possibleSimpleKeys_[flowLevel_].isNull = true;
549                 Mark keyMark = Mark(reader_.name, key.line, key.column);
550                 const idx = key.tokenIndex - tokensTaken_;
551 
552                 assert(idx >= 0);
553 
554                 // Add KEY.
555                 // Manually inserting since tokens are immutable (need linked list).
556                 tokens_.insert(keyToken(keyMark, keyMark), idx);
557 
558                 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START.
559                 if(flowLevel_ == 0 && addIndent(key.column))
560                 {
561                     tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx);
562                 }
563 
564                 // There cannot be two simple keys in a row.
565                 allowSimpleKey_ = false;
566             }
567             // Part of a complex key
568             else
569             {
570                 // We can start a complex value if and only if we can start a simple key.
571                 enforce(flowLevel_ > 0 || allowSimpleKey_,
572                         new ScannerException("Mapping values are not allowed here", reader_.mark));
573 
574                 // If this value starts a new block mapping, we need to add
575                 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser.
576                 if(flowLevel_ == 0 && addIndent(reader_.column))
577                 {
578                     tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark));
579                 }
580 
581                 // Reset possible simple key on the current level.
582                 removePossibleSimpleKey();
583                 // Simple keys are allowed after ':' in the block context.
584                 allowSimpleKey_ = (flowLevel_ == 0);
585             }
586 
587             // Add VALUE.
588             Mark startMark = reader_.mark;
589             reader_.forward();
590             tokens_.push(valueToken(startMark, reader_.mark));
591         }
592 
593         /// Add ALIAS or ANCHOR token.
594         void fetchAnchor_(TokenID id)() @safe
595             if(id == TokenID.alias_ || id == TokenID.anchor)
596         {
597             // ALIAS/ANCHOR could be a simple key.
598             savePossibleSimpleKey();
599             // No simple keys after ALIAS/ANCHOR.
600             allowSimpleKey_ = false;
601 
602             auto anchor = scanAnchor(id);
603             tokens_.push(anchor);
604         }
605 
606         /// Aliases to add ALIAS or ANCHOR token.
607         alias fetchAlias = fetchAnchor_!(TokenID.alias_);
608         alias fetchAnchor = fetchAnchor_!(TokenID.anchor);
609 
610         /// Add TAG token.
611         void fetchTag() @safe
612         {
613             //TAG could start a simple key.
614             savePossibleSimpleKey();
615             //No simple keys after TAG.
616             allowSimpleKey_ = false;
617 
618             tokens_.push(scanTag());
619         }
620 
621         /// Add block SCALAR token.
622         void fetchBlockScalar(ScalarStyle style)() @safe
623             if(style == ScalarStyle.literal || style == ScalarStyle.folded)
624         {
625             // Reset possible simple key on the current level.
626             removePossibleSimpleKey();
627             // A simple key may follow a block scalar.
628             allowSimpleKey_ = true;
629 
630             auto blockScalar = scanBlockScalar(style);
631             tokens_.push(blockScalar);
632         }
633 
634         /// Aliases to add literal or folded block scalar.
635         alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal);
636         alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded);
637 
638         /// Add quoted flow SCALAR token.
639         void fetchFlowScalar(ScalarStyle quotes)()
640         {
641             // A flow scalar could be a simple key.
642             savePossibleSimpleKey();
643             // No simple keys after flow scalars.
644             allowSimpleKey_ = false;
645 
646             // Scan and add SCALAR.
647             auto scalar = scanFlowScalar(quotes);
648             tokens_.push(scalar);
649         }
650 
651         /// Aliases to add single or double quoted block scalar.
652         alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted);
653         alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted);
654 
655         /// Add plain SCALAR token.
656         void fetchPlain() @safe
657         {
658             // A plain scalar could be a simple key
659             savePossibleSimpleKey();
660             // No simple keys after plain scalars. But note that scanPlain() will
661             // change this flag if the scan is finished at the beginning of the line.
662             allowSimpleKey_ = false;
663             auto plain = scanPlain();
664 
665             // Scan and add SCALAR. May change allowSimpleKey_
666             tokens_.push(plain);
667         }
668 
669     pure:
670 
671         ///Check if the next token is DIRECTIVE:        ^ '%' ...
672         bool checkDirective() @safe
673         {
674             return reader_.peekByte() == '%' && reader_.column == 0;
675         }
676 
677         /// Check if the next token is DOCUMENT-START:   ^ '---' (' '|'\n')
678         bool checkDocumentStart() @safe
679         {
680             // Check one char first, then all 3, to prevent reading outside the buffer.
681             return reader_.column     == 0     &&
682                    reader_.peekByte() == '-'   &&
683                    reader_.prefix(3)  == "---" &&
684                    reader_.peek(3).isWhiteSpace;
685         }
686 
687         /// Check if the next token is DOCUMENT-END:     ^ '...' (' '|'\n')
688         bool checkDocumentEnd() @safe
689         {
690             // Check one char first, then all 3, to prevent reading outside the buffer.
691             return reader_.column     == 0     &&
692                    reader_.peekByte() == '.'   &&
693                    reader_.prefix(3)  == "..." &&
694                    reader_.peek(3).isWhiteSpace;
695         }
696 
697         /// Check if the next token is BLOCK-ENTRY:      '-' (' '|'\n')
698         bool checkBlockEntry() @safe
699         {
700             return !!reader_.peek(1).isWhiteSpace;
701         }
702 
703         /// Check if the next token is KEY(flow context):    '?'
704         ///
705         /// or KEY(block context):   '?' (' '|'\n')
706         bool checkKey() @safe
707         {
708             return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace);
709         }
710 
711         /// Check if the next token is VALUE(flow context):  ':'
712         ///
713         /// or VALUE(block context): ':' (' '|'\n')
714         bool checkValue() @safe
715         {
716             return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace;
717         }
718 
719         /// Check if the next token is a plain scalar.
720         ///
721         /// A plain scalar may start with any non-space character except:
722         ///   '-', '?', ':', ',', '[', ']', '{', '}',
723         ///   '#', '&', '*', '!', '|', '>', '\'', '\"',
724         ///   '%', '@', '`'.
725         ///
726         /// It may also start with
727         ///   '-', '?', ':'
728         /// if it is followed by a non-space character.
729         ///
730         /// Note that we limit the last rule to the block context (except the
731         /// '-' character) because we want the flow context to be space
732         /// independent.
733         bool checkPlain() @safe
734         {
735             const c = reader_.peek();
736             if(!c.isNonScalarStartCharacter)
737             {
738                 return true;
739             }
740             return !reader_.peek(1).isWhiteSpace &&
741                    (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':')));
742         }
743 
744         /// Move to the next non-space character.
745         void findNextNonSpace() @safe
746         {
747             while(reader_.peekByte() == ' ') { reader_.forward(); }
748         }
749 
750         /// Scan a string of alphanumeric or "-_" characters.
751         ///
752         /// Assumes that the caller is building a slice in Reader, and puts the scanned
753         /// characters into that slice.
754         void scanAlphaNumericToSlice(string name)(const Mark startMark)
755         {
756             size_t length;
757             dchar c = reader_.peek();
758             while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); }
759 
760             enforce(length > 0, new ScannerException("While scanning " ~ name,
761                 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark));
762 
763             reader_.sliceBuilder.write(reader_.get(length));
764         }
765 
766         /// Scan and throw away all characters until next line break.
767         void scanToNextBreak() @safe
768         {
769             while(!reader_.peek().isBreak) { reader_.forward(); }
770         }
771 
772         /// Scan all characters until next line break.
773         ///
774         /// Assumes that the caller is building a slice in Reader, and puts the scanned
775         /// characters into that slice.
776         void scanToNextBreakToSlice() @safe
777         {
778             uint length;
779             while(!reader_.peek(length).isBreak)
780             {
781                 ++length;
782             }
783             reader_.sliceBuilder.write(reader_.get(length));
784         }
785 
786 
787         /// Move to next token in the file/stream.
788         ///
789         /// We ignore spaces, line breaks and comments.
790         /// If we find a line break in the block context, we set
791         /// allowSimpleKey` on.
792         ///
793         /// We do not yet support BOM inside the stream as the
794         /// specification requires. Any such mark will be considered as a part
795         /// of the document.
796         void scanToNextToken() @safe
797         {
798             // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is:
799             //   Tabs cannot precede tokens
800             //   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
801             //   KEY(block), VALUE(block), BLOCK-ENTRY
802             // So the checking code is
803             //   if <TAB>:
804             //       allowSimpleKey_ = false
805             // We also need to add the check for `allowSimpleKey_ == true` to
806             // `unwindIndent` before issuing BLOCK-END.
807             // Scanners for block, flow, and plain scalars need to be modified.
808 
809             for(;;)
810             {
811                 //All whitespace in flow context is ignored, even whitespace
812                 // not allowed in other contexts
813                 if (flowLevel_ > 0)
814                 {
815                     while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); }
816                 }
817                 else
818                 {
819                     findNextNonSpace();
820                 }
821                 if(reader_.peekByte() == '#') { scanToNextBreak(); }
822                 if(scanLineBreak() != '\0')
823                 {
824                     if(flowLevel_ == 0) { allowSimpleKey_ = true; }
825                 }
826                 else
827                 {
828                     break;
829                 }
830             }
831         }
832 
833         /// Scan directive token.
834         Token scanDirective() @safe
835         {
836             Mark startMark = reader_.mark;
837             // Skip the '%'.
838             reader_.forward();
839 
840             // Scan directive name
841             reader_.sliceBuilder.begin();
842             scanDirectiveNameToSlice(startMark);
843             const name = reader_.sliceBuilder.finish();
844 
845             reader_.sliceBuilder.begin();
846 
847             // Index where tag handle ends and suffix starts in a tag directive value.
848             uint tagHandleEnd = uint.max;
849             if(name == "YAML")     { scanYAMLDirectiveValueToSlice(startMark); }
850             else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); }
851             char[] value = reader_.sliceBuilder.finish();
852 
853             Mark endMark = reader_.mark;
854 
855             DirectiveType directive;
856             if(name == "YAML")     { directive = DirectiveType.yaml; }
857             else if(name == "TAG") { directive = DirectiveType.tag; }
858             else
859             {
860                 directive = DirectiveType.reserved;
861                 scanToNextBreak();
862             }
863 
864             scanDirectiveIgnoredLine(startMark);
865 
866             return directiveToken(startMark, endMark, value, directive, tagHandleEnd);
867         }
868 
869         /// Scan name of a directive token.
870         ///
871         /// Assumes that the caller is building a slice in Reader, and puts the scanned
872         /// characters into that slice.
873         void scanDirectiveNameToSlice(const Mark startMark) @safe
874         {
875             // Scan directive name.
876             scanAlphaNumericToSlice!"a directive"(startMark);
877 
878             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
879                 new ScannerException("While scanning a directive", startMark,
880                     expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark));
881         }
882 
883         /// Scan value of a YAML directive token. Returns major, minor version separated by '.'.
884         ///
885         /// Assumes that the caller is building a slice in Reader, and puts the scanned
886         /// characters into that slice.
887         void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe
888         {
889             findNextNonSpace();
890 
891             scanYAMLDirectiveNumberToSlice(startMark);
892 
893             enforce(reader_.peekByte() == '.',
894                 new ScannerException("While scanning a directive", startMark,
895                     expected("digit or '.'", reader_.peek()), reader_.mark));
896             // Skip the '.'.
897             reader_.forward();
898 
899             reader_.sliceBuilder.write('.');
900             scanYAMLDirectiveNumberToSlice(startMark);
901 
902             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
903                 new ScannerException("While scanning a directive", startMark,
904                     expected("digit or '.'", reader_.peek()), reader_.mark));
905         }
906 
907         /// Scan a number from a YAML directive.
908         ///
909         /// Assumes that the caller is building a slice in Reader, and puts the scanned
910         /// characters into that slice.
911         void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe
912         {
913             enforce(isDigit(reader_.peek()),
914                 new ScannerException("While scanning a directive", startMark,
915                     expected("digit", reader_.peek()), reader_.mark));
916 
917             // Already found the first digit in the enforce(), so set length to 1.
918             uint length = 1;
919             while(reader_.peek(length).isDigit) { ++length; }
920 
921             reader_.sliceBuilder.write(reader_.get(length));
922         }
923 
924         /// Scan value of a tag directive.
925         ///
926         /// Assumes that the caller is building a slice in Reader, and puts the scanned
927         /// characters into that slice.
928         ///
929         /// Returns: Length of tag handle (which is before tag prefix) in scanned data
930         uint scanTagDirectiveValueToSlice(const Mark startMark) @safe
931         {
932             findNextNonSpace();
933             const startLength = reader_.sliceBuilder.length;
934             scanTagDirectiveHandleToSlice(startMark);
935             const handleLength = cast(uint)(reader_.sliceBuilder.length  - startLength);
936             findNextNonSpace();
937             scanTagDirectivePrefixToSlice(startMark);
938 
939             return handleLength;
940         }
941 
942         /// Scan handle of a tag directive.
943         ///
944         /// Assumes that the caller is building a slice in Reader, and puts the scanned
945         /// characters into that slice.
946         void scanTagDirectiveHandleToSlice(const Mark startMark) @safe
947         {
948             scanTagHandleToSlice!"directive"(startMark);
949             enforce(reader_.peekByte() == ' ',
950                 new ScannerException("While scanning a directive handle", startMark,
951                     expected("' '", reader_.peek()), reader_.mark));
952         }
953 
954         /// Scan prefix of a tag directive.
955         ///
956         /// Assumes that the caller is building a slice in Reader, and puts the scanned
957         /// characters into that slice.
958         void scanTagDirectivePrefixToSlice(const Mark startMark) @safe
959         {
960             scanTagURIToSlice!"directive"(startMark);
961             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
962                 new ScannerException("While scanning a directive prefix", startMark,
963                     expected("' '", reader_.peek()), reader_.mark));
964         }
965 
966         /// Scan (and ignore) ignored line after a directive.
967         void scanDirectiveIgnoredLine(const Mark startMark) @safe
968         {
969             findNextNonSpace();
970             if(reader_.peekByte() == '#') { scanToNextBreak(); }
971             enforce(reader_.peek().isBreak,
972                 new ScannerException("While scanning a directive", startMark,
973                       expected("comment or a line break", reader_.peek()), reader_.mark));
974             scanLineBreak();
975         }
976 
977 
978         /// Scan an alias or an anchor.
979         ///
980         /// The specification does not restrict characters for anchors and
981         /// aliases. This may lead to problems, for instance, the document:
982         ///   [ *alias, value ]
983         /// can be interpteted in two ways, as
984         ///   [ "value" ]
985         /// and
986         ///   [ *alias , "value" ]
987         /// Therefore we restrict aliases to ASCII alphanumeric characters.
988         Token scanAnchor(const TokenID id) @safe
989         {
990             const startMark = reader_.mark;
991             const dchar i = reader_.get();
992 
993             reader_.sliceBuilder.begin();
994             if(i == '*') { scanAlphaNumericToSlice!"an alias"(startMark); }
995             else         { scanAlphaNumericToSlice!"an anchor"(startMark); }
996             // On error, value is discarded as we return immediately
997             char[] value = reader_.sliceBuilder.finish();
998 
999             enum anchorCtx = "While scanning an anchor";
1000             enum aliasCtx  = "While scanning an alias";
1001             enforce(reader_.peek().isWhiteSpace ||
1002                 reader_.peekByte().among!('?', ':', ',', ']', '}', '%', '@'),
1003                 new ScannerException(i == '*' ? aliasCtx : anchorCtx, startMark,
1004                     expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark));
1005 
1006             if(id == TokenID.alias_)
1007             {
1008                 return aliasToken(startMark, reader_.mark, value);
1009             }
1010             if(id == TokenID.anchor)
1011             {
1012                 return anchorToken(startMark, reader_.mark, value);
1013             }
1014             assert(false, "This code should never be reached");
1015         }
1016 
1017         /// Scan a tag token.
1018         Token scanTag() @safe
1019         {
1020             const startMark = reader_.mark;
1021             dchar c = reader_.peek(1);
1022 
1023             reader_.sliceBuilder.begin();
1024             scope(failure) { reader_.sliceBuilder.finish(); }
1025             // Index where tag handle ends and tag suffix starts in the tag value
1026             // (slice) we will produce.
1027             uint handleEnd;
1028 
1029             if(c == '<')
1030             {
1031                 reader_.forward(2);
1032 
1033                 handleEnd = 0;
1034                 scanTagURIToSlice!"tag"(startMark);
1035                 enforce(reader_.peekByte() == '>',
1036                     new ScannerException("While scanning a tag", startMark,
1037                         expected("'>'", reader_.peek()), reader_.mark));
1038                 reader_.forward();
1039             }
1040             else if(c.isWhiteSpace)
1041             {
1042                 reader_.forward();
1043                 handleEnd = 0;
1044                 reader_.sliceBuilder.write('!');
1045             }
1046             else
1047             {
1048                 uint length = 1;
1049                 bool useHandle;
1050 
1051                 while(!c.isBreakOrSpace)
1052                 {
1053                     if(c == '!')
1054                     {
1055                         useHandle = true;
1056                         break;
1057                     }
1058                     ++length;
1059                     c = reader_.peek(length);
1060                 }
1061 
1062                 if(useHandle)
1063                 {
1064                     scanTagHandleToSlice!"tag"(startMark);
1065                     handleEnd = cast(uint)reader_.sliceBuilder.length;
1066                 }
1067                 else
1068                 {
1069                     reader_.forward();
1070                     reader_.sliceBuilder.write('!');
1071                     handleEnd = cast(uint)reader_.sliceBuilder.length;
1072                 }
1073 
1074                 scanTagURIToSlice!"tag"(startMark);
1075             }
1076 
1077             enforce(reader_.peek().isBreakOrSpace,
1078                 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()),
1079                     reader_.mark));
1080 
1081             char[] slice = reader_.sliceBuilder.finish();
1082             return tagToken(startMark, reader_.mark, slice, handleEnd);
1083         }
1084 
1085         /// Scan a block scalar token with specified style.
1086         Token scanBlockScalar(const ScalarStyle style) @safe
1087         {
1088             const startMark = reader_.mark;
1089 
1090             // Scan the header.
1091             reader_.forward();
1092 
1093             const indicators = scanBlockScalarIndicators(startMark);
1094 
1095             const chomping   = indicators[0];
1096             const increment  = indicators[1];
1097             scanBlockScalarIgnoredLine(startMark);
1098 
1099             // Determine the indentation level and go to the first non-empty line.
1100             Mark endMark;
1101             uint indent = max(1, indent_ + 1);
1102 
1103             reader_.sliceBuilder.begin();
1104             alias Transaction = SliceBuilder.Transaction;
1105             // Used to strip the last line breaks written to the slice at the end of the
1106             // scalar, which may be needed based on chomping.
1107             Transaction breaksTransaction = Transaction(&reader_.sliceBuilder);
1108             // Read the first indentation/line breaks before the scalar.
1109             size_t startLen = reader_.sliceBuilder.length;
1110             if(increment == int.min)
1111             {
1112                 auto indentation = scanBlockScalarIndentationToSlice();
1113                 endMark = indentation[1];
1114                 indent  = max(indent, indentation[0]);
1115             }
1116             else
1117             {
1118                 indent += increment - 1;
1119                 endMark = scanBlockScalarBreaksToSlice(indent);
1120             }
1121 
1122             // int.max means there's no line break (int.max is outside UTF-32).
1123             dchar lineBreak = cast(dchar)int.max;
1124 
1125             // Scan the inner part of the block scalar.
1126             while(reader_.column == indent && reader_.peekByte() != '\0')
1127             {
1128                 breaksTransaction.commit();
1129                 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t');
1130                 // This is where the 'interesting' non-whitespace data gets read.
1131                 scanToNextBreakToSlice();
1132                 lineBreak = scanLineBreak();
1133 
1134 
1135                 // This transaction serves to rollback data read in the
1136                 // scanBlockScalarBreaksToSlice() call.
1137                 breaksTransaction = Transaction(&reader_.sliceBuilder);
1138                 startLen = reader_.sliceBuilder.length;
1139                 // The line breaks should actually be written _after_ the if() block
1140                 // below. We work around that by inserting
1141                 endMark = scanBlockScalarBreaksToSlice(indent);
1142 
1143                 // This will not run during the last iteration (see the if() vs the
1144                 // while()), hence breaksTransaction rollback (which happens after this
1145                 // loop) will never roll back data written in this if() block.
1146                 if(reader_.column == indent && reader_.peekByte() != '\0')
1147                 {
1148                     // Unfortunately, folding rules are ambiguous.
1149 
1150                     // This is the folding according to the specification:
1151                     if(style == ScalarStyle.folded && lineBreak == '\n' &&
1152                        leadingNonSpace && !reader_.peekByte().among!(' ', '\t'))
1153                     {
1154                         // No breaks were scanned; no need to insert the space in the
1155                         // middle of slice.
1156                         if(startLen == reader_.sliceBuilder.length)
1157                         {
1158                             reader_.sliceBuilder.write(' ');
1159                         }
1160                     }
1161                     else
1162                     {
1163                         // We need to insert in the middle of the slice in case any line
1164                         // breaks were scanned.
1165                         reader_.sliceBuilder.insert(lineBreak, startLen);
1166                     }
1167 
1168                     ////this is Clark Evans's interpretation (also in the spec
1169                     ////examples):
1170                     //
1171                     //if(style == ScalarStyle.folded && lineBreak == '\n')
1172                     //{
1173                     //    if(startLen == endLen)
1174                     //    {
1175                     //        if(!" \t"d.canFind(reader_.peekByte()))
1176                     //        {
1177                     //            reader_.sliceBuilder.write(' ');
1178                     //        }
1179                     //        else
1180                     //        {
1181                     //            chunks ~= lineBreak;
1182                     //        }
1183                     //    }
1184                     //}
1185                     //else
1186                     //{
1187                     //    reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen);
1188                     //}
1189                 }
1190                 else
1191                 {
1192                     break;
1193                 }
1194             }
1195 
1196             // If chompint is Keep, we keep (commit) the last scanned line breaks
1197             // (which are at the end of the scalar). Otherwise re remove them (end the
1198             // transaction).
1199             if(chomping == Chomping.keep)  { breaksTransaction.commit(); }
1200             else                           { breaksTransaction.end(); }
1201             if(chomping != Chomping.strip && lineBreak != int.max)
1202             {
1203                 // If chomping is Keep, we keep the line break but the first line break
1204                 // that isn't stripped (since chomping isn't Strip in this branch) must
1205                 // be inserted _before_ the other line breaks.
1206                 if(chomping == Chomping.keep)
1207                 {
1208                     reader_.sliceBuilder.insert(lineBreak, startLen);
1209                 }
1210                 // If chomping is not Keep, breaksTransaction was cancelled so we can
1211                 // directly write the first line break (as it isn't stripped - chomping
1212                 // is not Strip)
1213                 else
1214                 {
1215                     reader_.sliceBuilder.write(lineBreak);
1216                 }
1217             }
1218 
1219             char[] slice = reader_.sliceBuilder.finish();
1220             return scalarToken(startMark, endMark, slice, style);
1221         }
1222 
1223         /// Scan chomping and indentation indicators of a scalar token.
1224         Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe
1225         {
1226             auto chomping = Chomping.clip;
1227             int increment = int.min;
1228             dchar c       = reader_.peek();
1229 
1230             /// Indicators can be in any order.
1231             if(getChomping(c, chomping))
1232             {
1233                 getIncrement(c, increment, startMark);
1234             }
1235             else
1236             {
1237                 const gotIncrement = getIncrement(c, increment, startMark);
1238                 if(gotIncrement) { getChomping(c, chomping); }
1239             }
1240 
1241             enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
1242                 new ScannerException("While scanning a block scalar", startMark,
1243                 expected("chomping or indentation indicator", c), reader_.mark));
1244 
1245             return tuple(chomping, increment);
1246         }
1247 
1248         /// Get chomping indicator, if detected. Return false otherwise.
1249         ///
1250         /// Used in scanBlockScalarIndicators.
1251         ///
1252         /// Params:
1253         ///
1254         /// c        = The character that may be a chomping indicator.
1255         /// chomping = Write the chomping value here, if detected.
1256         bool getChomping(ref dchar c, ref Chomping chomping) @safe
1257         {
1258             if(!c.among!('+', '-')) { return false; }
1259             chomping = c == '+' ? Chomping.keep : Chomping.strip;
1260             reader_.forward();
1261             c = reader_.peek();
1262             return true;
1263         }
1264 
1265         /// Get increment indicator, if detected. Return false otherwise.
1266         ///
1267         /// Used in scanBlockScalarIndicators.
1268         ///
1269         /// Params:
1270         ///
1271         /// c         = The character that may be an increment indicator.
1272         ///             If an increment indicator is detected, this will be updated to
1273         ///             the next character in the Reader.
1274         /// increment = Write the increment value here, if detected.
1275         /// startMark = Mark for error messages.
1276         bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe
1277         {
1278             if(!c.isDigit) { return false; }
1279             // Convert a digit to integer.
1280             increment = c - '0';
1281             assert(increment < 10 && increment >= 0, "Digit has invalid value");
1282 
1283             enforce(increment > 0,
1284                 new ScannerException("While scanning a block scalar", startMark,
1285                     expected("indentation indicator in range 1-9", "0"), reader_.mark));
1286 
1287             reader_.forward();
1288             c = reader_.peek();
1289             return true;
1290         }
1291 
1292         /// Scan (and ignore) ignored line in a block scalar.
1293         void scanBlockScalarIgnoredLine(const Mark startMark) @safe
1294         {
1295             findNextNonSpace();
1296             if(reader_.peekByte()== '#') { scanToNextBreak(); }
1297 
1298             enforce(reader_.peek().isBreak,
1299                 new ScannerException("While scanning a block scalar", startMark,
1300                     expected("comment or line break", reader_.peek()), reader_.mark));
1301 
1302             scanLineBreak();
1303         }
1304 
1305         /// Scan indentation in a block scalar, returning line breaks, max indent and end mark.
1306         ///
1307         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1308         /// characters into that slice.
1309         Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe
1310         {
1311             uint maxIndent;
1312             Mark endMark = reader_.mark;
1313 
1314             while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'))
1315             {
1316                 if(reader_.peekByte() != ' ')
1317                 {
1318                     reader_.sliceBuilder.write(scanLineBreak());
1319                     endMark = reader_.mark;
1320                     continue;
1321                 }
1322                 reader_.forward();
1323                 maxIndent = max(reader_.column, maxIndent);
1324             }
1325 
1326             return tuple(maxIndent, endMark);
1327         }
1328 
1329         /// Scan line breaks at lower or specified indentation in a block scalar.
1330         ///
1331         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1332         /// characters into that slice.
1333         Mark scanBlockScalarBreaksToSlice(const uint indent) @safe
1334         {
1335             Mark endMark = reader_.mark;
1336 
1337             for(;;)
1338             {
1339                 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); }
1340                 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029'))  { break; }
1341                 reader_.sliceBuilder.write(scanLineBreak());
1342                 endMark = reader_.mark;
1343             }
1344 
1345             return endMark;
1346         }
1347 
1348         /// Scan a qouted flow scalar token with specified quotes.
1349         Token scanFlowScalar(const ScalarStyle quotes) @safe
1350         {
1351             const startMark = reader_.mark;
1352             const quote     = reader_.get();
1353 
1354             reader_.sliceBuilder.begin();
1355 
1356             scanFlowScalarNonSpacesToSlice(quotes, startMark);
1357 
1358             while(reader_.peek() != quote)
1359             {
1360                 scanFlowScalarSpacesToSlice(startMark);
1361                 scanFlowScalarNonSpacesToSlice(quotes, startMark);
1362             }
1363             reader_.forward();
1364 
1365             auto slice = reader_.sliceBuilder.finish();
1366             return scalarToken(startMark, reader_.mark, slice, quotes);
1367         }
1368 
1369         /// Scan nonspace characters in a flow scalar.
1370         ///
1371         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1372         /// characters into that slice.
1373         void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark)
1374             @safe
1375         {
1376             for(;;)
1377             {
1378                 dchar c = reader_.peek();
1379 
1380                 size_t numCodePoints;
1381                 while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; }
1382 
1383                 if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); }
1384 
1385                 c = reader_.peek();
1386                 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'')
1387                 {
1388                     reader_.forward(2);
1389                     reader_.sliceBuilder.write('\'');
1390                 }
1391                 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') ||
1392                         (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\')))
1393                 {
1394                     reader_.forward();
1395                     reader_.sliceBuilder.write(c);
1396                 }
1397                 else if(quotes == ScalarStyle.doubleQuoted && c == '\\')
1398                 {
1399                     reader_.forward();
1400                     c = reader_.peek();
1401                     if(c.among!(escapes))
1402                     {
1403                         reader_.forward();
1404                         // Escaping has been moved to Parser as it can't be done in
1405                         // place (in a slice) in case of '\P' and '\L' (very uncommon,
1406                         // but we don't want to break the spec)
1407                         char[2] escapeSequence = ['\\', cast(char)c];
1408                         reader_.sliceBuilder.write(escapeSequence);
1409                     }
1410                     else if(c.among!(escapeHexCodeList))
1411                     {
1412                         const hexLength = dyaml.escapes.escapeHexLength(c);
1413                         reader_.forward();
1414 
1415                         foreach(i; 0 .. hexLength) {
1416                             enforce(reader_.peek(i).isHexDigit,
1417                                 new ScannerException("While scanning a double quoted scalar", startMark,
1418                                     expected("escape sequence of hexadecimal numbers",
1419                                         reader_.peek(i)), reader_.mark));
1420                         }
1421                         char[] hex = reader_.get(hexLength);
1422 
1423                         enforce((hex.length > 0) && (hex.length <= 8),
1424                             new ScannerException("While scanning a double quoted scalar", startMark,
1425                                   "overflow when parsing an escape sequence of " ~
1426                                   "hexadecimal numbers.", reader_.mark));
1427 
1428                         char[2] escapeStart = ['\\', cast(char) c];
1429                         reader_.sliceBuilder.write(escapeStart);
1430                         reader_.sliceBuilder.write(hex);
1431 
1432                     }
1433                     else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
1434                     {
1435                         scanLineBreak();
1436                         scanFlowScalarBreaksToSlice(startMark);
1437                     }
1438                     else
1439                     {
1440                         throw new ScannerException("While scanning a double quoted scalar", startMark,
1441                               text("found unsupported escape character ", c),
1442                               reader_.mark);
1443                     }
1444                 }
1445                 else { return; }
1446             }
1447         }
1448 
1449         /// Scan space characters in a flow scalar.
1450         ///
1451         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1452         /// spaces into that slice.
1453         void scanFlowScalarSpacesToSlice(const Mark startMark) @safe
1454         {
1455             // Increase length as long as we see whitespace.
1456             size_t length;
1457             while(reader_.peekByte(length).among!(' ', '\t')) { ++length; }
1458             auto whitespaces = reader_.prefixBytes(length);
1459 
1460             // Can check the last byte without striding because '\0' is ASCII
1461             const c = reader_.peek(length);
1462             enforce(c != '\0',
1463                 new ScannerException("While scanning a quoted scalar", startMark,
1464                     "found unexpected end of buffer", reader_.mark));
1465 
1466             // Spaces not followed by a line break.
1467             if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
1468             {
1469                 reader_.forward(length);
1470                 reader_.sliceBuilder.write(whitespaces);
1471                 return;
1472             }
1473 
1474             // There's a line break after the spaces.
1475             reader_.forward(length);
1476             const lineBreak = scanLineBreak();
1477 
1478             if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
1479 
1480             // If we have extra line breaks after the first, scan them into the
1481             // slice.
1482             const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark);
1483 
1484             // No extra breaks, one normal line break. Replace it with a space.
1485             if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
1486         }
1487 
1488         /// Scan line breaks in a flow scalar.
1489         ///
1490         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1491         /// line breaks into that slice.
1492         bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe
1493         {
1494             // True if at least one line break was found.
1495             bool anyBreaks;
1496             for(;;)
1497             {
1498                 // Instead of checking indentation, we check for document separators.
1499                 const prefix = reader_.prefix(3);
1500                 enforce(!(prefix == "---" || prefix == "...") ||
1501                     !reader_.peek(3).isWhiteSpace,
1502                     new ScannerException("While scanning a quoted scalar", startMark,
1503                         "found unexpected document separator", reader_.mark));
1504 
1505                 // Skip any whitespaces.
1506                 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); }
1507 
1508                 // Encountered a non-whitespace non-linebreak character, so we're done.
1509                 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; }
1510 
1511                 const lineBreak = scanLineBreak();
1512                 anyBreaks = true;
1513                 reader_.sliceBuilder.write(lineBreak);
1514             }
1515             return anyBreaks;
1516         }
1517 
1518         /// Scan plain scalar token (no block, no quotes).
1519         Token scanPlain() @safe
1520         {
1521             // We keep track of the allowSimpleKey_ flag here.
1522             // Indentation rules are loosed for the flow context
1523             const startMark = reader_.mark;
1524             Mark endMark = startMark;
1525             const indent = indent_ + 1;
1526 
1527             // We allow zero indentation for scalars, but then we need to check for
1528             // document separators at the beginning of the line.
1529             // if(indent == 0) { indent = 1; }
1530 
1531             reader_.sliceBuilder.begin();
1532 
1533             alias Transaction = SliceBuilder.Transaction;
1534             Transaction spacesTransaction;
1535             // Stop at a comment.
1536             while(reader_.peekByte() != '#')
1537             {
1538                 // Scan the entire plain scalar.
1539                 size_t length;
1540                 dchar c = reader_.peek(length);
1541                 for(;;)
1542                 {
1543                     const cNext = reader_.peek(length + 1);
1544                     if(c.isWhiteSpace ||
1545                        (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) ||
1546                        (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}')))
1547                     {
1548                         break;
1549                     }
1550                     ++length;
1551                     c = cNext;
1552                 }
1553 
1554                 // It's not clear what we should do with ':' in the flow context.
1555                 enforce(flowLevel_ == 0 || c != ':' ||
1556                    reader_.peek(length + 1).isWhiteSpace ||
1557                    reader_.peek(length + 1).among!(',', '[', ']', '{', '}'),
1558                     new ScannerException("While scanning a plain scalar", startMark,
1559                         "found unexpected ':' . Please check " ~
1560                         "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.",
1561                         reader_.mark));
1562 
1563                 if(length == 0) { break; }
1564 
1565                 allowSimpleKey_ = false;
1566 
1567                 reader_.sliceBuilder.write(reader_.get(length));
1568 
1569                 endMark = reader_.mark;
1570 
1571                 spacesTransaction.commit();
1572                 spacesTransaction = Transaction(&reader_.sliceBuilder);
1573 
1574                 const startLength = reader_.sliceBuilder.length;
1575                 scanPlainSpacesToSlice();
1576                 if(startLength == reader_.sliceBuilder.length ||
1577                    (flowLevel_ == 0 && reader_.column < indent))
1578                 {
1579                     break;
1580                 }
1581             }
1582 
1583             spacesTransaction.end();
1584             char[] slice = reader_.sliceBuilder.finish();
1585 
1586             return scalarToken(startMark, endMark, slice, ScalarStyle.plain);
1587         }
1588 
1589         /// Scan spaces in a plain scalar.
1590         ///
1591         /// Assumes that the caller is building a slice in Reader, and puts the spaces
1592         /// into that slice.
1593         void scanPlainSpacesToSlice() @safe
1594         {
1595             // The specification is really confusing about tabs in plain scalars.
1596             // We just forbid them completely. Do not use tabs in YAML!
1597 
1598             // Get as many plain spaces as there are.
1599             size_t length;
1600             while(reader_.peekByte(length) == ' ') { ++length; }
1601             char[] whitespaces = reader_.prefixBytes(length);
1602             reader_.forward(length);
1603 
1604             const dchar c = reader_.peek();
1605             if(!c.isNSChar)
1606             {
1607                 // We have spaces, but no newline.
1608                 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); }
1609                 return;
1610             }
1611 
1612             // Newline after the spaces (if any)
1613             const lineBreak = scanLineBreak();
1614             allowSimpleKey_ = true;
1615 
1616             static bool end(Reader reader_) @safe pure
1617             {
1618                 const prefix = reader_.prefix(3);
1619                 return ("---" == prefix || "..." == prefix)
1620                         && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
1621             }
1622 
1623             if(end(reader_)) { return; }
1624 
1625             bool extraBreaks;
1626 
1627             alias Transaction = SliceBuilder.Transaction;
1628             auto transaction = Transaction(&reader_.sliceBuilder);
1629             if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
1630             while(reader_.peek().isNSChar)
1631             {
1632                 if(reader_.peekByte() == ' ') { reader_.forward(); }
1633                 else
1634                 {
1635                     const lBreak = scanLineBreak();
1636                     extraBreaks  = true;
1637                     reader_.sliceBuilder.write(lBreak);
1638 
1639                     if(end(reader_)) { return; }
1640                 }
1641             }
1642             transaction.commit();
1643 
1644             // No line breaks, only a space.
1645             if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
1646         }
1647 
1648         /// Scan handle of a tag token.
1649         ///
1650         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1651         /// characters into that slice.
1652         void scanTagHandleToSlice(string name)(const Mark startMark)
1653         {
1654             dchar c = reader_.peek();
1655             enum contextMsg = "While scanning a " ~ name;
1656             enforce(c == '!',
1657                 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
1658 
1659             uint length = 1;
1660             c = reader_.peek(length);
1661             if(c != ' ')
1662             {
1663                 while(c.isAlphaNum || c.among!('-', '_'))
1664                 {
1665                     ++length;
1666                     c = reader_.peek(length);
1667                 }
1668                 enforce(c == '!',
1669                     new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
1670                 ++length;
1671             }
1672 
1673             reader_.sliceBuilder.write(reader_.get(length));
1674         }
1675 
1676         /// Scan URI in a tag token.
1677         ///
1678         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1679         /// characters into that slice.
1680         void scanTagURIToSlice(string name)(const Mark startMark)
1681         {
1682             // Note: we do not check if URI is well-formed.
1683             dchar c = reader_.peek();
1684             const startLen = reader_.sliceBuilder.length;
1685             {
1686                 uint length;
1687                 while(c.isAlphaNum || c.isURIChar)
1688                 {
1689                     if(c == '%')
1690                     {
1691                         auto chars = reader_.get(length);
1692                         reader_.sliceBuilder.write(chars);
1693                         length = 0;
1694                         scanURIEscapesToSlice!name(startMark);
1695                     }
1696                     else { ++length; }
1697                     c = reader_.peek(length);
1698                 }
1699                 if(length > 0)
1700                 {
1701                     auto chars = reader_.get(length);
1702                     reader_.sliceBuilder.write(chars);
1703                     length = 0;
1704                 }
1705             }
1706             // OK if we scanned something, error otherwise.
1707             enum contextMsg = "While parsing a " ~ name;
1708             enforce(reader_.sliceBuilder.length > startLen,
1709                 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark));
1710         }
1711 
1712         // Not @nogc yet because std.utf.decode is not @nogc
1713         /// Scan URI escape sequences.
1714         ///
1715         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1716         /// characters into that slice.
1717         void scanURIEscapesToSlice(string name)(const Mark startMark)
1718         {
1719             import core.exception : UnicodeException;
1720             // URI escapes encode a UTF-8 string. We store UTF-8 code units here for
1721             // decoding into UTF-32.
1722             Appender!string buffer;
1723 
1724 
1725             enum contextMsg = "While scanning a " ~ name;
1726             while(reader_.peekByte() == '%')
1727             {
1728                 reader_.forward();
1729                 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)];
1730 
1731                 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit,
1732                     new ScannerException(contextMsg, startMark,
1733                         expected("URI escape sequence of 2 hexadecimal " ~
1734                             "numbers", nextByte), reader_.mark));
1735 
1736                 buffer ~= nextByte[].to!ubyte(16);
1737 
1738                 reader_.forward(2);
1739             }
1740             try
1741             {
1742                 foreach (dchar chr; buffer.data)
1743                 {
1744                     reader_.sliceBuilder.write(chr);
1745                 }
1746             }
1747             catch (UnicodeException)
1748             {
1749                 throw new ScannerException(contextMsg, startMark,
1750                         "Invalid UTF-8 data encoded in URI escape sequence",
1751                         reader_.mark);
1752             }
1753         }
1754 
1755 
1756         /// Scan a line break, if any.
1757         ///
1758         /// Transforms:
1759         ///   '\r\n'      :   '\n'
1760         ///   '\r'        :   '\n'
1761         ///   '\n'        :   '\n'
1762         ///   '\u0085'    :   '\n'
1763         ///   '\u2028'    :   '\u2028'
1764         ///   '\u2029     :   '\u2029'
1765         ///   no break    :   '\0'
1766         dchar scanLineBreak() @safe
1767         {
1768             // Fast path for ASCII line breaks.
1769             const b = reader_.peekByte();
1770             if(b < 0x80)
1771             {
1772                 if(b == '\n' || b == '\r')
1773                 {
1774                     if(reader_.prefix(2) == "\r\n") { reader_.forward(2); }
1775                     else { reader_.forward(); }
1776                     return '\n';
1777                 }
1778                 return '\0';
1779             }
1780 
1781             const c = reader_.peek();
1782             if(c == '\x85')
1783             {
1784                 reader_.forward();
1785                 return '\n';
1786             }
1787             if(c == '\u2028' || c == '\u2029')
1788             {
1789                 reader_.forward();
1790                 return c;
1791             }
1792             return '\0';
1793         }
1794 }