1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 /// YAML scanner.
8 /// Code based on PyYAML: http://www.pyyaml.org
9 module dyaml.scanner;
10 
11 
12 import core.stdc..string;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.ascii : isAlphaNum, isDigit, isHexDigit;
18 import std.exception;
19 import std..string;
20 import std.typecons;
21 import std.traits : Unqual;
22 import std.utf;
23 
24 import dyaml.escapes;
25 import dyaml.exception;
26 import dyaml.queue;
27 import dyaml.reader;
28 import dyaml.style;
29 import dyaml.token;
30 
31 package:
32 /// Scanner produces tokens of the following types:
33 /// STREAM-START
34 /// STREAM-END
35 /// DIRECTIVE(name, value)
36 /// DOCUMENT-START
37 /// DOCUMENT-END
38 /// BLOCK-SEQUENCE-START
39 /// BLOCK-MAPPING-START
40 /// BLOCK-END
41 /// FLOW-SEQUENCE-START
42 /// FLOW-MAPPING-START
43 /// FLOW-SEQUENCE-END
44 /// FLOW-MAPPING-END
45 /// BLOCK-ENTRY
46 /// FLOW-ENTRY
47 /// KEY
48 /// VALUE
49 /// ALIAS(value)
50 /// ANCHOR(value)
51 /// TAG(value)
52 /// SCALAR(value, plain, style)
53 
54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
55 
56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
57 
58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
59 
60 alias isNonLinebreakWhitespace = among!(' ', '\t');
61 
62 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}',
63     '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n',
64     '\r', '\u0085', '\u2028', '\u2029');
65 
66 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',',
67     '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%');
68 
69 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029');
70 
71 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029');
72 
73 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\');
74 
75 /// Marked exception thrown at scanner errors.
76 ///
77 /// See_Also: MarkedYAMLException
78 class ScannerException : MarkedYAMLException
79 {
80     mixin MarkedExceptionCtors;
81 }
82 
83 /// Generates tokens from data provided by a Reader.
84 struct Scanner
85 {
86     private:
87         /// A simple key is a key that is not denoted by the '?' indicator.
88         /// For example:
89         ///   ---
90         ///   block simple key: value
91         ///   ? not a simple key:
92         ///   : { flow simple key: value }
93         /// We emit the KEY token before all keys, so when we find a potential simple
94         /// key, we try to locate the corresponding ':' indicator. Simple keys should be
95         /// limited to a single line and 1024 characters.
96         ///
97         /// 16 bytes on 64-bit.
98         static struct SimpleKey
99         {
100             /// Character index in reader where the key starts.
101             uint charIndex = uint.max;
102             /// Index of the key token from start (first token scanned being 0).
103             uint tokenIndex;
104             /// Line the key starts at.
105             uint line;
106             /// Column the key starts at.
107             ushort column;
108             /// Is this required to be a simple key?
109             bool required;
110             /// Is this struct "null" (invalid)?.
111             bool isNull;
112         }
113 
114         /// Block chomping types.
115         enum Chomping
116         {
117             /// Strip all trailing line breaks. '-' indicator.
118             strip,
119             /// Line break of the last line is preserved, others discarded. Default.
120             clip,
121             /// All trailing line breaks are preserved. '+' indicator.
122             keep
123         }
124 
125         /// Reader used to read from a file/stream.
126         Reader reader_;
127         /// Are we done scanning?
128         bool done_;
129 
130         /// Level of nesting in flow context. If 0, we're in block context.
131         uint flowLevel_;
132         /// Current indentation level.
133         int indent_ = -1;
134         /// Past indentation levels. Used as a stack.
135         Appender!(int[]) indents_;
136 
137         /// Processed tokens not yet emitted. Used as a queue.
138         Queue!Token tokens_;
139 
140         /// Number of tokens emitted through the getToken method.
141         uint tokensTaken_;
142 
143         /// Can a simple key start at the current position? A simple key may start:
144         /// - at the beginning of the line, not counting indentation spaces
145         ///       (in block context),
146         /// - after '{', '[', ',' (in the flow context),
147         /// - after '?', ':', '-' (in the block context).
148         /// In the block context, this flag also signifies if a block collection
149         /// may start at the current position.
150         bool allowSimpleKey_ = true;
151 
152         /// Possible simple keys indexed by flow levels.
153         SimpleKey[] possibleSimpleKeys_;
154 
155     public:
156         /// Construct a Scanner using specified Reader.
157         this(Reader reader) @safe nothrow
158         {
159             // Return the next token, but do not delete it from the queue
160             reader_   = reader;
161             fetchStreamStart();
162         }
163 
164         /// Advance to the next token
165         void popFront() @safe
166         {
167             ++tokensTaken_;
168             tokens_.pop();
169         }
170 
171         /// Return the current token
172         const(Token) front() @safe
173         {
174             enforce(!empty, "No token left to peek");
175             return tokens_.peek();
176         }
177 
178         /// Return whether there are any more tokens left.
179         bool empty() @safe
180         {
181             while (needMoreTokens())
182             {
183                 fetchToken();
184             }
185             return tokens_.empty;
186         }
187 
188     private:
189         /// Most scanning error messages have the same format; so build them with this
190         /// function.
191         string expected(T)(string expected, T found)
192         {
193             return text("expected ", expected, ", but found ", found);
194         }
195 
196         /// Determine whether or not we need to fetch more tokens before peeking/getting a token.
197         bool needMoreTokens() @safe pure
198         {
199             if(done_)         { return false; }
200             if(tokens_.empty) { return true; }
201 
202             /// The current token may be a potential simple key, so we need to look further.
203             stalePossibleSimpleKeys();
204             return nextPossibleSimpleKey() == tokensTaken_;
205         }
206 
207         /// Fetch at token, adding it to tokens_.
208         void fetchToken() @safe
209         {
210             // Eat whitespaces and comments until we reach the next token.
211             scanToNextToken();
212 
213             // Remove obsolete possible simple keys.
214             stalePossibleSimpleKeys();
215 
216             // Compare current indentation and column. It may add some tokens
217             // and decrease the current indentation level.
218             unwindIndent(reader_.column);
219 
220             // Get the next character.
221             const dchar c = reader_.peekByte();
222 
223             // Fetch the token.
224             if(c == '\0')            { return fetchStreamEnd();     }
225             if(checkDirective())     { return fetchDirective();     }
226             if(checkDocumentStart()) { return fetchDocumentStart(); }
227             if(checkDocumentEnd())   { return fetchDocumentEnd();   }
228             // Order of the following checks is NOT significant.
229             switch(c)
230             {
231                 case '[':  return fetchFlowSequenceStart();
232                 case '{':  return fetchFlowMappingStart();
233                 case ']':  return fetchFlowSequenceEnd();
234                 case '}':  return fetchFlowMappingEnd();
235                 case ',':  return fetchFlowEntry();
236                 case '!':  return fetchTag();
237                 case '\'': return fetchSingle();
238                 case '\"': return fetchDouble();
239                 case '*':  return fetchAlias();
240                 case '&':  return fetchAnchor();
241                 case '?':  if(checkKey())        { return fetchKey();        } goto default;
242                 case ':':  if(checkValue())      { return fetchValue();      } goto default;
243                 case '-':  if(checkBlockEntry()) { return fetchBlockEntry(); } goto default;
244                 case '|':  if(flowLevel_ == 0)   { return fetchLiteral();    } break;
245                 case '>':  if(flowLevel_ == 0)   { return fetchFolded();     } break;
246                 default:   if(checkPlain())      { return fetchPlain();      }
247             }
248 
249             throw new ScannerException("While scanning for the next token, found character " ~
250                                        "\'%s\', index %s that cannot start any token"
251                                        .format(c, to!int(c)), reader_.mark);
252         }
253 
254 
255         /// Return the token number of the nearest possible simple key.
256         uint nextPossibleSimpleKey() @safe pure nothrow @nogc
257         {
258             uint minTokenNumber = uint.max;
259             foreach(k, ref simpleKey; possibleSimpleKeys_)
260             {
261                 if(simpleKey.isNull) { continue; }
262                 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex);
263             }
264             return minTokenNumber;
265         }
266 
267         /// Remove entries that are no longer possible simple keys.
268         ///
269         /// According to the YAML specification, simple keys
270         /// - should be limited to a single line,
271         /// - should be no longer than 1024 characters.
272         /// Disabling this will allow simple keys of any length and
273         /// height (may cause problems if indentation is broken though).
274         void stalePossibleSimpleKeys() @safe pure
275         {
276             foreach(level, ref key; possibleSimpleKeys_)
277             {
278                 if(key.isNull) { continue; }
279                 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024)
280                 {
281                     enforce(!key.required,
282                             new ScannerException("While scanning a simple key",
283                                                  Mark(reader_.name, key.line, key.column),
284                                                  "could not find expected ':'", reader_.mark));
285                     key.isNull = true;
286                 }
287             }
288         }
289 
290         /// Check if the next token starts a possible simple key and if so, save its position.
291         ///
292         /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
293         void savePossibleSimpleKey() @safe pure
294         {
295             // Check if a simple key is required at the current position.
296             const required = (flowLevel_ == 0 && indent_ == reader_.column);
297             assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~
298                    "the first token in the current line. Therefore it is always allowed.");
299 
300             if(!allowSimpleKey_) { return; }
301 
302             // The next token might be a simple key, so save its number and position.
303             removePossibleSimpleKey();
304             const tokenCount = tokensTaken_ + cast(uint)tokens_.length;
305 
306             const line   = reader_.line;
307             const column = reader_.column;
308             const key    = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line,
309                                      cast(ushort)min(column, ushort.max), required);
310 
311             if(possibleSimpleKeys_.length <= flowLevel_)
312             {
313                 const oldLength = possibleSimpleKeys_.length;
314                 possibleSimpleKeys_.length = flowLevel_ + 1;
315                 //No need to initialize the last element, it's already done in the next line.
316                 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init;
317             }
318             possibleSimpleKeys_[flowLevel_] = key;
319         }
320 
321         /// Remove the saved possible key position at the current flow level.
322         void removePossibleSimpleKey() @safe pure
323         {
324             if(possibleSimpleKeys_.length <= flowLevel_) { return; }
325 
326             if(!possibleSimpleKeys_[flowLevel_].isNull)
327             {
328                 const key = possibleSimpleKeys_[flowLevel_];
329                 enforce(!key.required,
330                         new ScannerException("While scanning a simple key",
331                                              Mark(reader_.name, key.line, key.column),
332                                              "could not find expected ':'", reader_.mark));
333                 possibleSimpleKeys_[flowLevel_].isNull = true;
334             }
335         }
336 
337         /// Decrease indentation, removing entries in indents_.
338         ///
339         /// Params:  column = Current column in the file/stream.
340         void unwindIndent(const int column) @safe
341         {
342             if(flowLevel_ > 0)
343             {
344                 // In flow context, tokens should respect indentation.
345                 // The condition should be `indent >= column` according to the spec.
346                 // But this condition will prohibit intuitively correct
347                 // constructions such as
348                 // key : {
349                 // }
350 
351                 // In the flow context, indentation is ignored. We make the scanner less
352                 // restrictive than what the specification requires.
353                 // if(pedantic_ && flowLevel_ > 0 && indent_ > column)
354                 // {
355                 //     throw new ScannerException("Invalid intendation or unclosed '[' or '{'",
356                 //                                reader_.mark)
357                 // }
358                 return;
359             }
360 
361             // In block context, we may need to issue the BLOCK-END tokens.
362             while(indent_ > column)
363             {
364                 indent_ = indents_.data.back;
365                 assert(indents_.data.length);
366                 indents_.shrinkTo(indents_.data.length - 1);
367                 tokens_.push(blockEndToken(reader_.mark, reader_.mark));
368             }
369         }
370 
371         /// Increase indentation if needed.
372         ///
373         /// Params:  column = Current column in the file/stream.
374         ///
375         /// Returns: true if the indentation was increased, false otherwise.
376         bool addIndent(int column) @safe
377         {
378             if(indent_ >= column){return false;}
379             indents_ ~= indent_;
380             indent_ = column;
381             return true;
382         }
383 
384 
385         /// Add STREAM-START token.
386         void fetchStreamStart() @safe nothrow
387         {
388             tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding));
389         }
390 
391         ///Add STREAM-END token.
392         void fetchStreamEnd() @safe
393         {
394             //Set intendation to -1 .
395             unwindIndent(-1);
396             removePossibleSimpleKey();
397             allowSimpleKey_ = false;
398             possibleSimpleKeys_.destroy;
399 
400             tokens_.push(streamEndToken(reader_.mark, reader_.mark));
401             done_ = true;
402         }
403 
404         /// Add DIRECTIVE token.
405         void fetchDirective() @safe
406         {
407             // Set intendation to -1 .
408             unwindIndent(-1);
409             // Reset simple keys.
410             removePossibleSimpleKey();
411             allowSimpleKey_ = false;
412 
413             auto directive = scanDirective();
414             tokens_.push(directive);
415         }
416 
417         /// Add DOCUMENT-START or DOCUMENT-END token.
418         void fetchDocumentIndicator(TokenID id)()
419             if(id == TokenID.documentStart || id == TokenID.documentEnd)
420         {
421             // Set indentation to -1 .
422             unwindIndent(-1);
423             // Reset simple keys. Note that there can't be a block collection after '---'.
424             removePossibleSimpleKey();
425             allowSimpleKey_ = false;
426 
427             Mark startMark = reader_.mark;
428             reader_.forward(3);
429             tokens_.push(simpleToken!id(startMark, reader_.mark));
430         }
431 
432         /// Aliases to add DOCUMENT-START or DOCUMENT-END token.
433         alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart);
434         alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd);
435 
436         /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
437         void fetchFlowCollectionStart(TokenID id)() @safe
438         {
439             // '[' and '{' may start a simple key.
440             savePossibleSimpleKey();
441             // Simple keys are allowed after '[' and '{'.
442             allowSimpleKey_ = true;
443             ++flowLevel_;
444 
445             Mark startMark = reader_.mark;
446             reader_.forward();
447             tokens_.push(simpleToken!id(startMark, reader_.mark));
448         }
449 
450         /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
451         alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart);
452         alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart);
453 
454         /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
455         void fetchFlowCollectionEnd(TokenID id)()
456         {
457             // Reset possible simple key on the current level.
458             removePossibleSimpleKey();
459             // No simple keys after ']' and '}'.
460             allowSimpleKey_ = false;
461             --flowLevel_;
462 
463             Mark startMark = reader_.mark;
464             reader_.forward();
465             tokens_.push(simpleToken!id(startMark, reader_.mark));
466         }
467 
468         /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/
469         alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd);
470         alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd);
471 
472         /// Add FLOW-ENTRY token;
473         void fetchFlowEntry() @safe
474         {
475             // Reset possible simple key on the current level.
476             removePossibleSimpleKey();
477             // Simple keys are allowed after ','.
478             allowSimpleKey_ = true;
479 
480             Mark startMark = reader_.mark;
481             reader_.forward();
482             tokens_.push(flowEntryToken(startMark, reader_.mark));
483         }
484 
485         /// Additional checks used in block context in fetchBlockEntry and fetchKey.
486         ///
487         /// Params:  type = String representing the token type we might need to add.
488         ///          id   = Token type we might need to add.
489         void blockChecks(string type, TokenID id)()
490         {
491             enum context = type ~ " keys are not allowed here";
492             // Are we allowed to start a key (not neccesarily a simple one)?
493             enforce(allowSimpleKey_, new ScannerException(context, reader_.mark));
494 
495             if(addIndent(reader_.column))
496             {
497                 tokens_.push(simpleToken!id(reader_.mark, reader_.mark));
498             }
499         }
500 
501         /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process.
502         void fetchBlockEntry() @safe
503         {
504             if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); }
505 
506             // It's an error for the block entry to occur in the flow context,
507             // but we let the parser detect this.
508 
509             // Reset possible simple key on the current level.
510             removePossibleSimpleKey();
511             // Simple keys are allowed after '-'.
512             allowSimpleKey_ = true;
513 
514             Mark startMark = reader_.mark;
515             reader_.forward();
516             tokens_.push(blockEntryToken(startMark, reader_.mark));
517         }
518 
519         /// Add KEY token. Might add BLOCK-MAPPING-START in the process.
520         void fetchKey() @safe
521         {
522             if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); }
523 
524             // Reset possible simple key on the current level.
525             removePossibleSimpleKey();
526             // Simple keys are allowed after '?' in the block context.
527             allowSimpleKey_ = (flowLevel_ == 0);
528 
529             Mark startMark = reader_.mark;
530             reader_.forward();
531             tokens_.push(keyToken(startMark, reader_.mark));
532         }
533 
534         /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process.
535         void fetchValue() @safe
536         {
537             //Do we determine a simple key?
538             if(possibleSimpleKeys_.length > flowLevel_ &&
539                !possibleSimpleKeys_[flowLevel_].isNull)
540             {
541                 const key = possibleSimpleKeys_[flowLevel_];
542                 possibleSimpleKeys_[flowLevel_].isNull = true;
543                 Mark keyMark = Mark(reader_.name, key.line, key.column);
544                 const idx = key.tokenIndex - tokensTaken_;
545 
546                 assert(idx >= 0);
547 
548                 // Add KEY.
549                 // Manually inserting since tokens are immutable (need linked list).
550                 tokens_.insert(keyToken(keyMark, keyMark), idx);
551 
552                 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START.
553                 if(flowLevel_ == 0 && addIndent(key.column))
554                 {
555                     tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx);
556                 }
557 
558                 // There cannot be two simple keys in a row.
559                 allowSimpleKey_ = false;
560             }
561             // Part of a complex key
562             else
563             {
564                 // We can start a complex value if and only if we can start a simple key.
565                 enforce(flowLevel_ > 0 || allowSimpleKey_,
566                         new ScannerException("Mapping values are not allowed here", reader_.mark));
567 
568                 // If this value starts a new block mapping, we need to add
569                 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser.
570                 if(flowLevel_ == 0 && addIndent(reader_.column))
571                 {
572                     tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark));
573                 }
574 
575                 // Reset possible simple key on the current level.
576                 removePossibleSimpleKey();
577                 // Simple keys are allowed after ':' in the block context.
578                 allowSimpleKey_ = (flowLevel_ == 0);
579             }
580 
581             // Add VALUE.
582             Mark startMark = reader_.mark;
583             reader_.forward();
584             tokens_.push(valueToken(startMark, reader_.mark));
585         }
586 
587         /// Add ALIAS or ANCHOR token.
588         void fetchAnchor_(TokenID id)() @safe
589             if(id == TokenID.alias_ || id == TokenID.anchor)
590         {
591             // ALIAS/ANCHOR could be a simple key.
592             savePossibleSimpleKey();
593             // No simple keys after ALIAS/ANCHOR.
594             allowSimpleKey_ = false;
595 
596             auto anchor = scanAnchor(id);
597             tokens_.push(anchor);
598         }
599 
600         /// Aliases to add ALIAS or ANCHOR token.
601         alias fetchAlias = fetchAnchor_!(TokenID.alias_);
602         alias fetchAnchor = fetchAnchor_!(TokenID.anchor);
603 
604         /// Add TAG token.
605         void fetchTag() @safe
606         {
607             //TAG could start a simple key.
608             savePossibleSimpleKey();
609             //No simple keys after TAG.
610             allowSimpleKey_ = false;
611 
612             tokens_.push(scanTag());
613         }
614 
615         /// Add block SCALAR token.
616         void fetchBlockScalar(ScalarStyle style)() @safe
617             if(style == ScalarStyle.literal || style == ScalarStyle.folded)
618         {
619             // Reset possible simple key on the current level.
620             removePossibleSimpleKey();
621             // A simple key may follow a block scalar.
622             allowSimpleKey_ = true;
623 
624             auto blockScalar = scanBlockScalar(style);
625             tokens_.push(blockScalar);
626         }
627 
628         /// Aliases to add literal or folded block scalar.
629         alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal);
630         alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded);
631 
632         /// Add quoted flow SCALAR token.
633         void fetchFlowScalar(ScalarStyle quotes)()
634         {
635             // A flow scalar could be a simple key.
636             savePossibleSimpleKey();
637             // No simple keys after flow scalars.
638             allowSimpleKey_ = false;
639 
640             // Scan and add SCALAR.
641             auto scalar = scanFlowScalar(quotes);
642             tokens_.push(scalar);
643         }
644 
645         /// Aliases to add single or double quoted block scalar.
646         alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted);
647         alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted);
648 
649         /// Add plain SCALAR token.
650         void fetchPlain() @safe
651         {
652             // A plain scalar could be a simple key
653             savePossibleSimpleKey();
654             // No simple keys after plain scalars. But note that scanPlain() will
655             // change this flag if the scan is finished at the beginning of the line.
656             allowSimpleKey_ = false;
657             auto plain = scanPlain();
658 
659             // Scan and add SCALAR. May change allowSimpleKey_
660             tokens_.push(plain);
661         }
662 
663     pure:
664 
665         ///Check if the next token is DIRECTIVE:        ^ '%' ...
666         bool checkDirective() @safe
667         {
668             return reader_.peekByte() == '%' && reader_.column == 0;
669         }
670 
671         /// Check if the next token is DOCUMENT-START:   ^ '---' (' '|'\n')
672         bool checkDocumentStart() @safe
673         {
674             // Check one char first, then all 3, to prevent reading outside the buffer.
675             return reader_.column     == 0     &&
676                    reader_.peekByte() == '-'   &&
677                    reader_.prefix(3)  == "---" &&
678                    reader_.peek(3).isWhiteSpace;
679         }
680 
681         /// Check if the next token is DOCUMENT-END:     ^ '...' (' '|'\n')
682         bool checkDocumentEnd() @safe
683         {
684             // Check one char first, then all 3, to prevent reading outside the buffer.
685             return reader_.column     == 0     &&
686                    reader_.peekByte() == '.'   &&
687                    reader_.prefix(3)  == "..." &&
688                    reader_.peek(3).isWhiteSpace;
689         }
690 
691         /// Check if the next token is BLOCK-ENTRY:      '-' (' '|'\n')
692         bool checkBlockEntry() @safe
693         {
694             return !!reader_.peek(1).isWhiteSpace;
695         }
696 
697         /// Check if the next token is KEY(flow context):    '?'
698         ///
699         /// or KEY(block context):   '?' (' '|'\n')
700         bool checkKey() @safe
701         {
702             return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace);
703         }
704 
705         /// Check if the next token is VALUE(flow context):  ':'
706         ///
707         /// or VALUE(block context): ':' (' '|'\n')
708         bool checkValue() @safe
709         {
710             return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace;
711         }
712 
713         /// Check if the next token is a plain scalar.
714         ///
715         /// A plain scalar may start with any non-space character except:
716         ///   '-', '?', ':', ',', '[', ']', '{', '}',
717         ///   '#', '&', '*', '!', '|', '>', '\'', '\"',
718         ///   '%', '@', '`'.
719         ///
720         /// It may also start with
721         ///   '-', '?', ':'
722         /// if it is followed by a non-space character.
723         ///
724         /// Note that we limit the last rule to the block context (except the
725         /// '-' character) because we want the flow context to be space
726         /// independent.
727         bool checkPlain() @safe
728         {
729             const c = reader_.peek();
730             if(!c.isNonScalarStartCharacter)
731             {
732                 return true;
733             }
734             return !reader_.peek(1).isWhiteSpace &&
735                    (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':')));
736         }
737 
738         /// Move to the next non-space character.
739         void findNextNonSpace() @safe
740         {
741             while(reader_.peekByte() == ' ') { reader_.forward(); }
742         }
743 
744         /// Scan a string of alphanumeric or "-_" characters.
745         ///
746         /// Assumes that the caller is building a slice in Reader, and puts the scanned
747         /// characters into that slice.
748         void scanAlphaNumericToSlice(string name)(const Mark startMark)
749         {
750             size_t length;
751             dchar c = reader_.peek();
752             while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); }
753 
754             enforce(length > 0, new ScannerException("While scanning " ~ name,
755                 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark));
756 
757             reader_.sliceBuilder.write(reader_.get(length));
758         }
759 
760         /// Scan and throw away all characters until next line break.
761         void scanToNextBreak() @safe
762         {
763             while(!reader_.peek().isBreak) { reader_.forward(); }
764         }
765 
766         /// Scan all characters until next line break.
767         ///
768         /// Assumes that the caller is building a slice in Reader, and puts the scanned
769         /// characters into that slice.
770         void scanToNextBreakToSlice() @safe
771         {
772             uint length;
773             while(!reader_.peek(length).isBreak)
774             {
775                 ++length;
776             }
777             reader_.sliceBuilder.write(reader_.get(length));
778         }
779 
780 
781         /// Move to next token in the file/stream.
782         ///
783         /// We ignore spaces, line breaks and comments.
784         /// If we find a line break in the block context, we set
785         /// allowSimpleKey` on.
786         ///
787         /// We do not yet support BOM inside the stream as the
788         /// specification requires. Any such mark will be considered as a part
789         /// of the document.
790         void scanToNextToken() @safe
791         {
792             // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is:
793             //   Tabs cannot precede tokens
794             //   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
795             //   KEY(block), VALUE(block), BLOCK-ENTRY
796             // So the checking code is
797             //   if <TAB>:
798             //       allowSimpleKey_ = false
799             // We also need to add the check for `allowSimpleKey_ == true` to
800             // `unwindIndent` before issuing BLOCK-END.
801             // Scanners for block, flow, and plain scalars need to be modified.
802 
803             for(;;)
804             {
805                 //All whitespace in flow context is ignored, even whitespace
806                 // not allowed in other contexts
807                 if (flowLevel_ > 0)
808                 {
809                     while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); }
810                 }
811                 else
812                 {
813                     findNextNonSpace();
814                 }
815                 if(reader_.peekByte() == '#') { scanToNextBreak(); }
816                 if(scanLineBreak() != '\0')
817                 {
818                     if(flowLevel_ == 0) { allowSimpleKey_ = true; }
819                 }
820                 else
821                 {
822                     break;
823                 }
824             }
825         }
826 
827         /// Scan directive token.
828         Token scanDirective() @safe
829         {
830             Mark startMark = reader_.mark;
831             // Skip the '%'.
832             reader_.forward();
833 
834             // Scan directive name
835             reader_.sliceBuilder.begin();
836             scanDirectiveNameToSlice(startMark);
837             const name = reader_.sliceBuilder.finish();
838 
839             reader_.sliceBuilder.begin();
840 
841             // Index where tag handle ends and suffix starts in a tag directive value.
842             uint tagHandleEnd = uint.max;
843             if(name == "YAML")     { scanYAMLDirectiveValueToSlice(startMark); }
844             else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); }
845             char[] value = reader_.sliceBuilder.finish();
846 
847             Mark endMark = reader_.mark;
848 
849             DirectiveType directive;
850             if(name == "YAML")     { directive = DirectiveType.yaml; }
851             else if(name == "TAG") { directive = DirectiveType.tag; }
852             else
853             {
854                 directive = DirectiveType.reserved;
855                 scanToNextBreak();
856             }
857 
858             scanDirectiveIgnoredLine(startMark);
859 
860             return directiveToken(startMark, endMark, value, directive, tagHandleEnd);
861         }
862 
863         /// Scan name of a directive token.
864         ///
865         /// Assumes that the caller is building a slice in Reader, and puts the scanned
866         /// characters into that slice.
867         void scanDirectiveNameToSlice(const Mark startMark) @safe
868         {
869             // Scan directive name.
870             scanAlphaNumericToSlice!"a directive"(startMark);
871 
872             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
873                 new ScannerException("While scanning a directive", startMark,
874                     expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark));
875         }
876 
877         /// Scan value of a YAML directive token. Returns major, minor version separated by '.'.
878         ///
879         /// Assumes that the caller is building a slice in Reader, and puts the scanned
880         /// characters into that slice.
881         void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe
882         {
883             findNextNonSpace();
884 
885             scanYAMLDirectiveNumberToSlice(startMark);
886 
887             enforce(reader_.peekByte() == '.',
888                 new ScannerException("While scanning a directive", startMark,
889                     expected("digit or '.'", reader_.peek()), reader_.mark));
890             // Skip the '.'.
891             reader_.forward();
892 
893             reader_.sliceBuilder.write('.');
894             scanYAMLDirectiveNumberToSlice(startMark);
895 
896             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
897                 new ScannerException("While scanning a directive", startMark,
898                     expected("digit or '.'", reader_.peek()), reader_.mark));
899         }
900 
901         /// Scan a number from a YAML directive.
902         ///
903         /// Assumes that the caller is building a slice in Reader, and puts the scanned
904         /// characters into that slice.
905         void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe
906         {
907             enforce(isDigit(reader_.peek()),
908                 new ScannerException("While scanning a directive", startMark,
909                     expected("digit", reader_.peek()), reader_.mark));
910 
911             // Already found the first digit in the enforce(), so set length to 1.
912             uint length = 1;
913             while(reader_.peek(length).isDigit) { ++length; }
914 
915             reader_.sliceBuilder.write(reader_.get(length));
916         }
917 
918         /// Scan value of a tag directive.
919         ///
920         /// Assumes that the caller is building a slice in Reader, and puts the scanned
921         /// characters into that slice.
922         ///
923         /// Returns: Length of tag handle (which is before tag prefix) in scanned data
924         uint scanTagDirectiveValueToSlice(const Mark startMark) @safe
925         {
926             findNextNonSpace();
927             const startLength = reader_.sliceBuilder.length;
928             scanTagDirectiveHandleToSlice(startMark);
929             const handleLength = cast(uint)(reader_.sliceBuilder.length  - startLength);
930             findNextNonSpace();
931             scanTagDirectivePrefixToSlice(startMark);
932 
933             return handleLength;
934         }
935 
936         /// Scan handle of a tag directive.
937         ///
938         /// Assumes that the caller is building a slice in Reader, and puts the scanned
939         /// characters into that slice.
940         void scanTagDirectiveHandleToSlice(const Mark startMark) @safe
941         {
942             scanTagHandleToSlice!"directive"(startMark);
943             enforce(reader_.peekByte() == ' ',
944                 new ScannerException("While scanning a directive handle", startMark,
945                     expected("' '", reader_.peek()), reader_.mark));
946         }
947 
948         /// Scan prefix of a tag directive.
949         ///
950         /// Assumes that the caller is building a slice in Reader, and puts the scanned
951         /// characters into that slice.
952         void scanTagDirectivePrefixToSlice(const Mark startMark) @safe
953         {
954             scanTagURIToSlice!"directive"(startMark);
955             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
956                 new ScannerException("While scanning a directive prefix", startMark,
957                     expected("' '", reader_.peek()), reader_.mark));
958         }
959 
960         /// Scan (and ignore) ignored line after a directive.
961         void scanDirectiveIgnoredLine(const Mark startMark) @safe
962         {
963             findNextNonSpace();
964             if(reader_.peekByte() == '#') { scanToNextBreak(); }
965             enforce(reader_.peek().isBreak,
966                 new ScannerException("While scanning a directive", startMark,
967                       expected("comment or a line break", reader_.peek()), reader_.mark));
968             scanLineBreak();
969         }
970 
971 
972         /// Scan an alias or an anchor.
973         ///
974         /// The specification does not restrict characters for anchors and
975         /// aliases. This may lead to problems, for instance, the document:
976         ///   [ *alias, value ]
977         /// can be interpteted in two ways, as
978         ///   [ "value" ]
979         /// and
980         ///   [ *alias , "value" ]
981         /// Therefore we restrict aliases to ASCII alphanumeric characters.
982         Token scanAnchor(const TokenID id) @safe
983         {
984             const startMark = reader_.mark;
985             const dchar i = reader_.get();
986 
987             reader_.sliceBuilder.begin();
988             if(i == '*') { scanAlphaNumericToSlice!"an alias"(startMark); }
989             else         { scanAlphaNumericToSlice!"an anchor"(startMark); }
990             // On error, value is discarded as we return immediately
991             char[] value = reader_.sliceBuilder.finish();
992 
993             enum anchorCtx = "While scanning an anchor";
994             enum aliasCtx  = "While scanning an alias";
995             enforce(reader_.peek().isWhiteSpace ||
996                 reader_.peekByte().among!('?', ':', ',', ']', '}', '%', '@'),
997                 new ScannerException(i == '*' ? aliasCtx : anchorCtx, startMark,
998                     expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark));
999 
1000             if(id == TokenID.alias_)
1001             {
1002                 return aliasToken(startMark, reader_.mark, value);
1003             }
1004             if(id == TokenID.anchor)
1005             {
1006                 return anchorToken(startMark, reader_.mark, value);
1007             }
1008             assert(false, "This code should never be reached");
1009         }
1010 
1011         /// Scan a tag token.
1012         Token scanTag() @safe
1013         {
1014             const startMark = reader_.mark;
1015             dchar c = reader_.peek(1);
1016 
1017             reader_.sliceBuilder.begin();
1018             scope(failure) { reader_.sliceBuilder.finish(); }
1019             // Index where tag handle ends and tag suffix starts in the tag value
1020             // (slice) we will produce.
1021             uint handleEnd;
1022 
1023             if(c == '<')
1024             {
1025                 reader_.forward(2);
1026 
1027                 handleEnd = 0;
1028                 scanTagURIToSlice!"tag"(startMark);
1029                 enforce(reader_.peekByte() == '>',
1030                     new ScannerException("While scanning a tag", startMark,
1031                         expected("'>'", reader_.peek()), reader_.mark));
1032                 reader_.forward();
1033             }
1034             else if(c.isWhiteSpace)
1035             {
1036                 reader_.forward();
1037                 handleEnd = 0;
1038                 reader_.sliceBuilder.write('!');
1039             }
1040             else
1041             {
1042                 uint length = 1;
1043                 bool useHandle;
1044 
1045                 while(!c.isBreakOrSpace)
1046                 {
1047                     if(c == '!')
1048                     {
1049                         useHandle = true;
1050                         break;
1051                     }
1052                     ++length;
1053                     c = reader_.peek(length);
1054                 }
1055 
1056                 if(useHandle)
1057                 {
1058                     scanTagHandleToSlice!"tag"(startMark);
1059                     handleEnd = cast(uint)reader_.sliceBuilder.length;
1060                 }
1061                 else
1062                 {
1063                     reader_.forward();
1064                     reader_.sliceBuilder.write('!');
1065                     handleEnd = cast(uint)reader_.sliceBuilder.length;
1066                 }
1067 
1068                 scanTagURIToSlice!"tag"(startMark);
1069             }
1070 
1071             enforce(reader_.peek().isBreakOrSpace,
1072                 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()),
1073                     reader_.mark));
1074 
1075             char[] slice = reader_.sliceBuilder.finish();
1076             return tagToken(startMark, reader_.mark, slice, handleEnd);
1077         }
1078 
1079         /// Scan a block scalar token with specified style.
1080         Token scanBlockScalar(const ScalarStyle style) @safe
1081         {
1082             const startMark = reader_.mark;
1083 
1084             // Scan the header.
1085             reader_.forward();
1086 
1087             const indicators = scanBlockScalarIndicators(startMark);
1088 
1089             const chomping   = indicators[0];
1090             const increment  = indicators[1];
1091             scanBlockScalarIgnoredLine(startMark);
1092 
1093             // Determine the indentation level and go to the first non-empty line.
1094             Mark endMark;
1095             uint indent = max(1, indent_ + 1);
1096 
1097             reader_.sliceBuilder.begin();
1098             alias Transaction = SliceBuilder.Transaction;
1099             // Used to strip the last line breaks written to the slice at the end of the
1100             // scalar, which may be needed based on chomping.
1101             Transaction breaksTransaction = Transaction(&reader_.sliceBuilder);
1102             // Read the first indentation/line breaks before the scalar.
1103             size_t startLen = reader_.sliceBuilder.length;
1104             if(increment == int.min)
1105             {
1106                 auto indentation = scanBlockScalarIndentationToSlice();
1107                 endMark = indentation[1];
1108                 indent  = max(indent, indentation[0]);
1109             }
1110             else
1111             {
1112                 indent += increment - 1;
1113                 endMark = scanBlockScalarBreaksToSlice(indent);
1114             }
1115 
1116             // int.max means there's no line break (int.max is outside UTF-32).
1117             dchar lineBreak = cast(dchar)int.max;
1118 
1119             // Scan the inner part of the block scalar.
1120             while(reader_.column == indent && reader_.peekByte() != '\0')
1121             {
1122                 breaksTransaction.commit();
1123                 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t');
1124                 // This is where the 'interesting' non-whitespace data gets read.
1125                 scanToNextBreakToSlice();
1126                 lineBreak = scanLineBreak();
1127 
1128 
1129                 // This transaction serves to rollback data read in the
1130                 // scanBlockScalarBreaksToSlice() call.
1131                 breaksTransaction = Transaction(&reader_.sliceBuilder);
1132                 startLen = reader_.sliceBuilder.length;
1133                 // The line breaks should actually be written _after_ the if() block
1134                 // below. We work around that by inserting
1135                 endMark = scanBlockScalarBreaksToSlice(indent);
1136 
1137                 // This will not run during the last iteration (see the if() vs the
1138                 // while()), hence breaksTransaction rollback (which happens after this
1139                 // loop) will never roll back data written in this if() block.
1140                 if(reader_.column == indent && reader_.peekByte() != '\0')
1141                 {
1142                     // Unfortunately, folding rules are ambiguous.
1143 
1144                     // This is the folding according to the specification:
1145                     if(style == ScalarStyle.folded && lineBreak == '\n' &&
1146                        leadingNonSpace && !reader_.peekByte().among!(' ', '\t'))
1147                     {
1148                         // No breaks were scanned; no need to insert the space in the
1149                         // middle of slice.
1150                         if(startLen == reader_.sliceBuilder.length)
1151                         {
1152                             reader_.sliceBuilder.write(' ');
1153                         }
1154                     }
1155                     else
1156                     {
1157                         // We need to insert in the middle of the slice in case any line
1158                         // breaks were scanned.
1159                         reader_.sliceBuilder.insert(lineBreak, startLen);
1160                     }
1161 
1162                     ////this is Clark Evans's interpretation (also in the spec
1163                     ////examples):
1164                     //
1165                     //if(style == ScalarStyle.folded && lineBreak == '\n')
1166                     //{
1167                     //    if(startLen == endLen)
1168                     //    {
1169                     //        if(!" \t"d.canFind(reader_.peekByte()))
1170                     //        {
1171                     //            reader_.sliceBuilder.write(' ');
1172                     //        }
1173                     //        else
1174                     //        {
1175                     //            chunks ~= lineBreak;
1176                     //        }
1177                     //    }
1178                     //}
1179                     //else
1180                     //{
1181                     //    reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen);
1182                     //}
1183                 }
1184                 else
1185                 {
1186                     break;
1187                 }
1188             }
1189 
1190             // If chompint is Keep, we keep (commit) the last scanned line breaks
1191             // (which are at the end of the scalar). Otherwise re remove them (end the
1192             // transaction).
1193             if(chomping == Chomping.keep)  { breaksTransaction.commit(); }
1194             else                           { breaksTransaction.end(); }
1195             if(chomping != Chomping.strip && lineBreak != int.max)
1196             {
1197                 // If chomping is Keep, we keep the line break but the first line break
1198                 // that isn't stripped (since chomping isn't Strip in this branch) must
1199                 // be inserted _before_ the other line breaks.
1200                 if(chomping == Chomping.keep)
1201                 {
1202                     reader_.sliceBuilder.insert(lineBreak, startLen);
1203                 }
1204                 // If chomping is not Keep, breaksTransaction was cancelled so we can
1205                 // directly write the first line break (as it isn't stripped - chomping
1206                 // is not Strip)
1207                 else
1208                 {
1209                     reader_.sliceBuilder.write(lineBreak);
1210                 }
1211             }
1212 
1213             char[] slice = reader_.sliceBuilder.finish();
1214             return scalarToken(startMark, endMark, slice, style);
1215         }
1216 
1217         /// Scan chomping and indentation indicators of a scalar token.
1218         Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe
1219         {
1220             auto chomping = Chomping.clip;
1221             int increment = int.min;
1222             dchar c       = reader_.peek();
1223 
1224             /// Indicators can be in any order.
1225             if(getChomping(c, chomping))
1226             {
1227                 getIncrement(c, increment, startMark);
1228             }
1229             else
1230             {
1231                 const gotIncrement = getIncrement(c, increment, startMark);
1232                 if(gotIncrement) { getChomping(c, chomping); }
1233             }
1234 
1235             enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
1236                 new ScannerException("While scanning a block scalar", startMark,
1237                 expected("chomping or indentation indicator", c), reader_.mark));
1238 
1239             return tuple(chomping, increment);
1240         }
1241 
1242         /// Get chomping indicator, if detected. Return false otherwise.
1243         ///
1244         /// Used in scanBlockScalarIndicators.
1245         ///
1246         /// Params:
1247         ///
1248         /// c        = The character that may be a chomping indicator.
1249         /// chomping = Write the chomping value here, if detected.
1250         bool getChomping(ref dchar c, ref Chomping chomping) @safe
1251         {
1252             if(!c.among!('+', '-')) { return false; }
1253             chomping = c == '+' ? Chomping.keep : Chomping.strip;
1254             reader_.forward();
1255             c = reader_.peek();
1256             return true;
1257         }
1258 
1259         /// Get increment indicator, if detected. Return false otherwise.
1260         ///
1261         /// Used in scanBlockScalarIndicators.
1262         ///
1263         /// Params:
1264         ///
1265         /// c         = The character that may be an increment indicator.
1266         ///             If an increment indicator is detected, this will be updated to
1267         ///             the next character in the Reader.
1268         /// increment = Write the increment value here, if detected.
1269         /// startMark = Mark for error messages.
1270         bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe
1271         {
1272             if(!c.isDigit) { return false; }
1273             // Convert a digit to integer.
1274             increment = c - '0';
1275             assert(increment < 10 && increment >= 0, "Digit has invalid value");
1276 
1277             enforce(increment > 0,
1278                 new ScannerException("While scanning a block scalar", startMark,
1279                     expected("indentation indicator in range 1-9", "0"), reader_.mark));
1280 
1281             reader_.forward();
1282             c = reader_.peek();
1283             return true;
1284         }
1285 
1286         /// Scan (and ignore) ignored line in a block scalar.
1287         void scanBlockScalarIgnoredLine(const Mark startMark) @safe
1288         {
1289             findNextNonSpace();
1290             if(reader_.peekByte()== '#') { scanToNextBreak(); }
1291 
1292             enforce(reader_.peek().isBreak,
1293                 new ScannerException("While scanning a block scalar", startMark,
1294                     expected("comment or line break", reader_.peek()), reader_.mark));
1295 
1296             scanLineBreak();
1297         }
1298 
1299         /// Scan indentation in a block scalar, returning line breaks, max indent and end mark.
1300         ///
1301         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1302         /// characters into that slice.
1303         Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe
1304         {
1305             uint maxIndent;
1306             Mark endMark = reader_.mark;
1307 
1308             while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'))
1309             {
1310                 if(reader_.peekByte() != ' ')
1311                 {
1312                     reader_.sliceBuilder.write(scanLineBreak());
1313                     endMark = reader_.mark;
1314                     continue;
1315                 }
1316                 reader_.forward();
1317                 maxIndent = max(reader_.column, maxIndent);
1318             }
1319 
1320             return tuple(maxIndent, endMark);
1321         }
1322 
1323         /// Scan line breaks at lower or specified indentation in a block scalar.
1324         ///
1325         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1326         /// characters into that slice.
1327         Mark scanBlockScalarBreaksToSlice(const uint indent) @safe
1328         {
1329             Mark endMark = reader_.mark;
1330 
1331             for(;;)
1332             {
1333                 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); }
1334                 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029'))  { break; }
1335                 reader_.sliceBuilder.write(scanLineBreak());
1336                 endMark = reader_.mark;
1337             }
1338 
1339             return endMark;
1340         }
1341 
1342         /// Scan a qouted flow scalar token with specified quotes.
1343         Token scanFlowScalar(const ScalarStyle quotes) @safe
1344         {
1345             const startMark = reader_.mark;
1346             const quote     = reader_.get();
1347 
1348             reader_.sliceBuilder.begin();
1349 
1350             scanFlowScalarNonSpacesToSlice(quotes, startMark);
1351 
1352             while(reader_.peek() != quote)
1353             {
1354                 scanFlowScalarSpacesToSlice(startMark);
1355                 scanFlowScalarNonSpacesToSlice(quotes, startMark);
1356             }
1357             reader_.forward();
1358 
1359             auto slice = reader_.sliceBuilder.finish();
1360             return scalarToken(startMark, reader_.mark, slice, quotes);
1361         }
1362 
1363         /// Scan nonspace characters in a flow scalar.
1364         ///
1365         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1366         /// characters into that slice.
1367         void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark)
1368             @safe
1369         {
1370             for(;;)
1371             {
1372                 dchar c = reader_.peek();
1373 
1374                 size_t numCodePoints;
1375                 while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; }
1376 
1377                 if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); }
1378 
1379                 c = reader_.peek();
1380                 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'')
1381                 {
1382                     reader_.forward(2);
1383                     reader_.sliceBuilder.write('\'');
1384                 }
1385                 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') ||
1386                         (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\')))
1387                 {
1388                     reader_.forward();
1389                     reader_.sliceBuilder.write(c);
1390                 }
1391                 else if(quotes == ScalarStyle.doubleQuoted && c == '\\')
1392                 {
1393                     reader_.forward();
1394                     c = reader_.peek();
1395                     if(c.among!(escapes))
1396                     {
1397                         reader_.forward();
1398                         // Escaping has been moved to Parser as it can't be done in
1399                         // place (in a slice) in case of '\P' and '\L' (very uncommon,
1400                         // but we don't want to break the spec)
1401                         char[2] escapeSequence = ['\\', cast(char)c];
1402                         reader_.sliceBuilder.write(escapeSequence);
1403                     }
1404                     else if(c.among!(escapeHexCodeList))
1405                     {
1406                         const hexLength = dyaml.escapes.escapeHexLength(c);
1407                         reader_.forward();
1408 
1409                         foreach(i; 0 .. hexLength) {
1410                             enforce(reader_.peek(i).isHexDigit,
1411                                 new ScannerException("While scanning a double quoted scalar", startMark,
1412                                     expected("escape sequence of hexadecimal numbers",
1413                                         reader_.peek(i)), reader_.mark));
1414                         }
1415                         char[] hex = reader_.get(hexLength);
1416 
1417                         enforce((hex.length > 0) && (hex.length <= 8),
1418                             new ScannerException("While scanning a double quoted scalar", startMark,
1419                                   "overflow when parsing an escape sequence of " ~
1420                                   "hexadecimal numbers.", reader_.mark));
1421 
1422                         char[2] escapeStart = ['\\', cast(char) c];
1423                         reader_.sliceBuilder.write(escapeStart);
1424                         reader_.sliceBuilder.write(hex);
1425 
1426                     }
1427                     else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
1428                     {
1429                         scanLineBreak();
1430                         scanFlowScalarBreaksToSlice(startMark);
1431                     }
1432                     else
1433                     {
1434                         throw new ScannerException("While scanning a double quoted scalar", startMark,
1435                               text("found unsupported escape character ", c),
1436                               reader_.mark);
1437                     }
1438                 }
1439                 else { return; }
1440             }
1441         }
1442 
1443         /// Scan space characters in a flow scalar.
1444         ///
1445         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1446         /// spaces into that slice.
1447         void scanFlowScalarSpacesToSlice(const Mark startMark) @safe
1448         {
1449             // Increase length as long as we see whitespace.
1450             size_t length;
1451             while(reader_.peekByte(length).among!(' ', '\t')) { ++length; }
1452             auto whitespaces = reader_.prefixBytes(length);
1453 
1454             // Can check the last byte without striding because '\0' is ASCII
1455             const c = reader_.peek(length);
1456             enforce(c != '\0',
1457                 new ScannerException("While scanning a quoted scalar", startMark,
1458                     "found unexpected end of buffer", reader_.mark));
1459 
1460             // Spaces not followed by a line break.
1461             if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
1462             {
1463                 reader_.forward(length);
1464                 reader_.sliceBuilder.write(whitespaces);
1465                 return;
1466             }
1467 
1468             // There's a line break after the spaces.
1469             reader_.forward(length);
1470             const lineBreak = scanLineBreak();
1471 
1472             if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
1473 
1474             // If we have extra line breaks after the first, scan them into the
1475             // slice.
1476             const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark);
1477 
1478             // No extra breaks, one normal line break. Replace it with a space.
1479             if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
1480         }
1481 
1482         /// Scan line breaks in a flow scalar.
1483         ///
1484         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1485         /// line breaks into that slice.
1486         bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe
1487         {
1488             // True if at least one line break was found.
1489             bool anyBreaks;
1490             for(;;)
1491             {
1492                 // Instead of checking indentation, we check for document separators.
1493                 const prefix = reader_.prefix(3);
1494                 enforce(!(prefix == "---" || prefix == "...") ||
1495                     !reader_.peek(3).isWhiteSpace,
1496                     new ScannerException("While scanning a quoted scalar", startMark,
1497                         "found unexpected document separator", reader_.mark));
1498 
1499                 // Skip any whitespaces.
1500                 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); }
1501 
1502                 // Encountered a non-whitespace non-linebreak character, so we're done.
1503                 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; }
1504 
1505                 const lineBreak = scanLineBreak();
1506                 anyBreaks = true;
1507                 reader_.sliceBuilder.write(lineBreak);
1508             }
1509             return anyBreaks;
1510         }
1511 
1512         /// Scan plain scalar token (no block, no quotes).
1513         Token scanPlain() @safe
1514         {
1515             // We keep track of the allowSimpleKey_ flag here.
1516             // Indentation rules are loosed for the flow context
1517             const startMark = reader_.mark;
1518             Mark endMark = startMark;
1519             const indent = indent_ + 1;
1520 
1521             // We allow zero indentation for scalars, but then we need to check for
1522             // document separators at the beginning of the line.
1523             // if(indent == 0) { indent = 1; }
1524 
1525             reader_.sliceBuilder.begin();
1526 
1527             alias Transaction = SliceBuilder.Transaction;
1528             Transaction spacesTransaction;
1529             // Stop at a comment.
1530             while(reader_.peekByte() != '#')
1531             {
1532                 // Scan the entire plain scalar.
1533                 size_t length;
1534                 dchar c = reader_.peek(length);
1535                 for(;;)
1536                 {
1537                     const cNext = reader_.peek(length + 1);
1538                     if(c.isWhiteSpace ||
1539                        (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) ||
1540                        (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}')))
1541                     {
1542                         break;
1543                     }
1544                     ++length;
1545                     c = cNext;
1546                 }
1547 
1548                 // It's not clear what we should do with ':' in the flow context.
1549                 enforce(flowLevel_ == 0 || c != ':' ||
1550                    reader_.peek(length + 1).isWhiteSpace ||
1551                    reader_.peek(length + 1).among!(',', '[', ']', '{', '}'),
1552                     new ScannerException("While scanning a plain scalar", startMark,
1553                         "found unexpected ':' . Please check " ~
1554                         "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.",
1555                         reader_.mark));
1556 
1557                 if(length == 0) { break; }
1558 
1559                 allowSimpleKey_ = false;
1560 
1561                 reader_.sliceBuilder.write(reader_.get(length));
1562 
1563                 endMark = reader_.mark;
1564 
1565                 spacesTransaction.commit();
1566                 spacesTransaction = Transaction(&reader_.sliceBuilder);
1567 
1568                 const startLength = reader_.sliceBuilder.length;
1569                 scanPlainSpacesToSlice();
1570                 if(startLength == reader_.sliceBuilder.length ||
1571                    (flowLevel_ == 0 && reader_.column < indent))
1572                 {
1573                     break;
1574                 }
1575             }
1576 
1577             spacesTransaction.end();
1578             char[] slice = reader_.sliceBuilder.finish();
1579 
1580             return scalarToken(startMark, endMark, slice, ScalarStyle.plain);
1581         }
1582 
1583         /// Scan spaces in a plain scalar.
1584         ///
1585         /// Assumes that the caller is building a slice in Reader, and puts the spaces
1586         /// into that slice.
1587         void scanPlainSpacesToSlice() @safe
1588         {
1589             // The specification is really confusing about tabs in plain scalars.
1590             // We just forbid them completely. Do not use tabs in YAML!
1591 
1592             // Get as many plain spaces as there are.
1593             size_t length;
1594             while(reader_.peekByte(length) == ' ') { ++length; }
1595             char[] whitespaces = reader_.prefixBytes(length);
1596             reader_.forward(length);
1597 
1598             const dchar c = reader_.peek();
1599             if(!c.isNSChar)
1600             {
1601                 // We have spaces, but no newline.
1602                 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); }
1603                 return;
1604             }
1605 
1606             // Newline after the spaces (if any)
1607             const lineBreak = scanLineBreak();
1608             allowSimpleKey_ = true;
1609 
1610             static bool end(Reader reader_) @safe pure
1611             {
1612                 const prefix = reader_.prefix(3);
1613                 return ("---" == prefix || "..." == prefix)
1614                         && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
1615             }
1616 
1617             if(end(reader_)) { return; }
1618 
1619             bool extraBreaks;
1620 
1621             alias Transaction = SliceBuilder.Transaction;
1622             auto transaction = Transaction(&reader_.sliceBuilder);
1623             if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
1624             while(reader_.peek().isNSChar)
1625             {
1626                 if(reader_.peekByte() == ' ') { reader_.forward(); }
1627                 else
1628                 {
1629                     const lBreak = scanLineBreak();
1630                     extraBreaks  = true;
1631                     reader_.sliceBuilder.write(lBreak);
1632 
1633                     if(end(reader_)) { return; }
1634                 }
1635             }
1636             transaction.commit();
1637 
1638             // No line breaks, only a space.
1639             if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
1640         }
1641 
1642         /// Scan handle of a tag token.
1643         ///
1644         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1645         /// characters into that slice.
1646         void scanTagHandleToSlice(string name)(const Mark startMark)
1647         {
1648             dchar c = reader_.peek();
1649             enum contextMsg = "While scanning a " ~ name;
1650             enforce(c == '!',
1651                 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
1652 
1653             uint length = 1;
1654             c = reader_.peek(length);
1655             if(c != ' ')
1656             {
1657                 while(c.isAlphaNum || c.among!('-', '_'))
1658                 {
1659                     ++length;
1660                     c = reader_.peek(length);
1661                 }
1662                 enforce(c == '!',
1663                     new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
1664                 ++length;
1665             }
1666 
1667             reader_.sliceBuilder.write(reader_.get(length));
1668         }
1669 
1670         /// Scan URI in a tag token.
1671         ///
1672         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1673         /// characters into that slice.
1674         void scanTagURIToSlice(string name)(const Mark startMark)
1675         {
1676             // Note: we do not check if URI is well-formed.
1677             dchar c = reader_.peek();
1678             const startLen = reader_.sliceBuilder.length;
1679             {
1680                 uint length;
1681                 while(c.isAlphaNum || c.isURIChar)
1682                 {
1683                     if(c == '%')
1684                     {
1685                         auto chars = reader_.get(length);
1686                         reader_.sliceBuilder.write(chars);
1687                         length = 0;
1688                         scanURIEscapesToSlice!name(startMark);
1689                     }
1690                     else { ++length; }
1691                     c = reader_.peek(length);
1692                 }
1693                 if(length > 0)
1694                 {
1695                     auto chars = reader_.get(length);
1696                     reader_.sliceBuilder.write(chars);
1697                     length = 0;
1698                 }
1699             }
1700             // OK if we scanned something, error otherwise.
1701             enum contextMsg = "While parsing a " ~ name;
1702             enforce(reader_.sliceBuilder.length > startLen,
1703                 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark));
1704         }
1705 
1706         // Not @nogc yet because std.utf.decode is not @nogc
1707         /// Scan URI escape sequences.
1708         ///
1709         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1710         /// characters into that slice.
1711         void scanURIEscapesToSlice(string name)(const Mark startMark)
1712         {
1713             import core.exception : UnicodeException;
1714             // URI escapes encode a UTF-8 string. We store UTF-8 code units here for
1715             // decoding into UTF-32.
1716             Appender!string buffer;
1717 
1718 
1719             enum contextMsg = "While scanning a " ~ name;
1720             while(reader_.peekByte() == '%')
1721             {
1722                 reader_.forward();
1723                 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)];
1724 
1725                 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit,
1726                     new ScannerException(contextMsg, startMark,
1727                         expected("URI escape sequence of 2 hexadecimal " ~
1728                             "numbers", nextByte), reader_.mark));
1729 
1730                 buffer ~= nextByte[].to!ubyte(16);
1731 
1732                 reader_.forward(2);
1733             }
1734             try
1735             {
1736                 foreach (dchar chr; buffer.data)
1737                 {
1738                     reader_.sliceBuilder.write(chr);
1739                 }
1740             }
1741             catch (UnicodeException)
1742             {
1743                 throw new ScannerException(contextMsg, startMark,
1744                         "Invalid UTF-8 data encoded in URI escape sequence",
1745                         reader_.mark);
1746             }
1747         }
1748 
1749 
1750         /// Scan a line break, if any.
1751         ///
1752         /// Transforms:
1753         ///   '\r\n'      :   '\n'
1754         ///   '\r'        :   '\n'
1755         ///   '\n'        :   '\n'
1756         ///   '\u0085'    :   '\n'
1757         ///   '\u2028'    :   '\u2028'
1758         ///   '\u2029     :   '\u2029'
1759         ///   no break    :   '\0'
1760         dchar scanLineBreak() @safe
1761         {
1762             // Fast path for ASCII line breaks.
1763             const b = reader_.peekByte();
1764             if(b < 0x80)
1765             {
1766                 if(b == '\n' || b == '\r')
1767                 {
1768                     if(reader_.prefix(2) == "\r\n") { reader_.forward(2); }
1769                     else { reader_.forward(); }
1770                     return '\n';
1771                 }
1772                 return '\0';
1773             }
1774 
1775             const c = reader_.peek();
1776             if(c == '\x85')
1777             {
1778                 reader_.forward();
1779                 return '\n';
1780             }
1781             if(c == '\u2028' || c == '\u2029')
1782             {
1783                 reader_.forward();
1784                 return c;
1785             }
1786             return '\0';
1787         }
1788 }