1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 /// YAML scanner.
8 /// Code based on PyYAML: http://www.pyyaml.org
9 module dyaml.scanner;
10 
11 
12 import core.stdc.string;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.ascii : isAlphaNum, isDigit, isHexDigit;
18 import std.exception;
19 import std.string;
20 import std.typecons;
21 import std.traits : Unqual;
22 import std.utf;
23 
24 import dyaml.escapes;
25 import dyaml.exception;
26 import dyaml.queue;
27 import dyaml.reader;
28 import dyaml.style;
29 import dyaml.token;
30 
31 package:
32 /// Scanner produces tokens of the following types:
33 /// STREAM-START
34 /// STREAM-END
35 /// DIRECTIVE(name, value)
36 /// DOCUMENT-START
37 /// DOCUMENT-END
38 /// BLOCK-SEQUENCE-START
39 /// BLOCK-MAPPING-START
40 /// BLOCK-END
41 /// FLOW-SEQUENCE-START
42 /// FLOW-MAPPING-START
43 /// FLOW-SEQUENCE-END
44 /// FLOW-MAPPING-END
45 /// BLOCK-ENTRY
46 /// FLOW-ENTRY
47 /// KEY
48 /// VALUE
49 /// ALIAS(value)
50 /// ANCHOR(value)
51 /// TAG(value)
52 /// SCALAR(value, plain, style)
53 
54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
55 
56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
57 
58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
59 
60 alias isNonLinebreakWhitespace = among!(' ', '\t');
61 
62 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}',
63     '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n',
64     '\r', '\u0085', '\u2028', '\u2029');
65 
66 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',',
67     '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%');
68 
69 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029');
70 
71 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029');
72 
73 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\');
74 
75 alias isNSAnchorName = c => !c.isWhiteSpace && !c.among!('[', ']', '{', '}', ',', '\uFEFF');
76 
77 /// Marked exception thrown at scanner errors.
78 ///
79 /// See_Also: MarkedYAMLException
80 class ScannerException : MarkedYAMLException
81 {
82     mixin MarkedExceptionCtors;
83 }
84 
85 /// Generates tokens from data provided by a Reader.
86 struct Scanner
87 {
88     private:
89         /// A simple key is a key that is not denoted by the '?' indicator.
90         /// For example:
91         ///   ---
92         ///   block simple key: value
93         ///   ? not a simple key:
94         ///   : { flow simple key: value }
95         /// We emit the KEY token before all keys, so when we find a potential simple
96         /// key, we try to locate the corresponding ':' indicator. Simple keys should be
97         /// limited to a single line and 1024 characters.
98         ///
99         /// 16 bytes on 64-bit.
100         static struct SimpleKey
101         {
102             /// Character index in reader where the key starts.
103             uint charIndex = uint.max;
104             /// Index of the key token from start (first token scanned being 0).
105             uint tokenIndex;
106             /// Line the key starts at.
107             uint line;
108             /// Column the key starts at.
109             ushort column;
110             /// Is this required to be a simple key?
111             bool required;
112             /// Is this struct "null" (invalid)?.
113             bool isNull;
114         }
115 
116         /// Block chomping types.
117         enum Chomping
118         {
119             /// Strip all trailing line breaks. '-' indicator.
120             strip,
121             /// Line break of the last line is preserved, others discarded. Default.
122             clip,
123             /// All trailing line breaks are preserved. '+' indicator.
124             keep
125         }
126 
127         /// Reader used to read from a file/stream.
128         Reader reader_;
129         /// Are we done scanning?
130         bool done_;
131 
132         /// Level of nesting in flow context. If 0, we're in block context.
133         uint flowLevel_;
134         /// Current indentation level.
135         int indent_ = -1;
136         /// Past indentation levels. Used as a stack.
137         Appender!(int[]) indents_;
138 
139         /// Processed tokens not yet emitted. Used as a queue.
140         Queue!Token tokens_;
141 
142         /// Number of tokens emitted through the getToken method.
143         uint tokensTaken_;
144 
145         /// Can a simple key start at the current position? A simple key may start:
146         /// - at the beginning of the line, not counting indentation spaces
147         ///       (in block context),
148         /// - after '{', '[', ',' (in the flow context),
149         /// - after '?', ':', '-' (in the block context).
150         /// In the block context, this flag also signifies if a block collection
151         /// may start at the current position.
152         bool allowSimpleKey_ = true;
153 
154         /// Possible simple keys indexed by flow levels.
155         SimpleKey[] possibleSimpleKeys_;
156 
157     public:
158         /// Construct a Scanner using specified Reader.
159         this(Reader reader) @safe nothrow
160         {
161             // Return the next token, but do not delete it from the queue
162             reader_   = reader;
163             fetchStreamStart();
164         }
165 
166         /// Advance to the next token
167         void popFront() @safe
168         {
169             ++tokensTaken_;
170             tokens_.pop();
171         }
172 
173         /// Return the current token
174         const(Token) front() @safe
175         {
176             enforce(!empty, "No token left to peek");
177             return tokens_.peek();
178         }
179 
180         /// Return whether there are any more tokens left.
181         bool empty() @safe
182         {
183             while (needMoreTokens())
184             {
185                 fetchToken();
186             }
187             return tokens_.empty;
188         }
189 
190         /// Set file name.
191         void name(string name) @safe pure nothrow @nogc
192         {
193             reader_.name = name;
194         }
195 
196     private:
197         /// Most scanning error messages have the same format; so build them with this
198         /// function.
199         string expected(T)(string expected, T found)
200         {
201             return text("expected ", expected, ", but found ", found);
202         }
203 
204         /// Determine whether or not we need to fetch more tokens before peeking/getting a token.
205         bool needMoreTokens() @safe pure
206         {
207             if(done_)         { return false; }
208             if(tokens_.empty) { return true; }
209 
210             /// The current token may be a potential simple key, so we need to look further.
211             stalePossibleSimpleKeys();
212             return nextPossibleSimpleKey() == tokensTaken_;
213         }
214 
215         /// Fetch at token, adding it to tokens_.
216         void fetchToken() @safe
217         {
218             // Eat whitespaces and comments until we reach the next token.
219             scanToNextToken();
220 
221             // Remove obsolete possible simple keys.
222             stalePossibleSimpleKeys();
223 
224             // Compare current indentation and column. It may add some tokens
225             // and decrease the current indentation level.
226             unwindIndent(reader_.column);
227 
228             // Get the next character.
229             const dchar c = reader_.peekByte();
230 
231             // Fetch the token.
232             if(c == '\0')            { return fetchStreamEnd();     }
233             if(checkDirective())     { return fetchDirective();     }
234             if(checkDocumentStart()) { return fetchDocumentStart(); }
235             if(checkDocumentEnd())   { return fetchDocumentEnd();   }
236             // Order of the following checks is NOT significant.
237             switch(c)
238             {
239                 case '[':  return fetchFlowSequenceStart();
240                 case '{':  return fetchFlowMappingStart();
241                 case ']':  return fetchFlowSequenceEnd();
242                 case '}':  return fetchFlowMappingEnd();
243                 case ',':  return fetchFlowEntry();
244                 case '!':  return fetchTag();
245                 case '\'': return fetchSingle();
246                 case '\"': return fetchDouble();
247                 case '*':  return fetchAlias();
248                 case '&':  return fetchAnchor();
249                 case '?':  if(checkKey())        { return fetchKey();        } goto default;
250                 case ':':  if(checkValue())      { return fetchValue();      } goto default;
251                 case '-':  if(checkBlockEntry()) { return fetchBlockEntry(); } goto default;
252                 case '|':  if(flowLevel_ == 0)   { return fetchLiteral();    } break;
253                 case '>':  if(flowLevel_ == 0)   { return fetchFolded();     } break;
254                 default:   if(checkPlain())      { return fetchPlain();      }
255             }
256 
257             throw new ScannerException("While scanning for the next token, found character " ~
258                                        "\'%s\', index %s that cannot start any token"
259                                        .format(c, to!int(c)), reader_.mark);
260         }
261 
262 
263         /// Return the token number of the nearest possible simple key.
264         uint nextPossibleSimpleKey() @safe pure nothrow @nogc
265         {
266             uint minTokenNumber = uint.max;
267             foreach(k, ref simpleKey; possibleSimpleKeys_)
268             {
269                 if(simpleKey.isNull) { continue; }
270                 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex);
271             }
272             return minTokenNumber;
273         }
274 
275         /// Remove entries that are no longer possible simple keys.
276         ///
277         /// According to the YAML specification, simple keys
278         /// - should be limited to a single line,
279         /// - should be no longer than 1024 characters.
280         /// Disabling this will allow simple keys of any length and
281         /// height (may cause problems if indentation is broken though).
282         void stalePossibleSimpleKeys() @safe pure
283         {
284             foreach(level, ref key; possibleSimpleKeys_)
285             {
286                 if(key.isNull) { continue; }
287                 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024)
288                 {
289                     enforce(!key.required,
290                             new ScannerException("While scanning a simple key",
291                                                  Mark(reader_.name, key.line, key.column),
292                                                  "could not find expected ':'", reader_.mark));
293                     key.isNull = true;
294                 }
295             }
296         }
297 
298         /// Check if the next token starts a possible simple key and if so, save its position.
299         ///
300         /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
301         void savePossibleSimpleKey() @safe pure
302         {
303             // Check if a simple key is required at the current position.
304             const required = (flowLevel_ == 0 && indent_ == reader_.column);
305             assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~
306                    "the first token in the current line. Therefore it is always allowed.");
307 
308             if(!allowSimpleKey_) { return; }
309 
310             // The next token might be a simple key, so save its number and position.
311             removePossibleSimpleKey();
312             const tokenCount = tokensTaken_ + cast(uint)tokens_.length;
313 
314             const line   = reader_.line;
315             const column = reader_.column;
316             const key    = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line,
317                                      cast(ushort)min(column, ushort.max), required);
318 
319             if(possibleSimpleKeys_.length <= flowLevel_)
320             {
321                 const oldLength = possibleSimpleKeys_.length;
322                 possibleSimpleKeys_.length = flowLevel_ + 1;
323                 //No need to initialize the last element, it's already done in the next line.
324                 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init;
325             }
326             possibleSimpleKeys_[flowLevel_] = key;
327         }
328 
329         /// Remove the saved possible key position at the current flow level.
330         void removePossibleSimpleKey() @safe pure
331         {
332             if(possibleSimpleKeys_.length <= flowLevel_) { return; }
333 
334             if(!possibleSimpleKeys_[flowLevel_].isNull)
335             {
336                 const key = possibleSimpleKeys_[flowLevel_];
337                 enforce(!key.required,
338                         new ScannerException("While scanning a simple key",
339                                              Mark(reader_.name, key.line, key.column),
340                                              "could not find expected ':'", reader_.mark));
341                 possibleSimpleKeys_[flowLevel_].isNull = true;
342             }
343         }
344 
345         /// Decrease indentation, removing entries in indents_.
346         ///
347         /// Params:  column = Current column in the file/stream.
348         void unwindIndent(const int column) @safe
349         {
350             if(flowLevel_ > 0)
351             {
352                 // In flow context, tokens should respect indentation.
353                 // The condition should be `indent >= column` according to the spec.
354                 // But this condition will prohibit intuitively correct
355                 // constructions such as
356                 // key : {
357                 // }
358 
359                 // In the flow context, indentation is ignored. We make the scanner less
360                 // restrictive than what the specification requires.
361                 // if(pedantic_ && flowLevel_ > 0 && indent_ > column)
362                 // {
363                 //     throw new ScannerException("Invalid intendation or unclosed '[' or '{'",
364                 //                                reader_.mark)
365                 // }
366                 return;
367             }
368 
369             // In block context, we may need to issue the BLOCK-END tokens.
370             while(indent_ > column)
371             {
372                 indent_ = indents_.data.back;
373                 assert(indents_.data.length);
374                 indents_.shrinkTo(indents_.data.length - 1);
375                 tokens_.push(blockEndToken(reader_.mark, reader_.mark));
376             }
377         }
378 
379         /// Increase indentation if needed.
380         ///
381         /// Params:  column = Current column in the file/stream.
382         ///
383         /// Returns: true if the indentation was increased, false otherwise.
384         bool addIndent(int column) @safe
385         {
386             if(indent_ >= column){return false;}
387             indents_ ~= indent_;
388             indent_ = column;
389             return true;
390         }
391 
392 
393         /// Add STREAM-START token.
394         void fetchStreamStart() @safe nothrow
395         {
396             tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding));
397         }
398 
399         ///Add STREAM-END token.
400         void fetchStreamEnd() @safe
401         {
402             //Set intendation to -1 .
403             unwindIndent(-1);
404             removePossibleSimpleKey();
405             allowSimpleKey_ = false;
406             possibleSimpleKeys_.destroy;
407 
408             tokens_.push(streamEndToken(reader_.mark, reader_.mark));
409             done_ = true;
410         }
411 
412         /// Add DIRECTIVE token.
413         void fetchDirective() @safe
414         {
415             // Set intendation to -1 .
416             unwindIndent(-1);
417             // Reset simple keys.
418             removePossibleSimpleKey();
419             allowSimpleKey_ = false;
420 
421             auto directive = scanDirective();
422             tokens_.push(directive);
423         }
424 
425         /// Add DOCUMENT-START or DOCUMENT-END token.
426         void fetchDocumentIndicator(TokenID id)()
427             if(id == TokenID.documentStart || id == TokenID.documentEnd)
428         {
429             // Set indentation to -1 .
430             unwindIndent(-1);
431             // Reset simple keys. Note that there can't be a block collection after '---'.
432             removePossibleSimpleKey();
433             allowSimpleKey_ = false;
434 
435             Mark startMark = reader_.mark;
436             reader_.forward(3);
437             tokens_.push(simpleToken!id(startMark, reader_.mark));
438         }
439 
440         /// Aliases to add DOCUMENT-START or DOCUMENT-END token.
441         alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart);
442         alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd);
443 
444         /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
445         void fetchFlowCollectionStart(TokenID id)() @safe
446         {
447             // '[' and '{' may start a simple key.
448             savePossibleSimpleKey();
449             // Simple keys are allowed after '[' and '{'.
450             allowSimpleKey_ = true;
451             ++flowLevel_;
452 
453             Mark startMark = reader_.mark;
454             reader_.forward();
455             tokens_.push(simpleToken!id(startMark, reader_.mark));
456         }
457 
458         /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
459         alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart);
460         alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart);
461 
462         /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
463         void fetchFlowCollectionEnd(TokenID id)()
464         {
465             // Reset possible simple key on the current level.
466             removePossibleSimpleKey();
467             // No simple keys after ']' and '}'.
468             allowSimpleKey_ = false;
469             --flowLevel_;
470 
471             Mark startMark = reader_.mark;
472             reader_.forward();
473             tokens_.push(simpleToken!id(startMark, reader_.mark));
474         }
475 
476         /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/
477         alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd);
478         alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd);
479 
480         /// Add FLOW-ENTRY token;
481         void fetchFlowEntry() @safe
482         {
483             // Reset possible simple key on the current level.
484             removePossibleSimpleKey();
485             // Simple keys are allowed after ','.
486             allowSimpleKey_ = true;
487 
488             Mark startMark = reader_.mark;
489             reader_.forward();
490             tokens_.push(flowEntryToken(startMark, reader_.mark));
491         }
492 
493         /// Additional checks used in block context in fetchBlockEntry and fetchKey.
494         ///
495         /// Params:  type = String representing the token type we might need to add.
496         ///          id   = Token type we might need to add.
497         void blockChecks(string type, TokenID id)()
498         {
499             enum context = type ~ " keys are not allowed here";
500             // Are we allowed to start a key (not neccesarily a simple one)?
501             enforce(allowSimpleKey_, new ScannerException(context, reader_.mark));
502 
503             if(addIndent(reader_.column))
504             {
505                 tokens_.push(simpleToken!id(reader_.mark, reader_.mark));
506             }
507         }
508 
509         /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process.
510         void fetchBlockEntry() @safe
511         {
512             if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); }
513 
514             // It's an error for the block entry to occur in the flow context,
515             // but we let the parser detect this.
516 
517             // Reset possible simple key on the current level.
518             removePossibleSimpleKey();
519             // Simple keys are allowed after '-'.
520             allowSimpleKey_ = true;
521 
522             Mark startMark = reader_.mark;
523             reader_.forward();
524             tokens_.push(blockEntryToken(startMark, reader_.mark));
525         }
526 
527         /// Add KEY token. Might add BLOCK-MAPPING-START in the process.
528         void fetchKey() @safe
529         {
530             if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); }
531 
532             // Reset possible simple key on the current level.
533             removePossibleSimpleKey();
534             // Simple keys are allowed after '?' in the block context.
535             allowSimpleKey_ = (flowLevel_ == 0);
536 
537             Mark startMark = reader_.mark;
538             reader_.forward();
539             tokens_.push(keyToken(startMark, reader_.mark));
540         }
541 
542         /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process.
543         void fetchValue() @safe
544         {
545             //Do we determine a simple key?
546             if(possibleSimpleKeys_.length > flowLevel_ &&
547                !possibleSimpleKeys_[flowLevel_].isNull)
548             {
549                 const key = possibleSimpleKeys_[flowLevel_];
550                 possibleSimpleKeys_[flowLevel_].isNull = true;
551                 Mark keyMark = Mark(reader_.name, key.line, key.column);
552                 const idx = key.tokenIndex - tokensTaken_;
553 
554                 assert(idx >= 0);
555 
556                 // Add KEY.
557                 // Manually inserting since tokens are immutable (need linked list).
558                 tokens_.insert(keyToken(keyMark, keyMark), idx);
559 
560                 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START.
561                 if(flowLevel_ == 0 && addIndent(key.column))
562                 {
563                     tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx);
564                 }
565 
566                 // There cannot be two simple keys in a row.
567                 allowSimpleKey_ = false;
568             }
569             // Part of a complex key
570             else
571             {
572                 // We can start a complex value if and only if we can start a simple key.
573                 enforce(flowLevel_ > 0 || allowSimpleKey_,
574                         new ScannerException("Mapping values are not allowed here", reader_.mark));
575 
576                 // If this value starts a new block mapping, we need to add
577                 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser.
578                 if(flowLevel_ == 0 && addIndent(reader_.column))
579                 {
580                     tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark));
581                 }
582 
583                 // Reset possible simple key on the current level.
584                 removePossibleSimpleKey();
585                 // Simple keys are allowed after ':' in the block context.
586                 allowSimpleKey_ = (flowLevel_ == 0);
587             }
588 
589             // Add VALUE.
590             Mark startMark = reader_.mark;
591             reader_.forward();
592             tokens_.push(valueToken(startMark, reader_.mark));
593         }
594 
595         /// Add ALIAS or ANCHOR token.
596         void fetchAnchor_(TokenID id)() @safe
597             if(id == TokenID.alias_ || id == TokenID.anchor)
598         {
599             // ALIAS/ANCHOR could be a simple key.
600             savePossibleSimpleKey();
601             // No simple keys after ALIAS/ANCHOR.
602             allowSimpleKey_ = false;
603 
604             auto anchor = scanAnchor(id);
605             tokens_.push(anchor);
606         }
607 
608         /// Aliases to add ALIAS or ANCHOR token.
609         alias fetchAlias = fetchAnchor_!(TokenID.alias_);
610         alias fetchAnchor = fetchAnchor_!(TokenID.anchor);
611 
612         /// Add TAG token.
613         void fetchTag() @safe
614         {
615             //TAG could start a simple key.
616             savePossibleSimpleKey();
617             //No simple keys after TAG.
618             allowSimpleKey_ = false;
619 
620             tokens_.push(scanTag());
621         }
622 
623         /// Add block SCALAR token.
624         void fetchBlockScalar(ScalarStyle style)() @safe
625             if(style == ScalarStyle.literal || style == ScalarStyle.folded)
626         {
627             // Reset possible simple key on the current level.
628             removePossibleSimpleKey();
629             // A simple key may follow a block scalar.
630             allowSimpleKey_ = true;
631 
632             auto blockScalar = scanBlockScalar(style);
633             tokens_.push(blockScalar);
634         }
635 
636         /// Aliases to add literal or folded block scalar.
637         alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal);
638         alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded);
639 
640         /// Add quoted flow SCALAR token.
641         void fetchFlowScalar(ScalarStyle quotes)()
642         {
643             // A flow scalar could be a simple key.
644             savePossibleSimpleKey();
645             // No simple keys after flow scalars.
646             allowSimpleKey_ = false;
647 
648             // Scan and add SCALAR.
649             auto scalar = scanFlowScalar(quotes);
650             tokens_.push(scalar);
651         }
652 
653         /// Aliases to add single or double quoted block scalar.
654         alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted);
655         alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted);
656 
657         /// Add plain SCALAR token.
658         void fetchPlain() @safe
659         {
660             // A plain scalar could be a simple key
661             savePossibleSimpleKey();
662             // No simple keys after plain scalars. But note that scanPlain() will
663             // change this flag if the scan is finished at the beginning of the line.
664             allowSimpleKey_ = false;
665             auto plain = scanPlain();
666 
667             // Scan and add SCALAR. May change allowSimpleKey_
668             tokens_.push(plain);
669         }
670 
671     pure:
672 
673         ///Check if the next token is DIRECTIVE:        ^ '%' ...
674         bool checkDirective() @safe
675         {
676             return reader_.peekByte() == '%' && reader_.column == 0;
677         }
678 
679         /// Check if the next token is DOCUMENT-START:   ^ '---' (' '|'\n')
680         bool checkDocumentStart() @safe
681         {
682             // Check one char first, then all 3, to prevent reading outside the buffer.
683             return reader_.column     == 0     &&
684                    reader_.peekByte() == '-'   &&
685                    reader_.prefix(3)  == "---" &&
686                    reader_.peek(3).isWhiteSpace;
687         }
688 
689         /// Check if the next token is DOCUMENT-END:     ^ '...' (' '|'\n')
690         bool checkDocumentEnd() @safe
691         {
692             // Check one char first, then all 3, to prevent reading outside the buffer.
693             return reader_.column     == 0     &&
694                    reader_.peekByte() == '.'   &&
695                    reader_.prefix(3)  == "..." &&
696                    reader_.peek(3).isWhiteSpace;
697         }
698 
699         /// Check if the next token is BLOCK-ENTRY:      '-' (' '|'\n')
700         bool checkBlockEntry() @safe
701         {
702             return !!reader_.peek(1).isWhiteSpace;
703         }
704 
705         /// Check if the next token is KEY(flow context):    '?'
706         ///
707         /// or KEY(block context):   '?' (' '|'\n')
708         bool checkKey() @safe
709         {
710             return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace);
711         }
712 
713         /// Check if the next token is VALUE(flow context):  ':'
714         ///
715         /// or VALUE(block context): ':' (' '|'\n')
716         bool checkValue() @safe
717         {
718             return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace;
719         }
720 
721         /// Check if the next token is a plain scalar.
722         ///
723         /// A plain scalar may start with any non-space character except:
724         ///   '-', '?', ':', ',', '[', ']', '{', '}',
725         ///   '#', '&', '*', '!', '|', '>', '\'', '\"',
726         ///   '%', '@', '`'.
727         ///
728         /// It may also start with
729         ///   '-', '?', ':'
730         /// if it is followed by a non-space character.
731         ///
732         /// Note that we limit the last rule to the block context (except the
733         /// '-' character) because we want the flow context to be space
734         /// independent.
735         bool checkPlain() @safe
736         {
737             const c = reader_.peek();
738             if(!c.isNonScalarStartCharacter)
739             {
740                 return true;
741             }
742             return !reader_.peek(1).isWhiteSpace &&
743                    (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':')));
744         }
745 
746         /// Move to the next non-space character.
747         void findNextNonSpace() @safe
748         {
749             while(reader_.peekByte() == ' ') { reader_.forward(); }
750         }
751 
752         /// Scan a string of alphanumeric or "-_" characters.
753         ///
754         /// Assumes that the caller is building a slice in Reader, and puts the scanned
755         /// characters into that slice.
756         void scanAlphaNumericToSlice(string name)(const Mark startMark)
757         {
758             size_t length;
759             dchar c = reader_.peek();
760             while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); }
761 
762             enforce(length > 0, new ScannerException("While scanning " ~ name,
763                 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark));
764 
765             reader_.sliceBuilder.write(reader_.get(length));
766         }
767 
768         /// Scan a string.
769         ///
770         /// Assumes that the caller is building a slice in Reader, and puts the scanned
771         /// characters into that slice.
772         void scanAnchorAliasToSlice(const Mark startMark) @safe
773         {
774             size_t length;
775             dchar c = reader_.peek();
776             while (c.isNSAnchorName)
777             {
778                 c = reader_.peek(++length);
779             }
780 
781             enforce(length > 0, new ScannerException("While scanning an anchor or alias",
782                 startMark, expected("a printable character besides '[', ']', '{', '}' and ','", c), reader_.mark));
783 
784             reader_.sliceBuilder.write(reader_.get(length));
785         }
786 
787         /// Scan and throw away all characters until next line break.
788         void scanToNextBreak() @safe
789         {
790             while(!reader_.peek().isBreak) { reader_.forward(); }
791         }
792 
793         /// Scan all characters until next line break.
794         ///
795         /// Assumes that the caller is building a slice in Reader, and puts the scanned
796         /// characters into that slice.
797         void scanToNextBreakToSlice() @safe
798         {
799             uint length;
800             while(!reader_.peek(length).isBreak)
801             {
802                 ++length;
803             }
804             reader_.sliceBuilder.write(reader_.get(length));
805         }
806 
807 
808         /// Move to next token in the file/stream.
809         ///
810         /// We ignore spaces, line breaks and comments.
811         /// If we find a line break in the block context, we set
812         /// allowSimpleKey` on.
813         ///
814         /// We do not yet support BOM inside the stream as the
815         /// specification requires. Any such mark will be considered as a part
816         /// of the document.
817         void scanToNextToken() @safe
818         {
819             // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is:
820             //   Tabs cannot precede tokens
821             //   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
822             //   KEY(block), VALUE(block), BLOCK-ENTRY
823             // So the checking code is
824             //   if <TAB>:
825             //       allowSimpleKey_ = false
826             // We also need to add the check for `allowSimpleKey_ == true` to
827             // `unwindIndent` before issuing BLOCK-END.
828             // Scanners for block, flow, and plain scalars need to be modified.
829 
830             for(;;)
831             {
832                 //All whitespace in flow context is ignored, even whitespace
833                 // not allowed in other contexts
834                 if (flowLevel_ > 0)
835                 {
836                     while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); }
837                 }
838                 else
839                 {
840                     findNextNonSpace();
841                 }
842                 if(reader_.peekByte() == '#') { scanToNextBreak(); }
843                 if(scanLineBreak() != '\0')
844                 {
845                     if(flowLevel_ == 0) { allowSimpleKey_ = true; }
846                 }
847                 else
848                 {
849                     break;
850                 }
851             }
852         }
853 
854         /// Scan directive token.
855         Token scanDirective() @safe
856         {
857             Mark startMark = reader_.mark;
858             // Skip the '%'.
859             reader_.forward();
860 
861             // Scan directive name
862             reader_.sliceBuilder.begin();
863             scanDirectiveNameToSlice(startMark);
864             const name = reader_.sliceBuilder.finish();
865 
866             reader_.sliceBuilder.begin();
867 
868             // Index where tag handle ends and suffix starts in a tag directive value.
869             uint tagHandleEnd = uint.max;
870             if(name == "YAML")     { scanYAMLDirectiveValueToSlice(startMark); }
871             else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); }
872             char[] value = reader_.sliceBuilder.finish();
873 
874             Mark endMark = reader_.mark;
875 
876             DirectiveType directive;
877             if(name == "YAML")     { directive = DirectiveType.yaml; }
878             else if(name == "TAG") { directive = DirectiveType.tag; }
879             else
880             {
881                 directive = DirectiveType.reserved;
882                 scanToNextBreak();
883             }
884 
885             scanDirectiveIgnoredLine(startMark);
886 
887             return directiveToken(startMark, endMark, value, directive, tagHandleEnd);
888         }
889 
890         /// Scan name of a directive token.
891         ///
892         /// Assumes that the caller is building a slice in Reader, and puts the scanned
893         /// characters into that slice.
894         void scanDirectiveNameToSlice(const Mark startMark) @safe
895         {
896             // Scan directive name.
897             scanAlphaNumericToSlice!"a directive"(startMark);
898 
899             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
900                 new ScannerException("While scanning a directive", startMark,
901                     expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark));
902         }
903 
904         /// Scan value of a YAML directive token. Returns major, minor version separated by '.'.
905         ///
906         /// Assumes that the caller is building a slice in Reader, and puts the scanned
907         /// characters into that slice.
908         void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe
909         {
910             findNextNonSpace();
911 
912             scanYAMLDirectiveNumberToSlice(startMark);
913 
914             enforce(reader_.peekByte() == '.',
915                 new ScannerException("While scanning a directive", startMark,
916                     expected("digit or '.'", reader_.peek()), reader_.mark));
917             // Skip the '.'.
918             reader_.forward();
919 
920             reader_.sliceBuilder.write('.');
921             scanYAMLDirectiveNumberToSlice(startMark);
922 
923             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
924                 new ScannerException("While scanning a directive", startMark,
925                     expected("digit or '.'", reader_.peek()), reader_.mark));
926         }
927 
928         /// Scan a number from a YAML directive.
929         ///
930         /// Assumes that the caller is building a slice in Reader, and puts the scanned
931         /// characters into that slice.
932         void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe
933         {
934             enforce(isDigit(reader_.peek()),
935                 new ScannerException("While scanning a directive", startMark,
936                     expected("digit", reader_.peek()), reader_.mark));
937 
938             // Already found the first digit in the enforce(), so set length to 1.
939             uint length = 1;
940             while(reader_.peek(length).isDigit) { ++length; }
941 
942             reader_.sliceBuilder.write(reader_.get(length));
943         }
944 
945         /// Scan value of a tag directive.
946         ///
947         /// Assumes that the caller is building a slice in Reader, and puts the scanned
948         /// characters into that slice.
949         ///
950         /// Returns: Length of tag handle (which is before tag prefix) in scanned data
951         uint scanTagDirectiveValueToSlice(const Mark startMark) @safe
952         {
953             findNextNonSpace();
954             const startLength = reader_.sliceBuilder.length;
955             scanTagDirectiveHandleToSlice(startMark);
956             const handleLength = cast(uint)(reader_.sliceBuilder.length  - startLength);
957             findNextNonSpace();
958             scanTagDirectivePrefixToSlice(startMark);
959 
960             return handleLength;
961         }
962 
963         /// Scan handle of a tag directive.
964         ///
965         /// Assumes that the caller is building a slice in Reader, and puts the scanned
966         /// characters into that slice.
967         void scanTagDirectiveHandleToSlice(const Mark startMark) @safe
968         {
969             scanTagHandleToSlice!"directive"(startMark);
970             enforce(reader_.peekByte() == ' ',
971                 new ScannerException("While scanning a directive handle", startMark,
972                     expected("' '", reader_.peek()), reader_.mark));
973         }
974 
975         /// Scan prefix of a tag directive.
976         ///
977         /// Assumes that the caller is building a slice in Reader, and puts the scanned
978         /// characters into that slice.
979         void scanTagDirectivePrefixToSlice(const Mark startMark) @safe
980         {
981             scanTagURIToSlice!"directive"(startMark);
982             enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
983                 new ScannerException("While scanning a directive prefix", startMark,
984                     expected("' '", reader_.peek()), reader_.mark));
985         }
986 
987         /// Scan (and ignore) ignored line after a directive.
988         void scanDirectiveIgnoredLine(const Mark startMark) @safe
989         {
990             findNextNonSpace();
991             if(reader_.peekByte() == '#') { scanToNextBreak(); }
992             enforce(reader_.peek().isBreak,
993                 new ScannerException("While scanning a directive", startMark,
994                       expected("comment or a line break", reader_.peek()), reader_.mark));
995             scanLineBreak();
996         }
997 
998 
999         /// Scan an alias or an anchor.
1000         ///
1001         /// The specification does not restrict characters for anchors and
1002         /// aliases. This may lead to problems, for instance, the document:
1003         ///   [ *alias, value ]
1004         /// can be interpteted in two ways, as
1005         ///   [ "value" ]
1006         /// and
1007         ///   [ *alias , "value" ]
1008         /// Therefore we restrict aliases to ASCII alphanumeric characters.
1009         Token scanAnchor(const TokenID id) @safe
1010         {
1011             const startMark = reader_.mark;
1012             reader_.forward(); // The */& character was only peeked, so we drop it now
1013 
1014             reader_.sliceBuilder.begin();
1015             scanAnchorAliasToSlice(startMark);
1016             // On error, value is discarded as we return immediately
1017             char[] value = reader_.sliceBuilder.finish();
1018 
1019             assert(!reader_.peek().isNSAnchorName, "Anchor/alias name not fully scanned");
1020 
1021             if(id == TokenID.alias_)
1022             {
1023                 return aliasToken(startMark, reader_.mark, value);
1024             }
1025             if(id == TokenID.anchor)
1026             {
1027                 return anchorToken(startMark, reader_.mark, value);
1028             }
1029             assert(false, "This code should never be reached");
1030         }
1031 
1032         /// Scan a tag token.
1033         Token scanTag() @safe
1034         {
1035             const startMark = reader_.mark;
1036             dchar c = reader_.peek(1);
1037 
1038             reader_.sliceBuilder.begin();
1039             scope(failure) { reader_.sliceBuilder.finish(); }
1040             // Index where tag handle ends and tag suffix starts in the tag value
1041             // (slice) we will produce.
1042             uint handleEnd;
1043 
1044             if(c == '<')
1045             {
1046                 reader_.forward(2);
1047 
1048                 handleEnd = 0;
1049                 scanTagURIToSlice!"tag"(startMark);
1050                 enforce(reader_.peekByte() == '>',
1051                     new ScannerException("While scanning a tag", startMark,
1052                         expected("'>'", reader_.peek()), reader_.mark));
1053                 reader_.forward();
1054             }
1055             else if(c.isWhiteSpace)
1056             {
1057                 reader_.forward();
1058                 handleEnd = 0;
1059                 reader_.sliceBuilder.write('!');
1060             }
1061             else
1062             {
1063                 uint length = 1;
1064                 bool useHandle;
1065 
1066                 while(!c.isBreakOrSpace)
1067                 {
1068                     if(c == '!')
1069                     {
1070                         useHandle = true;
1071                         break;
1072                     }
1073                     ++length;
1074                     c = reader_.peek(length);
1075                 }
1076 
1077                 if(useHandle)
1078                 {
1079                     scanTagHandleToSlice!"tag"(startMark);
1080                     handleEnd = cast(uint)reader_.sliceBuilder.length;
1081                 }
1082                 else
1083                 {
1084                     reader_.forward();
1085                     reader_.sliceBuilder.write('!');
1086                     handleEnd = cast(uint)reader_.sliceBuilder.length;
1087                 }
1088 
1089                 scanTagURIToSlice!"tag"(startMark);
1090             }
1091 
1092             enforce(reader_.peek().isBreakOrSpace,
1093                 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()),
1094                     reader_.mark));
1095 
1096             char[] slice = reader_.sliceBuilder.finish();
1097             return tagToken(startMark, reader_.mark, slice, handleEnd);
1098         }
1099 
1100         /// Scan a block scalar token with specified style.
1101         Token scanBlockScalar(const ScalarStyle style) @safe
1102         {
1103             const startMark = reader_.mark;
1104 
1105             // Scan the header.
1106             reader_.forward();
1107 
1108             const indicators = scanBlockScalarIndicators(startMark);
1109 
1110             const chomping   = indicators[0];
1111             const increment  = indicators[1];
1112             scanBlockScalarIgnoredLine(startMark);
1113 
1114             // Determine the indentation level and go to the first non-empty line.
1115             Mark endMark;
1116             uint indent = max(1, indent_ + 1);
1117 
1118             reader_.sliceBuilder.begin();
1119             alias Transaction = SliceBuilder.Transaction;
1120             // Used to strip the last line breaks written to the slice at the end of the
1121             // scalar, which may be needed based on chomping.
1122             Transaction breaksTransaction = Transaction(&reader_.sliceBuilder);
1123             // Read the first indentation/line breaks before the scalar.
1124             size_t startLen = reader_.sliceBuilder.length;
1125             if(increment == int.min)
1126             {
1127                 auto indentation = scanBlockScalarIndentationToSlice();
1128                 endMark = indentation[1];
1129                 indent  = max(indent, indentation[0]);
1130             }
1131             else
1132             {
1133                 indent += increment - 1;
1134                 endMark = scanBlockScalarBreaksToSlice(indent);
1135             }
1136 
1137             // int.max means there's no line break (int.max is outside UTF-32).
1138             dchar lineBreak = cast(dchar)int.max;
1139 
1140             // Scan the inner part of the block scalar.
1141             while(reader_.column == indent && reader_.peekByte() != '\0')
1142             {
1143                 breaksTransaction.commit();
1144                 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t');
1145                 // This is where the 'interesting' non-whitespace data gets read.
1146                 scanToNextBreakToSlice();
1147                 lineBreak = scanLineBreak();
1148 
1149 
1150                 // This transaction serves to rollback data read in the
1151                 // scanBlockScalarBreaksToSlice() call.
1152                 breaksTransaction = Transaction(&reader_.sliceBuilder);
1153                 startLen = reader_.sliceBuilder.length;
1154                 // The line breaks should actually be written _after_ the if() block
1155                 // below. We work around that by inserting
1156                 endMark = scanBlockScalarBreaksToSlice(indent);
1157 
1158                 // This will not run during the last iteration (see the if() vs the
1159                 // while()), hence breaksTransaction rollback (which happens after this
1160                 // loop) will never roll back data written in this if() block.
1161                 if(reader_.column == indent && reader_.peekByte() != '\0')
1162                 {
1163                     // Unfortunately, folding rules are ambiguous.
1164 
1165                     // This is the folding according to the specification:
1166                     if(style == ScalarStyle.folded && lineBreak == '\n' &&
1167                        leadingNonSpace && !reader_.peekByte().among!(' ', '\t'))
1168                     {
1169                         // No breaks were scanned; no need to insert the space in the
1170                         // middle of slice.
1171                         if(startLen == reader_.sliceBuilder.length)
1172                         {
1173                             reader_.sliceBuilder.write(' ');
1174                         }
1175                     }
1176                     else
1177                     {
1178                         // We need to insert in the middle of the slice in case any line
1179                         // breaks were scanned.
1180                         reader_.sliceBuilder.insert(lineBreak, startLen);
1181                     }
1182 
1183                     ////this is Clark Evans's interpretation (also in the spec
1184                     ////examples):
1185                     //
1186                     //if(style == ScalarStyle.folded && lineBreak == '\n')
1187                     //{
1188                     //    if(startLen == endLen)
1189                     //    {
1190                     //        if(!" \t"d.canFind(reader_.peekByte()))
1191                     //        {
1192                     //            reader_.sliceBuilder.write(' ');
1193                     //        }
1194                     //        else
1195                     //        {
1196                     //            chunks ~= lineBreak;
1197                     //        }
1198                     //    }
1199                     //}
1200                     //else
1201                     //{
1202                     //    reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen);
1203                     //}
1204                 }
1205                 else
1206                 {
1207                     break;
1208                 }
1209             }
1210 
1211             // If chompint is Keep, we keep (commit) the last scanned line breaks
1212             // (which are at the end of the scalar). Otherwise re remove them (end the
1213             // transaction).
1214             if(chomping == Chomping.keep)  { breaksTransaction.commit(); }
1215             else                           { breaksTransaction.end(); }
1216             if(chomping != Chomping.strip && lineBreak != int.max)
1217             {
1218                 // If chomping is Keep, we keep the line break but the first line break
1219                 // that isn't stripped (since chomping isn't Strip in this branch) must
1220                 // be inserted _before_ the other line breaks.
1221                 if(chomping == Chomping.keep)
1222                 {
1223                     reader_.sliceBuilder.insert(lineBreak, startLen);
1224                 }
1225                 // If chomping is not Keep, breaksTransaction was cancelled so we can
1226                 // directly write the first line break (as it isn't stripped - chomping
1227                 // is not Strip)
1228                 else
1229                 {
1230                     reader_.sliceBuilder.write(lineBreak);
1231                 }
1232             }
1233 
1234             char[] slice = reader_.sliceBuilder.finish();
1235             return scalarToken(startMark, endMark, slice, style);
1236         }
1237 
1238         /// Scan chomping and indentation indicators of a scalar token.
1239         Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe
1240         {
1241             auto chomping = Chomping.clip;
1242             int increment = int.min;
1243             dchar c       = reader_.peek();
1244 
1245             /// Indicators can be in any order.
1246             if(getChomping(c, chomping))
1247             {
1248                 getIncrement(c, increment, startMark);
1249             }
1250             else
1251             {
1252                 const gotIncrement = getIncrement(c, increment, startMark);
1253                 if(gotIncrement) { getChomping(c, chomping); }
1254             }
1255 
1256             enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
1257                 new ScannerException("While scanning a block scalar", startMark,
1258                 expected("chomping or indentation indicator", c), reader_.mark));
1259 
1260             return tuple(chomping, increment);
1261         }
1262 
1263         /// Get chomping indicator, if detected. Return false otherwise.
1264         ///
1265         /// Used in scanBlockScalarIndicators.
1266         ///
1267         /// Params:
1268         ///
1269         /// c        = The character that may be a chomping indicator.
1270         /// chomping = Write the chomping value here, if detected.
1271         bool getChomping(ref dchar c, ref Chomping chomping) @safe
1272         {
1273             if(!c.among!('+', '-')) { return false; }
1274             chomping = c == '+' ? Chomping.keep : Chomping.strip;
1275             reader_.forward();
1276             c = reader_.peek();
1277             return true;
1278         }
1279 
1280         /// Get increment indicator, if detected. Return false otherwise.
1281         ///
1282         /// Used in scanBlockScalarIndicators.
1283         ///
1284         /// Params:
1285         ///
1286         /// c         = The character that may be an increment indicator.
1287         ///             If an increment indicator is detected, this will be updated to
1288         ///             the next character in the Reader.
1289         /// increment = Write the increment value here, if detected.
1290         /// startMark = Mark for error messages.
1291         bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe
1292         {
1293             if(!c.isDigit) { return false; }
1294             // Convert a digit to integer.
1295             increment = c - '0';
1296             assert(increment < 10 && increment >= 0, "Digit has invalid value");
1297 
1298             enforce(increment > 0,
1299                 new ScannerException("While scanning a block scalar", startMark,
1300                     expected("indentation indicator in range 1-9", "0"), reader_.mark));
1301 
1302             reader_.forward();
1303             c = reader_.peek();
1304             return true;
1305         }
1306 
1307         /// Scan (and ignore) ignored line in a block scalar.
1308         void scanBlockScalarIgnoredLine(const Mark startMark) @safe
1309         {
1310             findNextNonSpace();
1311             if(reader_.peekByte()== '#') { scanToNextBreak(); }
1312 
1313             enforce(reader_.peek().isBreak,
1314                 new ScannerException("While scanning a block scalar", startMark,
1315                     expected("comment or line break", reader_.peek()), reader_.mark));
1316 
1317             scanLineBreak();
1318         }
1319 
1320         /// Scan indentation in a block scalar, returning line breaks, max indent and end mark.
1321         ///
1322         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1323         /// characters into that slice.
1324         Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe
1325         {
1326             uint maxIndent;
1327             Mark endMark = reader_.mark;
1328 
1329             while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'))
1330             {
1331                 if(reader_.peekByte() != ' ')
1332                 {
1333                     reader_.sliceBuilder.write(scanLineBreak());
1334                     endMark = reader_.mark;
1335                     continue;
1336                 }
1337                 reader_.forward();
1338                 maxIndent = max(reader_.column, maxIndent);
1339             }
1340 
1341             return tuple(maxIndent, endMark);
1342         }
1343 
1344         /// Scan line breaks at lower or specified indentation in a block scalar.
1345         ///
1346         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1347         /// characters into that slice.
1348         Mark scanBlockScalarBreaksToSlice(const uint indent) @safe
1349         {
1350             Mark endMark = reader_.mark;
1351 
1352             for(;;)
1353             {
1354                 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); }
1355                 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029'))  { break; }
1356                 reader_.sliceBuilder.write(scanLineBreak());
1357                 endMark = reader_.mark;
1358             }
1359 
1360             return endMark;
1361         }
1362 
1363         /// Scan a qouted flow scalar token with specified quotes.
1364         Token scanFlowScalar(const ScalarStyle quotes) @safe
1365         {
1366             const startMark = reader_.mark;
1367             const quote     = reader_.get();
1368 
1369             reader_.sliceBuilder.begin();
1370 
1371             scanFlowScalarNonSpacesToSlice(quotes, startMark);
1372 
1373             while(reader_.peek() != quote)
1374             {
1375                 scanFlowScalarSpacesToSlice(startMark);
1376                 scanFlowScalarNonSpacesToSlice(quotes, startMark);
1377             }
1378             reader_.forward();
1379 
1380             auto slice = reader_.sliceBuilder.finish();
1381             return scalarToken(startMark, reader_.mark, slice, quotes);
1382         }
1383 
1384         /// Scan nonspace characters in a flow scalar.
1385         ///
1386         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1387         /// characters into that slice.
1388         void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark)
1389             @safe
1390         {
1391             for(;;)
1392             {
1393                 dchar c = reader_.peek();
1394 
1395                 size_t numCodePoints;
1396                 while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; }
1397 
1398                 if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); }
1399 
1400                 c = reader_.peek();
1401                 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'')
1402                 {
1403                     reader_.forward(2);
1404                     reader_.sliceBuilder.write('\'');
1405                 }
1406                 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') ||
1407                         (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\')))
1408                 {
1409                     reader_.forward();
1410                     reader_.sliceBuilder.write(c);
1411                 }
1412                 else if(quotes == ScalarStyle.doubleQuoted && c == '\\')
1413                 {
1414                     reader_.forward();
1415                     c = reader_.peek();
1416                     if(c.among!(escapes))
1417                     {
1418                         reader_.forward();
1419                         // Escaping has been moved to Parser as it can't be done in
1420                         // place (in a slice) in case of '\P' and '\L' (very uncommon,
1421                         // but we don't want to break the spec)
1422                         char[2] escapeSequence = ['\\', cast(char)c];
1423                         reader_.sliceBuilder.write(escapeSequence);
1424                     }
1425                     else if(c.among!(escapeHexCodeList))
1426                     {
1427                         const hexLength = dyaml.escapes.escapeHexLength(c);
1428                         reader_.forward();
1429 
1430                         foreach(i; 0 .. hexLength) {
1431                             enforce(reader_.peek(i).isHexDigit,
1432                                 new ScannerException("While scanning a double quoted scalar", startMark,
1433                                     expected("escape sequence of hexadecimal numbers",
1434                                         reader_.peek(i)), reader_.mark));
1435                         }
1436                         char[] hex = reader_.get(hexLength);
1437 
1438                         enforce((hex.length > 0) && (hex.length <= 8),
1439                             new ScannerException("While scanning a double quoted scalar", startMark,
1440                                   "overflow when parsing an escape sequence of " ~
1441                                   "hexadecimal numbers.", reader_.mark));
1442 
1443                         char[2] escapeStart = ['\\', cast(char) c];
1444                         reader_.sliceBuilder.write(escapeStart);
1445                         reader_.sliceBuilder.write(hex);
1446 
1447                     }
1448                     else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
1449                     {
1450                         scanLineBreak();
1451                         scanFlowScalarBreaksToSlice(startMark);
1452                     }
1453                     else
1454                     {
1455                         throw new ScannerException("While scanning a double quoted scalar", startMark,
1456                               text("found unsupported escape character ", c),
1457                               reader_.mark);
1458                     }
1459                 }
1460                 else { return; }
1461             }
1462         }
1463 
1464         /// Scan space characters in a flow scalar.
1465         ///
1466         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1467         /// spaces into that slice.
1468         void scanFlowScalarSpacesToSlice(const Mark startMark) @safe
1469         {
1470             // Increase length as long as we see whitespace.
1471             size_t length;
1472             while(reader_.peekByte(length).among!(' ', '\t')) { ++length; }
1473             auto whitespaces = reader_.prefixBytes(length);
1474 
1475             // Can check the last byte without striding because '\0' is ASCII
1476             const c = reader_.peek(length);
1477             enforce(c != '\0',
1478                 new ScannerException("While scanning a quoted scalar", startMark,
1479                     "found unexpected end of buffer", reader_.mark));
1480 
1481             // Spaces not followed by a line break.
1482             if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
1483             {
1484                 reader_.forward(length);
1485                 reader_.sliceBuilder.write(whitespaces);
1486                 return;
1487             }
1488 
1489             // There's a line break after the spaces.
1490             reader_.forward(length);
1491             const lineBreak = scanLineBreak();
1492 
1493             if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
1494 
1495             // If we have extra line breaks after the first, scan them into the
1496             // slice.
1497             const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark);
1498 
1499             // No extra breaks, one normal line break. Replace it with a space.
1500             if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
1501         }
1502 
1503         /// Scan line breaks in a flow scalar.
1504         ///
1505         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1506         /// line breaks into that slice.
1507         bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe
1508         {
1509             // True if at least one line break was found.
1510             bool anyBreaks;
1511             for(;;)
1512             {
1513                 // Instead of checking indentation, we check for document separators.
1514                 const prefix = reader_.prefix(3);
1515                 enforce(!(prefix == "---" || prefix == "...") ||
1516                     !reader_.peek(3).isWhiteSpace,
1517                     new ScannerException("While scanning a quoted scalar", startMark,
1518                         "found unexpected document separator", reader_.mark));
1519 
1520                 // Skip any whitespaces.
1521                 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); }
1522 
1523                 // Encountered a non-whitespace non-linebreak character, so we're done.
1524                 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; }
1525 
1526                 const lineBreak = scanLineBreak();
1527                 anyBreaks = true;
1528                 reader_.sliceBuilder.write(lineBreak);
1529             }
1530             return anyBreaks;
1531         }
1532 
1533         /// Scan plain scalar token (no block, no quotes).
1534         Token scanPlain() @safe
1535         {
1536             // We keep track of the allowSimpleKey_ flag here.
1537             // Indentation rules are loosed for the flow context
1538             const startMark = reader_.mark;
1539             Mark endMark = startMark;
1540             const indent = indent_ + 1;
1541 
1542             // We allow zero indentation for scalars, but then we need to check for
1543             // document separators at the beginning of the line.
1544             // if(indent == 0) { indent = 1; }
1545 
1546             reader_.sliceBuilder.begin();
1547 
1548             alias Transaction = SliceBuilder.Transaction;
1549             Transaction spacesTransaction;
1550             // Stop at a comment.
1551             while(reader_.peekByte() != '#')
1552             {
1553                 // Scan the entire plain scalar.
1554                 size_t length;
1555                 dchar c = reader_.peek(length);
1556                 for(;;)
1557                 {
1558                     const cNext = reader_.peek(length + 1);
1559                     if(c.isWhiteSpace ||
1560                        (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) ||
1561                        (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}')))
1562                     {
1563                         break;
1564                     }
1565                     ++length;
1566                     c = cNext;
1567                 }
1568 
1569                 // It's not clear what we should do with ':' in the flow context.
1570                 enforce(flowLevel_ == 0 || c != ':' ||
1571                    reader_.peek(length + 1).isWhiteSpace ||
1572                    reader_.peek(length + 1).among!(',', '[', ']', '{', '}'),
1573                     new ScannerException("While scanning a plain scalar", startMark,
1574                         "found unexpected ':' . Please check " ~
1575                         "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.",
1576                         reader_.mark));
1577 
1578                 if(length == 0) { break; }
1579 
1580                 allowSimpleKey_ = false;
1581 
1582                 reader_.sliceBuilder.write(reader_.get(length));
1583 
1584                 endMark = reader_.mark;
1585 
1586                 spacesTransaction.commit();
1587                 spacesTransaction = Transaction(&reader_.sliceBuilder);
1588 
1589                 const startLength = reader_.sliceBuilder.length;
1590                 scanPlainSpacesToSlice();
1591                 if(startLength == reader_.sliceBuilder.length ||
1592                    (flowLevel_ == 0 && reader_.column < indent))
1593                 {
1594                     break;
1595                 }
1596             }
1597 
1598             spacesTransaction.end();
1599             char[] slice = reader_.sliceBuilder.finish();
1600 
1601             return scalarToken(startMark, endMark, slice, ScalarStyle.plain);
1602         }
1603 
1604         /// Scan spaces in a plain scalar.
1605         ///
1606         /// Assumes that the caller is building a slice in Reader, and puts the spaces
1607         /// into that slice.
1608         void scanPlainSpacesToSlice() @safe
1609         {
1610             // The specification is really confusing about tabs in plain scalars.
1611             // We just forbid them completely. Do not use tabs in YAML!
1612 
1613             // Get as many plain spaces as there are.
1614             size_t length;
1615             while(reader_.peekByte(length) == ' ') { ++length; }
1616             char[] whitespaces = reader_.prefixBytes(length);
1617             reader_.forward(length);
1618 
1619             const dchar c = reader_.peek();
1620             if(!c.isNSChar)
1621             {
1622                 // We have spaces, but no newline.
1623                 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); }
1624                 return;
1625             }
1626 
1627             // Newline after the spaces (if any)
1628             const lineBreak = scanLineBreak();
1629             allowSimpleKey_ = true;
1630 
1631             static bool end(Reader reader_) @safe pure
1632             {
1633                 const prefix = reader_.prefix(3);
1634                 return ("---" == prefix || "..." == prefix)
1635                         && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
1636             }
1637 
1638             if(end(reader_)) { return; }
1639 
1640             bool extraBreaks;
1641 
1642             alias Transaction = SliceBuilder.Transaction;
1643             auto transaction = Transaction(&reader_.sliceBuilder);
1644             if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
1645             while(reader_.peek().isNSChar)
1646             {
1647                 if(reader_.peekByte() == ' ') { reader_.forward(); }
1648                 else
1649                 {
1650                     const lBreak = scanLineBreak();
1651                     extraBreaks  = true;
1652                     reader_.sliceBuilder.write(lBreak);
1653 
1654                     if(end(reader_)) { return; }
1655                 }
1656             }
1657             transaction.commit();
1658 
1659             // No line breaks, only a space.
1660             if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
1661         }
1662 
1663         /// Scan handle of a tag token.
1664         ///
1665         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1666         /// characters into that slice.
1667         void scanTagHandleToSlice(string name)(const Mark startMark)
1668         {
1669             dchar c = reader_.peek();
1670             enum contextMsg = "While scanning a " ~ name;
1671             enforce(c == '!',
1672                 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
1673 
1674             uint length = 1;
1675             c = reader_.peek(length);
1676             if(c != ' ')
1677             {
1678                 while(c.isAlphaNum || c.among!('-', '_'))
1679                 {
1680                     ++length;
1681                     c = reader_.peek(length);
1682                 }
1683                 enforce(c == '!',
1684                     new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
1685                 ++length;
1686             }
1687 
1688             reader_.sliceBuilder.write(reader_.get(length));
1689         }
1690 
1691         /// Scan URI in a tag token.
1692         ///
1693         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1694         /// characters into that slice.
1695         void scanTagURIToSlice(string name)(const Mark startMark)
1696         {
1697             // Note: we do not check if URI is well-formed.
1698             dchar c = reader_.peek();
1699             const startLen = reader_.sliceBuilder.length;
1700             {
1701                 uint length;
1702                 while(c.isAlphaNum || c.isURIChar)
1703                 {
1704                     if(c == '%')
1705                     {
1706                         auto chars = reader_.get(length);
1707                         reader_.sliceBuilder.write(chars);
1708                         length = 0;
1709                         scanURIEscapesToSlice!name(startMark);
1710                     }
1711                     else { ++length; }
1712                     c = reader_.peek(length);
1713                 }
1714                 if(length > 0)
1715                 {
1716                     auto chars = reader_.get(length);
1717                     reader_.sliceBuilder.write(chars);
1718                     length = 0;
1719                 }
1720             }
1721             // OK if we scanned something, error otherwise.
1722             enum contextMsg = "While parsing a " ~ name;
1723             enforce(reader_.sliceBuilder.length > startLen,
1724                 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark));
1725         }
1726 
1727         // Not @nogc yet because std.utf.decode is not @nogc
1728         /// Scan URI escape sequences.
1729         ///
1730         /// Assumes that the caller is building a slice in Reader, and puts the scanned
1731         /// characters into that slice.
1732         void scanURIEscapesToSlice(string name)(const Mark startMark)
1733         {
1734             import core.exception : UnicodeException;
1735             // URI escapes encode a UTF-8 string. We store UTF-8 code units here for
1736             // decoding into UTF-32.
1737             Appender!string buffer;
1738 
1739 
1740             enum contextMsg = "While scanning a " ~ name;
1741             while(reader_.peekByte() == '%')
1742             {
1743                 reader_.forward();
1744                 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)];
1745 
1746                 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit,
1747                     new ScannerException(contextMsg, startMark,
1748                         expected("URI escape sequence of 2 hexadecimal " ~
1749                             "numbers", nextByte), reader_.mark));
1750 
1751                 buffer ~= nextByte[].to!ubyte(16);
1752 
1753                 reader_.forward(2);
1754             }
1755             try
1756             {
1757                 foreach (dchar chr; buffer.data)
1758                 {
1759                     reader_.sliceBuilder.write(chr);
1760                 }
1761             }
1762             catch (UnicodeException)
1763             {
1764                 throw new ScannerException(contextMsg, startMark,
1765                         "Invalid UTF-8 data encoded in URI escape sequence",
1766                         reader_.mark);
1767             }
1768         }
1769 
1770 
1771         /// Scan a line break, if any.
1772         ///
1773         /// Transforms:
1774         ///   '\r\n'      :   '\n'
1775         ///   '\r'        :   '\n'
1776         ///   '\n'        :   '\n'
1777         ///   '\u0085'    :   '\n'
1778         ///   '\u2028'    :   '\u2028'
1779         ///   '\u2029     :   '\u2029'
1780         ///   no break    :   '\0'
1781         dchar scanLineBreak() @safe
1782         {
1783             // Fast path for ASCII line breaks.
1784             const b = reader_.peekByte();
1785             if(b < 0x80)
1786             {
1787                 if(b == '\n' || b == '\r')
1788                 {
1789                     if(reader_.prefix(2) == "\r\n") { reader_.forward(2); }
1790                     else { reader_.forward(); }
1791                     return '\n';
1792                 }
1793                 return '\0';
1794             }
1795 
1796             const c = reader_.peek();
1797             if(c == '\x85')
1798             {
1799                 reader_.forward();
1800                 return '\n';
1801             }
1802             if(c == '\u2028' || c == '\u2029')
1803             {
1804                 reader_.forward();
1805                 return c;
1806             }
1807             return '\0';
1808         }
1809 }