1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 /// YAML scanner. 8 /// Code based on PyYAML: http://www.pyyaml.org 9 module dyaml.scanner; 10 11 12 import core.stdc.string; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.ascii : isAlphaNum, isDigit, isHexDigit; 18 import std.exception; 19 import std.string; 20 import std.typecons; 21 import std.traits : Unqual; 22 import std.utf; 23 24 import dyaml.escapes; 25 import dyaml.exception; 26 import dyaml.queue; 27 import dyaml.reader; 28 import dyaml.style; 29 import dyaml.token; 30 31 package: 32 /// Scanner produces tokens of the following types: 33 /// STREAM-START 34 /// STREAM-END 35 /// DIRECTIVE(name, value) 36 /// DOCUMENT-START 37 /// DOCUMENT-END 38 /// BLOCK-SEQUENCE-START 39 /// BLOCK-MAPPING-START 40 /// BLOCK-END 41 /// FLOW-SEQUENCE-START 42 /// FLOW-MAPPING-START 43 /// FLOW-SEQUENCE-END 44 /// FLOW-MAPPING-END 45 /// BLOCK-ENTRY 46 /// FLOW-ENTRY 47 /// KEY 48 /// VALUE 49 /// ALIAS(value) 50 /// ANCHOR(value) 51 /// TAG(value) 52 /// SCALAR(value, plain, style) 53 54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 55 56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 57 58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 59 60 alias isNonLinebreakWhitespace = among!(' ', '\t'); 61 62 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}', 63 '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n', 64 '\r', '\u0085', '\u2028', '\u2029'); 65 66 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',', 67 '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%'); 68 69 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'); 70 71 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029'); 72 73 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\'); 74 75 alias isNSAnchorName = c => !c.isWhiteSpace && !c.among!('[', ']', '{', '}', ',', '\uFEFF'); 76 77 /// Marked exception thrown at scanner errors. 78 /// 79 /// See_Also: MarkedYAMLException 80 class ScannerException : MarkedYAMLException 81 { 82 mixin MarkedExceptionCtors; 83 } 84 85 /// Generates tokens from data provided by a Reader. 86 struct Scanner 87 { 88 private: 89 /// A simple key is a key that is not denoted by the '?' indicator. 90 /// For example: 91 /// --- 92 /// block simple key: value 93 /// ? not a simple key: 94 /// : { flow simple key: value } 95 /// We emit the KEY token before all keys, so when we find a potential simple 96 /// key, we try to locate the corresponding ':' indicator. Simple keys should be 97 /// limited to a single line and 1024 characters. 98 /// 99 /// 16 bytes on 64-bit. 100 static struct SimpleKey 101 { 102 /// Character index in reader where the key starts. 103 uint charIndex = uint.max; 104 /// Index of the key token from start (first token scanned being 0). 105 uint tokenIndex; 106 /// Line the key starts at. 107 uint line; 108 /// Column the key starts at. 109 ushort column; 110 /// Is this required to be a simple key? 111 bool required; 112 /// Is this struct "null" (invalid)?. 113 bool isNull; 114 } 115 116 /// Block chomping types. 117 enum Chomping 118 { 119 /// Strip all trailing line breaks. '-' indicator. 120 strip, 121 /// Line break of the last line is preserved, others discarded. Default. 122 clip, 123 /// All trailing line breaks are preserved. '+' indicator. 124 keep 125 } 126 127 /// Reader used to read from a file/stream. 128 Reader reader_; 129 /// Are we done scanning? 130 bool done_; 131 132 /// Level of nesting in flow context. If 0, we're in block context. 133 uint flowLevel_; 134 /// Current indentation level. 135 int indent_ = -1; 136 /// Past indentation levels. Used as a stack. 137 Appender!(int[]) indents_; 138 139 /// Processed tokens not yet emitted. Used as a queue. 140 Queue!Token tokens_; 141 142 /// Number of tokens emitted through the getToken method. 143 uint tokensTaken_; 144 145 /// Can a simple key start at the current position? A simple key may start: 146 /// - at the beginning of the line, not counting indentation spaces 147 /// (in block context), 148 /// - after '{', '[', ',' (in the flow context), 149 /// - after '?', ':', '-' (in the block context). 150 /// In the block context, this flag also signifies if a block collection 151 /// may start at the current position. 152 bool allowSimpleKey_ = true; 153 154 /// Possible simple keys indexed by flow levels. 155 SimpleKey[] possibleSimpleKeys_; 156 157 public: 158 /// Construct a Scanner using specified Reader. 159 this(Reader reader) @safe nothrow 160 { 161 // Return the next token, but do not delete it from the queue 162 reader_ = reader; 163 fetchStreamStart(); 164 } 165 166 /// Advance to the next token 167 void popFront() @safe 168 { 169 ++tokensTaken_; 170 tokens_.pop(); 171 } 172 173 /// Return the current token 174 const(Token) front() @safe 175 { 176 enforce(!empty, "No token left to peek"); 177 return tokens_.peek(); 178 } 179 180 /// Return whether there are any more tokens left. 181 bool empty() @safe 182 { 183 while (needMoreTokens()) 184 { 185 fetchToken(); 186 } 187 return tokens_.empty; 188 } 189 190 /// Set file name. 191 void name(string name) @safe pure nothrow @nogc 192 { 193 reader_.name = name; 194 } 195 196 private: 197 /// Most scanning error messages have the same format; so build them with this 198 /// function. 199 string expected(T)(string expected, T found) 200 { 201 return text("expected ", expected, ", but found ", found); 202 } 203 204 /// Determine whether or not we need to fetch more tokens before peeking/getting a token. 205 bool needMoreTokens() @safe pure 206 { 207 if(done_) { return false; } 208 if(tokens_.empty) { return true; } 209 210 /// The current token may be a potential simple key, so we need to look further. 211 stalePossibleSimpleKeys(); 212 return nextPossibleSimpleKey() == tokensTaken_; 213 } 214 215 /// Fetch at token, adding it to tokens_. 216 void fetchToken() @safe 217 { 218 // Eat whitespaces and comments until we reach the next token. 219 scanToNextToken(); 220 221 // Remove obsolete possible simple keys. 222 stalePossibleSimpleKeys(); 223 224 // Compare current indentation and column. It may add some tokens 225 // and decrease the current indentation level. 226 unwindIndent(reader_.column); 227 228 // Get the next character. 229 const dchar c = reader_.peekByte(); 230 231 // Fetch the token. 232 if(c == '\0') { return fetchStreamEnd(); } 233 if(checkDirective()) { return fetchDirective(); } 234 if(checkDocumentStart()) { return fetchDocumentStart(); } 235 if(checkDocumentEnd()) { return fetchDocumentEnd(); } 236 // Order of the following checks is NOT significant. 237 switch(c) 238 { 239 case '[': return fetchFlowSequenceStart(); 240 case '{': return fetchFlowMappingStart(); 241 case ']': return fetchFlowSequenceEnd(); 242 case '}': return fetchFlowMappingEnd(); 243 case ',': return fetchFlowEntry(); 244 case '!': return fetchTag(); 245 case '\'': return fetchSingle(); 246 case '\"': return fetchDouble(); 247 case '*': return fetchAlias(); 248 case '&': return fetchAnchor(); 249 case '?': if(checkKey()) { return fetchKey(); } goto default; 250 case ':': if(checkValue()) { return fetchValue(); } goto default; 251 case '-': if(checkBlockEntry()) { return fetchBlockEntry(); } goto default; 252 case '|': if(flowLevel_ == 0) { return fetchLiteral(); } break; 253 case '>': if(flowLevel_ == 0) { return fetchFolded(); } break; 254 default: if(checkPlain()) { return fetchPlain(); } 255 } 256 257 throw new ScannerException("While scanning for the next token, found character " ~ 258 "\'%s\', index %s that cannot start any token" 259 .format(c, to!int(c)), reader_.mark); 260 } 261 262 263 /// Return the token number of the nearest possible simple key. 264 uint nextPossibleSimpleKey() @safe pure nothrow @nogc 265 { 266 uint minTokenNumber = uint.max; 267 foreach(k, ref simpleKey; possibleSimpleKeys_) 268 { 269 if(simpleKey.isNull) { continue; } 270 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex); 271 } 272 return minTokenNumber; 273 } 274 275 /// Remove entries that are no longer possible simple keys. 276 /// 277 /// According to the YAML specification, simple keys 278 /// - should be limited to a single line, 279 /// - should be no longer than 1024 characters. 280 /// Disabling this will allow simple keys of any length and 281 /// height (may cause problems if indentation is broken though). 282 void stalePossibleSimpleKeys() @safe pure 283 { 284 foreach(level, ref key; possibleSimpleKeys_) 285 { 286 if(key.isNull) { continue; } 287 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024) 288 { 289 enforce(!key.required, 290 new ScannerException("While scanning a simple key", 291 Mark(reader_.name, key.line, key.column), 292 "could not find expected ':'", reader_.mark)); 293 key.isNull = true; 294 } 295 } 296 } 297 298 /// Check if the next token starts a possible simple key and if so, save its position. 299 /// 300 /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 301 void savePossibleSimpleKey() @safe pure 302 { 303 // Check if a simple key is required at the current position. 304 const required = (flowLevel_ == 0 && indent_ == reader_.column); 305 assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~ 306 "the first token in the current line. Therefore it is always allowed."); 307 308 if(!allowSimpleKey_) { return; } 309 310 // The next token might be a simple key, so save its number and position. 311 removePossibleSimpleKey(); 312 const tokenCount = tokensTaken_ + cast(uint)tokens_.length; 313 314 const line = reader_.line; 315 const column = reader_.column; 316 const key = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line, 317 cast(ushort)min(column, ushort.max), required); 318 319 if(possibleSimpleKeys_.length <= flowLevel_) 320 { 321 const oldLength = possibleSimpleKeys_.length; 322 possibleSimpleKeys_.length = flowLevel_ + 1; 323 //No need to initialize the last element, it's already done in the next line. 324 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init; 325 } 326 possibleSimpleKeys_[flowLevel_] = key; 327 } 328 329 /// Remove the saved possible key position at the current flow level. 330 void removePossibleSimpleKey() @safe pure 331 { 332 if(possibleSimpleKeys_.length <= flowLevel_) { return; } 333 334 if(!possibleSimpleKeys_[flowLevel_].isNull) 335 { 336 const key = possibleSimpleKeys_[flowLevel_]; 337 enforce(!key.required, 338 new ScannerException("While scanning a simple key", 339 Mark(reader_.name, key.line, key.column), 340 "could not find expected ':'", reader_.mark)); 341 possibleSimpleKeys_[flowLevel_].isNull = true; 342 } 343 } 344 345 /// Decrease indentation, removing entries in indents_. 346 /// 347 /// Params: column = Current column in the file/stream. 348 void unwindIndent(const int column) @safe 349 { 350 if(flowLevel_ > 0) 351 { 352 // In flow context, tokens should respect indentation. 353 // The condition should be `indent >= column` according to the spec. 354 // But this condition will prohibit intuitively correct 355 // constructions such as 356 // key : { 357 // } 358 359 // In the flow context, indentation is ignored. We make the scanner less 360 // restrictive than what the specification requires. 361 // if(pedantic_ && flowLevel_ > 0 && indent_ > column) 362 // { 363 // throw new ScannerException("Invalid intendation or unclosed '[' or '{'", 364 // reader_.mark) 365 // } 366 return; 367 } 368 369 // In block context, we may need to issue the BLOCK-END tokens. 370 while(indent_ > column) 371 { 372 indent_ = indents_.data.back; 373 assert(indents_.data.length); 374 indents_.shrinkTo(indents_.data.length - 1); 375 tokens_.push(blockEndToken(reader_.mark, reader_.mark)); 376 } 377 } 378 379 /// Increase indentation if needed. 380 /// 381 /// Params: column = Current column in the file/stream. 382 /// 383 /// Returns: true if the indentation was increased, false otherwise. 384 bool addIndent(int column) @safe 385 { 386 if(indent_ >= column){return false;} 387 indents_ ~= indent_; 388 indent_ = column; 389 return true; 390 } 391 392 393 /// Add STREAM-START token. 394 void fetchStreamStart() @safe nothrow 395 { 396 tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding)); 397 } 398 399 ///Add STREAM-END token. 400 void fetchStreamEnd() @safe 401 { 402 //Set intendation to -1 . 403 unwindIndent(-1); 404 removePossibleSimpleKey(); 405 allowSimpleKey_ = false; 406 possibleSimpleKeys_.destroy; 407 408 tokens_.push(streamEndToken(reader_.mark, reader_.mark)); 409 done_ = true; 410 } 411 412 /// Add DIRECTIVE token. 413 void fetchDirective() @safe 414 { 415 // Set intendation to -1 . 416 unwindIndent(-1); 417 // Reset simple keys. 418 removePossibleSimpleKey(); 419 allowSimpleKey_ = false; 420 421 auto directive = scanDirective(); 422 tokens_.push(directive); 423 } 424 425 /// Add DOCUMENT-START or DOCUMENT-END token. 426 void fetchDocumentIndicator(TokenID id)() 427 if(id == TokenID.documentStart || id == TokenID.documentEnd) 428 { 429 // Set indentation to -1 . 430 unwindIndent(-1); 431 // Reset simple keys. Note that there can't be a block collection after '---'. 432 removePossibleSimpleKey(); 433 allowSimpleKey_ = false; 434 435 Mark startMark = reader_.mark; 436 reader_.forward(3); 437 tokens_.push(simpleToken!id(startMark, reader_.mark)); 438 } 439 440 /// Aliases to add DOCUMENT-START or DOCUMENT-END token. 441 alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart); 442 alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd); 443 444 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 445 void fetchFlowCollectionStart(TokenID id)() @safe 446 { 447 // '[' and '{' may start a simple key. 448 savePossibleSimpleKey(); 449 // Simple keys are allowed after '[' and '{'. 450 allowSimpleKey_ = true; 451 ++flowLevel_; 452 453 Mark startMark = reader_.mark; 454 reader_.forward(); 455 tokens_.push(simpleToken!id(startMark, reader_.mark)); 456 } 457 458 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 459 alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart); 460 alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart); 461 462 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 463 void fetchFlowCollectionEnd(TokenID id)() 464 { 465 // Reset possible simple key on the current level. 466 removePossibleSimpleKey(); 467 // No simple keys after ']' and '}'. 468 allowSimpleKey_ = false; 469 --flowLevel_; 470 471 Mark startMark = reader_.mark; 472 reader_.forward(); 473 tokens_.push(simpleToken!id(startMark, reader_.mark)); 474 } 475 476 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/ 477 alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd); 478 alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd); 479 480 /// Add FLOW-ENTRY token; 481 void fetchFlowEntry() @safe 482 { 483 // Reset possible simple key on the current level. 484 removePossibleSimpleKey(); 485 // Simple keys are allowed after ','. 486 allowSimpleKey_ = true; 487 488 Mark startMark = reader_.mark; 489 reader_.forward(); 490 tokens_.push(flowEntryToken(startMark, reader_.mark)); 491 } 492 493 /// Additional checks used in block context in fetchBlockEntry and fetchKey. 494 /// 495 /// Params: type = String representing the token type we might need to add. 496 /// id = Token type we might need to add. 497 void blockChecks(string type, TokenID id)() 498 { 499 enum context = type ~ " keys are not allowed here"; 500 // Are we allowed to start a key (not neccesarily a simple one)? 501 enforce(allowSimpleKey_, new ScannerException(context, reader_.mark)); 502 503 if(addIndent(reader_.column)) 504 { 505 tokens_.push(simpleToken!id(reader_.mark, reader_.mark)); 506 } 507 } 508 509 /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process. 510 void fetchBlockEntry() @safe 511 { 512 if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); } 513 514 // It's an error for the block entry to occur in the flow context, 515 // but we let the parser detect this. 516 517 // Reset possible simple key on the current level. 518 removePossibleSimpleKey(); 519 // Simple keys are allowed after '-'. 520 allowSimpleKey_ = true; 521 522 Mark startMark = reader_.mark; 523 reader_.forward(); 524 tokens_.push(blockEntryToken(startMark, reader_.mark)); 525 } 526 527 /// Add KEY token. Might add BLOCK-MAPPING-START in the process. 528 void fetchKey() @safe 529 { 530 if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); } 531 532 // Reset possible simple key on the current level. 533 removePossibleSimpleKey(); 534 // Simple keys are allowed after '?' in the block context. 535 allowSimpleKey_ = (flowLevel_ == 0); 536 537 Mark startMark = reader_.mark; 538 reader_.forward(); 539 tokens_.push(keyToken(startMark, reader_.mark)); 540 } 541 542 /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process. 543 void fetchValue() @safe 544 { 545 //Do we determine a simple key? 546 if(possibleSimpleKeys_.length > flowLevel_ && 547 !possibleSimpleKeys_[flowLevel_].isNull) 548 { 549 const key = possibleSimpleKeys_[flowLevel_]; 550 possibleSimpleKeys_[flowLevel_].isNull = true; 551 Mark keyMark = Mark(reader_.name, key.line, key.column); 552 const idx = key.tokenIndex - tokensTaken_; 553 554 assert(idx >= 0); 555 556 // Add KEY. 557 // Manually inserting since tokens are immutable (need linked list). 558 tokens_.insert(keyToken(keyMark, keyMark), idx); 559 560 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START. 561 if(flowLevel_ == 0 && addIndent(key.column)) 562 { 563 tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx); 564 } 565 566 // There cannot be two simple keys in a row. 567 allowSimpleKey_ = false; 568 } 569 // Part of a complex key 570 else 571 { 572 // We can start a complex value if and only if we can start a simple key. 573 enforce(flowLevel_ > 0 || allowSimpleKey_, 574 new ScannerException("Mapping values are not allowed here", reader_.mark)); 575 576 // If this value starts a new block mapping, we need to add 577 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser. 578 if(flowLevel_ == 0 && addIndent(reader_.column)) 579 { 580 tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark)); 581 } 582 583 // Reset possible simple key on the current level. 584 removePossibleSimpleKey(); 585 // Simple keys are allowed after ':' in the block context. 586 allowSimpleKey_ = (flowLevel_ == 0); 587 } 588 589 // Add VALUE. 590 Mark startMark = reader_.mark; 591 reader_.forward(); 592 tokens_.push(valueToken(startMark, reader_.mark)); 593 } 594 595 /// Add ALIAS or ANCHOR token. 596 void fetchAnchor_(TokenID id)() @safe 597 if(id == TokenID.alias_ || id == TokenID.anchor) 598 { 599 // ALIAS/ANCHOR could be a simple key. 600 savePossibleSimpleKey(); 601 // No simple keys after ALIAS/ANCHOR. 602 allowSimpleKey_ = false; 603 604 auto anchor = scanAnchor(id); 605 tokens_.push(anchor); 606 } 607 608 /// Aliases to add ALIAS or ANCHOR token. 609 alias fetchAlias = fetchAnchor_!(TokenID.alias_); 610 alias fetchAnchor = fetchAnchor_!(TokenID.anchor); 611 612 /// Add TAG token. 613 void fetchTag() @safe 614 { 615 //TAG could start a simple key. 616 savePossibleSimpleKey(); 617 //No simple keys after TAG. 618 allowSimpleKey_ = false; 619 620 tokens_.push(scanTag()); 621 } 622 623 /// Add block SCALAR token. 624 void fetchBlockScalar(ScalarStyle style)() @safe 625 if(style == ScalarStyle.literal || style == ScalarStyle.folded) 626 { 627 // Reset possible simple key on the current level. 628 removePossibleSimpleKey(); 629 // A simple key may follow a block scalar. 630 allowSimpleKey_ = true; 631 632 auto blockScalar = scanBlockScalar(style); 633 tokens_.push(blockScalar); 634 } 635 636 /// Aliases to add literal or folded block scalar. 637 alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal); 638 alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded); 639 640 /// Add quoted flow SCALAR token. 641 void fetchFlowScalar(ScalarStyle quotes)() 642 { 643 // A flow scalar could be a simple key. 644 savePossibleSimpleKey(); 645 // No simple keys after flow scalars. 646 allowSimpleKey_ = false; 647 648 // Scan and add SCALAR. 649 auto scalar = scanFlowScalar(quotes); 650 tokens_.push(scalar); 651 } 652 653 /// Aliases to add single or double quoted block scalar. 654 alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted); 655 alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted); 656 657 /// Add plain SCALAR token. 658 void fetchPlain() @safe 659 { 660 // A plain scalar could be a simple key 661 savePossibleSimpleKey(); 662 // No simple keys after plain scalars. But note that scanPlain() will 663 // change this flag if the scan is finished at the beginning of the line. 664 allowSimpleKey_ = false; 665 auto plain = scanPlain(); 666 667 // Scan and add SCALAR. May change allowSimpleKey_ 668 tokens_.push(plain); 669 } 670 671 pure: 672 673 ///Check if the next token is DIRECTIVE: ^ '%' ... 674 bool checkDirective() @safe 675 { 676 return reader_.peekByte() == '%' && reader_.column == 0; 677 } 678 679 /// Check if the next token is DOCUMENT-START: ^ '---' (' '|'\n') 680 bool checkDocumentStart() @safe 681 { 682 // Check one char first, then all 3, to prevent reading outside the buffer. 683 return reader_.column == 0 && 684 reader_.peekByte() == '-' && 685 reader_.prefix(3) == "---" && 686 reader_.peek(3).isWhiteSpace; 687 } 688 689 /// Check if the next token is DOCUMENT-END: ^ '...' (' '|'\n') 690 bool checkDocumentEnd() @safe 691 { 692 // Check one char first, then all 3, to prevent reading outside the buffer. 693 return reader_.column == 0 && 694 reader_.peekByte() == '.' && 695 reader_.prefix(3) == "..." && 696 reader_.peek(3).isWhiteSpace; 697 } 698 699 /// Check if the next token is BLOCK-ENTRY: '-' (' '|'\n') 700 bool checkBlockEntry() @safe 701 { 702 return !!reader_.peek(1).isWhiteSpace; 703 } 704 705 /// Check if the next token is KEY(flow context): '?' 706 /// 707 /// or KEY(block context): '?' (' '|'\n') 708 bool checkKey() @safe 709 { 710 return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace); 711 } 712 713 /// Check if the next token is VALUE(flow context): ':' 714 /// 715 /// or VALUE(block context): ':' (' '|'\n') 716 bool checkValue() @safe 717 { 718 return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace; 719 } 720 721 /// Check if the next token is a plain scalar. 722 /// 723 /// A plain scalar may start with any non-space character except: 724 /// '-', '?', ':', ',', '[', ']', '{', '}', 725 /// '#', '&', '*', '!', '|', '>', '\'', '\"', 726 /// '%', '@', '`'. 727 /// 728 /// It may also start with 729 /// '-', '?', ':' 730 /// if it is followed by a non-space character. 731 /// 732 /// Note that we limit the last rule to the block context (except the 733 /// '-' character) because we want the flow context to be space 734 /// independent. 735 bool checkPlain() @safe 736 { 737 const c = reader_.peek(); 738 if(!c.isNonScalarStartCharacter) 739 { 740 return true; 741 } 742 return !reader_.peek(1).isWhiteSpace && 743 (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':'))); 744 } 745 746 /// Move to the next non-space character. 747 void findNextNonSpace() @safe 748 { 749 while(reader_.peekByte() == ' ') { reader_.forward(); } 750 } 751 752 /// Scan a string of alphanumeric or "-_" characters. 753 /// 754 /// Assumes that the caller is building a slice in Reader, and puts the scanned 755 /// characters into that slice. 756 void scanAlphaNumericToSlice(string name)(const Mark startMark) 757 { 758 size_t length; 759 dchar c = reader_.peek(); 760 while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); } 761 762 enforce(length > 0, new ScannerException("While scanning " ~ name, 763 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark)); 764 765 reader_.sliceBuilder.write(reader_.get(length)); 766 } 767 768 /// Scan a string. 769 /// 770 /// Assumes that the caller is building a slice in Reader, and puts the scanned 771 /// characters into that slice. 772 void scanAnchorAliasToSlice(const Mark startMark) @safe 773 { 774 size_t length; 775 dchar c = reader_.peek(); 776 while (c.isNSAnchorName) 777 { 778 c = reader_.peek(++length); 779 } 780 781 enforce(length > 0, new ScannerException("While scanning an anchor or alias", 782 startMark, expected("a printable character besides '[', ']', '{', '}' and ','", c), reader_.mark)); 783 784 reader_.sliceBuilder.write(reader_.get(length)); 785 } 786 787 /// Scan and throw away all characters until next line break. 788 void scanToNextBreak() @safe 789 { 790 while(!reader_.peek().isBreak) { reader_.forward(); } 791 } 792 793 /// Scan all characters until next line break. 794 /// 795 /// Assumes that the caller is building a slice in Reader, and puts the scanned 796 /// characters into that slice. 797 void scanToNextBreakToSlice() @safe 798 { 799 uint length; 800 while(!reader_.peek(length).isBreak) 801 { 802 ++length; 803 } 804 reader_.sliceBuilder.write(reader_.get(length)); 805 } 806 807 808 /// Move to next token in the file/stream. 809 /// 810 /// We ignore spaces, line breaks and comments. 811 /// If we find a line break in the block context, we set 812 /// allowSimpleKey` on. 813 /// 814 /// We do not yet support BOM inside the stream as the 815 /// specification requires. Any such mark will be considered as a part 816 /// of the document. 817 void scanToNextToken() @safe 818 { 819 // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is: 820 // Tabs cannot precede tokens 821 // BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 822 // KEY(block), VALUE(block), BLOCK-ENTRY 823 // So the checking code is 824 // if <TAB>: 825 // allowSimpleKey_ = false 826 // We also need to add the check for `allowSimpleKey_ == true` to 827 // `unwindIndent` before issuing BLOCK-END. 828 // Scanners for block, flow, and plain scalars need to be modified. 829 830 for(;;) 831 { 832 //All whitespace in flow context is ignored, even whitespace 833 // not allowed in other contexts 834 if (flowLevel_ > 0) 835 { 836 while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); } 837 } 838 else 839 { 840 findNextNonSpace(); 841 } 842 if(reader_.peekByte() == '#') { scanToNextBreak(); } 843 if(scanLineBreak() != '\0') 844 { 845 if(flowLevel_ == 0) { allowSimpleKey_ = true; } 846 } 847 else 848 { 849 break; 850 } 851 } 852 } 853 854 /// Scan directive token. 855 Token scanDirective() @safe 856 { 857 Mark startMark = reader_.mark; 858 // Skip the '%'. 859 reader_.forward(); 860 861 // Scan directive name 862 reader_.sliceBuilder.begin(); 863 scanDirectiveNameToSlice(startMark); 864 const name = reader_.sliceBuilder.finish(); 865 866 reader_.sliceBuilder.begin(); 867 868 // Index where tag handle ends and suffix starts in a tag directive value. 869 uint tagHandleEnd = uint.max; 870 if(name == "YAML") { scanYAMLDirectiveValueToSlice(startMark); } 871 else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); } 872 char[] value = reader_.sliceBuilder.finish(); 873 874 Mark endMark = reader_.mark; 875 876 DirectiveType directive; 877 if(name == "YAML") { directive = DirectiveType.yaml; } 878 else if(name == "TAG") { directive = DirectiveType.tag; } 879 else 880 { 881 directive = DirectiveType.reserved; 882 scanToNextBreak(); 883 } 884 885 scanDirectiveIgnoredLine(startMark); 886 887 return directiveToken(startMark, endMark, value, directive, tagHandleEnd); 888 } 889 890 /// Scan name of a directive token. 891 /// 892 /// Assumes that the caller is building a slice in Reader, and puts the scanned 893 /// characters into that slice. 894 void scanDirectiveNameToSlice(const Mark startMark) @safe 895 { 896 // Scan directive name. 897 scanAlphaNumericToSlice!"a directive"(startMark); 898 899 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 900 new ScannerException("While scanning a directive", startMark, 901 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 902 } 903 904 /// Scan value of a YAML directive token. Returns major, minor version separated by '.'. 905 /// 906 /// Assumes that the caller is building a slice in Reader, and puts the scanned 907 /// characters into that slice. 908 void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe 909 { 910 findNextNonSpace(); 911 912 scanYAMLDirectiveNumberToSlice(startMark); 913 914 enforce(reader_.peekByte() == '.', 915 new ScannerException("While scanning a directive", startMark, 916 expected("digit or '.'", reader_.peek()), reader_.mark)); 917 // Skip the '.'. 918 reader_.forward(); 919 920 reader_.sliceBuilder.write('.'); 921 scanYAMLDirectiveNumberToSlice(startMark); 922 923 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 924 new ScannerException("While scanning a directive", startMark, 925 expected("digit or '.'", reader_.peek()), reader_.mark)); 926 } 927 928 /// Scan a number from a YAML directive. 929 /// 930 /// Assumes that the caller is building a slice in Reader, and puts the scanned 931 /// characters into that slice. 932 void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe 933 { 934 enforce(isDigit(reader_.peek()), 935 new ScannerException("While scanning a directive", startMark, 936 expected("digit", reader_.peek()), reader_.mark)); 937 938 // Already found the first digit in the enforce(), so set length to 1. 939 uint length = 1; 940 while(reader_.peek(length).isDigit) { ++length; } 941 942 reader_.sliceBuilder.write(reader_.get(length)); 943 } 944 945 /// Scan value of a tag directive. 946 /// 947 /// Assumes that the caller is building a slice in Reader, and puts the scanned 948 /// characters into that slice. 949 /// 950 /// Returns: Length of tag handle (which is before tag prefix) in scanned data 951 uint scanTagDirectiveValueToSlice(const Mark startMark) @safe 952 { 953 findNextNonSpace(); 954 const startLength = reader_.sliceBuilder.length; 955 scanTagDirectiveHandleToSlice(startMark); 956 const handleLength = cast(uint)(reader_.sliceBuilder.length - startLength); 957 findNextNonSpace(); 958 scanTagDirectivePrefixToSlice(startMark); 959 960 return handleLength; 961 } 962 963 /// Scan handle of a tag directive. 964 /// 965 /// Assumes that the caller is building a slice in Reader, and puts the scanned 966 /// characters into that slice. 967 void scanTagDirectiveHandleToSlice(const Mark startMark) @safe 968 { 969 scanTagHandleToSlice!"directive"(startMark); 970 enforce(reader_.peekByte() == ' ', 971 new ScannerException("While scanning a directive handle", startMark, 972 expected("' '", reader_.peek()), reader_.mark)); 973 } 974 975 /// Scan prefix of a tag directive. 976 /// 977 /// Assumes that the caller is building a slice in Reader, and puts the scanned 978 /// characters into that slice. 979 void scanTagDirectivePrefixToSlice(const Mark startMark) @safe 980 { 981 scanTagURIToSlice!"directive"(startMark); 982 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 983 new ScannerException("While scanning a directive prefix", startMark, 984 expected("' '", reader_.peek()), reader_.mark)); 985 } 986 987 /// Scan (and ignore) ignored line after a directive. 988 void scanDirectiveIgnoredLine(const Mark startMark) @safe 989 { 990 findNextNonSpace(); 991 if(reader_.peekByte() == '#') { scanToNextBreak(); } 992 enforce(reader_.peek().isBreak, 993 new ScannerException("While scanning a directive", startMark, 994 expected("comment or a line break", reader_.peek()), reader_.mark)); 995 scanLineBreak(); 996 } 997 998 999 /// Scan an alias or an anchor. 1000 /// 1001 /// The specification does not restrict characters for anchors and 1002 /// aliases. This may lead to problems, for instance, the document: 1003 /// [ *alias, value ] 1004 /// can be interpteted in two ways, as 1005 /// [ "value" ] 1006 /// and 1007 /// [ *alias , "value" ] 1008 /// Therefore we restrict aliases to ASCII alphanumeric characters. 1009 Token scanAnchor(const TokenID id) @safe 1010 { 1011 const startMark = reader_.mark; 1012 reader_.forward(); // The */& character was only peeked, so we drop it now 1013 1014 reader_.sliceBuilder.begin(); 1015 scanAnchorAliasToSlice(startMark); 1016 // On error, value is discarded as we return immediately 1017 char[] value = reader_.sliceBuilder.finish(); 1018 1019 assert(!reader_.peek().isNSAnchorName, "Anchor/alias name not fully scanned"); 1020 1021 if(id == TokenID.alias_) 1022 { 1023 return aliasToken(startMark, reader_.mark, value); 1024 } 1025 if(id == TokenID.anchor) 1026 { 1027 return anchorToken(startMark, reader_.mark, value); 1028 } 1029 assert(false, "This code should never be reached"); 1030 } 1031 1032 /// Scan a tag token. 1033 Token scanTag() @safe 1034 { 1035 const startMark = reader_.mark; 1036 dchar c = reader_.peek(1); 1037 1038 reader_.sliceBuilder.begin(); 1039 scope(failure) { reader_.sliceBuilder.finish(); } 1040 // Index where tag handle ends and tag suffix starts in the tag value 1041 // (slice) we will produce. 1042 uint handleEnd; 1043 1044 if(c == '<') 1045 { 1046 reader_.forward(2); 1047 1048 handleEnd = 0; 1049 scanTagURIToSlice!"tag"(startMark); 1050 enforce(reader_.peekByte() == '>', 1051 new ScannerException("While scanning a tag", startMark, 1052 expected("'>'", reader_.peek()), reader_.mark)); 1053 reader_.forward(); 1054 } 1055 else if(c.isWhiteSpace) 1056 { 1057 reader_.forward(); 1058 handleEnd = 0; 1059 reader_.sliceBuilder.write('!'); 1060 } 1061 else 1062 { 1063 uint length = 1; 1064 bool useHandle; 1065 1066 while(!c.isBreakOrSpace) 1067 { 1068 if(c == '!') 1069 { 1070 useHandle = true; 1071 break; 1072 } 1073 ++length; 1074 c = reader_.peek(length); 1075 } 1076 1077 if(useHandle) 1078 { 1079 scanTagHandleToSlice!"tag"(startMark); 1080 handleEnd = cast(uint)reader_.sliceBuilder.length; 1081 } 1082 else 1083 { 1084 reader_.forward(); 1085 reader_.sliceBuilder.write('!'); 1086 handleEnd = cast(uint)reader_.sliceBuilder.length; 1087 } 1088 1089 scanTagURIToSlice!"tag"(startMark); 1090 } 1091 1092 enforce(reader_.peek().isBreakOrSpace, 1093 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()), 1094 reader_.mark)); 1095 1096 char[] slice = reader_.sliceBuilder.finish(); 1097 return tagToken(startMark, reader_.mark, slice, handleEnd); 1098 } 1099 1100 /// Scan a block scalar token with specified style. 1101 Token scanBlockScalar(const ScalarStyle style) @safe 1102 { 1103 const startMark = reader_.mark; 1104 1105 // Scan the header. 1106 reader_.forward(); 1107 1108 const indicators = scanBlockScalarIndicators(startMark); 1109 1110 const chomping = indicators[0]; 1111 const increment = indicators[1]; 1112 scanBlockScalarIgnoredLine(startMark); 1113 1114 // Determine the indentation level and go to the first non-empty line. 1115 Mark endMark; 1116 uint indent = max(1, indent_ + 1); 1117 1118 reader_.sliceBuilder.begin(); 1119 alias Transaction = SliceBuilder.Transaction; 1120 // Used to strip the last line breaks written to the slice at the end of the 1121 // scalar, which may be needed based on chomping. 1122 Transaction breaksTransaction = Transaction(&reader_.sliceBuilder); 1123 // Read the first indentation/line breaks before the scalar. 1124 size_t startLen = reader_.sliceBuilder.length; 1125 if(increment == int.min) 1126 { 1127 auto indentation = scanBlockScalarIndentationToSlice(); 1128 endMark = indentation[1]; 1129 indent = max(indent, indentation[0]); 1130 } 1131 else 1132 { 1133 indent += increment - 1; 1134 endMark = scanBlockScalarBreaksToSlice(indent); 1135 } 1136 1137 // int.max means there's no line break (int.max is outside UTF-32). 1138 dchar lineBreak = cast(dchar)int.max; 1139 1140 // Scan the inner part of the block scalar. 1141 while(reader_.column == indent && reader_.peekByte() != '\0') 1142 { 1143 breaksTransaction.commit(); 1144 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t'); 1145 // This is where the 'interesting' non-whitespace data gets read. 1146 scanToNextBreakToSlice(); 1147 lineBreak = scanLineBreak(); 1148 1149 1150 // This transaction serves to rollback data read in the 1151 // scanBlockScalarBreaksToSlice() call. 1152 breaksTransaction = Transaction(&reader_.sliceBuilder); 1153 startLen = reader_.sliceBuilder.length; 1154 // The line breaks should actually be written _after_ the if() block 1155 // below. We work around that by inserting 1156 endMark = scanBlockScalarBreaksToSlice(indent); 1157 1158 // This will not run during the last iteration (see the if() vs the 1159 // while()), hence breaksTransaction rollback (which happens after this 1160 // loop) will never roll back data written in this if() block. 1161 if(reader_.column == indent && reader_.peekByte() != '\0') 1162 { 1163 // Unfortunately, folding rules are ambiguous. 1164 1165 // This is the folding according to the specification: 1166 if(style == ScalarStyle.folded && lineBreak == '\n' && 1167 leadingNonSpace && !reader_.peekByte().among!(' ', '\t')) 1168 { 1169 // No breaks were scanned; no need to insert the space in the 1170 // middle of slice. 1171 if(startLen == reader_.sliceBuilder.length) 1172 { 1173 reader_.sliceBuilder.write(' '); 1174 } 1175 } 1176 else 1177 { 1178 // We need to insert in the middle of the slice in case any line 1179 // breaks were scanned. 1180 reader_.sliceBuilder.insert(lineBreak, startLen); 1181 } 1182 1183 ////this is Clark Evans's interpretation (also in the spec 1184 ////examples): 1185 // 1186 //if(style == ScalarStyle.folded && lineBreak == '\n') 1187 //{ 1188 // if(startLen == endLen) 1189 // { 1190 // if(!" \t"d.canFind(reader_.peekByte())) 1191 // { 1192 // reader_.sliceBuilder.write(' '); 1193 // } 1194 // else 1195 // { 1196 // chunks ~= lineBreak; 1197 // } 1198 // } 1199 //} 1200 //else 1201 //{ 1202 // reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen); 1203 //} 1204 } 1205 else 1206 { 1207 break; 1208 } 1209 } 1210 1211 // If chompint is Keep, we keep (commit) the last scanned line breaks 1212 // (which are at the end of the scalar). Otherwise re remove them (end the 1213 // transaction). 1214 if(chomping == Chomping.keep) { breaksTransaction.commit(); } 1215 else { breaksTransaction.end(); } 1216 if(chomping != Chomping.strip && lineBreak != int.max) 1217 { 1218 // If chomping is Keep, we keep the line break but the first line break 1219 // that isn't stripped (since chomping isn't Strip in this branch) must 1220 // be inserted _before_ the other line breaks. 1221 if(chomping == Chomping.keep) 1222 { 1223 reader_.sliceBuilder.insert(lineBreak, startLen); 1224 } 1225 // If chomping is not Keep, breaksTransaction was cancelled so we can 1226 // directly write the first line break (as it isn't stripped - chomping 1227 // is not Strip) 1228 else 1229 { 1230 reader_.sliceBuilder.write(lineBreak); 1231 } 1232 } 1233 1234 char[] slice = reader_.sliceBuilder.finish(); 1235 return scalarToken(startMark, endMark, slice, style); 1236 } 1237 1238 /// Scan chomping and indentation indicators of a scalar token. 1239 Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe 1240 { 1241 auto chomping = Chomping.clip; 1242 int increment = int.min; 1243 dchar c = reader_.peek(); 1244 1245 /// Indicators can be in any order. 1246 if(getChomping(c, chomping)) 1247 { 1248 getIncrement(c, increment, startMark); 1249 } 1250 else 1251 { 1252 const gotIncrement = getIncrement(c, increment, startMark); 1253 if(gotIncrement) { getChomping(c, chomping); } 1254 } 1255 1256 enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 1257 new ScannerException("While scanning a block scalar", startMark, 1258 expected("chomping or indentation indicator", c), reader_.mark)); 1259 1260 return tuple(chomping, increment); 1261 } 1262 1263 /// Get chomping indicator, if detected. Return false otherwise. 1264 /// 1265 /// Used in scanBlockScalarIndicators. 1266 /// 1267 /// Params: 1268 /// 1269 /// c = The character that may be a chomping indicator. 1270 /// chomping = Write the chomping value here, if detected. 1271 bool getChomping(ref dchar c, ref Chomping chomping) @safe 1272 { 1273 if(!c.among!('+', '-')) { return false; } 1274 chomping = c == '+' ? Chomping.keep : Chomping.strip; 1275 reader_.forward(); 1276 c = reader_.peek(); 1277 return true; 1278 } 1279 1280 /// Get increment indicator, if detected. Return false otherwise. 1281 /// 1282 /// Used in scanBlockScalarIndicators. 1283 /// 1284 /// Params: 1285 /// 1286 /// c = The character that may be an increment indicator. 1287 /// If an increment indicator is detected, this will be updated to 1288 /// the next character in the Reader. 1289 /// increment = Write the increment value here, if detected. 1290 /// startMark = Mark for error messages. 1291 bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe 1292 { 1293 if(!c.isDigit) { return false; } 1294 // Convert a digit to integer. 1295 increment = c - '0'; 1296 assert(increment < 10 && increment >= 0, "Digit has invalid value"); 1297 1298 enforce(increment > 0, 1299 new ScannerException("While scanning a block scalar", startMark, 1300 expected("indentation indicator in range 1-9", "0"), reader_.mark)); 1301 1302 reader_.forward(); 1303 c = reader_.peek(); 1304 return true; 1305 } 1306 1307 /// Scan (and ignore) ignored line in a block scalar. 1308 void scanBlockScalarIgnoredLine(const Mark startMark) @safe 1309 { 1310 findNextNonSpace(); 1311 if(reader_.peekByte()== '#') { scanToNextBreak(); } 1312 1313 enforce(reader_.peek().isBreak, 1314 new ScannerException("While scanning a block scalar", startMark, 1315 expected("comment or line break", reader_.peek()), reader_.mark)); 1316 1317 scanLineBreak(); 1318 } 1319 1320 /// Scan indentation in a block scalar, returning line breaks, max indent and end mark. 1321 /// 1322 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1323 /// characters into that slice. 1324 Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe 1325 { 1326 uint maxIndent; 1327 Mark endMark = reader_.mark; 1328 1329 while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) 1330 { 1331 if(reader_.peekByte() != ' ') 1332 { 1333 reader_.sliceBuilder.write(scanLineBreak()); 1334 endMark = reader_.mark; 1335 continue; 1336 } 1337 reader_.forward(); 1338 maxIndent = max(reader_.column, maxIndent); 1339 } 1340 1341 return tuple(maxIndent, endMark); 1342 } 1343 1344 /// Scan line breaks at lower or specified indentation in a block scalar. 1345 /// 1346 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1347 /// characters into that slice. 1348 Mark scanBlockScalarBreaksToSlice(const uint indent) @safe 1349 { 1350 Mark endMark = reader_.mark; 1351 1352 for(;;) 1353 { 1354 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); } 1355 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1356 reader_.sliceBuilder.write(scanLineBreak()); 1357 endMark = reader_.mark; 1358 } 1359 1360 return endMark; 1361 } 1362 1363 /// Scan a qouted flow scalar token with specified quotes. 1364 Token scanFlowScalar(const ScalarStyle quotes) @safe 1365 { 1366 const startMark = reader_.mark; 1367 const quote = reader_.get(); 1368 1369 reader_.sliceBuilder.begin(); 1370 1371 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1372 1373 while(reader_.peek() != quote) 1374 { 1375 scanFlowScalarSpacesToSlice(startMark); 1376 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1377 } 1378 reader_.forward(); 1379 1380 auto slice = reader_.sliceBuilder.finish(); 1381 return scalarToken(startMark, reader_.mark, slice, quotes); 1382 } 1383 1384 /// Scan nonspace characters in a flow scalar. 1385 /// 1386 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1387 /// characters into that slice. 1388 void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) 1389 @safe 1390 { 1391 for(;;) 1392 { 1393 dchar c = reader_.peek(); 1394 1395 size_t numCodePoints; 1396 while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; } 1397 1398 if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); } 1399 1400 c = reader_.peek(); 1401 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'') 1402 { 1403 reader_.forward(2); 1404 reader_.sliceBuilder.write('\''); 1405 } 1406 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') || 1407 (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\'))) 1408 { 1409 reader_.forward(); 1410 reader_.sliceBuilder.write(c); 1411 } 1412 else if(quotes == ScalarStyle.doubleQuoted && c == '\\') 1413 { 1414 reader_.forward(); 1415 c = reader_.peek(); 1416 if(c.among!(escapes)) 1417 { 1418 reader_.forward(); 1419 // Escaping has been moved to Parser as it can't be done in 1420 // place (in a slice) in case of '\P' and '\L' (very uncommon, 1421 // but we don't want to break the spec) 1422 char[2] escapeSequence = ['\\', cast(char)c]; 1423 reader_.sliceBuilder.write(escapeSequence); 1424 } 1425 else if(c.among!(escapeHexCodeList)) 1426 { 1427 const hexLength = dyaml.escapes.escapeHexLength(c); 1428 reader_.forward(); 1429 1430 foreach(i; 0 .. hexLength) { 1431 enforce(reader_.peek(i).isHexDigit, 1432 new ScannerException("While scanning a double quoted scalar", startMark, 1433 expected("escape sequence of hexadecimal numbers", 1434 reader_.peek(i)), reader_.mark)); 1435 } 1436 char[] hex = reader_.get(hexLength); 1437 1438 enforce((hex.length > 0) && (hex.length <= 8), 1439 new ScannerException("While scanning a double quoted scalar", startMark, 1440 "overflow when parsing an escape sequence of " ~ 1441 "hexadecimal numbers.", reader_.mark)); 1442 1443 char[2] escapeStart = ['\\', cast(char) c]; 1444 reader_.sliceBuilder.write(escapeStart); 1445 reader_.sliceBuilder.write(hex); 1446 1447 } 1448 else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1449 { 1450 scanLineBreak(); 1451 scanFlowScalarBreaksToSlice(startMark); 1452 } 1453 else 1454 { 1455 throw new ScannerException("While scanning a double quoted scalar", startMark, 1456 text("found unsupported escape character ", c), 1457 reader_.mark); 1458 } 1459 } 1460 else { return; } 1461 } 1462 } 1463 1464 /// Scan space characters in a flow scalar. 1465 /// 1466 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1467 /// spaces into that slice. 1468 void scanFlowScalarSpacesToSlice(const Mark startMark) @safe 1469 { 1470 // Increase length as long as we see whitespace. 1471 size_t length; 1472 while(reader_.peekByte(length).among!(' ', '\t')) { ++length; } 1473 auto whitespaces = reader_.prefixBytes(length); 1474 1475 // Can check the last byte without striding because '\0' is ASCII 1476 const c = reader_.peek(length); 1477 enforce(c != '\0', 1478 new ScannerException("While scanning a quoted scalar", startMark, 1479 "found unexpected end of buffer", reader_.mark)); 1480 1481 // Spaces not followed by a line break. 1482 if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1483 { 1484 reader_.forward(length); 1485 reader_.sliceBuilder.write(whitespaces); 1486 return; 1487 } 1488 1489 // There's a line break after the spaces. 1490 reader_.forward(length); 1491 const lineBreak = scanLineBreak(); 1492 1493 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1494 1495 // If we have extra line breaks after the first, scan them into the 1496 // slice. 1497 const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); 1498 1499 // No extra breaks, one normal line break. Replace it with a space. 1500 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1501 } 1502 1503 /// Scan line breaks in a flow scalar. 1504 /// 1505 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1506 /// line breaks into that slice. 1507 bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe 1508 { 1509 // True if at least one line break was found. 1510 bool anyBreaks; 1511 for(;;) 1512 { 1513 // Instead of checking indentation, we check for document separators. 1514 const prefix = reader_.prefix(3); 1515 enforce(!(prefix == "---" || prefix == "...") || 1516 !reader_.peek(3).isWhiteSpace, 1517 new ScannerException("While scanning a quoted scalar", startMark, 1518 "found unexpected document separator", reader_.mark)); 1519 1520 // Skip any whitespaces. 1521 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); } 1522 1523 // Encountered a non-whitespace non-linebreak character, so we're done. 1524 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1525 1526 const lineBreak = scanLineBreak(); 1527 anyBreaks = true; 1528 reader_.sliceBuilder.write(lineBreak); 1529 } 1530 return anyBreaks; 1531 } 1532 1533 /// Scan plain scalar token (no block, no quotes). 1534 Token scanPlain() @safe 1535 { 1536 // We keep track of the allowSimpleKey_ flag here. 1537 // Indentation rules are loosed for the flow context 1538 const startMark = reader_.mark; 1539 Mark endMark = startMark; 1540 const indent = indent_ + 1; 1541 1542 // We allow zero indentation for scalars, but then we need to check for 1543 // document separators at the beginning of the line. 1544 // if(indent == 0) { indent = 1; } 1545 1546 reader_.sliceBuilder.begin(); 1547 1548 alias Transaction = SliceBuilder.Transaction; 1549 Transaction spacesTransaction; 1550 // Stop at a comment. 1551 while(reader_.peekByte() != '#') 1552 { 1553 // Scan the entire plain scalar. 1554 size_t length; 1555 dchar c = reader_.peek(length); 1556 for(;;) 1557 { 1558 const cNext = reader_.peek(length + 1); 1559 if(c.isWhiteSpace || 1560 (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) || 1561 (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}'))) 1562 { 1563 break; 1564 } 1565 ++length; 1566 c = cNext; 1567 } 1568 1569 // It's not clear what we should do with ':' in the flow context. 1570 enforce(flowLevel_ == 0 || c != ':' || 1571 reader_.peek(length + 1).isWhiteSpace || 1572 reader_.peek(length + 1).among!(',', '[', ']', '{', '}'), 1573 new ScannerException("While scanning a plain scalar", startMark, 1574 "found unexpected ':' . Please check " ~ 1575 "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.", 1576 reader_.mark)); 1577 1578 if(length == 0) { break; } 1579 1580 allowSimpleKey_ = false; 1581 1582 reader_.sliceBuilder.write(reader_.get(length)); 1583 1584 endMark = reader_.mark; 1585 1586 spacesTransaction.commit(); 1587 spacesTransaction = Transaction(&reader_.sliceBuilder); 1588 1589 const startLength = reader_.sliceBuilder.length; 1590 scanPlainSpacesToSlice(); 1591 if(startLength == reader_.sliceBuilder.length || 1592 (flowLevel_ == 0 && reader_.column < indent)) 1593 { 1594 break; 1595 } 1596 } 1597 1598 spacesTransaction.end(); 1599 char[] slice = reader_.sliceBuilder.finish(); 1600 1601 return scalarToken(startMark, endMark, slice, ScalarStyle.plain); 1602 } 1603 1604 /// Scan spaces in a plain scalar. 1605 /// 1606 /// Assumes that the caller is building a slice in Reader, and puts the spaces 1607 /// into that slice. 1608 void scanPlainSpacesToSlice() @safe 1609 { 1610 // The specification is really confusing about tabs in plain scalars. 1611 // We just forbid them completely. Do not use tabs in YAML! 1612 1613 // Get as many plain spaces as there are. 1614 size_t length; 1615 while(reader_.peekByte(length) == ' ') { ++length; } 1616 char[] whitespaces = reader_.prefixBytes(length); 1617 reader_.forward(length); 1618 1619 const dchar c = reader_.peek(); 1620 if(!c.isNSChar) 1621 { 1622 // We have spaces, but no newline. 1623 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); } 1624 return; 1625 } 1626 1627 // Newline after the spaces (if any) 1628 const lineBreak = scanLineBreak(); 1629 allowSimpleKey_ = true; 1630 1631 static bool end(Reader reader_) @safe pure 1632 { 1633 const prefix = reader_.prefix(3); 1634 return ("---" == prefix || "..." == prefix) 1635 && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 1636 } 1637 1638 if(end(reader_)) { return; } 1639 1640 bool extraBreaks; 1641 1642 alias Transaction = SliceBuilder.Transaction; 1643 auto transaction = Transaction(&reader_.sliceBuilder); 1644 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1645 while(reader_.peek().isNSChar) 1646 { 1647 if(reader_.peekByte() == ' ') { reader_.forward(); } 1648 else 1649 { 1650 const lBreak = scanLineBreak(); 1651 extraBreaks = true; 1652 reader_.sliceBuilder.write(lBreak); 1653 1654 if(end(reader_)) { return; } 1655 } 1656 } 1657 transaction.commit(); 1658 1659 // No line breaks, only a space. 1660 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1661 } 1662 1663 /// Scan handle of a tag token. 1664 /// 1665 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1666 /// characters into that slice. 1667 void scanTagHandleToSlice(string name)(const Mark startMark) 1668 { 1669 dchar c = reader_.peek(); 1670 enum contextMsg = "While scanning a " ~ name; 1671 enforce(c == '!', 1672 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1673 1674 uint length = 1; 1675 c = reader_.peek(length); 1676 if(c != ' ') 1677 { 1678 while(c.isAlphaNum || c.among!('-', '_')) 1679 { 1680 ++length; 1681 c = reader_.peek(length); 1682 } 1683 enforce(c == '!', 1684 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1685 ++length; 1686 } 1687 1688 reader_.sliceBuilder.write(reader_.get(length)); 1689 } 1690 1691 /// Scan URI in a tag token. 1692 /// 1693 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1694 /// characters into that slice. 1695 void scanTagURIToSlice(string name)(const Mark startMark) 1696 { 1697 // Note: we do not check if URI is well-formed. 1698 dchar c = reader_.peek(); 1699 const startLen = reader_.sliceBuilder.length; 1700 { 1701 uint length; 1702 while(c.isAlphaNum || c.isURIChar) 1703 { 1704 if(c == '%') 1705 { 1706 auto chars = reader_.get(length); 1707 reader_.sliceBuilder.write(chars); 1708 length = 0; 1709 scanURIEscapesToSlice!name(startMark); 1710 } 1711 else { ++length; } 1712 c = reader_.peek(length); 1713 } 1714 if(length > 0) 1715 { 1716 auto chars = reader_.get(length); 1717 reader_.sliceBuilder.write(chars); 1718 length = 0; 1719 } 1720 } 1721 // OK if we scanned something, error otherwise. 1722 enum contextMsg = "While parsing a " ~ name; 1723 enforce(reader_.sliceBuilder.length > startLen, 1724 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark)); 1725 } 1726 1727 // Not @nogc yet because std.utf.decode is not @nogc 1728 /// Scan URI escape sequences. 1729 /// 1730 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1731 /// characters into that slice. 1732 void scanURIEscapesToSlice(string name)(const Mark startMark) 1733 { 1734 import core.exception : UnicodeException; 1735 // URI escapes encode a UTF-8 string. We store UTF-8 code units here for 1736 // decoding into UTF-32. 1737 Appender!string buffer; 1738 1739 1740 enum contextMsg = "While scanning a " ~ name; 1741 while(reader_.peekByte() == '%') 1742 { 1743 reader_.forward(); 1744 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)]; 1745 1746 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit, 1747 new ScannerException(contextMsg, startMark, 1748 expected("URI escape sequence of 2 hexadecimal " ~ 1749 "numbers", nextByte), reader_.mark)); 1750 1751 buffer ~= nextByte[].to!ubyte(16); 1752 1753 reader_.forward(2); 1754 } 1755 try 1756 { 1757 foreach (dchar chr; buffer.data) 1758 { 1759 reader_.sliceBuilder.write(chr); 1760 } 1761 } 1762 catch (UnicodeException) 1763 { 1764 throw new ScannerException(contextMsg, startMark, 1765 "Invalid UTF-8 data encoded in URI escape sequence", 1766 reader_.mark); 1767 } 1768 } 1769 1770 1771 /// Scan a line break, if any. 1772 /// 1773 /// Transforms: 1774 /// '\r\n' : '\n' 1775 /// '\r' : '\n' 1776 /// '\n' : '\n' 1777 /// '\u0085' : '\n' 1778 /// '\u2028' : '\u2028' 1779 /// '\u2029 : '\u2029' 1780 /// no break : '\0' 1781 dchar scanLineBreak() @safe 1782 { 1783 // Fast path for ASCII line breaks. 1784 const b = reader_.peekByte(); 1785 if(b < 0x80) 1786 { 1787 if(b == '\n' || b == '\r') 1788 { 1789 if(reader_.prefix(2) == "\r\n") { reader_.forward(2); } 1790 else { reader_.forward(); } 1791 return '\n'; 1792 } 1793 return '\0'; 1794 } 1795 1796 const c = reader_.peek(); 1797 if(c == '\x85') 1798 { 1799 reader_.forward(); 1800 return '\n'; 1801 } 1802 if(c == '\u2028' || c == '\u2029') 1803 { 1804 reader_.forward(); 1805 return c; 1806 } 1807 return '\0'; 1808 } 1809 }