1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 /// YAML scanner. 8 /// Code based on PyYAML: http://www.pyyaml.org 9 module dyaml.scanner; 10 11 12 import core.stdc.string; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.ascii : isAlphaNum, isDigit, isHexDigit; 18 import std.exception; 19 import std.string; 20 import std.typecons; 21 import std.traits : Unqual; 22 import std.utf; 23 24 import dyaml.escapes; 25 import dyaml.exception; 26 import dyaml.queue; 27 import dyaml.reader; 28 import dyaml.style; 29 import dyaml.token; 30 31 package: 32 /// Scanner produces tokens of the following types: 33 /// STREAM-START 34 /// STREAM-END 35 /// DIRECTIVE(name, value) 36 /// DOCUMENT-START 37 /// DOCUMENT-END 38 /// BLOCK-SEQUENCE-START 39 /// BLOCK-MAPPING-START 40 /// BLOCK-END 41 /// FLOW-SEQUENCE-START 42 /// FLOW-MAPPING-START 43 /// FLOW-SEQUENCE-END 44 /// FLOW-MAPPING-END 45 /// BLOCK-ENTRY 46 /// FLOW-ENTRY 47 /// KEY 48 /// VALUE 49 /// ALIAS(value) 50 /// ANCHOR(value) 51 /// TAG(value) 52 /// SCALAR(value, plain, style) 53 54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 55 56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 57 58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 59 60 alias isNonLinebreakWhitespace = among!(' ', '\t'); 61 62 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}', 63 '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n', 64 '\r', '\u0085', '\u2028', '\u2029'); 65 66 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',', 67 '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%'); 68 69 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'); 70 71 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029'); 72 73 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\'); 74 75 /// Marked exception thrown at scanner errors. 76 /// 77 /// See_Also: MarkedYAMLException 78 class ScannerException : MarkedYAMLException 79 { 80 mixin MarkedExceptionCtors; 81 } 82 83 /// Generates tokens from data provided by a Reader. 84 struct Scanner 85 { 86 private: 87 /// A simple key is a key that is not denoted by the '?' indicator. 88 /// For example: 89 /// --- 90 /// block simple key: value 91 /// ? not a simple key: 92 /// : { flow simple key: value } 93 /// We emit the KEY token before all keys, so when we find a potential simple 94 /// key, we try to locate the corresponding ':' indicator. Simple keys should be 95 /// limited to a single line and 1024 characters. 96 /// 97 /// 16 bytes on 64-bit. 98 static struct SimpleKey 99 { 100 /// Character index in reader where the key starts. 101 uint charIndex = uint.max; 102 /// Index of the key token from start (first token scanned being 0). 103 uint tokenIndex; 104 /// Line the key starts at. 105 uint line; 106 /// Column the key starts at. 107 ushort column; 108 /// Is this required to be a simple key? 109 bool required; 110 /// Is this struct "null" (invalid)?. 111 bool isNull; 112 } 113 114 /// Block chomping types. 115 enum Chomping 116 { 117 /// Strip all trailing line breaks. '-' indicator. 118 strip, 119 /// Line break of the last line is preserved, others discarded. Default. 120 clip, 121 /// All trailing line breaks are preserved. '+' indicator. 122 keep 123 } 124 125 /// Reader used to read from a file/stream. 126 Reader reader_; 127 /// Are we done scanning? 128 bool done_; 129 130 /// Level of nesting in flow context. If 0, we're in block context. 131 uint flowLevel_; 132 /// Current indentation level. 133 int indent_ = -1; 134 /// Past indentation levels. Used as a stack. 135 Appender!(int[]) indents_; 136 137 /// Processed tokens not yet emitted. Used as a queue. 138 Queue!Token tokens_; 139 140 /// Number of tokens emitted through the getToken method. 141 uint tokensTaken_; 142 143 /// Can a simple key start at the current position? A simple key may start: 144 /// - at the beginning of the line, not counting indentation spaces 145 /// (in block context), 146 /// - after '{', '[', ',' (in the flow context), 147 /// - after '?', ':', '-' (in the block context). 148 /// In the block context, this flag also signifies if a block collection 149 /// may start at the current position. 150 bool allowSimpleKey_ = true; 151 152 /// Possible simple keys indexed by flow levels. 153 SimpleKey[] possibleSimpleKeys_; 154 155 public: 156 /// Construct a Scanner using specified Reader. 157 this(Reader reader) @safe nothrow 158 { 159 // Return the next token, but do not delete it from the queue 160 reader_ = reader; 161 fetchStreamStart(); 162 } 163 164 /// Advance to the next token 165 void popFront() @safe 166 { 167 ++tokensTaken_; 168 tokens_.pop(); 169 } 170 171 /// Return the current token 172 const(Token) front() @safe 173 { 174 enforce(!empty, "No token left to peek"); 175 return tokens_.peek(); 176 } 177 178 /// Return whether there are any more tokens left. 179 bool empty() @safe 180 { 181 while (needMoreTokens()) 182 { 183 fetchToken(); 184 } 185 return tokens_.empty; 186 } 187 188 /// Set file name. 189 void name(string name) @safe pure nothrow @nogc 190 { 191 reader_.name = name; 192 } 193 194 private: 195 /// Most scanning error messages have the same format; so build them with this 196 /// function. 197 string expected(T)(string expected, T found) 198 { 199 return text("expected ", expected, ", but found ", found); 200 } 201 202 /// Determine whether or not we need to fetch more tokens before peeking/getting a token. 203 bool needMoreTokens() @safe pure 204 { 205 if(done_) { return false; } 206 if(tokens_.empty) { return true; } 207 208 /// The current token may be a potential simple key, so we need to look further. 209 stalePossibleSimpleKeys(); 210 return nextPossibleSimpleKey() == tokensTaken_; 211 } 212 213 /// Fetch at token, adding it to tokens_. 214 void fetchToken() @safe 215 { 216 // Eat whitespaces and comments until we reach the next token. 217 scanToNextToken(); 218 219 // Remove obsolete possible simple keys. 220 stalePossibleSimpleKeys(); 221 222 // Compare current indentation and column. It may add some tokens 223 // and decrease the current indentation level. 224 unwindIndent(reader_.column); 225 226 // Get the next character. 227 const dchar c = reader_.peekByte(); 228 229 // Fetch the token. 230 if(c == '\0') { return fetchStreamEnd(); } 231 if(checkDirective()) { return fetchDirective(); } 232 if(checkDocumentStart()) { return fetchDocumentStart(); } 233 if(checkDocumentEnd()) { return fetchDocumentEnd(); } 234 // Order of the following checks is NOT significant. 235 switch(c) 236 { 237 case '[': return fetchFlowSequenceStart(); 238 case '{': return fetchFlowMappingStart(); 239 case ']': return fetchFlowSequenceEnd(); 240 case '}': return fetchFlowMappingEnd(); 241 case ',': return fetchFlowEntry(); 242 case '!': return fetchTag(); 243 case '\'': return fetchSingle(); 244 case '\"': return fetchDouble(); 245 case '*': return fetchAlias(); 246 case '&': return fetchAnchor(); 247 case '?': if(checkKey()) { return fetchKey(); } goto default; 248 case ':': if(checkValue()) { return fetchValue(); } goto default; 249 case '-': if(checkBlockEntry()) { return fetchBlockEntry(); } goto default; 250 case '|': if(flowLevel_ == 0) { return fetchLiteral(); } break; 251 case '>': if(flowLevel_ == 0) { return fetchFolded(); } break; 252 default: if(checkPlain()) { return fetchPlain(); } 253 } 254 255 throw new ScannerException("While scanning for the next token, found character " ~ 256 "\'%s\', index %s that cannot start any token" 257 .format(c, to!int(c)), reader_.mark); 258 } 259 260 261 /// Return the token number of the nearest possible simple key. 262 uint nextPossibleSimpleKey() @safe pure nothrow @nogc 263 { 264 uint minTokenNumber = uint.max; 265 foreach(k, ref simpleKey; possibleSimpleKeys_) 266 { 267 if(simpleKey.isNull) { continue; } 268 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex); 269 } 270 return minTokenNumber; 271 } 272 273 /// Remove entries that are no longer possible simple keys. 274 /// 275 /// According to the YAML specification, simple keys 276 /// - should be limited to a single line, 277 /// - should be no longer than 1024 characters. 278 /// Disabling this will allow simple keys of any length and 279 /// height (may cause problems if indentation is broken though). 280 void stalePossibleSimpleKeys() @safe pure 281 { 282 foreach(level, ref key; possibleSimpleKeys_) 283 { 284 if(key.isNull) { continue; } 285 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024) 286 { 287 enforce(!key.required, 288 new ScannerException("While scanning a simple key", 289 Mark(reader_.name, key.line, key.column), 290 "could not find expected ':'", reader_.mark)); 291 key.isNull = true; 292 } 293 } 294 } 295 296 /// Check if the next token starts a possible simple key and if so, save its position. 297 /// 298 /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 299 void savePossibleSimpleKey() @safe pure 300 { 301 // Check if a simple key is required at the current position. 302 const required = (flowLevel_ == 0 && indent_ == reader_.column); 303 assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~ 304 "the first token in the current line. Therefore it is always allowed."); 305 306 if(!allowSimpleKey_) { return; } 307 308 // The next token might be a simple key, so save its number and position. 309 removePossibleSimpleKey(); 310 const tokenCount = tokensTaken_ + cast(uint)tokens_.length; 311 312 const line = reader_.line; 313 const column = reader_.column; 314 const key = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line, 315 cast(ushort)min(column, ushort.max), required); 316 317 if(possibleSimpleKeys_.length <= flowLevel_) 318 { 319 const oldLength = possibleSimpleKeys_.length; 320 possibleSimpleKeys_.length = flowLevel_ + 1; 321 //No need to initialize the last element, it's already done in the next line. 322 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init; 323 } 324 possibleSimpleKeys_[flowLevel_] = key; 325 } 326 327 /// Remove the saved possible key position at the current flow level. 328 void removePossibleSimpleKey() @safe pure 329 { 330 if(possibleSimpleKeys_.length <= flowLevel_) { return; } 331 332 if(!possibleSimpleKeys_[flowLevel_].isNull) 333 { 334 const key = possibleSimpleKeys_[flowLevel_]; 335 enforce(!key.required, 336 new ScannerException("While scanning a simple key", 337 Mark(reader_.name, key.line, key.column), 338 "could not find expected ':'", reader_.mark)); 339 possibleSimpleKeys_[flowLevel_].isNull = true; 340 } 341 } 342 343 /// Decrease indentation, removing entries in indents_. 344 /// 345 /// Params: column = Current column in the file/stream. 346 void unwindIndent(const int column) @safe 347 { 348 if(flowLevel_ > 0) 349 { 350 // In flow context, tokens should respect indentation. 351 // The condition should be `indent >= column` according to the spec. 352 // But this condition will prohibit intuitively correct 353 // constructions such as 354 // key : { 355 // } 356 357 // In the flow context, indentation is ignored. We make the scanner less 358 // restrictive than what the specification requires. 359 // if(pedantic_ && flowLevel_ > 0 && indent_ > column) 360 // { 361 // throw new ScannerException("Invalid intendation or unclosed '[' or '{'", 362 // reader_.mark) 363 // } 364 return; 365 } 366 367 // In block context, we may need to issue the BLOCK-END tokens. 368 while(indent_ > column) 369 { 370 indent_ = indents_.data.back; 371 assert(indents_.data.length); 372 indents_.shrinkTo(indents_.data.length - 1); 373 tokens_.push(blockEndToken(reader_.mark, reader_.mark)); 374 } 375 } 376 377 /// Increase indentation if needed. 378 /// 379 /// Params: column = Current column in the file/stream. 380 /// 381 /// Returns: true if the indentation was increased, false otherwise. 382 bool addIndent(int column) @safe 383 { 384 if(indent_ >= column){return false;} 385 indents_ ~= indent_; 386 indent_ = column; 387 return true; 388 } 389 390 391 /// Add STREAM-START token. 392 void fetchStreamStart() @safe nothrow 393 { 394 tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding)); 395 } 396 397 ///Add STREAM-END token. 398 void fetchStreamEnd() @safe 399 { 400 //Set intendation to -1 . 401 unwindIndent(-1); 402 removePossibleSimpleKey(); 403 allowSimpleKey_ = false; 404 possibleSimpleKeys_.destroy; 405 406 tokens_.push(streamEndToken(reader_.mark, reader_.mark)); 407 done_ = true; 408 } 409 410 /// Add DIRECTIVE token. 411 void fetchDirective() @safe 412 { 413 // Set intendation to -1 . 414 unwindIndent(-1); 415 // Reset simple keys. 416 removePossibleSimpleKey(); 417 allowSimpleKey_ = false; 418 419 auto directive = scanDirective(); 420 tokens_.push(directive); 421 } 422 423 /// Add DOCUMENT-START or DOCUMENT-END token. 424 void fetchDocumentIndicator(TokenID id)() 425 if(id == TokenID.documentStart || id == TokenID.documentEnd) 426 { 427 // Set indentation to -1 . 428 unwindIndent(-1); 429 // Reset simple keys. Note that there can't be a block collection after '---'. 430 removePossibleSimpleKey(); 431 allowSimpleKey_ = false; 432 433 Mark startMark = reader_.mark; 434 reader_.forward(3); 435 tokens_.push(simpleToken!id(startMark, reader_.mark)); 436 } 437 438 /// Aliases to add DOCUMENT-START or DOCUMENT-END token. 439 alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart); 440 alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd); 441 442 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 443 void fetchFlowCollectionStart(TokenID id)() @safe 444 { 445 // '[' and '{' may start a simple key. 446 savePossibleSimpleKey(); 447 // Simple keys are allowed after '[' and '{'. 448 allowSimpleKey_ = true; 449 ++flowLevel_; 450 451 Mark startMark = reader_.mark; 452 reader_.forward(); 453 tokens_.push(simpleToken!id(startMark, reader_.mark)); 454 } 455 456 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 457 alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart); 458 alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart); 459 460 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 461 void fetchFlowCollectionEnd(TokenID id)() 462 { 463 // Reset possible simple key on the current level. 464 removePossibleSimpleKey(); 465 // No simple keys after ']' and '}'. 466 allowSimpleKey_ = false; 467 --flowLevel_; 468 469 Mark startMark = reader_.mark; 470 reader_.forward(); 471 tokens_.push(simpleToken!id(startMark, reader_.mark)); 472 } 473 474 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/ 475 alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd); 476 alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd); 477 478 /// Add FLOW-ENTRY token; 479 void fetchFlowEntry() @safe 480 { 481 // Reset possible simple key on the current level. 482 removePossibleSimpleKey(); 483 // Simple keys are allowed after ','. 484 allowSimpleKey_ = true; 485 486 Mark startMark = reader_.mark; 487 reader_.forward(); 488 tokens_.push(flowEntryToken(startMark, reader_.mark)); 489 } 490 491 /// Additional checks used in block context in fetchBlockEntry and fetchKey. 492 /// 493 /// Params: type = String representing the token type we might need to add. 494 /// id = Token type we might need to add. 495 void blockChecks(string type, TokenID id)() 496 { 497 enum context = type ~ " keys are not allowed here"; 498 // Are we allowed to start a key (not neccesarily a simple one)? 499 enforce(allowSimpleKey_, new ScannerException(context, reader_.mark)); 500 501 if(addIndent(reader_.column)) 502 { 503 tokens_.push(simpleToken!id(reader_.mark, reader_.mark)); 504 } 505 } 506 507 /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process. 508 void fetchBlockEntry() @safe 509 { 510 if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); } 511 512 // It's an error for the block entry to occur in the flow context, 513 // but we let the parser detect this. 514 515 // Reset possible simple key on the current level. 516 removePossibleSimpleKey(); 517 // Simple keys are allowed after '-'. 518 allowSimpleKey_ = true; 519 520 Mark startMark = reader_.mark; 521 reader_.forward(); 522 tokens_.push(blockEntryToken(startMark, reader_.mark)); 523 } 524 525 /// Add KEY token. Might add BLOCK-MAPPING-START in the process. 526 void fetchKey() @safe 527 { 528 if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); } 529 530 // Reset possible simple key on the current level. 531 removePossibleSimpleKey(); 532 // Simple keys are allowed after '?' in the block context. 533 allowSimpleKey_ = (flowLevel_ == 0); 534 535 Mark startMark = reader_.mark; 536 reader_.forward(); 537 tokens_.push(keyToken(startMark, reader_.mark)); 538 } 539 540 /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process. 541 void fetchValue() @safe 542 { 543 //Do we determine a simple key? 544 if(possibleSimpleKeys_.length > flowLevel_ && 545 !possibleSimpleKeys_[flowLevel_].isNull) 546 { 547 const key = possibleSimpleKeys_[flowLevel_]; 548 possibleSimpleKeys_[flowLevel_].isNull = true; 549 Mark keyMark = Mark(reader_.name, key.line, key.column); 550 const idx = key.tokenIndex - tokensTaken_; 551 552 assert(idx >= 0); 553 554 // Add KEY. 555 // Manually inserting since tokens are immutable (need linked list). 556 tokens_.insert(keyToken(keyMark, keyMark), idx); 557 558 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START. 559 if(flowLevel_ == 0 && addIndent(key.column)) 560 { 561 tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx); 562 } 563 564 // There cannot be two simple keys in a row. 565 allowSimpleKey_ = false; 566 } 567 // Part of a complex key 568 else 569 { 570 // We can start a complex value if and only if we can start a simple key. 571 enforce(flowLevel_ > 0 || allowSimpleKey_, 572 new ScannerException("Mapping values are not allowed here", reader_.mark)); 573 574 // If this value starts a new block mapping, we need to add 575 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser. 576 if(flowLevel_ == 0 && addIndent(reader_.column)) 577 { 578 tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark)); 579 } 580 581 // Reset possible simple key on the current level. 582 removePossibleSimpleKey(); 583 // Simple keys are allowed after ':' in the block context. 584 allowSimpleKey_ = (flowLevel_ == 0); 585 } 586 587 // Add VALUE. 588 Mark startMark = reader_.mark; 589 reader_.forward(); 590 tokens_.push(valueToken(startMark, reader_.mark)); 591 } 592 593 /// Add ALIAS or ANCHOR token. 594 void fetchAnchor_(TokenID id)() @safe 595 if(id == TokenID.alias_ || id == TokenID.anchor) 596 { 597 // ALIAS/ANCHOR could be a simple key. 598 savePossibleSimpleKey(); 599 // No simple keys after ALIAS/ANCHOR. 600 allowSimpleKey_ = false; 601 602 auto anchor = scanAnchor(id); 603 tokens_.push(anchor); 604 } 605 606 /// Aliases to add ALIAS or ANCHOR token. 607 alias fetchAlias = fetchAnchor_!(TokenID.alias_); 608 alias fetchAnchor = fetchAnchor_!(TokenID.anchor); 609 610 /// Add TAG token. 611 void fetchTag() @safe 612 { 613 //TAG could start a simple key. 614 savePossibleSimpleKey(); 615 //No simple keys after TAG. 616 allowSimpleKey_ = false; 617 618 tokens_.push(scanTag()); 619 } 620 621 /// Add block SCALAR token. 622 void fetchBlockScalar(ScalarStyle style)() @safe 623 if(style == ScalarStyle.literal || style == ScalarStyle.folded) 624 { 625 // Reset possible simple key on the current level. 626 removePossibleSimpleKey(); 627 // A simple key may follow a block scalar. 628 allowSimpleKey_ = true; 629 630 auto blockScalar = scanBlockScalar(style); 631 tokens_.push(blockScalar); 632 } 633 634 /// Aliases to add literal or folded block scalar. 635 alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal); 636 alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded); 637 638 /// Add quoted flow SCALAR token. 639 void fetchFlowScalar(ScalarStyle quotes)() 640 { 641 // A flow scalar could be a simple key. 642 savePossibleSimpleKey(); 643 // No simple keys after flow scalars. 644 allowSimpleKey_ = false; 645 646 // Scan and add SCALAR. 647 auto scalar = scanFlowScalar(quotes); 648 tokens_.push(scalar); 649 } 650 651 /// Aliases to add single or double quoted block scalar. 652 alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted); 653 alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted); 654 655 /// Add plain SCALAR token. 656 void fetchPlain() @safe 657 { 658 // A plain scalar could be a simple key 659 savePossibleSimpleKey(); 660 // No simple keys after plain scalars. But note that scanPlain() will 661 // change this flag if the scan is finished at the beginning of the line. 662 allowSimpleKey_ = false; 663 auto plain = scanPlain(); 664 665 // Scan and add SCALAR. May change allowSimpleKey_ 666 tokens_.push(plain); 667 } 668 669 pure: 670 671 ///Check if the next token is DIRECTIVE: ^ '%' ... 672 bool checkDirective() @safe 673 { 674 return reader_.peekByte() == '%' && reader_.column == 0; 675 } 676 677 /// Check if the next token is DOCUMENT-START: ^ '---' (' '|'\n') 678 bool checkDocumentStart() @safe 679 { 680 // Check one char first, then all 3, to prevent reading outside the buffer. 681 return reader_.column == 0 && 682 reader_.peekByte() == '-' && 683 reader_.prefix(3) == "---" && 684 reader_.peek(3).isWhiteSpace; 685 } 686 687 /// Check if the next token is DOCUMENT-END: ^ '...' (' '|'\n') 688 bool checkDocumentEnd() @safe 689 { 690 // Check one char first, then all 3, to prevent reading outside the buffer. 691 return reader_.column == 0 && 692 reader_.peekByte() == '.' && 693 reader_.prefix(3) == "..." && 694 reader_.peek(3).isWhiteSpace; 695 } 696 697 /// Check if the next token is BLOCK-ENTRY: '-' (' '|'\n') 698 bool checkBlockEntry() @safe 699 { 700 return !!reader_.peek(1).isWhiteSpace; 701 } 702 703 /// Check if the next token is KEY(flow context): '?' 704 /// 705 /// or KEY(block context): '?' (' '|'\n') 706 bool checkKey() @safe 707 { 708 return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace); 709 } 710 711 /// Check if the next token is VALUE(flow context): ':' 712 /// 713 /// or VALUE(block context): ':' (' '|'\n') 714 bool checkValue() @safe 715 { 716 return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace; 717 } 718 719 /// Check if the next token is a plain scalar. 720 /// 721 /// A plain scalar may start with any non-space character except: 722 /// '-', '?', ':', ',', '[', ']', '{', '}', 723 /// '#', '&', '*', '!', '|', '>', '\'', '\"', 724 /// '%', '@', '`'. 725 /// 726 /// It may also start with 727 /// '-', '?', ':' 728 /// if it is followed by a non-space character. 729 /// 730 /// Note that we limit the last rule to the block context (except the 731 /// '-' character) because we want the flow context to be space 732 /// independent. 733 bool checkPlain() @safe 734 { 735 const c = reader_.peek(); 736 if(!c.isNonScalarStartCharacter) 737 { 738 return true; 739 } 740 return !reader_.peek(1).isWhiteSpace && 741 (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':'))); 742 } 743 744 /// Move to the next non-space character. 745 void findNextNonSpace() @safe 746 { 747 while(reader_.peekByte() == ' ') { reader_.forward(); } 748 } 749 750 /// Scan a string of alphanumeric or "-_" characters. 751 /// 752 /// Assumes that the caller is building a slice in Reader, and puts the scanned 753 /// characters into that slice. 754 void scanAlphaNumericToSlice(string name)(const Mark startMark) 755 { 756 size_t length; 757 dchar c = reader_.peek(); 758 while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); } 759 760 enforce(length > 0, new ScannerException("While scanning " ~ name, 761 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark)); 762 763 reader_.sliceBuilder.write(reader_.get(length)); 764 } 765 766 /// Scan and throw away all characters until next line break. 767 void scanToNextBreak() @safe 768 { 769 while(!reader_.peek().isBreak) { reader_.forward(); } 770 } 771 772 /// Scan all characters until next line break. 773 /// 774 /// Assumes that the caller is building a slice in Reader, and puts the scanned 775 /// characters into that slice. 776 void scanToNextBreakToSlice() @safe 777 { 778 uint length; 779 while(!reader_.peek(length).isBreak) 780 { 781 ++length; 782 } 783 reader_.sliceBuilder.write(reader_.get(length)); 784 } 785 786 787 /// Move to next token in the file/stream. 788 /// 789 /// We ignore spaces, line breaks and comments. 790 /// If we find a line break in the block context, we set 791 /// allowSimpleKey` on. 792 /// 793 /// We do not yet support BOM inside the stream as the 794 /// specification requires. Any such mark will be considered as a part 795 /// of the document. 796 void scanToNextToken() @safe 797 { 798 // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is: 799 // Tabs cannot precede tokens 800 // BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 801 // KEY(block), VALUE(block), BLOCK-ENTRY 802 // So the checking code is 803 // if <TAB>: 804 // allowSimpleKey_ = false 805 // We also need to add the check for `allowSimpleKey_ == true` to 806 // `unwindIndent` before issuing BLOCK-END. 807 // Scanners for block, flow, and plain scalars need to be modified. 808 809 for(;;) 810 { 811 //All whitespace in flow context is ignored, even whitespace 812 // not allowed in other contexts 813 if (flowLevel_ > 0) 814 { 815 while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); } 816 } 817 else 818 { 819 findNextNonSpace(); 820 } 821 if(reader_.peekByte() == '#') { scanToNextBreak(); } 822 if(scanLineBreak() != '\0') 823 { 824 if(flowLevel_ == 0) { allowSimpleKey_ = true; } 825 } 826 else 827 { 828 break; 829 } 830 } 831 } 832 833 /// Scan directive token. 834 Token scanDirective() @safe 835 { 836 Mark startMark = reader_.mark; 837 // Skip the '%'. 838 reader_.forward(); 839 840 // Scan directive name 841 reader_.sliceBuilder.begin(); 842 scanDirectiveNameToSlice(startMark); 843 const name = reader_.sliceBuilder.finish(); 844 845 reader_.sliceBuilder.begin(); 846 847 // Index where tag handle ends and suffix starts in a tag directive value. 848 uint tagHandleEnd = uint.max; 849 if(name == "YAML") { scanYAMLDirectiveValueToSlice(startMark); } 850 else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); } 851 char[] value = reader_.sliceBuilder.finish(); 852 853 Mark endMark = reader_.mark; 854 855 DirectiveType directive; 856 if(name == "YAML") { directive = DirectiveType.yaml; } 857 else if(name == "TAG") { directive = DirectiveType.tag; } 858 else 859 { 860 directive = DirectiveType.reserved; 861 scanToNextBreak(); 862 } 863 864 scanDirectiveIgnoredLine(startMark); 865 866 return directiveToken(startMark, endMark, value, directive, tagHandleEnd); 867 } 868 869 /// Scan name of a directive token. 870 /// 871 /// Assumes that the caller is building a slice in Reader, and puts the scanned 872 /// characters into that slice. 873 void scanDirectiveNameToSlice(const Mark startMark) @safe 874 { 875 // Scan directive name. 876 scanAlphaNumericToSlice!"a directive"(startMark); 877 878 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 879 new ScannerException("While scanning a directive", startMark, 880 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 881 } 882 883 /// Scan value of a YAML directive token. Returns major, minor version separated by '.'. 884 /// 885 /// Assumes that the caller is building a slice in Reader, and puts the scanned 886 /// characters into that slice. 887 void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe 888 { 889 findNextNonSpace(); 890 891 scanYAMLDirectiveNumberToSlice(startMark); 892 893 enforce(reader_.peekByte() == '.', 894 new ScannerException("While scanning a directive", startMark, 895 expected("digit or '.'", reader_.peek()), reader_.mark)); 896 // Skip the '.'. 897 reader_.forward(); 898 899 reader_.sliceBuilder.write('.'); 900 scanYAMLDirectiveNumberToSlice(startMark); 901 902 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 903 new ScannerException("While scanning a directive", startMark, 904 expected("digit or '.'", reader_.peek()), reader_.mark)); 905 } 906 907 /// Scan a number from a YAML directive. 908 /// 909 /// Assumes that the caller is building a slice in Reader, and puts the scanned 910 /// characters into that slice. 911 void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe 912 { 913 enforce(isDigit(reader_.peek()), 914 new ScannerException("While scanning a directive", startMark, 915 expected("digit", reader_.peek()), reader_.mark)); 916 917 // Already found the first digit in the enforce(), so set length to 1. 918 uint length = 1; 919 while(reader_.peek(length).isDigit) { ++length; } 920 921 reader_.sliceBuilder.write(reader_.get(length)); 922 } 923 924 /// Scan value of a tag directive. 925 /// 926 /// Assumes that the caller is building a slice in Reader, and puts the scanned 927 /// characters into that slice. 928 /// 929 /// Returns: Length of tag handle (which is before tag prefix) in scanned data 930 uint scanTagDirectiveValueToSlice(const Mark startMark) @safe 931 { 932 findNextNonSpace(); 933 const startLength = reader_.sliceBuilder.length; 934 scanTagDirectiveHandleToSlice(startMark); 935 const handleLength = cast(uint)(reader_.sliceBuilder.length - startLength); 936 findNextNonSpace(); 937 scanTagDirectivePrefixToSlice(startMark); 938 939 return handleLength; 940 } 941 942 /// Scan handle of a tag directive. 943 /// 944 /// Assumes that the caller is building a slice in Reader, and puts the scanned 945 /// characters into that slice. 946 void scanTagDirectiveHandleToSlice(const Mark startMark) @safe 947 { 948 scanTagHandleToSlice!"directive"(startMark); 949 enforce(reader_.peekByte() == ' ', 950 new ScannerException("While scanning a directive handle", startMark, 951 expected("' '", reader_.peek()), reader_.mark)); 952 } 953 954 /// Scan prefix of a tag directive. 955 /// 956 /// Assumes that the caller is building a slice in Reader, and puts the scanned 957 /// characters into that slice. 958 void scanTagDirectivePrefixToSlice(const Mark startMark) @safe 959 { 960 scanTagURIToSlice!"directive"(startMark); 961 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 962 new ScannerException("While scanning a directive prefix", startMark, 963 expected("' '", reader_.peek()), reader_.mark)); 964 } 965 966 /// Scan (and ignore) ignored line after a directive. 967 void scanDirectiveIgnoredLine(const Mark startMark) @safe 968 { 969 findNextNonSpace(); 970 if(reader_.peekByte() == '#') { scanToNextBreak(); } 971 enforce(reader_.peek().isBreak, 972 new ScannerException("While scanning a directive", startMark, 973 expected("comment or a line break", reader_.peek()), reader_.mark)); 974 scanLineBreak(); 975 } 976 977 978 /// Scan an alias or an anchor. 979 /// 980 /// The specification does not restrict characters for anchors and 981 /// aliases. This may lead to problems, for instance, the document: 982 /// [ *alias, value ] 983 /// can be interpteted in two ways, as 984 /// [ "value" ] 985 /// and 986 /// [ *alias , "value" ] 987 /// Therefore we restrict aliases to ASCII alphanumeric characters. 988 Token scanAnchor(const TokenID id) @safe 989 { 990 const startMark = reader_.mark; 991 const dchar i = reader_.get(); 992 993 reader_.sliceBuilder.begin(); 994 if(i == '*') { scanAlphaNumericToSlice!"an alias"(startMark); } 995 else { scanAlphaNumericToSlice!"an anchor"(startMark); } 996 // On error, value is discarded as we return immediately 997 char[] value = reader_.sliceBuilder.finish(); 998 999 enum anchorCtx = "While scanning an anchor"; 1000 enum aliasCtx = "While scanning an alias"; 1001 enforce(reader_.peek().isWhiteSpace || 1002 reader_.peekByte().among!('?', ':', ',', ']', '}', '%', '@'), 1003 new ScannerException(i == '*' ? aliasCtx : anchorCtx, startMark, 1004 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 1005 1006 if(id == TokenID.alias_) 1007 { 1008 return aliasToken(startMark, reader_.mark, value); 1009 } 1010 if(id == TokenID.anchor) 1011 { 1012 return anchorToken(startMark, reader_.mark, value); 1013 } 1014 assert(false, "This code should never be reached"); 1015 } 1016 1017 /// Scan a tag token. 1018 Token scanTag() @safe 1019 { 1020 const startMark = reader_.mark; 1021 dchar c = reader_.peek(1); 1022 1023 reader_.sliceBuilder.begin(); 1024 scope(failure) { reader_.sliceBuilder.finish(); } 1025 // Index where tag handle ends and tag suffix starts in the tag value 1026 // (slice) we will produce. 1027 uint handleEnd; 1028 1029 if(c == '<') 1030 { 1031 reader_.forward(2); 1032 1033 handleEnd = 0; 1034 scanTagURIToSlice!"tag"(startMark); 1035 enforce(reader_.peekByte() == '>', 1036 new ScannerException("While scanning a tag", startMark, 1037 expected("'>'", reader_.peek()), reader_.mark)); 1038 reader_.forward(); 1039 } 1040 else if(c.isWhiteSpace) 1041 { 1042 reader_.forward(); 1043 handleEnd = 0; 1044 reader_.sliceBuilder.write('!'); 1045 } 1046 else 1047 { 1048 uint length = 1; 1049 bool useHandle; 1050 1051 while(!c.isBreakOrSpace) 1052 { 1053 if(c == '!') 1054 { 1055 useHandle = true; 1056 break; 1057 } 1058 ++length; 1059 c = reader_.peek(length); 1060 } 1061 1062 if(useHandle) 1063 { 1064 scanTagHandleToSlice!"tag"(startMark); 1065 handleEnd = cast(uint)reader_.sliceBuilder.length; 1066 } 1067 else 1068 { 1069 reader_.forward(); 1070 reader_.sliceBuilder.write('!'); 1071 handleEnd = cast(uint)reader_.sliceBuilder.length; 1072 } 1073 1074 scanTagURIToSlice!"tag"(startMark); 1075 } 1076 1077 enforce(reader_.peek().isBreakOrSpace, 1078 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()), 1079 reader_.mark)); 1080 1081 char[] slice = reader_.sliceBuilder.finish(); 1082 return tagToken(startMark, reader_.mark, slice, handleEnd); 1083 } 1084 1085 /// Scan a block scalar token with specified style. 1086 Token scanBlockScalar(const ScalarStyle style) @safe 1087 { 1088 const startMark = reader_.mark; 1089 1090 // Scan the header. 1091 reader_.forward(); 1092 1093 const indicators = scanBlockScalarIndicators(startMark); 1094 1095 const chomping = indicators[0]; 1096 const increment = indicators[1]; 1097 scanBlockScalarIgnoredLine(startMark); 1098 1099 // Determine the indentation level and go to the first non-empty line. 1100 Mark endMark; 1101 uint indent = max(1, indent_ + 1); 1102 1103 reader_.sliceBuilder.begin(); 1104 alias Transaction = SliceBuilder.Transaction; 1105 // Used to strip the last line breaks written to the slice at the end of the 1106 // scalar, which may be needed based on chomping. 1107 Transaction breaksTransaction = Transaction(&reader_.sliceBuilder); 1108 // Read the first indentation/line breaks before the scalar. 1109 size_t startLen = reader_.sliceBuilder.length; 1110 if(increment == int.min) 1111 { 1112 auto indentation = scanBlockScalarIndentationToSlice(); 1113 endMark = indentation[1]; 1114 indent = max(indent, indentation[0]); 1115 } 1116 else 1117 { 1118 indent += increment - 1; 1119 endMark = scanBlockScalarBreaksToSlice(indent); 1120 } 1121 1122 // int.max means there's no line break (int.max is outside UTF-32). 1123 dchar lineBreak = cast(dchar)int.max; 1124 1125 // Scan the inner part of the block scalar. 1126 while(reader_.column == indent && reader_.peekByte() != '\0') 1127 { 1128 breaksTransaction.commit(); 1129 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t'); 1130 // This is where the 'interesting' non-whitespace data gets read. 1131 scanToNextBreakToSlice(); 1132 lineBreak = scanLineBreak(); 1133 1134 1135 // This transaction serves to rollback data read in the 1136 // scanBlockScalarBreaksToSlice() call. 1137 breaksTransaction = Transaction(&reader_.sliceBuilder); 1138 startLen = reader_.sliceBuilder.length; 1139 // The line breaks should actually be written _after_ the if() block 1140 // below. We work around that by inserting 1141 endMark = scanBlockScalarBreaksToSlice(indent); 1142 1143 // This will not run during the last iteration (see the if() vs the 1144 // while()), hence breaksTransaction rollback (which happens after this 1145 // loop) will never roll back data written in this if() block. 1146 if(reader_.column == indent && reader_.peekByte() != '\0') 1147 { 1148 // Unfortunately, folding rules are ambiguous. 1149 1150 // This is the folding according to the specification: 1151 if(style == ScalarStyle.folded && lineBreak == '\n' && 1152 leadingNonSpace && !reader_.peekByte().among!(' ', '\t')) 1153 { 1154 // No breaks were scanned; no need to insert the space in the 1155 // middle of slice. 1156 if(startLen == reader_.sliceBuilder.length) 1157 { 1158 reader_.sliceBuilder.write(' '); 1159 } 1160 } 1161 else 1162 { 1163 // We need to insert in the middle of the slice in case any line 1164 // breaks were scanned. 1165 reader_.sliceBuilder.insert(lineBreak, startLen); 1166 } 1167 1168 ////this is Clark Evans's interpretation (also in the spec 1169 ////examples): 1170 // 1171 //if(style == ScalarStyle.folded && lineBreak == '\n') 1172 //{ 1173 // if(startLen == endLen) 1174 // { 1175 // if(!" \t"d.canFind(reader_.peekByte())) 1176 // { 1177 // reader_.sliceBuilder.write(' '); 1178 // } 1179 // else 1180 // { 1181 // chunks ~= lineBreak; 1182 // } 1183 // } 1184 //} 1185 //else 1186 //{ 1187 // reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen); 1188 //} 1189 } 1190 else 1191 { 1192 break; 1193 } 1194 } 1195 1196 // If chompint is Keep, we keep (commit) the last scanned line breaks 1197 // (which are at the end of the scalar). Otherwise re remove them (end the 1198 // transaction). 1199 if(chomping == Chomping.keep) { breaksTransaction.commit(); } 1200 else { breaksTransaction.end(); } 1201 if(chomping != Chomping.strip && lineBreak != int.max) 1202 { 1203 // If chomping is Keep, we keep the line break but the first line break 1204 // that isn't stripped (since chomping isn't Strip in this branch) must 1205 // be inserted _before_ the other line breaks. 1206 if(chomping == Chomping.keep) 1207 { 1208 reader_.sliceBuilder.insert(lineBreak, startLen); 1209 } 1210 // If chomping is not Keep, breaksTransaction was cancelled so we can 1211 // directly write the first line break (as it isn't stripped - chomping 1212 // is not Strip) 1213 else 1214 { 1215 reader_.sliceBuilder.write(lineBreak); 1216 } 1217 } 1218 1219 char[] slice = reader_.sliceBuilder.finish(); 1220 return scalarToken(startMark, endMark, slice, style); 1221 } 1222 1223 /// Scan chomping and indentation indicators of a scalar token. 1224 Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe 1225 { 1226 auto chomping = Chomping.clip; 1227 int increment = int.min; 1228 dchar c = reader_.peek(); 1229 1230 /// Indicators can be in any order. 1231 if(getChomping(c, chomping)) 1232 { 1233 getIncrement(c, increment, startMark); 1234 } 1235 else 1236 { 1237 const gotIncrement = getIncrement(c, increment, startMark); 1238 if(gotIncrement) { getChomping(c, chomping); } 1239 } 1240 1241 enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 1242 new ScannerException("While scanning a block scalar", startMark, 1243 expected("chomping or indentation indicator", c), reader_.mark)); 1244 1245 return tuple(chomping, increment); 1246 } 1247 1248 /// Get chomping indicator, if detected. Return false otherwise. 1249 /// 1250 /// Used in scanBlockScalarIndicators. 1251 /// 1252 /// Params: 1253 /// 1254 /// c = The character that may be a chomping indicator. 1255 /// chomping = Write the chomping value here, if detected. 1256 bool getChomping(ref dchar c, ref Chomping chomping) @safe 1257 { 1258 if(!c.among!('+', '-')) { return false; } 1259 chomping = c == '+' ? Chomping.keep : Chomping.strip; 1260 reader_.forward(); 1261 c = reader_.peek(); 1262 return true; 1263 } 1264 1265 /// Get increment indicator, if detected. Return false otherwise. 1266 /// 1267 /// Used in scanBlockScalarIndicators. 1268 /// 1269 /// Params: 1270 /// 1271 /// c = The character that may be an increment indicator. 1272 /// If an increment indicator is detected, this will be updated to 1273 /// the next character in the Reader. 1274 /// increment = Write the increment value here, if detected. 1275 /// startMark = Mark for error messages. 1276 bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe 1277 { 1278 if(!c.isDigit) { return false; } 1279 // Convert a digit to integer. 1280 increment = c - '0'; 1281 assert(increment < 10 && increment >= 0, "Digit has invalid value"); 1282 1283 enforce(increment > 0, 1284 new ScannerException("While scanning a block scalar", startMark, 1285 expected("indentation indicator in range 1-9", "0"), reader_.mark)); 1286 1287 reader_.forward(); 1288 c = reader_.peek(); 1289 return true; 1290 } 1291 1292 /// Scan (and ignore) ignored line in a block scalar. 1293 void scanBlockScalarIgnoredLine(const Mark startMark) @safe 1294 { 1295 findNextNonSpace(); 1296 if(reader_.peekByte()== '#') { scanToNextBreak(); } 1297 1298 enforce(reader_.peek().isBreak, 1299 new ScannerException("While scanning a block scalar", startMark, 1300 expected("comment or line break", reader_.peek()), reader_.mark)); 1301 1302 scanLineBreak(); 1303 } 1304 1305 /// Scan indentation in a block scalar, returning line breaks, max indent and end mark. 1306 /// 1307 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1308 /// characters into that slice. 1309 Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe 1310 { 1311 uint maxIndent; 1312 Mark endMark = reader_.mark; 1313 1314 while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) 1315 { 1316 if(reader_.peekByte() != ' ') 1317 { 1318 reader_.sliceBuilder.write(scanLineBreak()); 1319 endMark = reader_.mark; 1320 continue; 1321 } 1322 reader_.forward(); 1323 maxIndent = max(reader_.column, maxIndent); 1324 } 1325 1326 return tuple(maxIndent, endMark); 1327 } 1328 1329 /// Scan line breaks at lower or specified indentation in a block scalar. 1330 /// 1331 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1332 /// characters into that slice. 1333 Mark scanBlockScalarBreaksToSlice(const uint indent) @safe 1334 { 1335 Mark endMark = reader_.mark; 1336 1337 for(;;) 1338 { 1339 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); } 1340 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1341 reader_.sliceBuilder.write(scanLineBreak()); 1342 endMark = reader_.mark; 1343 } 1344 1345 return endMark; 1346 } 1347 1348 /// Scan a qouted flow scalar token with specified quotes. 1349 Token scanFlowScalar(const ScalarStyle quotes) @safe 1350 { 1351 const startMark = reader_.mark; 1352 const quote = reader_.get(); 1353 1354 reader_.sliceBuilder.begin(); 1355 1356 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1357 1358 while(reader_.peek() != quote) 1359 { 1360 scanFlowScalarSpacesToSlice(startMark); 1361 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1362 } 1363 reader_.forward(); 1364 1365 auto slice = reader_.sliceBuilder.finish(); 1366 return scalarToken(startMark, reader_.mark, slice, quotes); 1367 } 1368 1369 /// Scan nonspace characters in a flow scalar. 1370 /// 1371 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1372 /// characters into that slice. 1373 void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) 1374 @safe 1375 { 1376 for(;;) 1377 { 1378 dchar c = reader_.peek(); 1379 1380 size_t numCodePoints; 1381 while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; } 1382 1383 if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); } 1384 1385 c = reader_.peek(); 1386 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'') 1387 { 1388 reader_.forward(2); 1389 reader_.sliceBuilder.write('\''); 1390 } 1391 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') || 1392 (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\'))) 1393 { 1394 reader_.forward(); 1395 reader_.sliceBuilder.write(c); 1396 } 1397 else if(quotes == ScalarStyle.doubleQuoted && c == '\\') 1398 { 1399 reader_.forward(); 1400 c = reader_.peek(); 1401 if(c.among!(escapes)) 1402 { 1403 reader_.forward(); 1404 // Escaping has been moved to Parser as it can't be done in 1405 // place (in a slice) in case of '\P' and '\L' (very uncommon, 1406 // but we don't want to break the spec) 1407 char[2] escapeSequence = ['\\', cast(char)c]; 1408 reader_.sliceBuilder.write(escapeSequence); 1409 } 1410 else if(c.among!(escapeHexCodeList)) 1411 { 1412 const hexLength = dyaml.escapes.escapeHexLength(c); 1413 reader_.forward(); 1414 1415 foreach(i; 0 .. hexLength) { 1416 enforce(reader_.peek(i).isHexDigit, 1417 new ScannerException("While scanning a double quoted scalar", startMark, 1418 expected("escape sequence of hexadecimal numbers", 1419 reader_.peek(i)), reader_.mark)); 1420 } 1421 char[] hex = reader_.get(hexLength); 1422 1423 enforce((hex.length > 0) && (hex.length <= 8), 1424 new ScannerException("While scanning a double quoted scalar", startMark, 1425 "overflow when parsing an escape sequence of " ~ 1426 "hexadecimal numbers.", reader_.mark)); 1427 1428 char[2] escapeStart = ['\\', cast(char) c]; 1429 reader_.sliceBuilder.write(escapeStart); 1430 reader_.sliceBuilder.write(hex); 1431 1432 } 1433 else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1434 { 1435 scanLineBreak(); 1436 scanFlowScalarBreaksToSlice(startMark); 1437 } 1438 else 1439 { 1440 throw new ScannerException("While scanning a double quoted scalar", startMark, 1441 text("found unsupported escape character ", c), 1442 reader_.mark); 1443 } 1444 } 1445 else { return; } 1446 } 1447 } 1448 1449 /// Scan space characters in a flow scalar. 1450 /// 1451 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1452 /// spaces into that slice. 1453 void scanFlowScalarSpacesToSlice(const Mark startMark) @safe 1454 { 1455 // Increase length as long as we see whitespace. 1456 size_t length; 1457 while(reader_.peekByte(length).among!(' ', '\t')) { ++length; } 1458 auto whitespaces = reader_.prefixBytes(length); 1459 1460 // Can check the last byte without striding because '\0' is ASCII 1461 const c = reader_.peek(length); 1462 enforce(c != '\0', 1463 new ScannerException("While scanning a quoted scalar", startMark, 1464 "found unexpected end of buffer", reader_.mark)); 1465 1466 // Spaces not followed by a line break. 1467 if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1468 { 1469 reader_.forward(length); 1470 reader_.sliceBuilder.write(whitespaces); 1471 return; 1472 } 1473 1474 // There's a line break after the spaces. 1475 reader_.forward(length); 1476 const lineBreak = scanLineBreak(); 1477 1478 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1479 1480 // If we have extra line breaks after the first, scan them into the 1481 // slice. 1482 const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); 1483 1484 // No extra breaks, one normal line break. Replace it with a space. 1485 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1486 } 1487 1488 /// Scan line breaks in a flow scalar. 1489 /// 1490 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1491 /// line breaks into that slice. 1492 bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe 1493 { 1494 // True if at least one line break was found. 1495 bool anyBreaks; 1496 for(;;) 1497 { 1498 // Instead of checking indentation, we check for document separators. 1499 const prefix = reader_.prefix(3); 1500 enforce(!(prefix == "---" || prefix == "...") || 1501 !reader_.peek(3).isWhiteSpace, 1502 new ScannerException("While scanning a quoted scalar", startMark, 1503 "found unexpected document separator", reader_.mark)); 1504 1505 // Skip any whitespaces. 1506 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); } 1507 1508 // Encountered a non-whitespace non-linebreak character, so we're done. 1509 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1510 1511 const lineBreak = scanLineBreak(); 1512 anyBreaks = true; 1513 reader_.sliceBuilder.write(lineBreak); 1514 } 1515 return anyBreaks; 1516 } 1517 1518 /// Scan plain scalar token (no block, no quotes). 1519 Token scanPlain() @safe 1520 { 1521 // We keep track of the allowSimpleKey_ flag here. 1522 // Indentation rules are loosed for the flow context 1523 const startMark = reader_.mark; 1524 Mark endMark = startMark; 1525 const indent = indent_ + 1; 1526 1527 // We allow zero indentation for scalars, but then we need to check for 1528 // document separators at the beginning of the line. 1529 // if(indent == 0) { indent = 1; } 1530 1531 reader_.sliceBuilder.begin(); 1532 1533 alias Transaction = SliceBuilder.Transaction; 1534 Transaction spacesTransaction; 1535 // Stop at a comment. 1536 while(reader_.peekByte() != '#') 1537 { 1538 // Scan the entire plain scalar. 1539 size_t length; 1540 dchar c = reader_.peek(length); 1541 for(;;) 1542 { 1543 const cNext = reader_.peek(length + 1); 1544 if(c.isWhiteSpace || 1545 (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) || 1546 (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}'))) 1547 { 1548 break; 1549 } 1550 ++length; 1551 c = cNext; 1552 } 1553 1554 // It's not clear what we should do with ':' in the flow context. 1555 enforce(flowLevel_ == 0 || c != ':' || 1556 reader_.peek(length + 1).isWhiteSpace || 1557 reader_.peek(length + 1).among!(',', '[', ']', '{', '}'), 1558 new ScannerException("While scanning a plain scalar", startMark, 1559 "found unexpected ':' . Please check " ~ 1560 "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.", 1561 reader_.mark)); 1562 1563 if(length == 0) { break; } 1564 1565 allowSimpleKey_ = false; 1566 1567 reader_.sliceBuilder.write(reader_.get(length)); 1568 1569 endMark = reader_.mark; 1570 1571 spacesTransaction.commit(); 1572 spacesTransaction = Transaction(&reader_.sliceBuilder); 1573 1574 const startLength = reader_.sliceBuilder.length; 1575 scanPlainSpacesToSlice(); 1576 if(startLength == reader_.sliceBuilder.length || 1577 (flowLevel_ == 0 && reader_.column < indent)) 1578 { 1579 break; 1580 } 1581 } 1582 1583 spacesTransaction.end(); 1584 char[] slice = reader_.sliceBuilder.finish(); 1585 1586 return scalarToken(startMark, endMark, slice, ScalarStyle.plain); 1587 } 1588 1589 /// Scan spaces in a plain scalar. 1590 /// 1591 /// Assumes that the caller is building a slice in Reader, and puts the spaces 1592 /// into that slice. 1593 void scanPlainSpacesToSlice() @safe 1594 { 1595 // The specification is really confusing about tabs in plain scalars. 1596 // We just forbid them completely. Do not use tabs in YAML! 1597 1598 // Get as many plain spaces as there are. 1599 size_t length; 1600 while(reader_.peekByte(length) == ' ') { ++length; } 1601 char[] whitespaces = reader_.prefixBytes(length); 1602 reader_.forward(length); 1603 1604 const dchar c = reader_.peek(); 1605 if(!c.isNSChar) 1606 { 1607 // We have spaces, but no newline. 1608 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); } 1609 return; 1610 } 1611 1612 // Newline after the spaces (if any) 1613 const lineBreak = scanLineBreak(); 1614 allowSimpleKey_ = true; 1615 1616 static bool end(Reader reader_) @safe pure 1617 { 1618 const prefix = reader_.prefix(3); 1619 return ("---" == prefix || "..." == prefix) 1620 && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 1621 } 1622 1623 if(end(reader_)) { return; } 1624 1625 bool extraBreaks; 1626 1627 alias Transaction = SliceBuilder.Transaction; 1628 auto transaction = Transaction(&reader_.sliceBuilder); 1629 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1630 while(reader_.peek().isNSChar) 1631 { 1632 if(reader_.peekByte() == ' ') { reader_.forward(); } 1633 else 1634 { 1635 const lBreak = scanLineBreak(); 1636 extraBreaks = true; 1637 reader_.sliceBuilder.write(lBreak); 1638 1639 if(end(reader_)) { return; } 1640 } 1641 } 1642 transaction.commit(); 1643 1644 // No line breaks, only a space. 1645 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1646 } 1647 1648 /// Scan handle of a tag token. 1649 /// 1650 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1651 /// characters into that slice. 1652 void scanTagHandleToSlice(string name)(const Mark startMark) 1653 { 1654 dchar c = reader_.peek(); 1655 enum contextMsg = "While scanning a " ~ name; 1656 enforce(c == '!', 1657 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1658 1659 uint length = 1; 1660 c = reader_.peek(length); 1661 if(c != ' ') 1662 { 1663 while(c.isAlphaNum || c.among!('-', '_')) 1664 { 1665 ++length; 1666 c = reader_.peek(length); 1667 } 1668 enforce(c == '!', 1669 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1670 ++length; 1671 } 1672 1673 reader_.sliceBuilder.write(reader_.get(length)); 1674 } 1675 1676 /// Scan URI in a tag token. 1677 /// 1678 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1679 /// characters into that slice. 1680 void scanTagURIToSlice(string name)(const Mark startMark) 1681 { 1682 // Note: we do not check if URI is well-formed. 1683 dchar c = reader_.peek(); 1684 const startLen = reader_.sliceBuilder.length; 1685 { 1686 uint length; 1687 while(c.isAlphaNum || c.isURIChar) 1688 { 1689 if(c == '%') 1690 { 1691 auto chars = reader_.get(length); 1692 reader_.sliceBuilder.write(chars); 1693 length = 0; 1694 scanURIEscapesToSlice!name(startMark); 1695 } 1696 else { ++length; } 1697 c = reader_.peek(length); 1698 } 1699 if(length > 0) 1700 { 1701 auto chars = reader_.get(length); 1702 reader_.sliceBuilder.write(chars); 1703 length = 0; 1704 } 1705 } 1706 // OK if we scanned something, error otherwise. 1707 enum contextMsg = "While parsing a " ~ name; 1708 enforce(reader_.sliceBuilder.length > startLen, 1709 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark)); 1710 } 1711 1712 // Not @nogc yet because std.utf.decode is not @nogc 1713 /// Scan URI escape sequences. 1714 /// 1715 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1716 /// characters into that slice. 1717 void scanURIEscapesToSlice(string name)(const Mark startMark) 1718 { 1719 import core.exception : UnicodeException; 1720 // URI escapes encode a UTF-8 string. We store UTF-8 code units here for 1721 // decoding into UTF-32. 1722 Appender!string buffer; 1723 1724 1725 enum contextMsg = "While scanning a " ~ name; 1726 while(reader_.peekByte() == '%') 1727 { 1728 reader_.forward(); 1729 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)]; 1730 1731 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit, 1732 new ScannerException(contextMsg, startMark, 1733 expected("URI escape sequence of 2 hexadecimal " ~ 1734 "numbers", nextByte), reader_.mark)); 1735 1736 buffer ~= nextByte[].to!ubyte(16); 1737 1738 reader_.forward(2); 1739 } 1740 try 1741 { 1742 foreach (dchar chr; buffer.data) 1743 { 1744 reader_.sliceBuilder.write(chr); 1745 } 1746 } 1747 catch (UnicodeException) 1748 { 1749 throw new ScannerException(contextMsg, startMark, 1750 "Invalid UTF-8 data encoded in URI escape sequence", 1751 reader_.mark); 1752 } 1753 } 1754 1755 1756 /// Scan a line break, if any. 1757 /// 1758 /// Transforms: 1759 /// '\r\n' : '\n' 1760 /// '\r' : '\n' 1761 /// '\n' : '\n' 1762 /// '\u0085' : '\n' 1763 /// '\u2028' : '\u2028' 1764 /// '\u2029 : '\u2029' 1765 /// no break : '\0' 1766 dchar scanLineBreak() @safe 1767 { 1768 // Fast path for ASCII line breaks. 1769 const b = reader_.peekByte(); 1770 if(b < 0x80) 1771 { 1772 if(b == '\n' || b == '\r') 1773 { 1774 if(reader_.prefix(2) == "\r\n") { reader_.forward(2); } 1775 else { reader_.forward(); } 1776 return '\n'; 1777 } 1778 return '\0'; 1779 } 1780 1781 const c = reader_.peek(); 1782 if(c == '\x85') 1783 { 1784 reader_.forward(); 1785 return '\n'; 1786 } 1787 if(c == '\u2028' || c == '\u2029') 1788 { 1789 reader_.forward(); 1790 return c; 1791 } 1792 return '\0'; 1793 } 1794 }