1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 /// YAML scanner. 8 /// Code based on PyYAML: http://www.pyyaml.org 9 module dyaml.scanner; 10 11 12 import core.stdc..string; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.ascii : isAlphaNum, isDigit, isHexDigit; 18 import std.exception; 19 import std..string; 20 import std.typecons; 21 import std.traits : Unqual; 22 import std.utf; 23 24 import dyaml.escapes; 25 import dyaml.exception; 26 import dyaml.queue; 27 import dyaml.reader; 28 import dyaml.style; 29 import dyaml.token; 30 31 package: 32 /// Scanner produces tokens of the following types: 33 /// STREAM-START 34 /// STREAM-END 35 /// DIRECTIVE(name, value) 36 /// DOCUMENT-START 37 /// DOCUMENT-END 38 /// BLOCK-SEQUENCE-START 39 /// BLOCK-MAPPING-START 40 /// BLOCK-END 41 /// FLOW-SEQUENCE-START 42 /// FLOW-MAPPING-START 43 /// FLOW-SEQUENCE-END 44 /// FLOW-MAPPING-END 45 /// BLOCK-ENTRY 46 /// FLOW-ENTRY 47 /// KEY 48 /// VALUE 49 /// ALIAS(value) 50 /// ANCHOR(value) 51 /// TAG(value) 52 /// SCALAR(value, plain, style) 53 54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 55 56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 57 58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 59 60 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}', 61 '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n', 62 '\r', '\u0085', '\u2028', '\u2029'); 63 64 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',', 65 '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%'); 66 67 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'); 68 69 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029'); 70 71 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\'); 72 73 /// Marked exception thrown at scanner errors. 74 /// 75 /// See_Also: MarkedYAMLException 76 class ScannerException : MarkedYAMLException 77 { 78 mixin MarkedExceptionCtors; 79 } 80 81 /// Generates tokens from data provided by a Reader. 82 struct Scanner 83 { 84 private: 85 /// A simple key is a key that is not denoted by the '?' indicator. 86 /// For example: 87 /// --- 88 /// block simple key: value 89 /// ? not a simple key: 90 /// : { flow simple key: value } 91 /// We emit the KEY token before all keys, so when we find a potential simple 92 /// key, we try to locate the corresponding ':' indicator. Simple keys should be 93 /// limited to a single line and 1024 characters. 94 /// 95 /// 16 bytes on 64-bit. 96 static struct SimpleKey 97 { 98 /// Character index in reader where the key starts. 99 uint charIndex = uint.max; 100 /// Index of the key token from start (first token scanned being 0). 101 uint tokenIndex; 102 /// Line the key starts at. 103 uint line; 104 /// Column the key starts at. 105 ushort column; 106 /// Is this required to be a simple key? 107 bool required; 108 /// Is this struct "null" (invalid)?. 109 bool isNull; 110 } 111 112 /// Block chomping types. 113 enum Chomping 114 { 115 /// Strip all trailing line breaks. '-' indicator. 116 strip, 117 /// Line break of the last line is preserved, others discarded. Default. 118 clip, 119 /// All trailing line breaks are preserved. '+' indicator. 120 keep 121 } 122 123 /// Reader used to read from a file/stream. 124 Reader reader_; 125 /// Are we done scanning? 126 bool done_; 127 128 /// Level of nesting in flow context. If 0, we're in block context. 129 uint flowLevel_; 130 /// Current indentation level. 131 int indent_ = -1; 132 /// Past indentation levels. Used as a stack. 133 Appender!(int[]) indents_; 134 135 /// Processed tokens not yet emitted. Used as a queue. 136 Queue!Token tokens_; 137 138 /// Number of tokens emitted through the getToken method. 139 uint tokensTaken_; 140 141 /// Can a simple key start at the current position? A simple key may start: 142 /// - at the beginning of the line, not counting indentation spaces 143 /// (in block context), 144 /// - after '{', '[', ',' (in the flow context), 145 /// - after '?', ':', '-' (in the block context). 146 /// In the block context, this flag also signifies if a block collection 147 /// may start at the current position. 148 bool allowSimpleKey_ = true; 149 150 /// Possible simple keys indexed by flow levels. 151 SimpleKey[] possibleSimpleKeys_; 152 153 public: 154 /// Construct a Scanner using specified Reader. 155 this(Reader reader) @safe nothrow 156 { 157 // Return the next token, but do not delete it from the queue 158 reader_ = reader; 159 fetchStreamStart(); 160 } 161 162 /// Advance to the next token 163 void popFront() @safe 164 { 165 ++tokensTaken_; 166 tokens_.pop(); 167 } 168 169 /// Return the current token 170 const(Token) front() @safe 171 { 172 enforce(!empty, "No token left to peek"); 173 return tokens_.peek(); 174 } 175 176 /// Return whether there are any more tokens left. 177 bool empty() @safe 178 { 179 while (needMoreTokens()) 180 { 181 fetchToken(); 182 } 183 return tokens_.empty; 184 } 185 186 private: 187 /// Most scanning error messages have the same format; so build them with this 188 /// function. 189 string expected(T)(string expected, T found) 190 { 191 return text("expected ", expected, ", but found ", found); 192 } 193 194 /// Determine whether or not we need to fetch more tokens before peeking/getting a token. 195 bool needMoreTokens() @safe pure 196 { 197 if(done_) { return false; } 198 if(tokens_.empty) { return true; } 199 200 /// The current token may be a potential simple key, so we need to look further. 201 stalePossibleSimpleKeys(); 202 return nextPossibleSimpleKey() == tokensTaken_; 203 } 204 205 /// Fetch at token, adding it to tokens_. 206 void fetchToken() @safe 207 { 208 // Eat whitespaces and comments until we reach the next token. 209 scanToNextToken(); 210 211 // Remove obsolete possible simple keys. 212 stalePossibleSimpleKeys(); 213 214 // Compare current indentation and column. It may add some tokens 215 // and decrease the current indentation level. 216 unwindIndent(reader_.column); 217 218 // Get the next character. 219 const dchar c = reader_.peekByte(); 220 221 // Fetch the token. 222 if(c == '\0') { return fetchStreamEnd(); } 223 if(checkDirective()) { return fetchDirective(); } 224 if(checkDocumentStart()) { return fetchDocumentStart(); } 225 if(checkDocumentEnd()) { return fetchDocumentEnd(); } 226 // Order of the following checks is NOT significant. 227 switch(c) 228 { 229 case '[': return fetchFlowSequenceStart(); 230 case '{': return fetchFlowMappingStart(); 231 case ']': return fetchFlowSequenceEnd(); 232 case '}': return fetchFlowMappingEnd(); 233 case ',': return fetchFlowEntry(); 234 case '!': return fetchTag(); 235 case '\'': return fetchSingle(); 236 case '\"': return fetchDouble(); 237 case '*': return fetchAlias(); 238 case '&': return fetchAnchor(); 239 case '?': if(checkKey()) { return fetchKey(); } goto default; 240 case ':': if(checkValue()) { return fetchValue(); } goto default; 241 case '-': if(checkBlockEntry()) { return fetchBlockEntry(); } goto default; 242 case '|': if(flowLevel_ == 0) { return fetchLiteral(); } break; 243 case '>': if(flowLevel_ == 0) { return fetchFolded(); } break; 244 default: if(checkPlain()) { return fetchPlain(); } 245 } 246 247 throw new ScannerException("While scanning for the next token, found character " ~ 248 "\'%s\', index %s that cannot start any token" 249 .format(c, to!int(c)), reader_.mark); 250 } 251 252 253 /// Return the token number of the nearest possible simple key. 254 uint nextPossibleSimpleKey() @safe pure nothrow @nogc 255 { 256 uint minTokenNumber = uint.max; 257 foreach(k, ref simpleKey; possibleSimpleKeys_) 258 { 259 if(simpleKey.isNull) { continue; } 260 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex); 261 } 262 return minTokenNumber; 263 } 264 265 /// Remove entries that are no longer possible simple keys. 266 /// 267 /// According to the YAML specification, simple keys 268 /// - should be limited to a single line, 269 /// - should be no longer than 1024 characters. 270 /// Disabling this will allow simple keys of any length and 271 /// height (may cause problems if indentation is broken though). 272 void stalePossibleSimpleKeys() @safe pure 273 { 274 foreach(level, ref key; possibleSimpleKeys_) 275 { 276 if(key.isNull) { continue; } 277 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024) 278 { 279 enforce(!key.required, 280 new ScannerException("While scanning a simple key", 281 Mark(key.line, key.column), 282 "could not find expected ':'", reader_.mark)); 283 key.isNull = true; 284 } 285 } 286 } 287 288 /// Check if the next token starts a possible simple key and if so, save its position. 289 /// 290 /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 291 void savePossibleSimpleKey() @safe pure 292 { 293 // Check if a simple key is required at the current position. 294 const required = (flowLevel_ == 0 && indent_ == reader_.column); 295 assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~ 296 "the first token in the current line. Therefore it is always allowed."); 297 298 if(!allowSimpleKey_) { return; } 299 300 // The next token might be a simple key, so save its number and position. 301 removePossibleSimpleKey(); 302 const tokenCount = tokensTaken_ + cast(uint)tokens_.length; 303 304 const line = reader_.line; 305 const column = reader_.column; 306 const key = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line, 307 cast(ushort)min(column, ushort.max), required); 308 309 if(possibleSimpleKeys_.length <= flowLevel_) 310 { 311 const oldLength = possibleSimpleKeys_.length; 312 possibleSimpleKeys_.length = flowLevel_ + 1; 313 //No need to initialize the last element, it's already done in the next line. 314 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init; 315 } 316 possibleSimpleKeys_[flowLevel_] = key; 317 } 318 319 /// Remove the saved possible key position at the current flow level. 320 void removePossibleSimpleKey() @safe pure 321 { 322 if(possibleSimpleKeys_.length <= flowLevel_) { return; } 323 324 if(!possibleSimpleKeys_[flowLevel_].isNull) 325 { 326 const key = possibleSimpleKeys_[flowLevel_]; 327 enforce(!key.required, 328 new ScannerException("While scanning a simple key", 329 Mark(key.line, key.column), 330 "could not find expected ':'", reader_.mark)); 331 possibleSimpleKeys_[flowLevel_].isNull = true; 332 } 333 } 334 335 /// Decrease indentation, removing entries in indents_. 336 /// 337 /// Params: column = Current column in the file/stream. 338 void unwindIndent(const int column) @safe 339 { 340 if(flowLevel_ > 0) 341 { 342 // In flow context, tokens should respect indentation. 343 // The condition should be `indent >= column` according to the spec. 344 // But this condition will prohibit intuitively correct 345 // constructions such as 346 // key : { 347 // } 348 349 // In the flow context, indentation is ignored. We make the scanner less 350 // restrictive than what the specification requires. 351 // if(pedantic_ && flowLevel_ > 0 && indent_ > column) 352 // { 353 // throw new ScannerException("Invalid intendation or unclosed '[' or '{'", 354 // reader_.mark) 355 // } 356 return; 357 } 358 359 // In block context, we may need to issue the BLOCK-END tokens. 360 while(indent_ > column) 361 { 362 indent_ = indents_.data.back; 363 assert(indents_.data.length); 364 indents_.shrinkTo(indents_.data.length - 1); 365 tokens_.push(blockEndToken(reader_.mark, reader_.mark)); 366 } 367 } 368 369 /// Increase indentation if needed. 370 /// 371 /// Params: column = Current column in the file/stream. 372 /// 373 /// Returns: true if the indentation was increased, false otherwise. 374 bool addIndent(int column) @safe 375 { 376 if(indent_ >= column){return false;} 377 indents_ ~= indent_; 378 indent_ = column; 379 return true; 380 } 381 382 383 /// Add STREAM-START token. 384 void fetchStreamStart() @safe nothrow 385 { 386 tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding)); 387 } 388 389 ///Add STREAM-END token. 390 void fetchStreamEnd() @safe 391 { 392 //Set intendation to -1 . 393 unwindIndent(-1); 394 removePossibleSimpleKey(); 395 allowSimpleKey_ = false; 396 possibleSimpleKeys_.destroy; 397 398 tokens_.push(streamEndToken(reader_.mark, reader_.mark)); 399 done_ = true; 400 } 401 402 /// Add DIRECTIVE token. 403 void fetchDirective() @safe 404 { 405 // Set intendation to -1 . 406 unwindIndent(-1); 407 // Reset simple keys. 408 removePossibleSimpleKey(); 409 allowSimpleKey_ = false; 410 411 auto directive = scanDirective(); 412 tokens_.push(directive); 413 } 414 415 /// Add DOCUMENT-START or DOCUMENT-END token. 416 void fetchDocumentIndicator(TokenID id)() 417 if(id == TokenID.documentStart || id == TokenID.documentEnd) 418 { 419 // Set indentation to -1 . 420 unwindIndent(-1); 421 // Reset simple keys. Note that there can't be a block collection after '---'. 422 removePossibleSimpleKey(); 423 allowSimpleKey_ = false; 424 425 Mark startMark = reader_.mark; 426 reader_.forward(3); 427 tokens_.push(simpleToken!id(startMark, reader_.mark)); 428 } 429 430 /// Aliases to add DOCUMENT-START or DOCUMENT-END token. 431 alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart); 432 alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd); 433 434 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 435 void fetchFlowCollectionStart(TokenID id)() @safe 436 { 437 // '[' and '{' may start a simple key. 438 savePossibleSimpleKey(); 439 // Simple keys are allowed after '[' and '{'. 440 allowSimpleKey_ = true; 441 ++flowLevel_; 442 443 Mark startMark = reader_.mark; 444 reader_.forward(); 445 tokens_.push(simpleToken!id(startMark, reader_.mark)); 446 } 447 448 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 449 alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart); 450 alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart); 451 452 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 453 void fetchFlowCollectionEnd(TokenID id)() 454 { 455 // Reset possible simple key on the current level. 456 removePossibleSimpleKey(); 457 // No simple keys after ']' and '}'. 458 allowSimpleKey_ = false; 459 --flowLevel_; 460 461 Mark startMark = reader_.mark; 462 reader_.forward(); 463 tokens_.push(simpleToken!id(startMark, reader_.mark)); 464 } 465 466 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/ 467 alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd); 468 alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd); 469 470 /// Add FLOW-ENTRY token; 471 void fetchFlowEntry() @safe 472 { 473 // Reset possible simple key on the current level. 474 removePossibleSimpleKey(); 475 // Simple keys are allowed after ','. 476 allowSimpleKey_ = true; 477 478 Mark startMark = reader_.mark; 479 reader_.forward(); 480 tokens_.push(flowEntryToken(startMark, reader_.mark)); 481 } 482 483 /// Additional checks used in block context in fetchBlockEntry and fetchKey. 484 /// 485 /// Params: type = String representing the token type we might need to add. 486 /// id = Token type we might need to add. 487 void blockChecks(string type, TokenID id)() 488 { 489 enum context = type ~ " keys are not allowed here"; 490 // Are we allowed to start a key (not neccesarily a simple one)? 491 enforce(allowSimpleKey_, new ScannerException(context, reader_.mark)); 492 493 if(addIndent(reader_.column)) 494 { 495 tokens_.push(simpleToken!id(reader_.mark, reader_.mark)); 496 } 497 } 498 499 /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process. 500 void fetchBlockEntry() @safe 501 { 502 if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); } 503 504 // It's an error for the block entry to occur in the flow context, 505 // but we let the parser detect this. 506 507 // Reset possible simple key on the current level. 508 removePossibleSimpleKey(); 509 // Simple keys are allowed after '-'. 510 allowSimpleKey_ = true; 511 512 Mark startMark = reader_.mark; 513 reader_.forward(); 514 tokens_.push(blockEntryToken(startMark, reader_.mark)); 515 } 516 517 /// Add KEY token. Might add BLOCK-MAPPING-START in the process. 518 void fetchKey() @safe 519 { 520 if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); } 521 522 // Reset possible simple key on the current level. 523 removePossibleSimpleKey(); 524 // Simple keys are allowed after '?' in the block context. 525 allowSimpleKey_ = (flowLevel_ == 0); 526 527 Mark startMark = reader_.mark; 528 reader_.forward(); 529 tokens_.push(keyToken(startMark, reader_.mark)); 530 } 531 532 /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process. 533 void fetchValue() @safe 534 { 535 //Do we determine a simple key? 536 if(possibleSimpleKeys_.length > flowLevel_ && 537 !possibleSimpleKeys_[flowLevel_].isNull) 538 { 539 const key = possibleSimpleKeys_[flowLevel_]; 540 possibleSimpleKeys_[flowLevel_].isNull = true; 541 Mark keyMark = Mark(key.line, key.column); 542 const idx = key.tokenIndex - tokensTaken_; 543 544 assert(idx >= 0); 545 546 // Add KEY. 547 // Manually inserting since tokens are immutable (need linked list). 548 tokens_.insert(keyToken(keyMark, keyMark), idx); 549 550 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START. 551 if(flowLevel_ == 0 && addIndent(key.column)) 552 { 553 tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx); 554 } 555 556 // There cannot be two simple keys in a row. 557 allowSimpleKey_ = false; 558 } 559 // Part of a complex key 560 else 561 { 562 // We can start a complex value if and only if we can start a simple key. 563 enforce(flowLevel_ > 0 || allowSimpleKey_, 564 new ScannerException("Mapping values are not allowed here", reader_.mark)); 565 566 // If this value starts a new block mapping, we need to add 567 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser. 568 if(flowLevel_ == 0 && addIndent(reader_.column)) 569 { 570 tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark)); 571 } 572 573 // Reset possible simple key on the current level. 574 removePossibleSimpleKey(); 575 // Simple keys are allowed after ':' in the block context. 576 allowSimpleKey_ = (flowLevel_ == 0); 577 } 578 579 // Add VALUE. 580 Mark startMark = reader_.mark; 581 reader_.forward(); 582 tokens_.push(valueToken(startMark, reader_.mark)); 583 } 584 585 /// Add ALIAS or ANCHOR token. 586 void fetchAnchor_(TokenID id)() @safe 587 if(id == TokenID.alias_ || id == TokenID.anchor) 588 { 589 // ALIAS/ANCHOR could be a simple key. 590 savePossibleSimpleKey(); 591 // No simple keys after ALIAS/ANCHOR. 592 allowSimpleKey_ = false; 593 594 auto anchor = scanAnchor(id); 595 tokens_.push(anchor); 596 } 597 598 /// Aliases to add ALIAS or ANCHOR token. 599 alias fetchAlias = fetchAnchor_!(TokenID.alias_); 600 alias fetchAnchor = fetchAnchor_!(TokenID.anchor); 601 602 /// Add TAG token. 603 void fetchTag() @safe 604 { 605 //TAG could start a simple key. 606 savePossibleSimpleKey(); 607 //No simple keys after TAG. 608 allowSimpleKey_ = false; 609 610 tokens_.push(scanTag()); 611 } 612 613 /// Add block SCALAR token. 614 void fetchBlockScalar(ScalarStyle style)() @safe 615 if(style == ScalarStyle.literal || style == ScalarStyle.folded) 616 { 617 // Reset possible simple key on the current level. 618 removePossibleSimpleKey(); 619 // A simple key may follow a block scalar. 620 allowSimpleKey_ = true; 621 622 auto blockScalar = scanBlockScalar(style); 623 tokens_.push(blockScalar); 624 } 625 626 /// Aliases to add literal or folded block scalar. 627 alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal); 628 alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded); 629 630 /// Add quoted flow SCALAR token. 631 void fetchFlowScalar(ScalarStyle quotes)() 632 { 633 // A flow scalar could be a simple key. 634 savePossibleSimpleKey(); 635 // No simple keys after flow scalars. 636 allowSimpleKey_ = false; 637 638 // Scan and add SCALAR. 639 auto scalar = scanFlowScalar(quotes); 640 tokens_.push(scalar); 641 } 642 643 /// Aliases to add single or double quoted block scalar. 644 alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted); 645 alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted); 646 647 /// Add plain SCALAR token. 648 void fetchPlain() @safe 649 { 650 // A plain scalar could be a simple key 651 savePossibleSimpleKey(); 652 // No simple keys after plain scalars. But note that scanPlain() will 653 // change this flag if the scan is finished at the beginning of the line. 654 allowSimpleKey_ = false; 655 auto plain = scanPlain(); 656 657 // Scan and add SCALAR. May change allowSimpleKey_ 658 tokens_.push(plain); 659 } 660 661 pure: 662 663 ///Check if the next token is DIRECTIVE: ^ '%' ... 664 bool checkDirective() @safe 665 { 666 return reader_.peekByte() == '%' && reader_.column == 0; 667 } 668 669 /// Check if the next token is DOCUMENT-START: ^ '---' (' '|'\n') 670 bool checkDocumentStart() @safe 671 { 672 // Check one char first, then all 3, to prevent reading outside the buffer. 673 return reader_.column == 0 && 674 reader_.peekByte() == '-' && 675 reader_.prefix(3) == "---" && 676 reader_.peek(3).isWhiteSpace; 677 } 678 679 /// Check if the next token is DOCUMENT-END: ^ '...' (' '|'\n') 680 bool checkDocumentEnd() @safe 681 { 682 // Check one char first, then all 3, to prevent reading outside the buffer. 683 return reader_.column == 0 && 684 reader_.peekByte() == '.' && 685 reader_.prefix(3) == "..." && 686 reader_.peek(3).isWhiteSpace; 687 } 688 689 /// Check if the next token is BLOCK-ENTRY: '-' (' '|'\n') 690 bool checkBlockEntry() @safe 691 { 692 return !!reader_.peek(1).isWhiteSpace; 693 } 694 695 /// Check if the next token is KEY(flow context): '?' 696 /// 697 /// or KEY(block context): '?' (' '|'\n') 698 bool checkKey() @safe 699 { 700 return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace); 701 } 702 703 /// Check if the next token is VALUE(flow context): ':' 704 /// 705 /// or VALUE(block context): ':' (' '|'\n') 706 bool checkValue() @safe 707 { 708 return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace; 709 } 710 711 /// Check if the next token is a plain scalar. 712 /// 713 /// A plain scalar may start with any non-space character except: 714 /// '-', '?', ':', ',', '[', ']', '{', '}', 715 /// '#', '&', '*', '!', '|', '>', '\'', '\"', 716 /// '%', '@', '`'. 717 /// 718 /// It may also start with 719 /// '-', '?', ':' 720 /// if it is followed by a non-space character. 721 /// 722 /// Note that we limit the last rule to the block context (except the 723 /// '-' character) because we want the flow context to be space 724 /// independent. 725 bool checkPlain() @safe 726 { 727 const c = reader_.peek(); 728 if(!c.isNonScalarStartCharacter) 729 { 730 return true; 731 } 732 return !reader_.peek(1).isWhiteSpace && 733 (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':'))); 734 } 735 736 /// Move to the next non-space character. 737 void findNextNonSpace() @safe 738 { 739 while(reader_.peekByte() == ' ') { reader_.forward(); } 740 } 741 742 /// Scan a string of alphanumeric or "-_" characters. 743 /// 744 /// Assumes that the caller is building a slice in Reader, and puts the scanned 745 /// characters into that slice. 746 void scanAlphaNumericToSlice(string name)(const Mark startMark) 747 { 748 size_t length; 749 dchar c = reader_.peek(); 750 while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); } 751 752 enforce(length > 0, new ScannerException("While scanning " ~ name, 753 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark)); 754 755 reader_.sliceBuilder.write(reader_.get(length)); 756 } 757 758 /// Scan and throw away all characters until next line break. 759 void scanToNextBreak() @safe 760 { 761 while(!reader_.peek().isBreak) { reader_.forward(); } 762 } 763 764 /// Scan all characters until next line break. 765 /// 766 /// Assumes that the caller is building a slice in Reader, and puts the scanned 767 /// characters into that slice. 768 void scanToNextBreakToSlice() @safe 769 { 770 uint length; 771 while(!reader_.peek(length).isBreak) 772 { 773 ++length; 774 } 775 reader_.sliceBuilder.write(reader_.get(length)); 776 } 777 778 779 /// Move to next token in the file/stream. 780 /// 781 /// We ignore spaces, line breaks and comments. 782 /// If we find a line break in the block context, we set 783 /// allowSimpleKey` on. 784 /// 785 /// We do not yet support BOM inside the stream as the 786 /// specification requires. Any such mark will be considered as a part 787 /// of the document. 788 void scanToNextToken() @safe 789 { 790 // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is: 791 // Tabs cannot precede tokens 792 // BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 793 // KEY(block), VALUE(block), BLOCK-ENTRY 794 // So the checking code is 795 // if <TAB>: 796 // allowSimpleKey_ = false 797 // We also need to add the check for `allowSimpleKey_ == true` to 798 // `unwindIndent` before issuing BLOCK-END. 799 // Scanners for block, flow, and plain scalars need to be modified. 800 801 for(;;) 802 { 803 findNextNonSpace(); 804 805 if(reader_.peekByte() == '#') { scanToNextBreak(); } 806 if(scanLineBreak() != '\0') 807 { 808 if(flowLevel_ == 0) { allowSimpleKey_ = true; } 809 } 810 else 811 { 812 break; 813 } 814 } 815 } 816 817 /// Scan directive token. 818 Token scanDirective() @safe 819 { 820 Mark startMark = reader_.mark; 821 // Skip the '%'. 822 reader_.forward(); 823 824 // Scan directive name 825 reader_.sliceBuilder.begin(); 826 scanDirectiveNameToSlice(startMark); 827 const name = reader_.sliceBuilder.finish(); 828 829 reader_.sliceBuilder.begin(); 830 831 // Index where tag handle ends and suffix starts in a tag directive value. 832 uint tagHandleEnd = uint.max; 833 if(name == "YAML") { scanYAMLDirectiveValueToSlice(startMark); } 834 else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); } 835 char[] value = reader_.sliceBuilder.finish(); 836 837 Mark endMark = reader_.mark; 838 839 DirectiveType directive; 840 if(name == "YAML") { directive = DirectiveType.yaml; } 841 else if(name == "TAG") { directive = DirectiveType.tag; } 842 else 843 { 844 directive = DirectiveType.reserved; 845 scanToNextBreak(); 846 } 847 848 scanDirectiveIgnoredLine(startMark); 849 850 return directiveToken(startMark, endMark, value, directive, tagHandleEnd); 851 } 852 853 /// Scan name of a directive token. 854 /// 855 /// Assumes that the caller is building a slice in Reader, and puts the scanned 856 /// characters into that slice. 857 void scanDirectiveNameToSlice(const Mark startMark) @safe 858 { 859 // Scan directive name. 860 scanAlphaNumericToSlice!"a directive"(startMark); 861 862 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 863 new ScannerException("While scanning a directive", startMark, 864 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 865 } 866 867 /// Scan value of a YAML directive token. Returns major, minor version separated by '.'. 868 /// 869 /// Assumes that the caller is building a slice in Reader, and puts the scanned 870 /// characters into that slice. 871 void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe 872 { 873 findNextNonSpace(); 874 875 scanYAMLDirectiveNumberToSlice(startMark); 876 877 enforce(reader_.peekByte() == '.', 878 new ScannerException("While scanning a directive", startMark, 879 expected("digit or '.'", reader_.peek()), reader_.mark)); 880 // Skip the '.'. 881 reader_.forward(); 882 883 reader_.sliceBuilder.write('.'); 884 scanYAMLDirectiveNumberToSlice(startMark); 885 886 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 887 new ScannerException("While scanning a directive", startMark, 888 expected("digit or '.'", reader_.peek()), reader_.mark)); 889 } 890 891 /// Scan a number from a YAML directive. 892 /// 893 /// Assumes that the caller is building a slice in Reader, and puts the scanned 894 /// characters into that slice. 895 void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe 896 { 897 enforce(isDigit(reader_.peek()), 898 new ScannerException("While scanning a directive", startMark, 899 expected("digit", reader_.peek()), reader_.mark)); 900 901 // Already found the first digit in the enforce(), so set length to 1. 902 uint length = 1; 903 while(reader_.peek(length).isDigit) { ++length; } 904 905 reader_.sliceBuilder.write(reader_.get(length)); 906 } 907 908 /// Scan value of a tag directive. 909 /// 910 /// Assumes that the caller is building a slice in Reader, and puts the scanned 911 /// characters into that slice. 912 /// 913 /// Returns: Length of tag handle (which is before tag prefix) in scanned data 914 uint scanTagDirectiveValueToSlice(const Mark startMark) @safe 915 { 916 findNextNonSpace(); 917 const startLength = reader_.sliceBuilder.length; 918 scanTagDirectiveHandleToSlice(startMark); 919 const handleLength = cast(uint)(reader_.sliceBuilder.length - startLength); 920 findNextNonSpace(); 921 scanTagDirectivePrefixToSlice(startMark); 922 923 return handleLength; 924 } 925 926 /// Scan handle of a tag directive. 927 /// 928 /// Assumes that the caller is building a slice in Reader, and puts the scanned 929 /// characters into that slice. 930 void scanTagDirectiveHandleToSlice(const Mark startMark) @safe 931 { 932 scanTagHandleToSlice!"directive"(startMark); 933 enforce(reader_.peekByte() == ' ', 934 new ScannerException("While scanning a directive handle", startMark, 935 expected("' '", reader_.peek()), reader_.mark)); 936 } 937 938 /// Scan prefix of a tag directive. 939 /// 940 /// Assumes that the caller is building a slice in Reader, and puts the scanned 941 /// characters into that slice. 942 void scanTagDirectivePrefixToSlice(const Mark startMark) @safe 943 { 944 scanTagURIToSlice!"directive"(startMark); 945 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 946 new ScannerException("While scanning a directive prefix", startMark, 947 expected("' '", reader_.peek()), reader_.mark)); 948 } 949 950 /// Scan (and ignore) ignored line after a directive. 951 void scanDirectiveIgnoredLine(const Mark startMark) @safe 952 { 953 findNextNonSpace(); 954 if(reader_.peekByte() == '#') { scanToNextBreak(); } 955 enforce(reader_.peek().isBreak, 956 new ScannerException("While scanning a directive", startMark, 957 expected("comment or a line break", reader_.peek()), reader_.mark)); 958 scanLineBreak(); 959 } 960 961 962 /// Scan an alias or an anchor. 963 /// 964 /// The specification does not restrict characters for anchors and 965 /// aliases. This may lead to problems, for instance, the document: 966 /// [ *alias, value ] 967 /// can be interpteted in two ways, as 968 /// [ "value" ] 969 /// and 970 /// [ *alias , "value" ] 971 /// Therefore we restrict aliases to ASCII alphanumeric characters. 972 Token scanAnchor(const TokenID id) @safe 973 { 974 const startMark = reader_.mark; 975 const dchar i = reader_.get(); 976 977 reader_.sliceBuilder.begin(); 978 if(i == '*') { scanAlphaNumericToSlice!"an alias"(startMark); } 979 else { scanAlphaNumericToSlice!"an anchor"(startMark); } 980 // On error, value is discarded as we return immediately 981 char[] value = reader_.sliceBuilder.finish(); 982 983 enum anchorCtx = "While scanning an anchor"; 984 enum aliasCtx = "While scanning an alias"; 985 enforce(reader_.peek().isWhiteSpace || 986 reader_.peekByte().among!('?', ':', ',', ']', '}', '%', '@'), 987 new ScannerException(i == '*' ? aliasCtx : anchorCtx, startMark, 988 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 989 990 if(id == TokenID.alias_) 991 { 992 return aliasToken(startMark, reader_.mark, value); 993 } 994 if(id == TokenID.anchor) 995 { 996 return anchorToken(startMark, reader_.mark, value); 997 } 998 assert(false, "This code should never be reached"); 999 } 1000 1001 /// Scan a tag token. 1002 Token scanTag() @safe 1003 { 1004 const startMark = reader_.mark; 1005 dchar c = reader_.peek(1); 1006 1007 reader_.sliceBuilder.begin(); 1008 scope(failure) { reader_.sliceBuilder.finish(); } 1009 // Index where tag handle ends and tag suffix starts in the tag value 1010 // (slice) we will produce. 1011 uint handleEnd; 1012 1013 if(c == '<') 1014 { 1015 reader_.forward(2); 1016 1017 handleEnd = 0; 1018 scanTagURIToSlice!"tag"(startMark); 1019 enforce(reader_.peekByte() == '>', 1020 new ScannerException("While scanning a tag", startMark, 1021 expected("'>'", reader_.peek()), reader_.mark)); 1022 reader_.forward(); 1023 } 1024 else if(c.isWhiteSpace) 1025 { 1026 reader_.forward(); 1027 handleEnd = 0; 1028 reader_.sliceBuilder.write('!'); 1029 } 1030 else 1031 { 1032 uint length = 1; 1033 bool useHandle; 1034 1035 while(!c.isBreakOrSpace) 1036 { 1037 if(c == '!') 1038 { 1039 useHandle = true; 1040 break; 1041 } 1042 ++length; 1043 c = reader_.peek(length); 1044 } 1045 1046 if(useHandle) 1047 { 1048 scanTagHandleToSlice!"tag"(startMark); 1049 handleEnd = cast(uint)reader_.sliceBuilder.length; 1050 } 1051 else 1052 { 1053 reader_.forward(); 1054 reader_.sliceBuilder.write('!'); 1055 handleEnd = cast(uint)reader_.sliceBuilder.length; 1056 } 1057 1058 scanTagURIToSlice!"tag"(startMark); 1059 } 1060 1061 enforce(reader_.peek().isBreakOrSpace, 1062 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()), 1063 reader_.mark)); 1064 1065 char[] slice = reader_.sliceBuilder.finish(); 1066 return tagToken(startMark, reader_.mark, slice, handleEnd); 1067 } 1068 1069 /// Scan a block scalar token with specified style. 1070 Token scanBlockScalar(const ScalarStyle style) @safe 1071 { 1072 const startMark = reader_.mark; 1073 1074 // Scan the header. 1075 reader_.forward(); 1076 1077 const indicators = scanBlockScalarIndicators(startMark); 1078 1079 const chomping = indicators[0]; 1080 const increment = indicators[1]; 1081 scanBlockScalarIgnoredLine(startMark); 1082 1083 // Determine the indentation level and go to the first non-empty line. 1084 Mark endMark; 1085 uint indent = max(1, indent_ + 1); 1086 1087 reader_.sliceBuilder.begin(); 1088 alias Transaction = SliceBuilder.Transaction; 1089 // Used to strip the last line breaks written to the slice at the end of the 1090 // scalar, which may be needed based on chomping. 1091 Transaction breaksTransaction = Transaction(&reader_.sliceBuilder); 1092 // Read the first indentation/line breaks before the scalar. 1093 size_t startLen = reader_.sliceBuilder.length; 1094 if(increment == int.min) 1095 { 1096 auto indentation = scanBlockScalarIndentationToSlice(); 1097 endMark = indentation[1]; 1098 indent = max(indent, indentation[0]); 1099 } 1100 else 1101 { 1102 indent += increment - 1; 1103 endMark = scanBlockScalarBreaksToSlice(indent); 1104 } 1105 1106 // int.max means there's no line break (int.max is outside UTF-32). 1107 dchar lineBreak = cast(dchar)int.max; 1108 1109 // Scan the inner part of the block scalar. 1110 while(reader_.column == indent && reader_.peekByte() != '\0') 1111 { 1112 breaksTransaction.commit(); 1113 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t'); 1114 // This is where the 'interesting' non-whitespace data gets read. 1115 scanToNextBreakToSlice(); 1116 lineBreak = scanLineBreak(); 1117 1118 1119 // This transaction serves to rollback data read in the 1120 // scanBlockScalarBreaksToSlice() call. 1121 breaksTransaction = Transaction(&reader_.sliceBuilder); 1122 startLen = reader_.sliceBuilder.length; 1123 // The line breaks should actually be written _after_ the if() block 1124 // below. We work around that by inserting 1125 endMark = scanBlockScalarBreaksToSlice(indent); 1126 1127 // This will not run during the last iteration (see the if() vs the 1128 // while()), hence breaksTransaction rollback (which happens after this 1129 // loop) will never roll back data written in this if() block. 1130 if(reader_.column == indent && reader_.peekByte() != '\0') 1131 { 1132 // Unfortunately, folding rules are ambiguous. 1133 1134 // This is the folding according to the specification: 1135 if(style == ScalarStyle.folded && lineBreak == '\n' && 1136 leadingNonSpace && !reader_.peekByte().among!(' ', '\t')) 1137 { 1138 // No breaks were scanned; no need to insert the space in the 1139 // middle of slice. 1140 if(startLen == reader_.sliceBuilder.length) 1141 { 1142 reader_.sliceBuilder.write(' '); 1143 } 1144 } 1145 else 1146 { 1147 // We need to insert in the middle of the slice in case any line 1148 // breaks were scanned. 1149 reader_.sliceBuilder.insert(lineBreak, startLen); 1150 } 1151 1152 ////this is Clark Evans's interpretation (also in the spec 1153 ////examples): 1154 // 1155 //if(style == ScalarStyle.folded && lineBreak == '\n') 1156 //{ 1157 // if(startLen == endLen) 1158 // { 1159 // if(!" \t"d.canFind(reader_.peekByte())) 1160 // { 1161 // reader_.sliceBuilder.write(' '); 1162 // } 1163 // else 1164 // { 1165 // chunks ~= lineBreak; 1166 // } 1167 // } 1168 //} 1169 //else 1170 //{ 1171 // reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen); 1172 //} 1173 } 1174 else 1175 { 1176 break; 1177 } 1178 } 1179 1180 // If chompint is Keep, we keep (commit) the last scanned line breaks 1181 // (which are at the end of the scalar). Otherwise re remove them (end the 1182 // transaction). 1183 if(chomping == Chomping.keep) { breaksTransaction.commit(); } 1184 else { breaksTransaction.end(); } 1185 if(chomping != Chomping.strip && lineBreak != int.max) 1186 { 1187 // If chomping is Keep, we keep the line break but the first line break 1188 // that isn't stripped (since chomping isn't Strip in this branch) must 1189 // be inserted _before_ the other line breaks. 1190 if(chomping == Chomping.keep) 1191 { 1192 reader_.sliceBuilder.insert(lineBreak, startLen); 1193 } 1194 // If chomping is not Keep, breaksTransaction was cancelled so we can 1195 // directly write the first line break (as it isn't stripped - chomping 1196 // is not Strip) 1197 else 1198 { 1199 reader_.sliceBuilder.write(lineBreak); 1200 } 1201 } 1202 1203 char[] slice = reader_.sliceBuilder.finish(); 1204 return scalarToken(startMark, endMark, slice, style); 1205 } 1206 1207 /// Scan chomping and indentation indicators of a scalar token. 1208 Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe 1209 { 1210 auto chomping = Chomping.clip; 1211 int increment = int.min; 1212 dchar c = reader_.peek(); 1213 1214 /// Indicators can be in any order. 1215 if(getChomping(c, chomping)) 1216 { 1217 getIncrement(c, increment, startMark); 1218 } 1219 else 1220 { 1221 const gotIncrement = getIncrement(c, increment, startMark); 1222 if(gotIncrement) { getChomping(c, chomping); } 1223 } 1224 1225 enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 1226 new ScannerException("While scanning a block scalar", startMark, 1227 expected("chomping or indentation indicator", c), reader_.mark)); 1228 1229 return tuple(chomping, increment); 1230 } 1231 1232 /// Get chomping indicator, if detected. Return false otherwise. 1233 /// 1234 /// Used in scanBlockScalarIndicators. 1235 /// 1236 /// Params: 1237 /// 1238 /// c = The character that may be a chomping indicator. 1239 /// chomping = Write the chomping value here, if detected. 1240 bool getChomping(ref dchar c, ref Chomping chomping) @safe 1241 { 1242 if(!c.among!('+', '-')) { return false; } 1243 chomping = c == '+' ? Chomping.keep : Chomping.strip; 1244 reader_.forward(); 1245 c = reader_.peek(); 1246 return true; 1247 } 1248 1249 /// Get increment indicator, if detected. Return false otherwise. 1250 /// 1251 /// Used in scanBlockScalarIndicators. 1252 /// 1253 /// Params: 1254 /// 1255 /// c = The character that may be an increment indicator. 1256 /// If an increment indicator is detected, this will be updated to 1257 /// the next character in the Reader. 1258 /// increment = Write the increment value here, if detected. 1259 /// startMark = Mark for error messages. 1260 bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe 1261 { 1262 if(!c.isDigit) { return false; } 1263 // Convert a digit to integer. 1264 increment = c - '0'; 1265 assert(increment < 10 && increment >= 0, "Digit has invalid value"); 1266 1267 enforce(increment > 0, 1268 new ScannerException("While scanning a block scalar", startMark, 1269 expected("indentation indicator in range 1-9", "0"), reader_.mark)); 1270 1271 reader_.forward(); 1272 c = reader_.peek(); 1273 return true; 1274 } 1275 1276 /// Scan (and ignore) ignored line in a block scalar. 1277 void scanBlockScalarIgnoredLine(const Mark startMark) @safe 1278 { 1279 findNextNonSpace(); 1280 if(reader_.peekByte()== '#') { scanToNextBreak(); } 1281 1282 enforce(reader_.peek().isBreak, 1283 new ScannerException("While scanning a block scalar", startMark, 1284 expected("comment or line break", reader_.peek()), reader_.mark)); 1285 1286 scanLineBreak(); 1287 } 1288 1289 /// Scan indentation in a block scalar, returning line breaks, max indent and end mark. 1290 /// 1291 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1292 /// characters into that slice. 1293 Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe 1294 { 1295 uint maxIndent; 1296 Mark endMark = reader_.mark; 1297 1298 while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) 1299 { 1300 if(reader_.peekByte() != ' ') 1301 { 1302 reader_.sliceBuilder.write(scanLineBreak()); 1303 endMark = reader_.mark; 1304 continue; 1305 } 1306 reader_.forward(); 1307 maxIndent = max(reader_.column, maxIndent); 1308 } 1309 1310 return tuple(maxIndent, endMark); 1311 } 1312 1313 /// Scan line breaks at lower or specified indentation in a block scalar. 1314 /// 1315 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1316 /// characters into that slice. 1317 Mark scanBlockScalarBreaksToSlice(const uint indent) @safe 1318 { 1319 Mark endMark = reader_.mark; 1320 1321 for(;;) 1322 { 1323 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); } 1324 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1325 reader_.sliceBuilder.write(scanLineBreak()); 1326 endMark = reader_.mark; 1327 } 1328 1329 return endMark; 1330 } 1331 1332 /// Scan a qouted flow scalar token with specified quotes. 1333 Token scanFlowScalar(const ScalarStyle quotes) @safe 1334 { 1335 const startMark = reader_.mark; 1336 const quote = reader_.get(); 1337 1338 reader_.sliceBuilder.begin(); 1339 1340 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1341 1342 while(reader_.peek() != quote) 1343 { 1344 scanFlowScalarSpacesToSlice(startMark); 1345 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1346 } 1347 reader_.forward(); 1348 1349 auto slice = reader_.sliceBuilder.finish(); 1350 return scalarToken(startMark, reader_.mark, slice, quotes); 1351 } 1352 1353 /// Scan nonspace characters in a flow scalar. 1354 /// 1355 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1356 /// characters into that slice. 1357 void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) 1358 @safe 1359 { 1360 for(;;) 1361 { 1362 dchar c = reader_.peek(); 1363 1364 size_t numCodePoints; 1365 // This is an optimized way of writing: 1366 // while(!search.canFind(reader_.peek(numCodePoints))) { ++numCodePoints; } 1367 outer: for(size_t oldSliceLength;;) 1368 { 1369 // This will not necessarily make slice 32 chars longer, as not all 1370 // code points are 1 char. 1371 const char[] slice = reader_.slice(numCodePoints + 32); 1372 enforce(slice.length != oldSliceLength, 1373 new ScannerException("While reading a flow scalar", startMark, 1374 "reached end of file", reader_.mark)); 1375 1376 for(size_t i = oldSliceLength; i < slice.length;) 1377 { 1378 // slice is UTF-8 - need to decode 1379 const ch = slice[i] < 0x80 ? slice[i++] : decode(slice, i); 1380 if(ch.isFlowScalarBreakSpace) { break outer; } 1381 ++numCodePoints; 1382 } 1383 oldSliceLength = slice.length; 1384 } 1385 1386 reader_.sliceBuilder.write(reader_.get(numCodePoints)); 1387 1388 c = reader_.peek(); 1389 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'') 1390 { 1391 reader_.forward(2); 1392 reader_.sliceBuilder.write('\''); 1393 } 1394 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') || 1395 (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\'))) 1396 { 1397 reader_.forward(); 1398 reader_.sliceBuilder.write(c); 1399 } 1400 else if(quotes == ScalarStyle.doubleQuoted && c == '\\') 1401 { 1402 reader_.forward(); 1403 c = reader_.peek(); 1404 if(c.among!(escapes)) 1405 { 1406 reader_.forward(); 1407 // Escaping has been moved to Parser as it can't be done in 1408 // place (in a slice) in case of '\P' and '\L' (very uncommon, 1409 // but we don't want to break the spec) 1410 char[2] escapeSequence = ['\\', cast(char)c]; 1411 reader_.sliceBuilder.write(escapeSequence); 1412 } 1413 else if(c.among!(escapeHexCodeList)) 1414 { 1415 const hexLength = dyaml.escapes.escapeHexLength(c); 1416 reader_.forward(); 1417 1418 foreach(i; 0 .. hexLength) { 1419 enforce(reader_.peek(i).isHexDigit, 1420 new ScannerException("While scanning a double quoted scalar", startMark, 1421 expected("escape sequence of hexadecimal numbers", 1422 reader_.peek(i)), reader_.mark)); 1423 } 1424 char[] hex = reader_.get(hexLength); 1425 1426 enforce((hex.length > 0) && (hex.length <= 8), 1427 new ScannerException("While scanning a double quoted scalar", startMark, 1428 "overflow when parsing an escape sequence of " ~ 1429 "hexadecimal numbers.", reader_.mark)); 1430 1431 char[2] escapeStart = ['\\', cast(char) c]; 1432 reader_.sliceBuilder.write(escapeStart); 1433 reader_.sliceBuilder.write(hex); 1434 1435 } 1436 else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1437 { 1438 scanLineBreak(); 1439 scanFlowScalarBreaksToSlice(startMark); 1440 } 1441 else 1442 { 1443 throw new ScannerException("While scanning a double quoted scalar", startMark, 1444 text("found unsupported escape character ", c), 1445 reader_.mark); 1446 } 1447 } 1448 else { return; } 1449 } 1450 } 1451 1452 /// Scan space characters in a flow scalar. 1453 /// 1454 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1455 /// spaces into that slice. 1456 void scanFlowScalarSpacesToSlice(const Mark startMark) @safe 1457 { 1458 // Increase length as long as we see whitespace. 1459 size_t length; 1460 while(reader_.peekByte(length).among!(' ', '\t')) { ++length; } 1461 auto whitespaces = reader_.prefixBytes(length); 1462 1463 // Can check the last byte without striding because '\0' is ASCII 1464 const c = reader_.peek(length); 1465 enforce(c != '\0', 1466 new ScannerException("While scanning a quoted scalar", startMark, 1467 "found unexpected end of buffer", reader_.mark)); 1468 1469 // Spaces not followed by a line break. 1470 if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1471 { 1472 reader_.forward(length); 1473 reader_.sliceBuilder.write(whitespaces); 1474 return; 1475 } 1476 1477 // There's a line break after the spaces. 1478 reader_.forward(length); 1479 const lineBreak = scanLineBreak(); 1480 1481 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1482 1483 // If we have extra line breaks after the first, scan them into the 1484 // slice. 1485 const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); 1486 1487 // No extra breaks, one normal line break. Replace it with a space. 1488 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1489 } 1490 1491 /// Scan line breaks in a flow scalar. 1492 /// 1493 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1494 /// line breaks into that slice. 1495 bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe 1496 { 1497 // True if at least one line break was found. 1498 bool anyBreaks; 1499 for(;;) 1500 { 1501 // Instead of checking indentation, we check for document separators. 1502 const prefix = reader_.prefix(3); 1503 enforce(!(prefix == "---" || prefix == "...") || 1504 !reader_.peek(3).isWhiteSpace, 1505 new ScannerException("While scanning a quoted scalar", startMark, 1506 "found unexpected document separator", reader_.mark)); 1507 1508 // Skip any whitespaces. 1509 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); } 1510 1511 // Encountered a non-whitespace non-linebreak character, so we're done. 1512 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1513 1514 const lineBreak = scanLineBreak(); 1515 anyBreaks = true; 1516 reader_.sliceBuilder.write(lineBreak); 1517 } 1518 return anyBreaks; 1519 } 1520 1521 /// Scan plain scalar token (no block, no quotes). 1522 Token scanPlain() @safe 1523 { 1524 // We keep track of the allowSimpleKey_ flag here. 1525 // Indentation rules are loosed for the flow context 1526 const startMark = reader_.mark; 1527 Mark endMark = startMark; 1528 const indent = indent_ + 1; 1529 1530 // We allow zero indentation for scalars, but then we need to check for 1531 // document separators at the beginning of the line. 1532 // if(indent == 0) { indent = 1; } 1533 1534 reader_.sliceBuilder.begin(); 1535 1536 alias Transaction = SliceBuilder.Transaction; 1537 Transaction spacesTransaction; 1538 // Stop at a comment. 1539 while(reader_.peekByte() != '#') 1540 { 1541 // Scan the entire plain scalar. 1542 size_t length; 1543 dchar c = void; 1544 // Moved the if() out of the loop for optimization. 1545 if(flowLevel_ == 0) 1546 { 1547 c = reader_.peek(length); 1548 for(;;) 1549 { 1550 const cNext = reader_.peek(length + 1); 1551 if(c.isWhiteSpace || 1552 (c == ':' && cNext.isWhiteSpace)) 1553 { 1554 break; 1555 } 1556 ++length; 1557 c = cNext; 1558 } 1559 } 1560 else 1561 { 1562 for(;;) 1563 { 1564 c = reader_.peek(length); 1565 if(c.isWhiteSpace || c.among!(',', ':', '?', '[', ']', '{', '}')) 1566 { 1567 break; 1568 } 1569 ++length; 1570 } 1571 } 1572 1573 // It's not clear what we should do with ':' in the flow context. 1574 enforce(flowLevel_ == 0 || c != ':' || 1575 reader_.peek(length + 1).isWhiteSpace || 1576 reader_.peek(length + 1).among!(',', '[', ']', '{', '}'), 1577 new ScannerException("While scanning a plain scalar", startMark, 1578 "found unexpected ':' . Please check " ~ 1579 "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.", 1580 reader_.mark)); 1581 1582 if(length == 0) { break; } 1583 1584 allowSimpleKey_ = false; 1585 1586 reader_.sliceBuilder.write(reader_.get(length)); 1587 1588 endMark = reader_.mark; 1589 1590 spacesTransaction.commit(); 1591 spacesTransaction = Transaction(&reader_.sliceBuilder); 1592 1593 const startLength = reader_.sliceBuilder.length; 1594 scanPlainSpacesToSlice(); 1595 if(startLength == reader_.sliceBuilder.length || 1596 (flowLevel_ == 0 && reader_.column < indent)) 1597 { 1598 break; 1599 } 1600 } 1601 1602 spacesTransaction.end(); 1603 char[] slice = reader_.sliceBuilder.finish(); 1604 1605 return scalarToken(startMark, endMark, slice, ScalarStyle.plain); 1606 } 1607 1608 /// Scan spaces in a plain scalar. 1609 /// 1610 /// Assumes that the caller is building a slice in Reader, and puts the spaces 1611 /// into that slice. 1612 void scanPlainSpacesToSlice() @safe 1613 { 1614 // The specification is really confusing about tabs in plain scalars. 1615 // We just forbid them completely. Do not use tabs in YAML! 1616 1617 // Get as many plain spaces as there are. 1618 size_t length; 1619 while(reader_.peekByte(length) == ' ') { ++length; } 1620 char[] whitespaces = reader_.prefixBytes(length); 1621 reader_.forward(length); 1622 1623 const dchar c = reader_.peek(); 1624 if(!c.isNSChar) 1625 { 1626 // We have spaces, but no newline. 1627 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); } 1628 return; 1629 } 1630 1631 // Newline after the spaces (if any) 1632 const lineBreak = scanLineBreak(); 1633 allowSimpleKey_ = true; 1634 1635 static bool end(Reader reader_) @safe pure 1636 { 1637 const prefix = reader_.prefix(3); 1638 return ("---" == prefix || "..." == prefix) 1639 && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 1640 } 1641 1642 if(end(reader_)) { return; } 1643 1644 bool extraBreaks; 1645 1646 alias Transaction = SliceBuilder.Transaction; 1647 auto transaction = Transaction(&reader_.sliceBuilder); 1648 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1649 while(reader_.peek().isNSChar) 1650 { 1651 if(reader_.peekByte() == ' ') { reader_.forward(); } 1652 else 1653 { 1654 const lBreak = scanLineBreak(); 1655 extraBreaks = true; 1656 reader_.sliceBuilder.write(lBreak); 1657 1658 if(end(reader_)) { return; } 1659 } 1660 } 1661 transaction.commit(); 1662 1663 // No line breaks, only a space. 1664 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1665 } 1666 1667 /// Scan handle of a tag token. 1668 /// 1669 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1670 /// characters into that slice. 1671 void scanTagHandleToSlice(string name)(const Mark startMark) 1672 { 1673 dchar c = reader_.peek(); 1674 enum contextMsg = "While scanning a " ~ name; 1675 enforce(c == '!', 1676 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1677 1678 uint length = 1; 1679 c = reader_.peek(length); 1680 if(c != ' ') 1681 { 1682 while(c.isAlphaNum || c.among!('-', '_')) 1683 { 1684 ++length; 1685 c = reader_.peek(length); 1686 } 1687 enforce(c == '!', 1688 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1689 ++length; 1690 } 1691 1692 reader_.sliceBuilder.write(reader_.get(length)); 1693 } 1694 1695 /// Scan URI in a tag token. 1696 /// 1697 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1698 /// characters into that slice. 1699 void scanTagURIToSlice(string name)(const Mark startMark) 1700 { 1701 // Note: we do not check if URI is well-formed. 1702 dchar c = reader_.peek(); 1703 const startLen = reader_.sliceBuilder.length; 1704 { 1705 uint length; 1706 while(c.isAlphaNum || c.isURIChar) 1707 { 1708 if(c == '%') 1709 { 1710 auto chars = reader_.get(length); 1711 reader_.sliceBuilder.write(chars); 1712 length = 0; 1713 scanURIEscapesToSlice!name(startMark); 1714 } 1715 else { ++length; } 1716 c = reader_.peek(length); 1717 } 1718 if(length > 0) 1719 { 1720 auto chars = reader_.get(length); 1721 reader_.sliceBuilder.write(chars); 1722 length = 0; 1723 } 1724 } 1725 // OK if we scanned something, error otherwise. 1726 enum contextMsg = "While parsing a " ~ name; 1727 enforce(reader_.sliceBuilder.length > startLen, 1728 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark)); 1729 } 1730 1731 // Not @nogc yet because std.utf.decode is not @nogc 1732 /// Scan URI escape sequences. 1733 /// 1734 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1735 /// characters into that slice. 1736 void scanURIEscapesToSlice(string name)(const Mark startMark) 1737 { 1738 import core.exception : UnicodeException; 1739 // URI escapes encode a UTF-8 string. We store UTF-8 code units here for 1740 // decoding into UTF-32. 1741 Appender!string buffer; 1742 1743 1744 enum contextMsg = "While scanning a " ~ name; 1745 while(reader_.peekByte() == '%') 1746 { 1747 reader_.forward(); 1748 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)]; 1749 1750 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit, 1751 new ScannerException(contextMsg, startMark, 1752 expected("URI escape sequence of 2 hexadecimal " ~ 1753 "numbers", nextByte), reader_.mark)); 1754 1755 buffer ~= nextByte[].to!ubyte(16); 1756 1757 reader_.forward(2); 1758 } 1759 try 1760 { 1761 foreach (dchar chr; buffer.data) 1762 { 1763 reader_.sliceBuilder.write(chr); 1764 } 1765 } 1766 catch (UnicodeException) 1767 { 1768 throw new ScannerException(contextMsg, startMark, 1769 "Invalid UTF-8 data encoded in URI escape sequence", 1770 reader_.mark); 1771 } 1772 } 1773 1774 1775 /// Scan a line break, if any. 1776 /// 1777 /// Transforms: 1778 /// '\r\n' : '\n' 1779 /// '\r' : '\n' 1780 /// '\n' : '\n' 1781 /// '\u0085' : '\n' 1782 /// '\u2028' : '\u2028' 1783 /// '\u2029 : '\u2029' 1784 /// no break : '\0' 1785 dchar scanLineBreak() @safe 1786 { 1787 // Fast path for ASCII line breaks. 1788 const b = reader_.peekByte(); 1789 if(b < 0x80) 1790 { 1791 if(b == '\n' || b == '\r') 1792 { 1793 if(reader_.prefix(2) == "\r\n") { reader_.forward(2); } 1794 else { reader_.forward(); } 1795 return '\n'; 1796 } 1797 return '\0'; 1798 } 1799 1800 const c = reader_.peek(); 1801 if(c == '\x85') 1802 { 1803 reader_.forward(); 1804 return '\n'; 1805 } 1806 if(c == '\u2028' || c == '\u2029') 1807 { 1808 reader_.forward(); 1809 return c; 1810 } 1811 return '\0'; 1812 } 1813 }