1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 /// YAML scanner. 8 /// Code based on PyYAML: http://www.pyyaml.org 9 module dyaml.scanner; 10 11 12 import core.stdc..string; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.ascii : isAlphaNum, isDigit, isHexDigit; 18 import std.exception; 19 import std..string; 20 import std.typecons; 21 import std.traits : Unqual; 22 import std.utf; 23 24 import dyaml.escapes; 25 import dyaml.exception; 26 import dyaml.queue; 27 import dyaml.reader; 28 import dyaml.style; 29 import dyaml.token; 30 31 package: 32 /// Scanner produces tokens of the following types: 33 /// STREAM-START 34 /// STREAM-END 35 /// DIRECTIVE(name, value) 36 /// DOCUMENT-START 37 /// DOCUMENT-END 38 /// BLOCK-SEQUENCE-START 39 /// BLOCK-MAPPING-START 40 /// BLOCK-END 41 /// FLOW-SEQUENCE-START 42 /// FLOW-MAPPING-START 43 /// FLOW-SEQUENCE-END 44 /// FLOW-MAPPING-END 45 /// BLOCK-ENTRY 46 /// FLOW-ENTRY 47 /// KEY 48 /// VALUE 49 /// ALIAS(value) 50 /// ANCHOR(value) 51 /// TAG(value) 52 /// SCALAR(value, plain, style) 53 54 alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 55 56 alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 57 58 alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 59 60 alias isNonLinebreakWhitespace = among!(' ', '\t'); 61 62 alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}', 63 '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n', 64 '\r', '\u0085', '\u2028', '\u2029'); 65 66 alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',', 67 '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%'); 68 69 alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'); 70 71 alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029'); 72 73 alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\'); 74 75 /// Marked exception thrown at scanner errors. 76 /// 77 /// See_Also: MarkedYAMLException 78 class ScannerException : MarkedYAMLException 79 { 80 mixin MarkedExceptionCtors; 81 } 82 83 /// Generates tokens from data provided by a Reader. 84 struct Scanner 85 { 86 private: 87 /// A simple key is a key that is not denoted by the '?' indicator. 88 /// For example: 89 /// --- 90 /// block simple key: value 91 /// ? not a simple key: 92 /// : { flow simple key: value } 93 /// We emit the KEY token before all keys, so when we find a potential simple 94 /// key, we try to locate the corresponding ':' indicator. Simple keys should be 95 /// limited to a single line and 1024 characters. 96 /// 97 /// 16 bytes on 64-bit. 98 static struct SimpleKey 99 { 100 /// Character index in reader where the key starts. 101 uint charIndex = uint.max; 102 /// Index of the key token from start (first token scanned being 0). 103 uint tokenIndex; 104 /// Line the key starts at. 105 uint line; 106 /// Column the key starts at. 107 ushort column; 108 /// Is this required to be a simple key? 109 bool required; 110 /// Is this struct "null" (invalid)?. 111 bool isNull; 112 } 113 114 /// Block chomping types. 115 enum Chomping 116 { 117 /// Strip all trailing line breaks. '-' indicator. 118 strip, 119 /// Line break of the last line is preserved, others discarded. Default. 120 clip, 121 /// All trailing line breaks are preserved. '+' indicator. 122 keep 123 } 124 125 /// Reader used to read from a file/stream. 126 Reader reader_; 127 /// Are we done scanning? 128 bool done_; 129 130 /// Level of nesting in flow context. If 0, we're in block context. 131 uint flowLevel_; 132 /// Current indentation level. 133 int indent_ = -1; 134 /// Past indentation levels. Used as a stack. 135 Appender!(int[]) indents_; 136 137 /// Processed tokens not yet emitted. Used as a queue. 138 Queue!Token tokens_; 139 140 /// Number of tokens emitted through the getToken method. 141 uint tokensTaken_; 142 143 /// Can a simple key start at the current position? A simple key may start: 144 /// - at the beginning of the line, not counting indentation spaces 145 /// (in block context), 146 /// - after '{', '[', ',' (in the flow context), 147 /// - after '?', ':', '-' (in the block context). 148 /// In the block context, this flag also signifies if a block collection 149 /// may start at the current position. 150 bool allowSimpleKey_ = true; 151 152 /// Possible simple keys indexed by flow levels. 153 SimpleKey[] possibleSimpleKeys_; 154 155 public: 156 /// Construct a Scanner using specified Reader. 157 this(Reader reader) @safe nothrow 158 { 159 // Return the next token, but do not delete it from the queue 160 reader_ = reader; 161 fetchStreamStart(); 162 } 163 164 /// Advance to the next token 165 void popFront() @safe 166 { 167 ++tokensTaken_; 168 tokens_.pop(); 169 } 170 171 /// Return the current token 172 const(Token) front() @safe 173 { 174 enforce(!empty, "No token left to peek"); 175 return tokens_.peek(); 176 } 177 178 /// Return whether there are any more tokens left. 179 bool empty() @safe 180 { 181 while (needMoreTokens()) 182 { 183 fetchToken(); 184 } 185 return tokens_.empty; 186 } 187 188 private: 189 /// Most scanning error messages have the same format; so build them with this 190 /// function. 191 string expected(T)(string expected, T found) 192 { 193 return text("expected ", expected, ", but found ", found); 194 } 195 196 /// Determine whether or not we need to fetch more tokens before peeking/getting a token. 197 bool needMoreTokens() @safe pure 198 { 199 if(done_) { return false; } 200 if(tokens_.empty) { return true; } 201 202 /// The current token may be a potential simple key, so we need to look further. 203 stalePossibleSimpleKeys(); 204 return nextPossibleSimpleKey() == tokensTaken_; 205 } 206 207 /// Fetch at token, adding it to tokens_. 208 void fetchToken() @safe 209 { 210 // Eat whitespaces and comments until we reach the next token. 211 scanToNextToken(); 212 213 // Remove obsolete possible simple keys. 214 stalePossibleSimpleKeys(); 215 216 // Compare current indentation and column. It may add some tokens 217 // and decrease the current indentation level. 218 unwindIndent(reader_.column); 219 220 // Get the next character. 221 const dchar c = reader_.peekByte(); 222 223 // Fetch the token. 224 if(c == '\0') { return fetchStreamEnd(); } 225 if(checkDirective()) { return fetchDirective(); } 226 if(checkDocumentStart()) { return fetchDocumentStart(); } 227 if(checkDocumentEnd()) { return fetchDocumentEnd(); } 228 // Order of the following checks is NOT significant. 229 switch(c) 230 { 231 case '[': return fetchFlowSequenceStart(); 232 case '{': return fetchFlowMappingStart(); 233 case ']': return fetchFlowSequenceEnd(); 234 case '}': return fetchFlowMappingEnd(); 235 case ',': return fetchFlowEntry(); 236 case '!': return fetchTag(); 237 case '\'': return fetchSingle(); 238 case '\"': return fetchDouble(); 239 case '*': return fetchAlias(); 240 case '&': return fetchAnchor(); 241 case '?': if(checkKey()) { return fetchKey(); } goto default; 242 case ':': if(checkValue()) { return fetchValue(); } goto default; 243 case '-': if(checkBlockEntry()) { return fetchBlockEntry(); } goto default; 244 case '|': if(flowLevel_ == 0) { return fetchLiteral(); } break; 245 case '>': if(flowLevel_ == 0) { return fetchFolded(); } break; 246 default: if(checkPlain()) { return fetchPlain(); } 247 } 248 249 throw new ScannerException("While scanning for the next token, found character " ~ 250 "\'%s\', index %s that cannot start any token" 251 .format(c, to!int(c)), reader_.mark); 252 } 253 254 255 /// Return the token number of the nearest possible simple key. 256 uint nextPossibleSimpleKey() @safe pure nothrow @nogc 257 { 258 uint minTokenNumber = uint.max; 259 foreach(k, ref simpleKey; possibleSimpleKeys_) 260 { 261 if(simpleKey.isNull) { continue; } 262 minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex); 263 } 264 return minTokenNumber; 265 } 266 267 /// Remove entries that are no longer possible simple keys. 268 /// 269 /// According to the YAML specification, simple keys 270 /// - should be limited to a single line, 271 /// - should be no longer than 1024 characters. 272 /// Disabling this will allow simple keys of any length and 273 /// height (may cause problems if indentation is broken though). 274 void stalePossibleSimpleKeys() @safe pure 275 { 276 foreach(level, ref key; possibleSimpleKeys_) 277 { 278 if(key.isNull) { continue; } 279 if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024) 280 { 281 enforce(!key.required, 282 new ScannerException("While scanning a simple key", 283 Mark(reader_.name, key.line, key.column), 284 "could not find expected ':'", reader_.mark)); 285 key.isNull = true; 286 } 287 } 288 } 289 290 /// Check if the next token starts a possible simple key and if so, save its position. 291 /// 292 /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 293 void savePossibleSimpleKey() @safe pure 294 { 295 // Check if a simple key is required at the current position. 296 const required = (flowLevel_ == 0 && indent_ == reader_.column); 297 assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~ 298 "the first token in the current line. Therefore it is always allowed."); 299 300 if(!allowSimpleKey_) { return; } 301 302 // The next token might be a simple key, so save its number and position. 303 removePossibleSimpleKey(); 304 const tokenCount = tokensTaken_ + cast(uint)tokens_.length; 305 306 const line = reader_.line; 307 const column = reader_.column; 308 const key = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line, 309 cast(ushort)min(column, ushort.max), required); 310 311 if(possibleSimpleKeys_.length <= flowLevel_) 312 { 313 const oldLength = possibleSimpleKeys_.length; 314 possibleSimpleKeys_.length = flowLevel_ + 1; 315 //No need to initialize the last element, it's already done in the next line. 316 possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init; 317 } 318 possibleSimpleKeys_[flowLevel_] = key; 319 } 320 321 /// Remove the saved possible key position at the current flow level. 322 void removePossibleSimpleKey() @safe pure 323 { 324 if(possibleSimpleKeys_.length <= flowLevel_) { return; } 325 326 if(!possibleSimpleKeys_[flowLevel_].isNull) 327 { 328 const key = possibleSimpleKeys_[flowLevel_]; 329 enforce(!key.required, 330 new ScannerException("While scanning a simple key", 331 Mark(reader_.name, key.line, key.column), 332 "could not find expected ':'", reader_.mark)); 333 possibleSimpleKeys_[flowLevel_].isNull = true; 334 } 335 } 336 337 /// Decrease indentation, removing entries in indents_. 338 /// 339 /// Params: column = Current column in the file/stream. 340 void unwindIndent(const int column) @safe 341 { 342 if(flowLevel_ > 0) 343 { 344 // In flow context, tokens should respect indentation. 345 // The condition should be `indent >= column` according to the spec. 346 // But this condition will prohibit intuitively correct 347 // constructions such as 348 // key : { 349 // } 350 351 // In the flow context, indentation is ignored. We make the scanner less 352 // restrictive than what the specification requires. 353 // if(pedantic_ && flowLevel_ > 0 && indent_ > column) 354 // { 355 // throw new ScannerException("Invalid intendation or unclosed '[' or '{'", 356 // reader_.mark) 357 // } 358 return; 359 } 360 361 // In block context, we may need to issue the BLOCK-END tokens. 362 while(indent_ > column) 363 { 364 indent_ = indents_.data.back; 365 assert(indents_.data.length); 366 indents_.shrinkTo(indents_.data.length - 1); 367 tokens_.push(blockEndToken(reader_.mark, reader_.mark)); 368 } 369 } 370 371 /// Increase indentation if needed. 372 /// 373 /// Params: column = Current column in the file/stream. 374 /// 375 /// Returns: true if the indentation was increased, false otherwise. 376 bool addIndent(int column) @safe 377 { 378 if(indent_ >= column){return false;} 379 indents_ ~= indent_; 380 indent_ = column; 381 return true; 382 } 383 384 385 /// Add STREAM-START token. 386 void fetchStreamStart() @safe nothrow 387 { 388 tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding)); 389 } 390 391 ///Add STREAM-END token. 392 void fetchStreamEnd() @safe 393 { 394 //Set intendation to -1 . 395 unwindIndent(-1); 396 removePossibleSimpleKey(); 397 allowSimpleKey_ = false; 398 possibleSimpleKeys_.destroy; 399 400 tokens_.push(streamEndToken(reader_.mark, reader_.mark)); 401 done_ = true; 402 } 403 404 /// Add DIRECTIVE token. 405 void fetchDirective() @safe 406 { 407 // Set intendation to -1 . 408 unwindIndent(-1); 409 // Reset simple keys. 410 removePossibleSimpleKey(); 411 allowSimpleKey_ = false; 412 413 auto directive = scanDirective(); 414 tokens_.push(directive); 415 } 416 417 /// Add DOCUMENT-START or DOCUMENT-END token. 418 void fetchDocumentIndicator(TokenID id)() 419 if(id == TokenID.documentStart || id == TokenID.documentEnd) 420 { 421 // Set indentation to -1 . 422 unwindIndent(-1); 423 // Reset simple keys. Note that there can't be a block collection after '---'. 424 removePossibleSimpleKey(); 425 allowSimpleKey_ = false; 426 427 Mark startMark = reader_.mark; 428 reader_.forward(3); 429 tokens_.push(simpleToken!id(startMark, reader_.mark)); 430 } 431 432 /// Aliases to add DOCUMENT-START or DOCUMENT-END token. 433 alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart); 434 alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd); 435 436 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 437 void fetchFlowCollectionStart(TokenID id)() @safe 438 { 439 // '[' and '{' may start a simple key. 440 savePossibleSimpleKey(); 441 // Simple keys are allowed after '[' and '{'. 442 allowSimpleKey_ = true; 443 ++flowLevel_; 444 445 Mark startMark = reader_.mark; 446 reader_.forward(); 447 tokens_.push(simpleToken!id(startMark, reader_.mark)); 448 } 449 450 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 451 alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart); 452 alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart); 453 454 /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. 455 void fetchFlowCollectionEnd(TokenID id)() 456 { 457 // Reset possible simple key on the current level. 458 removePossibleSimpleKey(); 459 // No simple keys after ']' and '}'. 460 allowSimpleKey_ = false; 461 --flowLevel_; 462 463 Mark startMark = reader_.mark; 464 reader_.forward(); 465 tokens_.push(simpleToken!id(startMark, reader_.mark)); 466 } 467 468 /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/ 469 alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd); 470 alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd); 471 472 /// Add FLOW-ENTRY token; 473 void fetchFlowEntry() @safe 474 { 475 // Reset possible simple key on the current level. 476 removePossibleSimpleKey(); 477 // Simple keys are allowed after ','. 478 allowSimpleKey_ = true; 479 480 Mark startMark = reader_.mark; 481 reader_.forward(); 482 tokens_.push(flowEntryToken(startMark, reader_.mark)); 483 } 484 485 /// Additional checks used in block context in fetchBlockEntry and fetchKey. 486 /// 487 /// Params: type = String representing the token type we might need to add. 488 /// id = Token type we might need to add. 489 void blockChecks(string type, TokenID id)() 490 { 491 enum context = type ~ " keys are not allowed here"; 492 // Are we allowed to start a key (not neccesarily a simple one)? 493 enforce(allowSimpleKey_, new ScannerException(context, reader_.mark)); 494 495 if(addIndent(reader_.column)) 496 { 497 tokens_.push(simpleToken!id(reader_.mark, reader_.mark)); 498 } 499 } 500 501 /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process. 502 void fetchBlockEntry() @safe 503 { 504 if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); } 505 506 // It's an error for the block entry to occur in the flow context, 507 // but we let the parser detect this. 508 509 // Reset possible simple key on the current level. 510 removePossibleSimpleKey(); 511 // Simple keys are allowed after '-'. 512 allowSimpleKey_ = true; 513 514 Mark startMark = reader_.mark; 515 reader_.forward(); 516 tokens_.push(blockEntryToken(startMark, reader_.mark)); 517 } 518 519 /// Add KEY token. Might add BLOCK-MAPPING-START in the process. 520 void fetchKey() @safe 521 { 522 if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); } 523 524 // Reset possible simple key on the current level. 525 removePossibleSimpleKey(); 526 // Simple keys are allowed after '?' in the block context. 527 allowSimpleKey_ = (flowLevel_ == 0); 528 529 Mark startMark = reader_.mark; 530 reader_.forward(); 531 tokens_.push(keyToken(startMark, reader_.mark)); 532 } 533 534 /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process. 535 void fetchValue() @safe 536 { 537 //Do we determine a simple key? 538 if(possibleSimpleKeys_.length > flowLevel_ && 539 !possibleSimpleKeys_[flowLevel_].isNull) 540 { 541 const key = possibleSimpleKeys_[flowLevel_]; 542 possibleSimpleKeys_[flowLevel_].isNull = true; 543 Mark keyMark = Mark(reader_.name, key.line, key.column); 544 const idx = key.tokenIndex - tokensTaken_; 545 546 assert(idx >= 0); 547 548 // Add KEY. 549 // Manually inserting since tokens are immutable (need linked list). 550 tokens_.insert(keyToken(keyMark, keyMark), idx); 551 552 // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START. 553 if(flowLevel_ == 0 && addIndent(key.column)) 554 { 555 tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx); 556 } 557 558 // There cannot be two simple keys in a row. 559 allowSimpleKey_ = false; 560 } 561 // Part of a complex key 562 else 563 { 564 // We can start a complex value if and only if we can start a simple key. 565 enforce(flowLevel_ > 0 || allowSimpleKey_, 566 new ScannerException("Mapping values are not allowed here", reader_.mark)); 567 568 // If this value starts a new block mapping, we need to add 569 // BLOCK-MAPPING-START. It'll be detected as an error later by the parser. 570 if(flowLevel_ == 0 && addIndent(reader_.column)) 571 { 572 tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark)); 573 } 574 575 // Reset possible simple key on the current level. 576 removePossibleSimpleKey(); 577 // Simple keys are allowed after ':' in the block context. 578 allowSimpleKey_ = (flowLevel_ == 0); 579 } 580 581 // Add VALUE. 582 Mark startMark = reader_.mark; 583 reader_.forward(); 584 tokens_.push(valueToken(startMark, reader_.mark)); 585 } 586 587 /// Add ALIAS or ANCHOR token. 588 void fetchAnchor_(TokenID id)() @safe 589 if(id == TokenID.alias_ || id == TokenID.anchor) 590 { 591 // ALIAS/ANCHOR could be a simple key. 592 savePossibleSimpleKey(); 593 // No simple keys after ALIAS/ANCHOR. 594 allowSimpleKey_ = false; 595 596 auto anchor = scanAnchor(id); 597 tokens_.push(anchor); 598 } 599 600 /// Aliases to add ALIAS or ANCHOR token. 601 alias fetchAlias = fetchAnchor_!(TokenID.alias_); 602 alias fetchAnchor = fetchAnchor_!(TokenID.anchor); 603 604 /// Add TAG token. 605 void fetchTag() @safe 606 { 607 //TAG could start a simple key. 608 savePossibleSimpleKey(); 609 //No simple keys after TAG. 610 allowSimpleKey_ = false; 611 612 tokens_.push(scanTag()); 613 } 614 615 /// Add block SCALAR token. 616 void fetchBlockScalar(ScalarStyle style)() @safe 617 if(style == ScalarStyle.literal || style == ScalarStyle.folded) 618 { 619 // Reset possible simple key on the current level. 620 removePossibleSimpleKey(); 621 // A simple key may follow a block scalar. 622 allowSimpleKey_ = true; 623 624 auto blockScalar = scanBlockScalar(style); 625 tokens_.push(blockScalar); 626 } 627 628 /// Aliases to add literal or folded block scalar. 629 alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal); 630 alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded); 631 632 /// Add quoted flow SCALAR token. 633 void fetchFlowScalar(ScalarStyle quotes)() 634 { 635 // A flow scalar could be a simple key. 636 savePossibleSimpleKey(); 637 // No simple keys after flow scalars. 638 allowSimpleKey_ = false; 639 640 // Scan and add SCALAR. 641 auto scalar = scanFlowScalar(quotes); 642 tokens_.push(scalar); 643 } 644 645 /// Aliases to add single or double quoted block scalar. 646 alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted); 647 alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted); 648 649 /// Add plain SCALAR token. 650 void fetchPlain() @safe 651 { 652 // A plain scalar could be a simple key 653 savePossibleSimpleKey(); 654 // No simple keys after plain scalars. But note that scanPlain() will 655 // change this flag if the scan is finished at the beginning of the line. 656 allowSimpleKey_ = false; 657 auto plain = scanPlain(); 658 659 // Scan and add SCALAR. May change allowSimpleKey_ 660 tokens_.push(plain); 661 } 662 663 pure: 664 665 ///Check if the next token is DIRECTIVE: ^ '%' ... 666 bool checkDirective() @safe 667 { 668 return reader_.peekByte() == '%' && reader_.column == 0; 669 } 670 671 /// Check if the next token is DOCUMENT-START: ^ '---' (' '|'\n') 672 bool checkDocumentStart() @safe 673 { 674 // Check one char first, then all 3, to prevent reading outside the buffer. 675 return reader_.column == 0 && 676 reader_.peekByte() == '-' && 677 reader_.prefix(3) == "---" && 678 reader_.peek(3).isWhiteSpace; 679 } 680 681 /// Check if the next token is DOCUMENT-END: ^ '...' (' '|'\n') 682 bool checkDocumentEnd() @safe 683 { 684 // Check one char first, then all 3, to prevent reading outside the buffer. 685 return reader_.column == 0 && 686 reader_.peekByte() == '.' && 687 reader_.prefix(3) == "..." && 688 reader_.peek(3).isWhiteSpace; 689 } 690 691 /// Check if the next token is BLOCK-ENTRY: '-' (' '|'\n') 692 bool checkBlockEntry() @safe 693 { 694 return !!reader_.peek(1).isWhiteSpace; 695 } 696 697 /// Check if the next token is KEY(flow context): '?' 698 /// 699 /// or KEY(block context): '?' (' '|'\n') 700 bool checkKey() @safe 701 { 702 return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace); 703 } 704 705 /// Check if the next token is VALUE(flow context): ':' 706 /// 707 /// or VALUE(block context): ':' (' '|'\n') 708 bool checkValue() @safe 709 { 710 return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace; 711 } 712 713 /// Check if the next token is a plain scalar. 714 /// 715 /// A plain scalar may start with any non-space character except: 716 /// '-', '?', ':', ',', '[', ']', '{', '}', 717 /// '#', '&', '*', '!', '|', '>', '\'', '\"', 718 /// '%', '@', '`'. 719 /// 720 /// It may also start with 721 /// '-', '?', ':' 722 /// if it is followed by a non-space character. 723 /// 724 /// Note that we limit the last rule to the block context (except the 725 /// '-' character) because we want the flow context to be space 726 /// independent. 727 bool checkPlain() @safe 728 { 729 const c = reader_.peek(); 730 if(!c.isNonScalarStartCharacter) 731 { 732 return true; 733 } 734 return !reader_.peek(1).isWhiteSpace && 735 (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':'))); 736 } 737 738 /// Move to the next non-space character. 739 void findNextNonSpace() @safe 740 { 741 while(reader_.peekByte() == ' ') { reader_.forward(); } 742 } 743 744 /// Scan a string of alphanumeric or "-_" characters. 745 /// 746 /// Assumes that the caller is building a slice in Reader, and puts the scanned 747 /// characters into that slice. 748 void scanAlphaNumericToSlice(string name)(const Mark startMark) 749 { 750 size_t length; 751 dchar c = reader_.peek(); 752 while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); } 753 754 enforce(length > 0, new ScannerException("While scanning " ~ name, 755 startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark)); 756 757 reader_.sliceBuilder.write(reader_.get(length)); 758 } 759 760 /// Scan and throw away all characters until next line break. 761 void scanToNextBreak() @safe 762 { 763 while(!reader_.peek().isBreak) { reader_.forward(); } 764 } 765 766 /// Scan all characters until next line break. 767 /// 768 /// Assumes that the caller is building a slice in Reader, and puts the scanned 769 /// characters into that slice. 770 void scanToNextBreakToSlice() @safe 771 { 772 uint length; 773 while(!reader_.peek(length).isBreak) 774 { 775 ++length; 776 } 777 reader_.sliceBuilder.write(reader_.get(length)); 778 } 779 780 781 /// Move to next token in the file/stream. 782 /// 783 /// We ignore spaces, line breaks and comments. 784 /// If we find a line break in the block context, we set 785 /// allowSimpleKey` on. 786 /// 787 /// We do not yet support BOM inside the stream as the 788 /// specification requires. Any such mark will be considered as a part 789 /// of the document. 790 void scanToNextToken() @safe 791 { 792 // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is: 793 // Tabs cannot precede tokens 794 // BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 795 // KEY(block), VALUE(block), BLOCK-ENTRY 796 // So the checking code is 797 // if <TAB>: 798 // allowSimpleKey_ = false 799 // We also need to add the check for `allowSimpleKey_ == true` to 800 // `unwindIndent` before issuing BLOCK-END. 801 // Scanners for block, flow, and plain scalars need to be modified. 802 803 for(;;) 804 { 805 //All whitespace in flow context is ignored, even whitespace 806 // not allowed in other contexts 807 if (flowLevel_ > 0) 808 { 809 while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); } 810 } 811 else 812 { 813 findNextNonSpace(); 814 } 815 if(reader_.peekByte() == '#') { scanToNextBreak(); } 816 if(scanLineBreak() != '\0') 817 { 818 if(flowLevel_ == 0) { allowSimpleKey_ = true; } 819 } 820 else 821 { 822 break; 823 } 824 } 825 } 826 827 /// Scan directive token. 828 Token scanDirective() @safe 829 { 830 Mark startMark = reader_.mark; 831 // Skip the '%'. 832 reader_.forward(); 833 834 // Scan directive name 835 reader_.sliceBuilder.begin(); 836 scanDirectiveNameToSlice(startMark); 837 const name = reader_.sliceBuilder.finish(); 838 839 reader_.sliceBuilder.begin(); 840 841 // Index where tag handle ends and suffix starts in a tag directive value. 842 uint tagHandleEnd = uint.max; 843 if(name == "YAML") { scanYAMLDirectiveValueToSlice(startMark); } 844 else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); } 845 char[] value = reader_.sliceBuilder.finish(); 846 847 Mark endMark = reader_.mark; 848 849 DirectiveType directive; 850 if(name == "YAML") { directive = DirectiveType.yaml; } 851 else if(name == "TAG") { directive = DirectiveType.tag; } 852 else 853 { 854 directive = DirectiveType.reserved; 855 scanToNextBreak(); 856 } 857 858 scanDirectiveIgnoredLine(startMark); 859 860 return directiveToken(startMark, endMark, value, directive, tagHandleEnd); 861 } 862 863 /// Scan name of a directive token. 864 /// 865 /// Assumes that the caller is building a slice in Reader, and puts the scanned 866 /// characters into that slice. 867 void scanDirectiveNameToSlice(const Mark startMark) @safe 868 { 869 // Scan directive name. 870 scanAlphaNumericToSlice!"a directive"(startMark); 871 872 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 873 new ScannerException("While scanning a directive", startMark, 874 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 875 } 876 877 /// Scan value of a YAML directive token. Returns major, minor version separated by '.'. 878 /// 879 /// Assumes that the caller is building a slice in Reader, and puts the scanned 880 /// characters into that slice. 881 void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe 882 { 883 findNextNonSpace(); 884 885 scanYAMLDirectiveNumberToSlice(startMark); 886 887 enforce(reader_.peekByte() == '.', 888 new ScannerException("While scanning a directive", startMark, 889 expected("digit or '.'", reader_.peek()), reader_.mark)); 890 // Skip the '.'. 891 reader_.forward(); 892 893 reader_.sliceBuilder.write('.'); 894 scanYAMLDirectiveNumberToSlice(startMark); 895 896 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 897 new ScannerException("While scanning a directive", startMark, 898 expected("digit or '.'", reader_.peek()), reader_.mark)); 899 } 900 901 /// Scan a number from a YAML directive. 902 /// 903 /// Assumes that the caller is building a slice in Reader, and puts the scanned 904 /// characters into that slice. 905 void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe 906 { 907 enforce(isDigit(reader_.peek()), 908 new ScannerException("While scanning a directive", startMark, 909 expected("digit", reader_.peek()), reader_.mark)); 910 911 // Already found the first digit in the enforce(), so set length to 1. 912 uint length = 1; 913 while(reader_.peek(length).isDigit) { ++length; } 914 915 reader_.sliceBuilder.write(reader_.get(length)); 916 } 917 918 /// Scan value of a tag directive. 919 /// 920 /// Assumes that the caller is building a slice in Reader, and puts the scanned 921 /// characters into that slice. 922 /// 923 /// Returns: Length of tag handle (which is before tag prefix) in scanned data 924 uint scanTagDirectiveValueToSlice(const Mark startMark) @safe 925 { 926 findNextNonSpace(); 927 const startLength = reader_.sliceBuilder.length; 928 scanTagDirectiveHandleToSlice(startMark); 929 const handleLength = cast(uint)(reader_.sliceBuilder.length - startLength); 930 findNextNonSpace(); 931 scanTagDirectivePrefixToSlice(startMark); 932 933 return handleLength; 934 } 935 936 /// Scan handle of a tag directive. 937 /// 938 /// Assumes that the caller is building a slice in Reader, and puts the scanned 939 /// characters into that slice. 940 void scanTagDirectiveHandleToSlice(const Mark startMark) @safe 941 { 942 scanTagHandleToSlice!"directive"(startMark); 943 enforce(reader_.peekByte() == ' ', 944 new ScannerException("While scanning a directive handle", startMark, 945 expected("' '", reader_.peek()), reader_.mark)); 946 } 947 948 /// Scan prefix of a tag directive. 949 /// 950 /// Assumes that the caller is building a slice in Reader, and puts the scanned 951 /// characters into that slice. 952 void scanTagDirectivePrefixToSlice(const Mark startMark) @safe 953 { 954 scanTagURIToSlice!"directive"(startMark); 955 enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 956 new ScannerException("While scanning a directive prefix", startMark, 957 expected("' '", reader_.peek()), reader_.mark)); 958 } 959 960 /// Scan (and ignore) ignored line after a directive. 961 void scanDirectiveIgnoredLine(const Mark startMark) @safe 962 { 963 findNextNonSpace(); 964 if(reader_.peekByte() == '#') { scanToNextBreak(); } 965 enforce(reader_.peek().isBreak, 966 new ScannerException("While scanning a directive", startMark, 967 expected("comment or a line break", reader_.peek()), reader_.mark)); 968 scanLineBreak(); 969 } 970 971 972 /// Scan an alias or an anchor. 973 /// 974 /// The specification does not restrict characters for anchors and 975 /// aliases. This may lead to problems, for instance, the document: 976 /// [ *alias, value ] 977 /// can be interpteted in two ways, as 978 /// [ "value" ] 979 /// and 980 /// [ *alias , "value" ] 981 /// Therefore we restrict aliases to ASCII alphanumeric characters. 982 Token scanAnchor(const TokenID id) @safe 983 { 984 const startMark = reader_.mark; 985 const dchar i = reader_.get(); 986 987 reader_.sliceBuilder.begin(); 988 if(i == '*') { scanAlphaNumericToSlice!"an alias"(startMark); } 989 else { scanAlphaNumericToSlice!"an anchor"(startMark); } 990 // On error, value is discarded as we return immediately 991 char[] value = reader_.sliceBuilder.finish(); 992 993 enum anchorCtx = "While scanning an anchor"; 994 enum aliasCtx = "While scanning an alias"; 995 enforce(reader_.peek().isWhiteSpace || 996 reader_.peekByte().among!('?', ':', ',', ']', '}', '%', '@'), 997 new ScannerException(i == '*' ? aliasCtx : anchorCtx, startMark, 998 expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); 999 1000 if(id == TokenID.alias_) 1001 { 1002 return aliasToken(startMark, reader_.mark, value); 1003 } 1004 if(id == TokenID.anchor) 1005 { 1006 return anchorToken(startMark, reader_.mark, value); 1007 } 1008 assert(false, "This code should never be reached"); 1009 } 1010 1011 /// Scan a tag token. 1012 Token scanTag() @safe 1013 { 1014 const startMark = reader_.mark; 1015 dchar c = reader_.peek(1); 1016 1017 reader_.sliceBuilder.begin(); 1018 scope(failure) { reader_.sliceBuilder.finish(); } 1019 // Index where tag handle ends and tag suffix starts in the tag value 1020 // (slice) we will produce. 1021 uint handleEnd; 1022 1023 if(c == '<') 1024 { 1025 reader_.forward(2); 1026 1027 handleEnd = 0; 1028 scanTagURIToSlice!"tag"(startMark); 1029 enforce(reader_.peekByte() == '>', 1030 new ScannerException("While scanning a tag", startMark, 1031 expected("'>'", reader_.peek()), reader_.mark)); 1032 reader_.forward(); 1033 } 1034 else if(c.isWhiteSpace) 1035 { 1036 reader_.forward(); 1037 handleEnd = 0; 1038 reader_.sliceBuilder.write('!'); 1039 } 1040 else 1041 { 1042 uint length = 1; 1043 bool useHandle; 1044 1045 while(!c.isBreakOrSpace) 1046 { 1047 if(c == '!') 1048 { 1049 useHandle = true; 1050 break; 1051 } 1052 ++length; 1053 c = reader_.peek(length); 1054 } 1055 1056 if(useHandle) 1057 { 1058 scanTagHandleToSlice!"tag"(startMark); 1059 handleEnd = cast(uint)reader_.sliceBuilder.length; 1060 } 1061 else 1062 { 1063 reader_.forward(); 1064 reader_.sliceBuilder.write('!'); 1065 handleEnd = cast(uint)reader_.sliceBuilder.length; 1066 } 1067 1068 scanTagURIToSlice!"tag"(startMark); 1069 } 1070 1071 enforce(reader_.peek().isBreakOrSpace, 1072 new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()), 1073 reader_.mark)); 1074 1075 char[] slice = reader_.sliceBuilder.finish(); 1076 return tagToken(startMark, reader_.mark, slice, handleEnd); 1077 } 1078 1079 /// Scan a block scalar token with specified style. 1080 Token scanBlockScalar(const ScalarStyle style) @safe 1081 { 1082 const startMark = reader_.mark; 1083 1084 // Scan the header. 1085 reader_.forward(); 1086 1087 const indicators = scanBlockScalarIndicators(startMark); 1088 1089 const chomping = indicators[0]; 1090 const increment = indicators[1]; 1091 scanBlockScalarIgnoredLine(startMark); 1092 1093 // Determine the indentation level and go to the first non-empty line. 1094 Mark endMark; 1095 uint indent = max(1, indent_ + 1); 1096 1097 reader_.sliceBuilder.begin(); 1098 alias Transaction = SliceBuilder.Transaction; 1099 // Used to strip the last line breaks written to the slice at the end of the 1100 // scalar, which may be needed based on chomping. 1101 Transaction breaksTransaction = Transaction(&reader_.sliceBuilder); 1102 // Read the first indentation/line breaks before the scalar. 1103 size_t startLen = reader_.sliceBuilder.length; 1104 if(increment == int.min) 1105 { 1106 auto indentation = scanBlockScalarIndentationToSlice(); 1107 endMark = indentation[1]; 1108 indent = max(indent, indentation[0]); 1109 } 1110 else 1111 { 1112 indent += increment - 1; 1113 endMark = scanBlockScalarBreaksToSlice(indent); 1114 } 1115 1116 // int.max means there's no line break (int.max is outside UTF-32). 1117 dchar lineBreak = cast(dchar)int.max; 1118 1119 // Scan the inner part of the block scalar. 1120 while(reader_.column == indent && reader_.peekByte() != '\0') 1121 { 1122 breaksTransaction.commit(); 1123 const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t'); 1124 // This is where the 'interesting' non-whitespace data gets read. 1125 scanToNextBreakToSlice(); 1126 lineBreak = scanLineBreak(); 1127 1128 1129 // This transaction serves to rollback data read in the 1130 // scanBlockScalarBreaksToSlice() call. 1131 breaksTransaction = Transaction(&reader_.sliceBuilder); 1132 startLen = reader_.sliceBuilder.length; 1133 // The line breaks should actually be written _after_ the if() block 1134 // below. We work around that by inserting 1135 endMark = scanBlockScalarBreaksToSlice(indent); 1136 1137 // This will not run during the last iteration (see the if() vs the 1138 // while()), hence breaksTransaction rollback (which happens after this 1139 // loop) will never roll back data written in this if() block. 1140 if(reader_.column == indent && reader_.peekByte() != '\0') 1141 { 1142 // Unfortunately, folding rules are ambiguous. 1143 1144 // This is the folding according to the specification: 1145 if(style == ScalarStyle.folded && lineBreak == '\n' && 1146 leadingNonSpace && !reader_.peekByte().among!(' ', '\t')) 1147 { 1148 // No breaks were scanned; no need to insert the space in the 1149 // middle of slice. 1150 if(startLen == reader_.sliceBuilder.length) 1151 { 1152 reader_.sliceBuilder.write(' '); 1153 } 1154 } 1155 else 1156 { 1157 // We need to insert in the middle of the slice in case any line 1158 // breaks were scanned. 1159 reader_.sliceBuilder.insert(lineBreak, startLen); 1160 } 1161 1162 ////this is Clark Evans's interpretation (also in the spec 1163 ////examples): 1164 // 1165 //if(style == ScalarStyle.folded && lineBreak == '\n') 1166 //{ 1167 // if(startLen == endLen) 1168 // { 1169 // if(!" \t"d.canFind(reader_.peekByte())) 1170 // { 1171 // reader_.sliceBuilder.write(' '); 1172 // } 1173 // else 1174 // { 1175 // chunks ~= lineBreak; 1176 // } 1177 // } 1178 //} 1179 //else 1180 //{ 1181 // reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen); 1182 //} 1183 } 1184 else 1185 { 1186 break; 1187 } 1188 } 1189 1190 // If chompint is Keep, we keep (commit) the last scanned line breaks 1191 // (which are at the end of the scalar). Otherwise re remove them (end the 1192 // transaction). 1193 if(chomping == Chomping.keep) { breaksTransaction.commit(); } 1194 else { breaksTransaction.end(); } 1195 if(chomping != Chomping.strip && lineBreak != int.max) 1196 { 1197 // If chomping is Keep, we keep the line break but the first line break 1198 // that isn't stripped (since chomping isn't Strip in this branch) must 1199 // be inserted _before_ the other line breaks. 1200 if(chomping == Chomping.keep) 1201 { 1202 reader_.sliceBuilder.insert(lineBreak, startLen); 1203 } 1204 // If chomping is not Keep, breaksTransaction was cancelled so we can 1205 // directly write the first line break (as it isn't stripped - chomping 1206 // is not Strip) 1207 else 1208 { 1209 reader_.sliceBuilder.write(lineBreak); 1210 } 1211 } 1212 1213 char[] slice = reader_.sliceBuilder.finish(); 1214 return scalarToken(startMark, endMark, slice, style); 1215 } 1216 1217 /// Scan chomping and indentation indicators of a scalar token. 1218 Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe 1219 { 1220 auto chomping = Chomping.clip; 1221 int increment = int.min; 1222 dchar c = reader_.peek(); 1223 1224 /// Indicators can be in any order. 1225 if(getChomping(c, chomping)) 1226 { 1227 getIncrement(c, increment, startMark); 1228 } 1229 else 1230 { 1231 const gotIncrement = getIncrement(c, increment, startMark); 1232 if(gotIncrement) { getChomping(c, chomping); } 1233 } 1234 1235 enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), 1236 new ScannerException("While scanning a block scalar", startMark, 1237 expected("chomping or indentation indicator", c), reader_.mark)); 1238 1239 return tuple(chomping, increment); 1240 } 1241 1242 /// Get chomping indicator, if detected. Return false otherwise. 1243 /// 1244 /// Used in scanBlockScalarIndicators. 1245 /// 1246 /// Params: 1247 /// 1248 /// c = The character that may be a chomping indicator. 1249 /// chomping = Write the chomping value here, if detected. 1250 bool getChomping(ref dchar c, ref Chomping chomping) @safe 1251 { 1252 if(!c.among!('+', '-')) { return false; } 1253 chomping = c == '+' ? Chomping.keep : Chomping.strip; 1254 reader_.forward(); 1255 c = reader_.peek(); 1256 return true; 1257 } 1258 1259 /// Get increment indicator, if detected. Return false otherwise. 1260 /// 1261 /// Used in scanBlockScalarIndicators. 1262 /// 1263 /// Params: 1264 /// 1265 /// c = The character that may be an increment indicator. 1266 /// If an increment indicator is detected, this will be updated to 1267 /// the next character in the Reader. 1268 /// increment = Write the increment value here, if detected. 1269 /// startMark = Mark for error messages. 1270 bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe 1271 { 1272 if(!c.isDigit) { return false; } 1273 // Convert a digit to integer. 1274 increment = c - '0'; 1275 assert(increment < 10 && increment >= 0, "Digit has invalid value"); 1276 1277 enforce(increment > 0, 1278 new ScannerException("While scanning a block scalar", startMark, 1279 expected("indentation indicator in range 1-9", "0"), reader_.mark)); 1280 1281 reader_.forward(); 1282 c = reader_.peek(); 1283 return true; 1284 } 1285 1286 /// Scan (and ignore) ignored line in a block scalar. 1287 void scanBlockScalarIgnoredLine(const Mark startMark) @safe 1288 { 1289 findNextNonSpace(); 1290 if(reader_.peekByte()== '#') { scanToNextBreak(); } 1291 1292 enforce(reader_.peek().isBreak, 1293 new ScannerException("While scanning a block scalar", startMark, 1294 expected("comment or line break", reader_.peek()), reader_.mark)); 1295 1296 scanLineBreak(); 1297 } 1298 1299 /// Scan indentation in a block scalar, returning line breaks, max indent and end mark. 1300 /// 1301 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1302 /// characters into that slice. 1303 Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe 1304 { 1305 uint maxIndent; 1306 Mark endMark = reader_.mark; 1307 1308 while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) 1309 { 1310 if(reader_.peekByte() != ' ') 1311 { 1312 reader_.sliceBuilder.write(scanLineBreak()); 1313 endMark = reader_.mark; 1314 continue; 1315 } 1316 reader_.forward(); 1317 maxIndent = max(reader_.column, maxIndent); 1318 } 1319 1320 return tuple(maxIndent, endMark); 1321 } 1322 1323 /// Scan line breaks at lower or specified indentation in a block scalar. 1324 /// 1325 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1326 /// characters into that slice. 1327 Mark scanBlockScalarBreaksToSlice(const uint indent) @safe 1328 { 1329 Mark endMark = reader_.mark; 1330 1331 for(;;) 1332 { 1333 while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); } 1334 if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1335 reader_.sliceBuilder.write(scanLineBreak()); 1336 endMark = reader_.mark; 1337 } 1338 1339 return endMark; 1340 } 1341 1342 /// Scan a qouted flow scalar token with specified quotes. 1343 Token scanFlowScalar(const ScalarStyle quotes) @safe 1344 { 1345 const startMark = reader_.mark; 1346 const quote = reader_.get(); 1347 1348 reader_.sliceBuilder.begin(); 1349 1350 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1351 1352 while(reader_.peek() != quote) 1353 { 1354 scanFlowScalarSpacesToSlice(startMark); 1355 scanFlowScalarNonSpacesToSlice(quotes, startMark); 1356 } 1357 reader_.forward(); 1358 1359 auto slice = reader_.sliceBuilder.finish(); 1360 return scalarToken(startMark, reader_.mark, slice, quotes); 1361 } 1362 1363 /// Scan nonspace characters in a flow scalar. 1364 /// 1365 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1366 /// characters into that slice. 1367 void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) 1368 @safe 1369 { 1370 for(;;) 1371 { 1372 dchar c = reader_.peek(); 1373 1374 size_t numCodePoints; 1375 while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; } 1376 1377 if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); } 1378 1379 c = reader_.peek(); 1380 if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'') 1381 { 1382 reader_.forward(2); 1383 reader_.sliceBuilder.write('\''); 1384 } 1385 else if((quotes == ScalarStyle.doubleQuoted && c == '\'') || 1386 (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\'))) 1387 { 1388 reader_.forward(); 1389 reader_.sliceBuilder.write(c); 1390 } 1391 else if(quotes == ScalarStyle.doubleQuoted && c == '\\') 1392 { 1393 reader_.forward(); 1394 c = reader_.peek(); 1395 if(c.among!(escapes)) 1396 { 1397 reader_.forward(); 1398 // Escaping has been moved to Parser as it can't be done in 1399 // place (in a slice) in case of '\P' and '\L' (very uncommon, 1400 // but we don't want to break the spec) 1401 char[2] escapeSequence = ['\\', cast(char)c]; 1402 reader_.sliceBuilder.write(escapeSequence); 1403 } 1404 else if(c.among!(escapeHexCodeList)) 1405 { 1406 const hexLength = dyaml.escapes.escapeHexLength(c); 1407 reader_.forward(); 1408 1409 foreach(i; 0 .. hexLength) { 1410 enforce(reader_.peek(i).isHexDigit, 1411 new ScannerException("While scanning a double quoted scalar", startMark, 1412 expected("escape sequence of hexadecimal numbers", 1413 reader_.peek(i)), reader_.mark)); 1414 } 1415 char[] hex = reader_.get(hexLength); 1416 1417 enforce((hex.length > 0) && (hex.length <= 8), 1418 new ScannerException("While scanning a double quoted scalar", startMark, 1419 "overflow when parsing an escape sequence of " ~ 1420 "hexadecimal numbers.", reader_.mark)); 1421 1422 char[2] escapeStart = ['\\', cast(char) c]; 1423 reader_.sliceBuilder.write(escapeStart); 1424 reader_.sliceBuilder.write(hex); 1425 1426 } 1427 else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1428 { 1429 scanLineBreak(); 1430 scanFlowScalarBreaksToSlice(startMark); 1431 } 1432 else 1433 { 1434 throw new ScannerException("While scanning a double quoted scalar", startMark, 1435 text("found unsupported escape character ", c), 1436 reader_.mark); 1437 } 1438 } 1439 else { return; } 1440 } 1441 } 1442 1443 /// Scan space characters in a flow scalar. 1444 /// 1445 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1446 /// spaces into that slice. 1447 void scanFlowScalarSpacesToSlice(const Mark startMark) @safe 1448 { 1449 // Increase length as long as we see whitespace. 1450 size_t length; 1451 while(reader_.peekByte(length).among!(' ', '\t')) { ++length; } 1452 auto whitespaces = reader_.prefixBytes(length); 1453 1454 // Can check the last byte without striding because '\0' is ASCII 1455 const c = reader_.peek(length); 1456 enforce(c != '\0', 1457 new ScannerException("While scanning a quoted scalar", startMark, 1458 "found unexpected end of buffer", reader_.mark)); 1459 1460 // Spaces not followed by a line break. 1461 if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) 1462 { 1463 reader_.forward(length); 1464 reader_.sliceBuilder.write(whitespaces); 1465 return; 1466 } 1467 1468 // There's a line break after the spaces. 1469 reader_.forward(length); 1470 const lineBreak = scanLineBreak(); 1471 1472 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1473 1474 // If we have extra line breaks after the first, scan them into the 1475 // slice. 1476 const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); 1477 1478 // No extra breaks, one normal line break. Replace it with a space. 1479 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1480 } 1481 1482 /// Scan line breaks in a flow scalar. 1483 /// 1484 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1485 /// line breaks into that slice. 1486 bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe 1487 { 1488 // True if at least one line break was found. 1489 bool anyBreaks; 1490 for(;;) 1491 { 1492 // Instead of checking indentation, we check for document separators. 1493 const prefix = reader_.prefix(3); 1494 enforce(!(prefix == "---" || prefix == "...") || 1495 !reader_.peek(3).isWhiteSpace, 1496 new ScannerException("While scanning a quoted scalar", startMark, 1497 "found unexpected document separator", reader_.mark)); 1498 1499 // Skip any whitespaces. 1500 while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); } 1501 1502 // Encountered a non-whitespace non-linebreak character, so we're done. 1503 if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } 1504 1505 const lineBreak = scanLineBreak(); 1506 anyBreaks = true; 1507 reader_.sliceBuilder.write(lineBreak); 1508 } 1509 return anyBreaks; 1510 } 1511 1512 /// Scan plain scalar token (no block, no quotes). 1513 Token scanPlain() @safe 1514 { 1515 // We keep track of the allowSimpleKey_ flag here. 1516 // Indentation rules are loosed for the flow context 1517 const startMark = reader_.mark; 1518 Mark endMark = startMark; 1519 const indent = indent_ + 1; 1520 1521 // We allow zero indentation for scalars, but then we need to check for 1522 // document separators at the beginning of the line. 1523 // if(indent == 0) { indent = 1; } 1524 1525 reader_.sliceBuilder.begin(); 1526 1527 alias Transaction = SliceBuilder.Transaction; 1528 Transaction spacesTransaction; 1529 // Stop at a comment. 1530 while(reader_.peekByte() != '#') 1531 { 1532 // Scan the entire plain scalar. 1533 size_t length; 1534 dchar c = reader_.peek(length); 1535 for(;;) 1536 { 1537 const cNext = reader_.peek(length + 1); 1538 if(c.isWhiteSpace || 1539 (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) || 1540 (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}'))) 1541 { 1542 break; 1543 } 1544 ++length; 1545 c = cNext; 1546 } 1547 1548 // It's not clear what we should do with ':' in the flow context. 1549 enforce(flowLevel_ == 0 || c != ':' || 1550 reader_.peek(length + 1).isWhiteSpace || 1551 reader_.peek(length + 1).among!(',', '[', ']', '{', '}'), 1552 new ScannerException("While scanning a plain scalar", startMark, 1553 "found unexpected ':' . Please check " ~ 1554 "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.", 1555 reader_.mark)); 1556 1557 if(length == 0) { break; } 1558 1559 allowSimpleKey_ = false; 1560 1561 reader_.sliceBuilder.write(reader_.get(length)); 1562 1563 endMark = reader_.mark; 1564 1565 spacesTransaction.commit(); 1566 spacesTransaction = Transaction(&reader_.sliceBuilder); 1567 1568 const startLength = reader_.sliceBuilder.length; 1569 scanPlainSpacesToSlice(); 1570 if(startLength == reader_.sliceBuilder.length || 1571 (flowLevel_ == 0 && reader_.column < indent)) 1572 { 1573 break; 1574 } 1575 } 1576 1577 spacesTransaction.end(); 1578 char[] slice = reader_.sliceBuilder.finish(); 1579 1580 return scalarToken(startMark, endMark, slice, ScalarStyle.plain); 1581 } 1582 1583 /// Scan spaces in a plain scalar. 1584 /// 1585 /// Assumes that the caller is building a slice in Reader, and puts the spaces 1586 /// into that slice. 1587 void scanPlainSpacesToSlice() @safe 1588 { 1589 // The specification is really confusing about tabs in plain scalars. 1590 // We just forbid them completely. Do not use tabs in YAML! 1591 1592 // Get as many plain spaces as there are. 1593 size_t length; 1594 while(reader_.peekByte(length) == ' ') { ++length; } 1595 char[] whitespaces = reader_.prefixBytes(length); 1596 reader_.forward(length); 1597 1598 const dchar c = reader_.peek(); 1599 if(!c.isNSChar) 1600 { 1601 // We have spaces, but no newline. 1602 if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); } 1603 return; 1604 } 1605 1606 // Newline after the spaces (if any) 1607 const lineBreak = scanLineBreak(); 1608 allowSimpleKey_ = true; 1609 1610 static bool end(Reader reader_) @safe pure 1611 { 1612 const prefix = reader_.prefix(3); 1613 return ("---" == prefix || "..." == prefix) 1614 && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); 1615 } 1616 1617 if(end(reader_)) { return; } 1618 1619 bool extraBreaks; 1620 1621 alias Transaction = SliceBuilder.Transaction; 1622 auto transaction = Transaction(&reader_.sliceBuilder); 1623 if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } 1624 while(reader_.peek().isNSChar) 1625 { 1626 if(reader_.peekByte() == ' ') { reader_.forward(); } 1627 else 1628 { 1629 const lBreak = scanLineBreak(); 1630 extraBreaks = true; 1631 reader_.sliceBuilder.write(lBreak); 1632 1633 if(end(reader_)) { return; } 1634 } 1635 } 1636 transaction.commit(); 1637 1638 // No line breaks, only a space. 1639 if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } 1640 } 1641 1642 /// Scan handle of a tag token. 1643 /// 1644 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1645 /// characters into that slice. 1646 void scanTagHandleToSlice(string name)(const Mark startMark) 1647 { 1648 dchar c = reader_.peek(); 1649 enum contextMsg = "While scanning a " ~ name; 1650 enforce(c == '!', 1651 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1652 1653 uint length = 1; 1654 c = reader_.peek(length); 1655 if(c != ' ') 1656 { 1657 while(c.isAlphaNum || c.among!('-', '_')) 1658 { 1659 ++length; 1660 c = reader_.peek(length); 1661 } 1662 enforce(c == '!', 1663 new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); 1664 ++length; 1665 } 1666 1667 reader_.sliceBuilder.write(reader_.get(length)); 1668 } 1669 1670 /// Scan URI in a tag token. 1671 /// 1672 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1673 /// characters into that slice. 1674 void scanTagURIToSlice(string name)(const Mark startMark) 1675 { 1676 // Note: we do not check if URI is well-formed. 1677 dchar c = reader_.peek(); 1678 const startLen = reader_.sliceBuilder.length; 1679 { 1680 uint length; 1681 while(c.isAlphaNum || c.isURIChar) 1682 { 1683 if(c == '%') 1684 { 1685 auto chars = reader_.get(length); 1686 reader_.sliceBuilder.write(chars); 1687 length = 0; 1688 scanURIEscapesToSlice!name(startMark); 1689 } 1690 else { ++length; } 1691 c = reader_.peek(length); 1692 } 1693 if(length > 0) 1694 { 1695 auto chars = reader_.get(length); 1696 reader_.sliceBuilder.write(chars); 1697 length = 0; 1698 } 1699 } 1700 // OK if we scanned something, error otherwise. 1701 enum contextMsg = "While parsing a " ~ name; 1702 enforce(reader_.sliceBuilder.length > startLen, 1703 new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark)); 1704 } 1705 1706 // Not @nogc yet because std.utf.decode is not @nogc 1707 /// Scan URI escape sequences. 1708 /// 1709 /// Assumes that the caller is building a slice in Reader, and puts the scanned 1710 /// characters into that slice. 1711 void scanURIEscapesToSlice(string name)(const Mark startMark) 1712 { 1713 import core.exception : UnicodeException; 1714 // URI escapes encode a UTF-8 string. We store UTF-8 code units here for 1715 // decoding into UTF-32. 1716 Appender!string buffer; 1717 1718 1719 enum contextMsg = "While scanning a " ~ name; 1720 while(reader_.peekByte() == '%') 1721 { 1722 reader_.forward(); 1723 char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)]; 1724 1725 enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit, 1726 new ScannerException(contextMsg, startMark, 1727 expected("URI escape sequence of 2 hexadecimal " ~ 1728 "numbers", nextByte), reader_.mark)); 1729 1730 buffer ~= nextByte[].to!ubyte(16); 1731 1732 reader_.forward(2); 1733 } 1734 try 1735 { 1736 foreach (dchar chr; buffer.data) 1737 { 1738 reader_.sliceBuilder.write(chr); 1739 } 1740 } 1741 catch (UnicodeException) 1742 { 1743 throw new ScannerException(contextMsg, startMark, 1744 "Invalid UTF-8 data encoded in URI escape sequence", 1745 reader_.mark); 1746 } 1747 } 1748 1749 1750 /// Scan a line break, if any. 1751 /// 1752 /// Transforms: 1753 /// '\r\n' : '\n' 1754 /// '\r' : '\n' 1755 /// '\n' : '\n' 1756 /// '\u0085' : '\n' 1757 /// '\u2028' : '\u2028' 1758 /// '\u2029 : '\u2029' 1759 /// no break : '\0' 1760 dchar scanLineBreak() @safe 1761 { 1762 // Fast path for ASCII line breaks. 1763 const b = reader_.peekByte(); 1764 if(b < 0x80) 1765 { 1766 if(b == '\n' || b == '\r') 1767 { 1768 if(reader_.prefix(2) == "\r\n") { reader_.forward(2); } 1769 else { reader_.forward(); } 1770 return '\n'; 1771 } 1772 return '\0'; 1773 } 1774 1775 const c = reader_.peek(); 1776 if(c == '\x85') 1777 { 1778 reader_.forward(); 1779 return '\n'; 1780 } 1781 if(c == '\u2028' || c == '\u2029') 1782 { 1783 reader_.forward(); 1784 return c; 1785 } 1786 return '\0'; 1787 } 1788 }