1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 module dyaml.reader; 8 9 10 import core.stdc.stdlib; 11 import core.stdc.string; 12 import core.thread; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.exception; 18 import std.range; 19 import std.string; 20 import std.system; 21 import std.typecons; 22 import std.utf; 23 24 import tinyendian; 25 26 import dyaml.encoding; 27 import dyaml.exception; 28 29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029'); 30 31 package: 32 33 34 ///Exception thrown at Reader errors. 35 class ReaderException : YAMLException 36 { 37 this(string msg, string file = __FILE__, size_t line = __LINE__) 38 @safe pure nothrow 39 { 40 super("Reader error: " ~ msg, file, line); 41 } 42 } 43 44 /// Provides an API to read characters from a UTF-8 buffer and build slices into that 45 /// buffer to avoid allocations (see SliceBuilder). 46 final class Reader 47 { 48 private: 49 // Buffer of currently loaded characters. 50 char[] buffer_; 51 52 // Current position within buffer. Only data after this position can be read. 53 size_t bufferOffset_; 54 55 // Index of the current character in the buffer. 56 size_t charIndex_; 57 // Number of characters (code points) in buffer_. 58 size_t characterCount_; 59 60 // File name 61 string name_; 62 // Current line in file. 63 uint line_; 64 // Current column in file. 65 uint column_; 66 67 // Original Unicode encoding of the data. 68 Encoding encoding_; 69 70 version(unittest) 71 { 72 // Endianness of the input before it was converted (for testing) 73 Endian endian_; 74 } 75 76 // The number of consecutive ASCII characters starting at bufferOffset_. 77 // 78 // Used to minimize UTF-8 decoding. 79 size_t upcomingASCII_; 80 81 // Index to buffer_ where the last decoded character starts. 82 size_t lastDecodedBufferOffset_; 83 // Offset, relative to charIndex_, of the last decoded character, 84 // in code points, not chars. 85 size_t lastDecodedCharOffset_; 86 87 public: 88 /// Construct a Reader. 89 /// 90 /// Params: buffer = Buffer with YAML data. This may be e.g. the entire 91 /// contents of a file or a string. $(B will) be modified by 92 /// the Reader and other parts of D:YAML (D:YAML tries to 93 /// reuse the buffer to minimize memory allocations) 94 /// name = File name if the buffer is the contents of a file or 95 /// `"<unknown>"` if the buffer is the contents of a string. 96 /// 97 /// Throws: ReaderException on a UTF decoding error or if there are 98 /// nonprintable Unicode characters illegal in YAML. 99 this(ubyte[] buffer, string name = "<unknown>") @safe pure 100 { 101 name_ = name; 102 auto endianResult = fixUTFByteOrder(buffer); 103 if(endianResult.bytesStripped > 0) 104 { 105 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~ 106 "to 2 or 4 bytes, respectively"); 107 } 108 109 version(unittest) { endian_ = endianResult.endian; } 110 encoding_ = endianResult.encoding; 111 112 auto utf8Result = toUTF8(endianResult.array, endianResult.encoding); 113 const msg = utf8Result.errorMessage; 114 if(msg !is null) 115 { 116 throw new ReaderException("Error when converting to UTF-8: " ~ msg); 117 } 118 119 buffer_ = utf8Result.utf8; 120 121 characterCount_ = utf8Result.characterCount; 122 // Check that all characters in buffer are printable. 123 enforce(isPrintableValidUTF8(buffer_), 124 new ReaderException("Special unicode characters are not allowed")); 125 126 this.sliceBuilder = SliceBuilder(this); 127 checkASCII(); 128 } 129 130 /// Get character at specified index relative to current position. 131 /// 132 /// Params: index = Index of the character to get relative to current position 133 /// in the buffer. Can point outside of the buffer; In that 134 /// case, '\0' will be returned. 135 /// 136 /// Returns: Character at specified position or '\0' if outside of the buffer. 137 /// 138 // XXX removed; search for 'risky' to find why. 139 // Throws: ReaderException if trying to read past the end of the buffer. 140 dchar peek(const size_t index) @safe pure 141 { 142 if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; } 143 if(characterCount_ <= charIndex_ + index) 144 { 145 // XXX This is risky; revert this if bugs are introduced. We rely on 146 // the assumption that Reader only uses peek() to detect end of buffer. 147 // The test suite passes. 148 // Revert this case here and in other peek() versions if this causes 149 // errors. 150 // throw new ReaderException("Trying to read past the end of the buffer"); 151 return '\0'; 152 } 153 154 // Optimized path for Scanner code that peeks chars in linear order to 155 // determine the length of some sequence. 156 if(index == lastDecodedCharOffset_) 157 { 158 ++lastDecodedCharOffset_; 159 const char b = buffer_[lastDecodedBufferOffset_]; 160 // ASCII 161 if(b < 0x80) 162 { 163 ++lastDecodedBufferOffset_; 164 return b; 165 } 166 return decode(buffer_, lastDecodedBufferOffset_); 167 } 168 169 // 'Slow' path where we decode everything up to the requested character. 170 const asciiToTake = min(upcomingASCII_, index); 171 lastDecodedCharOffset_ = asciiToTake; 172 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 173 dchar d; 174 while(lastDecodedCharOffset_ <= index) 175 { 176 d = decodeNext(); 177 } 178 179 return d; 180 } 181 182 /// Optimized version of peek() for the case where peek index is 0. 183 dchar peek() @safe pure 184 { 185 if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; } 186 if(characterCount_ <= charIndex_) { return '\0'; } 187 188 lastDecodedCharOffset_ = 0; 189 lastDecodedBufferOffset_ = bufferOffset_; 190 return decodeNext(); 191 } 192 193 /// Get byte at specified index relative to current position. 194 /// 195 /// Params: index = Index of the byte to get relative to current position 196 /// in the buffer. Can point outside of the buffer; In that 197 /// case, '\0' will be returned. 198 /// 199 /// Returns: Byte at specified position or '\0' if outside of the buffer. 200 char peekByte(const size_t index) @safe pure nothrow @nogc 201 { 202 return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0'; 203 } 204 205 /// Optimized version of peekByte() for the case where peek byte index is 0. 206 char peekByte() @safe pure nothrow @nogc 207 { 208 return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0'; 209 } 210 211 212 /// Get specified number of characters starting at current position. 213 /// 214 /// Note: This gets only a "view" into the internal buffer, which will be 215 /// invalidated after other Reader calls. Use SliceBuilder to build slices 216 /// for permanent use. 217 /// 218 /// Params: length = Number of characters (code points, not bytes) to get. May 219 /// reach past the end of the buffer; in that case the returned 220 /// slice will be shorter. 221 /// 222 /// Returns: Characters starting at current position or an empty slice if out of bounds. 223 char[] prefix(const size_t length) @safe pure 224 { 225 return slice(length); 226 } 227 228 /// Get specified number of bytes, not code points, starting at current position. 229 /// 230 /// Note: This gets only a "view" into the internal buffer, which will be 231 /// invalidated after other Reader calls. Use SliceBuilder to build slices 232 /// for permanent use. 233 /// 234 /// Params: length = Number bytes (not code points) to get. May NOT reach past 235 /// the end of the buffer; should be used with peek() to avoid 236 /// this. 237 /// 238 /// Returns: Bytes starting at current position. 239 char[] prefixBytes(const size_t length) @safe pure nothrow @nogc 240 in(length == 0 || bufferOffset_ + length <= buffer_.length, "prefixBytes out of bounds") 241 { 242 return buffer_[bufferOffset_ .. bufferOffset_ + length]; 243 } 244 245 /// Get a slice view of the internal buffer, starting at the current position. 246 /// 247 /// Note: This gets only a "view" into the internal buffer, 248 /// which get invalidated after other Reader calls. 249 /// 250 /// Params: end = End of the slice relative to current position. May reach past 251 /// the end of the buffer; in that case the returned slice will 252 /// be shorter. 253 /// 254 /// Returns: Slice into the internal buffer or an empty slice if out of bounds. 255 char[] slice(const size_t end) @safe pure 256 { 257 // Fast path in case the caller has already peek()ed all the way to end. 258 if(end == lastDecodedCharOffset_) 259 { 260 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 261 } 262 263 const asciiToTake = min(upcomingASCII_, end, buffer_.length); 264 lastDecodedCharOffset_ = asciiToTake; 265 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 266 267 // 'Slow' path - decode everything up to end. 268 while(lastDecodedCharOffset_ < end && 269 lastDecodedBufferOffset_ < buffer_.length) 270 { 271 decodeNext(); 272 } 273 274 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 275 } 276 277 /// Get the next character, moving buffer position beyond it. 278 /// 279 /// Returns: Next character. 280 /// 281 /// Throws: ReaderException if trying to read past the end of the buffer 282 /// or if invalid data is read. 283 dchar get() @safe pure 284 { 285 const result = peek(); 286 forward(); 287 return result; 288 } 289 290 /// Get specified number of characters, moving buffer position beyond them. 291 /// 292 /// Params: length = Number or characters (code points, not bytes) to get. 293 /// 294 /// Returns: Characters starting at current position. 295 char[] get(const size_t length) @safe pure 296 { 297 auto result = slice(length); 298 forward(length); 299 return result; 300 } 301 302 /// Move current position forward. 303 /// 304 /// Params: length = Number of characters to move position forward. 305 void forward(size_t length) @safe pure 306 { 307 while(length > 0) 308 { 309 auto asciiToTake = min(upcomingASCII_, length); 310 charIndex_ += asciiToTake; 311 length -= asciiToTake; 312 upcomingASCII_ -= asciiToTake; 313 314 for(; asciiToTake > 0; --asciiToTake) 315 { 316 const c = buffer_[bufferOffset_++]; 317 // c is ASCII, do we only need to check for ASCII line breaks. 318 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 319 { 320 ++line_; 321 column_ = 0; 322 continue; 323 } 324 ++column_; 325 } 326 327 // If we have used up all upcoming ASCII chars, the next char is 328 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to 329 // be updated - it's zero. 330 if(length == 0) { break; } 331 332 assert(upcomingASCII_ == 0, 333 "Running unicode handling code but we haven't run out of ASCII chars"); 334 assert(bufferOffset_ < buffer_.length, 335 "Attempted to decode past the end of YAML buffer"); 336 assert(buffer_[bufferOffset_] >= 0x80, 337 "ASCII must be handled by preceding code"); 338 339 ++charIndex_; 340 const c = decode(buffer_, bufferOffset_); 341 342 // New line. (can compare with '\n' without decoding since it's ASCII) 343 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n')) 344 { 345 ++line_; 346 column_ = 0; 347 } 348 else if(c != '\uFEFF') { ++column_; } 349 --length; 350 checkASCII(); 351 } 352 353 lastDecodedBufferOffset_ = bufferOffset_; 354 lastDecodedCharOffset_ = 0; 355 } 356 357 /// Move current position forward by one character. 358 void forward() @safe pure 359 { 360 ++charIndex_; 361 lastDecodedBufferOffset_ = bufferOffset_; 362 lastDecodedCharOffset_ = 0; 363 364 // ASCII 365 if(upcomingASCII_ > 0) 366 { 367 --upcomingASCII_; 368 const c = buffer_[bufferOffset_++]; 369 370 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 371 { 372 ++line_; 373 column_ = 0; 374 return; 375 } 376 ++column_; 377 return; 378 } 379 380 // UTF-8 381 assert(bufferOffset_ < buffer_.length, 382 "Attempted to decode past the end of YAML buffer"); 383 assert(buffer_[bufferOffset_] >= 0x80, 384 "ASCII must be handled by preceding code"); 385 386 const c = decode(buffer_, bufferOffset_); 387 388 // New line. (can compare with '\n' without decoding since it's ASCII) 389 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n')) 390 { 391 ++line_; 392 column_ = 0; 393 } 394 else if(c != '\uFEFF') { ++column_; } 395 396 checkASCII(); 397 } 398 399 /// Used to build slices of read data in Reader; to avoid allocations. 400 SliceBuilder sliceBuilder; 401 402 /// Get a string describing current buffer position, used for error messages. 403 Mark mark() const pure nothrow @nogc @safe { return Mark(name_, line_, column_); } 404 405 /// Get file name. 406 string name() const @safe pure nothrow @nogc { return name_; } 407 408 /// Set file name. 409 void name(string name) pure @safe nothrow @nogc { name_ = name; } 410 411 /// Get current line number. 412 uint line() const @safe pure nothrow @nogc { return line_; } 413 414 /// Get current column number. 415 uint column() const @safe pure nothrow @nogc { return column_; } 416 417 /// Get index of the current character in the buffer. 418 size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; } 419 420 /// Get encoding of the input buffer. 421 Encoding encoding() const @safe pure nothrow @nogc { return encoding_; } 422 423 private: 424 // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence) 425 void checkASCII() @safe pure nothrow @nogc 426 { 427 upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]); 428 } 429 430 // Decode the next character relative to 431 // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them. 432 // 433 // Does not advance the buffer position. Used in peek() and slice(). 434 dchar decodeNext() @safe pure 435 { 436 assert(lastDecodedBufferOffset_ < buffer_.length, 437 "Attempted to decode past the end of YAML buffer"); 438 const char b = buffer_[lastDecodedBufferOffset_]; 439 ++lastDecodedCharOffset_; 440 // ASCII 441 if(b < 0x80) 442 { 443 ++lastDecodedBufferOffset_; 444 return b; 445 } 446 447 return decode(buffer_, lastDecodedBufferOffset_); 448 } 449 } 450 451 /// Used to build slices of already read data in Reader buffer, avoiding allocations. 452 /// 453 /// Usually these slices point to unchanged Reader data, but sometimes the data is 454 /// changed due to how YAML interprets certain characters/strings. 455 /// 456 /// See begin() documentation. 457 struct SliceBuilder 458 { 459 private: 460 // No copying by the user. 461 @disable this(this); 462 @disable void opAssign(ref SliceBuilder); 463 464 // Reader this builder works in. 465 Reader reader_; 466 467 // Start of the slice om reader_.buffer_ (size_t.max while no slice being build) 468 size_t start_ = size_t.max; 469 // End of the slice om reader_.buffer_ (size_t.max while no slice being build) 470 size_t end_ = size_t.max; 471 472 // Stack of slice ends to revert to (see Transaction) 473 // 474 // Very few levels as we don't want arbitrarily nested transactions. 475 size_t[4] endStack_; 476 // The number of elements currently in endStack_. 477 size_t endStackUsed_; 478 479 @safe const pure nothrow @nogc invariant() 480 { 481 if(!inProgress) { return; } 482 assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position"); 483 assert(start_ <= end_, "Slice start after slice end"); 484 } 485 486 // Is a slice currently being built? 487 bool inProgress() @safe const pure nothrow @nogc 488 in(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max, "start_/end_ are not consistent") 489 { 490 return start_ != size_t.max; 491 } 492 493 public: 494 /// Begin building a slice. 495 /// 496 /// Only one slice can be built at any given time; before beginning a new slice, 497 /// finish the previous one (if any). 498 /// 499 /// The slice starts at the current position in the Reader buffer. It can only be 500 /// extended up to the current position in the buffer; Reader methods get() and 501 /// forward() move the position. E.g. it is valid to extend a slice by write()-ing 502 /// a string just returned by get() - but not one returned by prefix() unless the 503 /// position has changed since the prefix() call. 504 void begin() @safe pure nothrow @nogc 505 in(!inProgress, "Beginning a slice while another slice is being built") 506 in(endStackUsed_ == 0, "Slice stack not empty at slice begin") 507 { 508 509 start_ = reader_.bufferOffset_; 510 end_ = reader_.bufferOffset_; 511 } 512 513 /// Finish building a slice and return it. 514 /// 515 /// Any Transactions on the slice must be committed or destroyed before the slice 516 /// is finished. 517 /// 518 /// Returns a string; once a slice is finished it is definitive that its contents 519 /// will not be changed. 520 char[] finish() @safe pure nothrow @nogc 521 in(inProgress, "finish called without begin") 522 in(endStackUsed_ == 0, "Finishing a slice with running transactions.") 523 { 524 525 auto result = reader_.buffer_[start_ .. end_]; 526 start_ = end_ = size_t.max; 527 return result; 528 } 529 530 /// Write a string to the slice being built. 531 /// 532 /// Data can only be written up to the current position in the Reader buffer. 533 /// 534 /// If str is a string returned by a Reader method, and str starts right after the 535 /// end of the slice being built, the slice is extended (trivial operation). 536 /// 537 /// See_Also: begin 538 void write(scope char[] str) @safe pure nothrow @nogc 539 { 540 assert(inProgress, "write called without begin"); 541 assert(end_ <= reader_.bufferOffset_, 542 "AT START: Slice ends after buffer position"); 543 544 // Nothing? Already done. 545 if (str.length == 0) { return; } 546 // If str starts at the end of the slice (is a string returned by a Reader 547 // method), just extend the slice to contain str. 548 if(&str[0] == &reader_.buffer_[end_]) 549 { 550 end_ += str.length; 551 } 552 // Even if str does not start at the end of the slice, it still may be returned 553 // by a Reader method and point to buffer. So we need to memmove. 554 else 555 { 556 copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]); 557 end_ += str.length; 558 } 559 } 560 561 /// Write a character to the slice being built. 562 /// 563 /// Data can only be written up to the current position in the Reader buffer. 564 /// 565 /// See_Also: begin 566 void write(dchar c) @safe pure 567 in(inProgress, "write called without begin") 568 { 569 if(c < 0x80) 570 { 571 reader_.buffer_[end_++] = cast(char)c; 572 return; 573 } 574 575 // We need to encode a non-ASCII dchar into UTF-8 576 char[4] encodeBuf; 577 const bytes = encode(encodeBuf, c); 578 reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes]; 579 end_ += bytes; 580 } 581 582 /// Insert a character to a specified position in the slice. 583 /// 584 /// Enlarges the slice by 1 char. Note that the slice can only extend up to the 585 /// current position in the Reader buffer. 586 /// 587 /// Params: 588 /// 589 /// c = The character to insert. 590 /// position = Position to insert the character at in code units, not code points. 591 /// Must be less than slice length(); a previously returned length() 592 /// can be used. 593 void insert(const dchar c, const size_t position) @safe pure 594 in(inProgress, "insert called without begin") 595 in(start_ + position <= end_, "Trying to insert after the end of the slice") 596 { 597 598 const point = start_ + position; 599 const movedLength = end_ - point; 600 601 // Encode c into UTF-8 602 char[4] encodeBuf; 603 if(c < 0x80) { encodeBuf[0] = cast(char)c; } 604 const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c); 605 606 if(movedLength > 0) 607 { 608 copy(reader_.buffer_[point..point + movedLength * char.sizeof], 609 reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]); 610 } 611 reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes]; 612 end_ += bytes; 613 } 614 615 /// Get the current length of the slice. 616 size_t length() @safe const pure nothrow @nogc 617 { 618 return end_ - start_; 619 } 620 621 /// A slice building transaction. 622 /// 623 /// Can be used to save and revert back to slice state. 624 struct Transaction 625 { 626 private: 627 // The slice builder affected by the transaction. 628 SliceBuilder* builder_; 629 // Index of the return point of the transaction in StringBuilder.endStack_. 630 size_t stackLevel_; 631 // True after commit() has been called. 632 bool committed_; 633 634 public: 635 /// Begins a transaction on a SliceBuilder object. 636 /// 637 /// The transaction must end $(B after) any transactions created within the 638 /// transaction but $(B before) the slice is finish()-ed. A transaction can be 639 /// ended either by commit()-ing or reverting through the destructor. 640 /// 641 /// Saves the current state of a slice. 642 this(SliceBuilder* builder) @safe pure nothrow @nogc 643 { 644 builder_ = builder; 645 stackLevel_ = builder_.endStackUsed_; 646 builder_.push(); 647 } 648 649 /// Commit changes to the slice. 650 /// 651 /// Ends the transaction - can only be called once, and removes the possibility 652 /// to revert slice state. 653 /// 654 /// Does nothing for a default-initialized transaction (the transaction has not 655 /// been started yet). 656 void commit() @safe pure nothrow @nogc 657 in(!committed_, "Can't commit a transaction more than once") 658 { 659 660 if(builder_ is null) { return; } 661 assert(builder_.endStackUsed_ == stackLevel_ + 1, 662 "Parent transactions don't fully contain child transactions"); 663 builder_.apply(); 664 committed_ = true; 665 } 666 667 /// Destroy the transaction and revert it if it hasn't been committed yet. 668 void end() @safe pure nothrow @nogc 669 in(builder_ && builder_.endStackUsed_ == stackLevel_ + 1, "Parent transactions don't fully contain child transactions") 670 { 671 builder_.pop(); 672 builder_ = null; 673 } 674 675 } 676 677 private: 678 // Push the current end of the slice so we can revert to it if needed. 679 // 680 // Used by Transaction. 681 void push() @safe pure nothrow @nogc 682 in(inProgress, "push called without begin") 683 in(endStackUsed_ < endStack_.length, "Slice stack overflow") 684 { 685 endStack_[endStackUsed_++] = end_; 686 } 687 688 // Pop the current end of endStack_ and set the end of the slice to the popped 689 // value, reverting changes since the old end was pushed. 690 // 691 // Used by Transaction. 692 void pop() @safe pure nothrow @nogc 693 in(inProgress, "pop called without begin") 694 in(endStackUsed_ > 0, "Trying to pop an empty slice stack") 695 { 696 end_ = endStack_[--endStackUsed_]; 697 } 698 699 // Pop the current end of endStack_, but keep the current end of the slice, applying 700 // changes made since pushing the old end. 701 // 702 // Used by Transaction. 703 void apply() @safe pure nothrow @nogc 704 in(inProgress, "apply called without begin") 705 in(endStackUsed_ > 0, "Trying to apply an empty slice stack") 706 { 707 --endStackUsed_; 708 } 709 } 710 711 712 private: 713 714 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible. 715 // 716 // Params: 717 // 718 // input = Buffer with UTF-8/16/32 data to decode. May be overwritten by the 719 // conversion, in which case the result will be a slice of this buffer. 720 // encoding = Encoding of input. 721 // 722 // Returns: 723 // 724 // A struct with the following members: 725 // 726 // $(D string errorMessage) In case of an error, the error message is stored here. If 727 // there was no error, errorMessage is NULL. Always check 728 // this first. 729 // $(D char[] utf8) input converted to UTF-8. May be a slice of input. 730 // $(D size_t characterCount) Number of characters (code points) in input. 731 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow 732 { 733 // Documented in function ddoc. 734 struct Result 735 { 736 string errorMessage; 737 char[] utf8; 738 size_t characterCount; 739 } 740 741 Result result; 742 743 // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32. 744 // 745 // Params: 746 // 747 // buffer = The input buffer to encode. 748 // result = A Result struct to put encoded result and any error messages to. 749 // 750 // On error, result.errorMessage will be set. 751 static void encode(C)(C[] input, ref Result result) @safe pure 752 { 753 // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or 754 // less bytes. 755 static if(is(C == dchar)) 756 { 757 char[4] encodeBuf; 758 auto utf8 = cast(char[])input; 759 auto length = 0; 760 foreach(dchar c; input) 761 { 762 ++result.characterCount; 763 // ASCII 764 if(c < 0x80) 765 { 766 utf8[length++] = cast(char)c; 767 continue; 768 } 769 770 std.utf.encode(encodeBuf, c); 771 const bytes = codeLength!char(c); 772 utf8[length .. length + bytes] = encodeBuf[0 .. bytes]; 773 length += bytes; 774 } 775 result.utf8 = utf8[0 .. length]; 776 } 777 // Unfortunately we can't do UTF-16 in place so we just use std.conv.to 778 else 779 { 780 result.characterCount = std.utf.count(input); 781 result.utf8 = input.to!(char[]); 782 } 783 } 784 785 try final switch(encoding) 786 { 787 case UTFEncoding.UTF_8: 788 result.utf8 = cast(char[])input; 789 result.utf8.validate(); 790 result.characterCount = std.utf.count(result.utf8); 791 break; 792 case UTFEncoding.UTF_16: 793 assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); 794 encode(cast(wchar[])input, result); 795 break; 796 case UTFEncoding.UTF_32: 797 assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4"); 798 encode(cast(dchar[])input, result); 799 break; 800 } 801 catch(ConvException e) { result.errorMessage = e.msg; } 802 catch(UTFException e) { result.errorMessage = e.msg; } 803 catch(Exception e) 804 { 805 assert(false, "Unexpected exception in encode(): " ~ e.msg); 806 } 807 808 return result; 809 } 810 811 /// Determine if all characters (code points, not bytes) in a string are printable. 812 bool isPrintableValidUTF8(const char[] chars) @safe pure 813 { 814 import std.uni : isControl, isWhite; 815 foreach (dchar chr; chars) 816 { 817 if (!chr.isValidDchar || (chr.isControl && !chr.isWhite)) 818 { 819 return false; 820 } 821 } 822 return true; 823 } 824 825 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence. 826 /// 827 /// Used to determine how many characters we can process without decoding. 828 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc 829 { 830 return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength; 831 } 832 // Unittests. 833 834 void testEndian(R)() 835 { 836 void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) 837 { 838 auto reader = new R(data); 839 assert(reader.encoding == encoding_expected); 840 assert(reader.endian_ == endian_expected); 841 } 842 ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; 843 ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; 844 endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); 845 endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); 846 } 847 848 void testPeekPrefixForward(R)() 849 { 850 import std.encoding; 851 ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data"; 852 auto reader = new R(data); 853 assert(reader.peek() == 'd'); 854 assert(reader.peek(1) == 'a'); 855 assert(reader.peek(2) == 't'); 856 assert(reader.peek(3) == 'a'); 857 assert(reader.peek(4) == '\0'); 858 assert(reader.prefix(4) == "data"); 859 // assert(reader.prefix(6) == "data\0"); 860 reader.forward(2); 861 assert(reader.peek(1) == 'a'); 862 // assert(collectException(reader.peek(3))); 863 } 864 865 void testUTF(R)() 866 { 867 import std.encoding; 868 dchar[] data = cast(dchar[])"data"; 869 void utf_test(T)(T[] data, BOM bom) 870 { 871 ubyte[] bytes = bomTable[bom].sequence ~ 872 (cast(ubyte[])data)[0 .. data.length * T.sizeof]; 873 auto reader = new R(bytes); 874 assert(reader.peek() == 'd'); 875 assert(reader.peek(1) == 'a'); 876 assert(reader.peek(2) == 't'); 877 assert(reader.peek(3) == 'a'); 878 } 879 utf_test!char(to!(char[])(data), BOM.utf8); 880 utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le); 881 utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le); 882 } 883 884 void test1Byte(R)() 885 { 886 ubyte[] data = [97]; 887 888 auto reader = new R(data); 889 assert(reader.peek() == 'a'); 890 assert(reader.peek(1) == '\0'); 891 // assert(collectException(reader.peek(2))); 892 } 893 894 @system unittest 895 { 896 testEndian!Reader(); 897 testPeekPrefixForward!Reader(); 898 testUTF!Reader(); 899 test1Byte!Reader(); 900 } 901 //Issue 257 - https://github.com/dlang-community/D-YAML/issues/257 902 @safe unittest 903 { 904 import dyaml.loader : Loader; 905 auto yaml = "hello "; 906 auto root = Loader.fromString(yaml).load(); 907 908 assert(root.isValid); 909 }