1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 module dyaml.reader; 8 9 10 import core.stdc.stdlib; 11 import core.stdc.string; 12 import core.thread; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.exception; 18 import std.stdio; 19 import std.string; 20 import std.system; 21 import std.typecons; 22 import std.utf; 23 24 import tinyendian; 25 26 import dyaml.fastcharsearch; 27 import dyaml.encoding; 28 import dyaml.exception; 29 import dyaml.nogcutil; 30 31 32 33 package: 34 35 36 ///Exception thrown at Reader errors. 37 class ReaderException : YAMLException 38 { 39 this(string msg, string file = __FILE__, int line = __LINE__) 40 @safe pure nothrow 41 { 42 super("Reader error: " ~ msg, file, line); 43 } 44 } 45 46 /// Provides an API to read characters from a UTF-8 buffer and build slices into that 47 /// buffer to avoid allocations (see SliceBuilder). 48 final class Reader 49 { 50 private: 51 // Buffer of currently loaded characters. 52 char[] buffer_ = null; 53 54 // Current position within buffer. Only data after this position can be read. 55 size_t bufferOffset_ = 0; 56 57 // Index of the current character in the buffer. 58 size_t charIndex_ = 0; 59 // Number of characters (code points) in buffer_. 60 size_t characterCount_ = 0; 61 62 // Current line in file. 63 uint line_; 64 // Current column in file. 65 uint column_; 66 67 // Original Unicode encoding of the data. 68 Encoding encoding_; 69 70 version(unittest) 71 { 72 // Endianness of the input before it was converted (for testing) 73 Endian endian_; 74 } 75 76 // The number of consecutive ASCII characters starting at bufferOffset_. 77 // 78 // Used to minimize UTF-8 decoding. 79 size_t upcomingASCII_ = 0; 80 81 // Index to buffer_ where the last decoded character starts. 82 size_t lastDecodedBufferOffset_ = 0; 83 // Offset, relative to charIndex_, of the last decoded character, 84 // in code points, not chars. 85 size_t lastDecodedCharOffset_ = 0; 86 87 public: 88 /// Construct a Reader. 89 /// 90 /// Params: buffer = Buffer with YAML data. This may be e.g. the entire 91 /// contents of a file or a string. $(B will) be modified by 92 /// the Reader and other parts of D:YAML (D:YAML tries to 93 /// reuse the buffer to minimize memory allocations) 94 /// 95 /// Throws: ReaderException on a UTF decoding error or if there are 96 /// nonprintable Unicode characters illegal in YAML. 97 this(ubyte[] buffer) @trusted pure //!nothrow 98 { 99 auto endianResult = fixUTFByteOrder(buffer); 100 if(endianResult.bytesStripped > 0) 101 { 102 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~ 103 "to 2 or 4 bytes, respectively"); 104 } 105 106 version(unittest) { endian_ = endianResult.endian; } 107 encoding_ = endianResult.encoding; 108 109 auto utf8Result = toUTF8(endianResult.array, endianResult.encoding); 110 const msg = utf8Result.errorMessage; 111 if(msg !is null) 112 { 113 throw new ReaderException("Error when converting to UTF-8: " ~ msg); 114 } 115 116 buffer_ = utf8Result.utf8; 117 118 characterCount_ = utf8Result.characterCount; 119 // Check that all characters in buffer are printable. 120 enforce(isPrintableValidUTF8(buffer_), 121 new ReaderException("Special unicode characters are not allowed")); 122 123 this.sliceBuilder = SliceBuilder(this); 124 checkASCII(); 125 } 126 127 pure nothrow @nogc: 128 /// Get character at specified index relative to current position. 129 /// 130 /// Params: index = Index of the character to get relative to current position 131 /// in the buffer. Can point outside of the buffer; In that 132 /// case, '\0' will be returned. 133 /// 134 /// Returns: Character at specified position or '\0' if outside of the buffer. 135 /// 136 // XXX removed; search for 'risky' to find why. 137 // Throws: ReaderException if trying to read past the end of the buffer. 138 dchar peek(const size_t index) @safe 139 { 140 if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; } 141 if(characterCount_ <= charIndex_ + index) 142 { 143 // XXX This is risky; revert this if bugs are introduced. We rely on 144 // the assumption that Reader only uses peek() to detect end of buffer. 145 // The test suite passes. 146 // Revert this case here and in other peek() versions if this causes 147 // errors. 148 // throw new ReaderException("Trying to read past the end of the buffer"); 149 return '\0'; 150 } 151 152 // Optimized path for Scanner code that peeks chars in linear order to 153 // determine the length of some sequence. 154 if(index == lastDecodedCharOffset_) 155 { 156 ++lastDecodedCharOffset_; 157 const char b = buffer_[lastDecodedBufferOffset_]; 158 // ASCII 159 if(b < 0x80) 160 { 161 ++lastDecodedBufferOffset_; 162 return b; 163 } 164 return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_); 165 } 166 167 // 'Slow' path where we decode everything up to the requested character. 168 const asciiToTake = min(upcomingASCII_, index); 169 lastDecodedCharOffset_ = asciiToTake; 170 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 171 dchar d; 172 while(lastDecodedCharOffset_ <= index) 173 { 174 d = decodeNext(); 175 } 176 177 return d; 178 } 179 180 /// Optimized version of peek() for the case where peek index is 0. 181 dchar peek() @safe 182 { 183 if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; } 184 if(characterCount_ <= charIndex_) { return '\0'; } 185 186 lastDecodedCharOffset_ = 0; 187 lastDecodedBufferOffset_ = bufferOffset_; 188 return decodeNext(); 189 } 190 191 /// Get byte at specified index relative to current position. 192 /// 193 /// Params: index = Index of the byte to get relative to current position 194 /// in the buffer. Can point outside of the buffer; In that 195 /// case, '\0' will be returned. 196 /// 197 /// Returns: Byte at specified position or '\0' if outside of the buffer. 198 char peekByte(const size_t index) @safe 199 { 200 return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0'; 201 } 202 203 /// Optimized version of peekByte() for the case where peek byte index is 0. 204 char peekByte() @safe 205 { 206 return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0'; 207 } 208 209 210 /// Get specified number of characters starting at current position. 211 /// 212 /// Note: This gets only a "view" into the internal buffer, which will be 213 /// invalidated after other Reader calls. Use SliceBuilder to build slices 214 /// for permanent use. 215 /// 216 /// Params: length = Number of characters (code points, not bytes) to get. May 217 /// reach past the end of the buffer; in that case the returned 218 /// slice will be shorter. 219 /// 220 /// Returns: Characters starting at current position or an empty slice if out of bounds. 221 char[] prefix(const size_t length) @safe 222 { 223 return slice(length); 224 } 225 226 /// Get specified number of bytes, not code points, starting at current position. 227 /// 228 /// Note: This gets only a "view" into the internal buffer, which will be 229 /// invalidated after other Reader calls. Use SliceBuilder to build slices 230 /// for permanent use. 231 /// 232 /// Params: length = Number bytes (not code points) to get. May NOT reach past 233 /// the end of the buffer; should be used with peek() to avoid 234 /// this. 235 /// 236 /// Returns: Bytes starting at current position. 237 char[] prefixBytes(const size_t length) @safe 238 { 239 assert(length == 0 || bufferOffset_ + length < buffer_.length, 240 "prefixBytes out of bounds"); 241 return buffer_[bufferOffset_ .. bufferOffset_ + length]; 242 } 243 244 /// Get a slice view of the internal buffer, starting at the current position. 245 /// 246 /// Note: This gets only a "view" into the internal buffer, 247 /// which get invalidated after other Reader calls. 248 /// 249 /// Params: end = End of the slice relative to current position. May reach past 250 /// the end of the buffer; in that case the returned slice will 251 /// be shorter. 252 /// 253 /// Returns: Slice into the internal buffer or an empty slice if out of bounds. 254 char[] slice(const size_t end) @safe 255 { 256 // Fast path in case the caller has already peek()ed all the way to end. 257 if(end == lastDecodedCharOffset_) 258 { 259 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 260 } 261 262 const asciiToTake = min(upcomingASCII_, end, buffer_.length); 263 lastDecodedCharOffset_ = asciiToTake; 264 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 265 266 // 'Slow' path - decode everything up to end. 267 while(lastDecodedCharOffset_ < end && 268 lastDecodedBufferOffset_ < buffer_.length) 269 { 270 decodeNext(); 271 } 272 273 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 274 } 275 276 /// Get the next character, moving buffer position beyond it. 277 /// 278 /// Returns: Next character. 279 /// 280 /// Throws: ReaderException if trying to read past the end of the buffer 281 /// or if invalid data is read. 282 dchar get() @safe 283 { 284 const result = peek(); 285 forward(); 286 return result; 287 } 288 289 /// Get specified number of characters, moving buffer position beyond them. 290 /// 291 /// Params: length = Number or characters (code points, not bytes) to get. 292 /// 293 /// Returns: Characters starting at current position. 294 char[] get(const size_t length) @safe 295 { 296 auto result = slice(length); 297 forward(length); 298 return result; 299 } 300 301 /// Move current position forward. 302 /// 303 /// Params: length = Number of characters to move position forward. 304 void forward(size_t length) @safe 305 { 306 mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; 307 308 while(length > 0) 309 { 310 auto asciiToTake = min(upcomingASCII_, length); 311 charIndex_ += asciiToTake; 312 length -= asciiToTake; 313 upcomingASCII_ -= asciiToTake; 314 315 for(; asciiToTake > 0; --asciiToTake) 316 { 317 const c = buffer_[bufferOffset_++]; 318 // c is ASCII, do we only need to check for ASCII line breaks. 319 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 320 { 321 ++line_; 322 column_ = 0; 323 continue; 324 } 325 ++column_; 326 } 327 328 // If we have used up all upcoming ASCII chars, the next char is 329 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to 330 // be updated - it's zero. 331 if(length == 0) { break; } 332 333 assert(upcomingASCII_ == 0, 334 "Running unicode handling code but we haven't run out of ASCII chars"); 335 assert(bufferOffset_ < buffer_.length, 336 "Attempted to decode past the end of YAML buffer"); 337 assert(buffer_[bufferOffset_] >= 0x80, 338 "ASCII must be handled by preceding code"); 339 340 ++charIndex_; 341 const c = decodeValidUTF8NoGC(buffer_, bufferOffset_); 342 343 // New line. (can compare with '\n' without decoding since it's ASCII) 344 if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n')) 345 { 346 ++line_; 347 column_ = 0; 348 } 349 else if(c != '\uFEFF') { ++column_; } 350 --length; 351 checkASCII(); 352 } 353 354 lastDecodedBufferOffset_ = bufferOffset_; 355 lastDecodedCharOffset_ = 0; 356 } 357 358 /// Move current position forward by one character. 359 void forward() @trusted 360 { 361 ++charIndex_; 362 lastDecodedBufferOffset_ = bufferOffset_; 363 lastDecodedCharOffset_ = 0; 364 365 // ASCII 366 if(upcomingASCII_ > 0) 367 { 368 --upcomingASCII_; 369 const c = buffer_[bufferOffset_++]; 370 371 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 372 { 373 ++line_; 374 column_ = 0; 375 return; 376 } 377 ++column_; 378 return; 379 } 380 381 // UTF-8 382 mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; 383 assert(bufferOffset_ < buffer_.length, 384 "Attempted to decode past the end of YAML buffer"); 385 assert(buffer_[bufferOffset_] >= 0x80, 386 "ASCII must be handled by preceding code"); 387 388 const c = decodeValidUTF8NoGC(buffer_, bufferOffset_); 389 390 // New line. (can compare with '\n' without decoding since it's ASCII) 391 if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n')) 392 { 393 ++line_; 394 column_ = 0; 395 } 396 else if(c != '\uFEFF') { ++column_; } 397 398 checkASCII(); 399 } 400 401 /// Used to build slices of read data in Reader; to avoid allocations. 402 SliceBuilder sliceBuilder; 403 404 @safe pure nothrow @nogc: 405 /// Get a string describing current buffer position, used for error messages. 406 Mark mark() const { return Mark(line_, column_); } 407 408 /// Get current line number. 409 uint line() const { return line_; } 410 411 /// Get current column number. 412 uint column() const { return column_; } 413 414 /// Get index of the current character in the buffer. 415 size_t charIndex() const { return charIndex_; } 416 417 /// Get encoding of the input buffer. 418 Encoding encoding() const { return encoding_; } 419 420 private: 421 // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence) 422 void checkASCII() 423 { 424 upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]); 425 } 426 427 // Decode the next character relative to 428 // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them. 429 // 430 // Does not advance the buffer position. Used in peek() and slice(). 431 dchar decodeNext() 432 { 433 assert(lastDecodedBufferOffset_ < buffer_.length, 434 "Attempted to decode past the end of YAML buffer"); 435 const char b = buffer_[lastDecodedBufferOffset_]; 436 ++lastDecodedCharOffset_; 437 // ASCII 438 if(b < 0x80) 439 { 440 ++lastDecodedBufferOffset_; 441 return b; 442 } 443 444 return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_); 445 } 446 } 447 448 /// Used to build slices of already read data in Reader buffer, avoiding allocations. 449 /// 450 /// Usually these slices point to unchanged Reader data, but sometimes the data is 451 /// changed due to how YAML interprets certain characters/strings. 452 /// 453 /// See begin() documentation. 454 struct SliceBuilder 455 { 456 pure nothrow @nogc: 457 private: 458 // No copying by the user. 459 @disable this(this); 460 @disable void opAssign(ref SliceBuilder); 461 462 // Reader this builder works in. 463 Reader reader_; 464 465 // Start of the slice om reader_.buffer_ (size_t.max while no slice being build) 466 size_t start_ = size_t.max; 467 // End of the slice om reader_.buffer_ (size_t.max while no slice being build) 468 size_t end_ = size_t.max; 469 470 // Stack of slice ends to revert to (see Transaction) 471 // 472 // Very few levels as we don't want arbitrarily nested transactions. 473 size_t[4] endStack_; 474 // The number of elements currently in endStack_. 475 size_t endStackUsed_ = 0; 476 477 @safe const invariant() 478 { 479 if(!inProgress) { return; } 480 assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position"); 481 assert(start_ <= end_, "Slice start after slice end"); 482 } 483 484 // Is a slice currently being built? 485 bool inProgress() @safe const 486 { 487 assert(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max, 488 "start_/end_ are not consistent"); 489 return start_ != size_t.max; 490 } 491 492 public: 493 /// Begin building a slice. 494 /// 495 /// Only one slice can be built at any given time; before beginning a new slice, 496 /// finish the previous one (if any). 497 /// 498 /// The slice starts at the current position in the Reader buffer. It can only be 499 /// extended up to the current position in the buffer; Reader methods get() and 500 /// forward() move the position. E.g. it is valid to extend a slice by write()-ing 501 /// a string just returned by get() - but not one returned by prefix() unless the 502 /// position has changed since the prefix() call. 503 void begin() @system 504 { 505 assert(!inProgress, "Beginning a slice while another slice is being built"); 506 assert(endStackUsed_ == 0, "Slice stack not empty at slice begin"); 507 508 start_ = reader_.bufferOffset_; 509 end_ = reader_.bufferOffset_; 510 } 511 512 /// Finish building a slice and return it. 513 /// 514 /// Any Transactions on the slice must be committed or destroyed before the slice 515 /// is finished. 516 /// 517 /// Returns a string; once a slice is finished it is definitive that its contents 518 /// will not be changed. 519 char[] finish() @system 520 { 521 assert(inProgress, "finish called without begin"); 522 assert(endStackUsed_ == 0, "Finishing a slice with running transactions."); 523 524 auto result = reader_.buffer_[start_ .. end_]; 525 start_ = end_ = size_t.max; 526 return result; 527 } 528 529 /// Write a string to the slice being built. 530 /// 531 /// Data can only be written up to the current position in the Reader buffer. 532 /// 533 /// If str is a string returned by a Reader method, and str starts right after the 534 /// end of the slice being built, the slice is extended (trivial operation). 535 /// 536 /// See_Also: begin 537 void write(char[] str) @system 538 { 539 assert(inProgress, "write called without begin"); 540 assert(end_ <= reader_.bufferOffset_, 541 "AT START: Slice ends after buffer position"); 542 543 // If str starts at the end of the slice (is a string returned by a Reader 544 // method), just extend the slice to contain str. 545 if(str.ptr == reader_.buffer_.ptr + end_) 546 { 547 end_ += str.length; 548 } 549 // Even if str does not start at the end of the slice, it still may be returned 550 // by a Reader method and point to buffer. So we need to memmove. 551 else 552 { 553 core.stdc..string.memmove(reader_.buffer_.ptr + end_, cast(char*)str.ptr, 554 str.length * char.sizeof); 555 end_ += str.length; 556 } 557 } 558 559 /// Write a character to the slice being built. 560 /// 561 /// Data can only be written up to the current position in the Reader buffer. 562 /// 563 /// See_Also: begin 564 void write(dchar c) @system 565 { 566 assert(inProgress, "write called without begin"); 567 if(c < 0x80) 568 { 569 reader_.buffer_[end_++] = cast(char)c; 570 return; 571 } 572 573 // We need to encode a non-ASCII dchar into UTF-8 574 char[4] encodeBuf; 575 const bytes = encodeValidCharNoGC(encodeBuf, c); 576 reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes]; 577 end_ += bytes; 578 } 579 580 /// Insert a character to a specified position in the slice. 581 /// 582 /// Enlarges the slice by 1 char. Note that the slice can only extend up to the 583 /// current position in the Reader buffer. 584 /// 585 /// Params: 586 /// 587 /// c = The character to insert. 588 /// position = Position to insert the character at in code units, not code points. 589 /// Must be less than slice length(); a previously returned length() 590 /// can be used. 591 void insert(const dchar c, const size_t position) @system 592 { 593 assert(inProgress, "insert called without begin"); 594 assert(start_ + position <= end_, "Trying to insert after the end of the slice"); 595 596 const point = start_ + position; 597 const movedLength = end_ - point; 598 599 // Encode c into UTF-8 600 char[4] encodeBuf; 601 if(c < 0x80) { encodeBuf[0] = cast(char)c; } 602 const size_t bytes = c < 0x80 ? 1 : encodeValidCharNoGC(encodeBuf, c); 603 604 if(movedLength > 0) 605 { 606 core.stdc..string.memmove(reader_.buffer_.ptr + point + bytes, 607 reader_.buffer_.ptr + point, 608 movedLength * char.sizeof); 609 } 610 reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes]; 611 end_ += bytes; 612 } 613 614 /// Get the current length of the slice. 615 size_t length() @safe const 616 { 617 return end_ - start_; 618 } 619 620 /// A slice building transaction. 621 /// 622 /// Can be used to save and revert back to slice state. 623 struct Transaction 624 { 625 @system pure nothrow @nogc: 626 private: 627 // The slice builder affected by the transaction. 628 SliceBuilder* builder_ = null; 629 // Index of the return point of the transaction in StringBuilder.endStack_. 630 size_t stackLevel_; 631 // True after commit() has been called. 632 bool committed_; 633 634 public: 635 /// Begins a transaction on a SliceBuilder object. 636 /// 637 /// The transaction must end $(B after) any transactions created within the 638 /// transaction but $(B before) the slice is finish()-ed. A transaction can be 639 /// ended either by commit()-ing or reverting through the destructor. 640 /// 641 /// Saves the current state of a slice. 642 this(ref SliceBuilder builder) 643 { 644 builder_ = &builder; 645 stackLevel_ = builder_.endStackUsed_; 646 builder_.push(); 647 } 648 649 /// Commit changes to the slice. 650 /// 651 /// Ends the transaction - can only be called once, and removes the possibility 652 /// to revert slice state. 653 /// 654 /// Does nothing for a default-initialized transaction (the transaction has not 655 /// been started yet). 656 void commit() 657 { 658 assert(!committed_, "Can't commit a transaction more than once"); 659 660 if(builder_ is null) { return; } 661 assert(builder_.endStackUsed_ == stackLevel_ + 1, 662 "Parent transactions don't fully contain child transactions"); 663 builder_.apply(); 664 committed_ = true; 665 } 666 667 /// Destroy the transaction and revert it if it hasn't been committed yet. 668 /// 669 /// Does nothing for a default-initialized transaction. 670 ~this() 671 { 672 if(builder_ is null || committed_) { return; } 673 assert(builder_.endStackUsed_ == stackLevel_ + 1, 674 "Parent transactions don't fully contain child transactions"); 675 builder_.pop(); 676 builder_ = null; 677 } 678 } 679 680 private: 681 // Push the current end of the slice so we can revert to it if needed. 682 // 683 // Used by Transaction. 684 void push() @system 685 { 686 assert(inProgress, "push called without begin"); 687 assert(endStackUsed_ < endStack_.length, "Slice stack overflow"); 688 endStack_[endStackUsed_++] = end_; 689 } 690 691 // Pop the current end of endStack_ and set the end of the slice to the popped 692 // value, reverting changes since the old end was pushed. 693 // 694 // Used by Transaction. 695 void pop() @system 696 { 697 assert(inProgress, "pop called without begin"); 698 assert(endStackUsed_ > 0, "Trying to pop an empty slice stack"); 699 end_ = endStack_[--endStackUsed_]; 700 } 701 702 // Pop the current end of endStack_, but keep the current end of the slice, applying 703 // changes made since pushing the old end. 704 // 705 // Used by Transaction. 706 void apply() @system 707 { 708 assert(inProgress, "apply called without begin"); 709 assert(endStackUsed_ > 0, "Trying to apply an empty slice stack"); 710 --endStackUsed_; 711 } 712 } 713 714 715 private: 716 717 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible. 718 // 719 // Params: 720 // 721 // input = Buffer with UTF-8/16/32 data to decode. May be overwritten by the 722 // conversion, in which case the result will be a slice of this buffer. 723 // encoding = Encoding of input. 724 // 725 // Returns: 726 // 727 // A struct with the following members: 728 // 729 // $(D string errorMessage) In case of an error, the error message is stored here. If 730 // there was no error, errorMessage is NULL. Always check 731 // this first. 732 // $(D char[] utf8) input converted to UTF-8. May be a slice of input. 733 // $(D size_t characterCount) Number of characters (code points) in input. 734 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow 735 { 736 // Documented in function ddoc. 737 struct Result 738 { 739 string errorMessage; 740 char[] utf8; 741 size_t characterCount; 742 } 743 744 Result result; 745 746 // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32. 747 // 748 // Params: 749 // 750 // buffer = The input buffer to encode. 751 // result = A Result struct to put encoded result and any error messages to. 752 // 753 // On error, result.errorMessage will be set. 754 static void encode(C)(C[] input, ref Result result) @safe pure 755 { 756 // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or 757 // less bytes. 758 static if(is(C == dchar)) 759 { 760 char[4] encodeBuf; 761 auto utf8 = cast(char[])input; 762 auto length = 0; 763 foreach(dchar c; input) 764 { 765 ++result.characterCount; 766 // ASCII 767 if(c < 0x80) 768 { 769 utf8[length++] = cast(char)c; 770 continue; 771 } 772 773 const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c); 774 if(encodeResult.errorMessage !is null) 775 { 776 result.errorMessage = encodeResult.errorMessage; 777 return; 778 } 779 const bytes = encodeResult.bytes; 780 utf8[length .. length + bytes] = encodeBuf[0 .. bytes]; 781 length += bytes; 782 } 783 result.utf8 = utf8[0 .. length]; 784 } 785 // Unfortunately we can't do UTF-16 in place so we just use std.conv.to 786 else 787 { 788 result.characterCount = std.utf.count(input); 789 result.utf8 = input.to!(char[]); 790 } 791 } 792 793 try final switch(encoding) 794 { 795 case UTFEncoding.UTF_8: 796 result.utf8 = cast(char[])input; 797 const validateResult = result.utf8.validateUTF8NoGC(); 798 if(!validateResult.valid) 799 { 800 result.errorMessage = "UTF-8 validation error after character #" ~ 801 validateResult.characterCount.to!string ~ ": " ~ 802 validateResult.msg; 803 } 804 result.characterCount = validateResult.characterCount; 805 break; 806 case UTFEncoding.UTF_16: 807 assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); 808 encode(cast(wchar[])input, result); 809 break; 810 case UTFEncoding.UTF_32: 811 assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4"); 812 encode(cast(dchar[])input, result); 813 break; 814 } 815 catch(ConvException e) { result.errorMessage = e.msg; } 816 catch(UTFException e) { result.errorMessage = e.msg; } 817 catch(Exception e) 818 { 819 assert(false, "Unexpected exception in encode(): " ~ e.msg); 820 } 821 822 return result; 823 } 824 825 /// Determine if all characters (code points, not bytes) in a string are printable. 826 bool isPrintableValidUTF8(const char[] chars) @trusted pure nothrow @nogc 827 { 828 // This is oversized (only 128 entries are necessary) simply because having 256 829 // entries improves performance... for some reason (alignment?) 830 bool[256] printable = [false, false, false, false, false, false, false, false, 831 false, true, true, false, false, true, false, false, 832 false, false, false, false, false, false, false, false, 833 false, false, false, false, false, false, false, false, 834 835 true, true, true, true, true, true, true, true, 836 true, true, true, true, true, true, true, true, 837 true, true, true, true, true, true, true, true, 838 true, true, true, true, true, true, true, true, 839 840 true, true, true, true, true, true, true, true, 841 true, true, true, true, true, true, true, true, 842 true, true, true, true, true, true, true, true, 843 true, true, true, true, true, true, true, true, 844 true, true, true, true, true, true, true, true, 845 true, true, true, true, true, true, true, true, 846 true, true, true, true, true, true, true, true, 847 true, true, true, true, true, true, true, true, 848 849 false, false, false, false, false, false, false, false, 850 false, false, false, false, false, false, false, false, 851 false, false, false, false, false, false, false, false, 852 false, false, false, false, false, false, false, false, 853 false, false, false, false, false, false, false, false, 854 false, false, false, false, false, false, false, false, 855 false, false, false, false, false, false, false, false, 856 false, false, false, false, false, false, false, false, 857 858 false, false, false, false, false, false, false, false, 859 false, false, false, false, false, false, false, false, 860 false, false, false, false, false, false, false, false, 861 false, false, false, false, false, false, false, false, 862 false, false, false, false, false, false, false, false, 863 false, false, false, false, false, false, false, false, 864 false, false, false, false, false, false, false, false, 865 false, false, false, false, false, false, false, false]; 866 867 for(size_t index = 0; index < chars.length;) 868 { 869 // Fast path for ASCII. 870 // Both this while() block and the if() block below it are optimized, unrolled 871 // versions of the for() block below them; the while()/if() block could be 872 // removed without affecting logic, but both help increase performance. 873 size_t asciiCount = countASCII(chars[index .. $]); 874 // 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars. 875 while(asciiCount > 8) 876 { 877 const dchar b0 = chars[index]; 878 const dchar b1 = chars[index + 1]; 879 const dchar b2 = chars[index + 2]; 880 const dchar b3 = chars[index + 3]; 881 const dchar b4 = chars[index + 4]; 882 const dchar b5 = chars[index + 5]; 883 const dchar b6 = chars[index + 6]; 884 const dchar b7 = chars[index + 7]; 885 886 index += 8; 887 asciiCount -= 8; 888 889 const all = printable[b0] & printable[b1] & printable[b2] & printable[b3] & 890 printable[b4] & printable[b5] & printable[b6] & printable[b1]; 891 if(!all) 892 { 893 return false; 894 } 895 } 896 // 4 ASCII iterations unrolled 897 if(asciiCount > 4) 898 { 899 const char b0 = chars[index]; 900 const char b1 = chars[index + 1]; 901 const char b2 = chars[index + 2]; 902 const char b3 = chars[index + 3]; 903 904 index += 4; 905 asciiCount -= 4; 906 907 if(!printable[b0]) { return false; } 908 if(!printable[b1]) { return false; } 909 if(!printable[b2]) { return false; } 910 if(!printable[b3]) { return false; } 911 } 912 // Any remaining ASCII chars. This is really the only code needed to handle 913 // ASCII, the above if() and while() blocks are just an optimization. 914 for(; asciiCount > 0; --asciiCount) 915 { 916 const char b = chars[index]; 917 ++index; 918 if(b >= 0x20) { continue; } 919 if(printable[b]) { continue; } 920 return false; 921 } 922 923 if(index == chars.length) { break; } 924 925 // Not ASCII, need to decode. 926 const dchar c = decodeValidUTF8NoGC(chars, index); 927 // We now c is not ASCII, so only check for printable non-ASCII chars. 928 if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') || 929 (c >= '\uE000' && c <= '\uFFFD'))) 930 { 931 return false; 932 } 933 } 934 return true; 935 } 936 937 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence. 938 /// 939 /// Used to determine how many characters we can process without decoding. 940 size_t countASCII(const(char)[] buffer) @trusted pure nothrow @nogc 941 { 942 size_t count = 0; 943 944 // The topmost bit in ASCII characters is always 0 945 enum ulong Mask8 = 0x7f7f7f7f7f7f7f7f; 946 enum uint Mask4 = 0x7f7f7f7f; 947 enum ushort Mask2 = 0x7f7f; 948 949 // Start by checking in 8-byte chunks. 950 while(buffer.length >= Mask8.sizeof) 951 { 952 const block = *cast(typeof(Mask8)*)buffer.ptr; 953 const masked = Mask8 & block; 954 if(masked != block) { break; } 955 count += Mask8.sizeof; 956 buffer = buffer[Mask8.sizeof .. $]; 957 } 958 959 // If 8 bytes didn't match, try 4, 2 bytes. 960 import std.typetuple; 961 foreach(Mask; TypeTuple!(Mask4, Mask2)) 962 { 963 if(buffer.length < Mask.sizeof) { continue; } 964 const block = *cast(typeof(Mask)*)buffer.ptr; 965 const masked = Mask & block; 966 if(masked != block) { continue; } 967 count += Mask.sizeof; 968 buffer = buffer[Mask.sizeof .. $]; 969 } 970 971 // If even a 2-byte chunk didn't match, test just one byte. 972 if(buffer.empty || buffer[0] >= 0x80) { return count; } 973 ++count; 974 975 return count; 976 } 977 // Unittests. 978 979 void testEndian(R)() 980 { 981 writeln(typeid(R).toString() ~ ": endian unittest"); 982 void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) 983 { 984 auto reader = new R(data); 985 assert(reader.encoding == encoding_expected); 986 assert(reader.endian_ == endian_expected); 987 } 988 ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; 989 ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; 990 endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); 991 endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); 992 } 993 994 void testPeekPrefixForward(R)() 995 { 996 import dyaml.stream; 997 writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest"); 998 ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data"; 999 auto reader = new R(data); 1000 assert(reader.peek() == 'd'); 1001 assert(reader.peek(1) == 'a'); 1002 assert(reader.peek(2) == 't'); 1003 assert(reader.peek(3) == 'a'); 1004 assert(reader.peek(4) == '\0'); 1005 assert(reader.prefix(4) == "data"); 1006 // assert(reader.prefix(6) == "data\0"); 1007 reader.forward(2); 1008 assert(reader.peek(1) == 'a'); 1009 // assert(collectException(reader.peek(3))); 1010 } 1011 1012 void testUTF(R)() 1013 { 1014 import dyaml.stream; 1015 writeln(typeid(R).toString() ~ ": UTF formats unittest"); 1016 dchar[] data = cast(dchar[])"data"; 1017 void utf_test(T)(T[] data, BOM bom) 1018 { 1019 ubyte[] bytes = ByteOrderMarks[bom] ~ 1020 (cast(ubyte[])data)[0 .. data.length * T.sizeof]; 1021 auto reader = new R(bytes); 1022 assert(reader.peek() == 'd'); 1023 assert(reader.peek(1) == 'a'); 1024 assert(reader.peek(2) == 't'); 1025 assert(reader.peek(3) == 'a'); 1026 } 1027 utf_test!char(to!(char[])(data), BOM.UTF8); 1028 utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE); 1029 utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE); 1030 } 1031 1032 void test1Byte(R)() 1033 { 1034 writeln(typeid(R).toString() ~ ": 1 byte file unittest"); 1035 ubyte[] data = [97]; 1036 1037 auto reader = new R(data); 1038 assert(reader.peek() == 'a'); 1039 assert(reader.peek(1) == '\0'); 1040 // assert(collectException(reader.peek(2))); 1041 } 1042 1043 unittest 1044 { 1045 testEndian!Reader(); 1046 testPeekPrefixForward!Reader(); 1047 testUTF!Reader(); 1048 test1Byte!Reader(); 1049 }