1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 module dyaml.reader; 8 9 10 import core.stdc.stdlib; 11 import core.stdc..string; 12 import core.thread; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.exception; 18 import std.range; 19 import std..string; 20 import std.system; 21 import std.typecons; 22 import std.utf; 23 24 import tinyendian; 25 26 import dyaml.encoding; 27 import dyaml.exception; 28 29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029'); 30 31 package: 32 33 34 ///Exception thrown at Reader errors. 35 class ReaderException : YAMLException 36 { 37 this(string msg, string file = __FILE__, size_t line = __LINE__) 38 @safe pure nothrow 39 { 40 super("Reader error: " ~ msg, file, line); 41 } 42 } 43 44 /// Provides an API to read characters from a UTF-8 buffer and build slices into that 45 /// buffer to avoid allocations (see SliceBuilder). 46 final class Reader 47 { 48 private: 49 // Buffer of currently loaded characters. 50 char[] buffer_; 51 52 // Current position within buffer. Only data after this position can be read. 53 size_t bufferOffset_; 54 55 // Index of the current character in the buffer. 56 size_t charIndex_; 57 // Number of characters (code points) in buffer_. 58 size_t characterCount_; 59 60 // Current line in file. 61 uint line_; 62 // Current column in file. 63 uint column_; 64 65 // Original Unicode encoding of the data. 66 Encoding encoding_; 67 68 version(unittest) 69 { 70 // Endianness of the input before it was converted (for testing) 71 Endian endian_; 72 } 73 74 // The number of consecutive ASCII characters starting at bufferOffset_. 75 // 76 // Used to minimize UTF-8 decoding. 77 size_t upcomingASCII_; 78 79 // Index to buffer_ where the last decoded character starts. 80 size_t lastDecodedBufferOffset_; 81 // Offset, relative to charIndex_, of the last decoded character, 82 // in code points, not chars. 83 size_t lastDecodedCharOffset_; 84 85 public: 86 /// Construct a Reader. 87 /// 88 /// Params: buffer = Buffer with YAML data. This may be e.g. the entire 89 /// contents of a file or a string. $(B will) be modified by 90 /// the Reader and other parts of D:YAML (D:YAML tries to 91 /// reuse the buffer to minimize memory allocations) 92 /// 93 /// Throws: ReaderException on a UTF decoding error or if there are 94 /// nonprintable Unicode characters illegal in YAML. 95 this(ubyte[] buffer) @safe pure 96 { 97 auto endianResult = fixUTFByteOrder(buffer); 98 if(endianResult.bytesStripped > 0) 99 { 100 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~ 101 "to 2 or 4 bytes, respectively"); 102 } 103 104 version(unittest) { endian_ = endianResult.endian; } 105 encoding_ = endianResult.encoding; 106 107 auto utf8Result = toUTF8(endianResult.array, endianResult.encoding); 108 const msg = utf8Result.errorMessage; 109 if(msg !is null) 110 { 111 throw new ReaderException("Error when converting to UTF-8: " ~ msg); 112 } 113 114 buffer_ = utf8Result.utf8; 115 116 characterCount_ = utf8Result.characterCount; 117 // Check that all characters in buffer are printable. 118 enforce(isPrintableValidUTF8(buffer_), 119 new ReaderException("Special unicode characters are not allowed")); 120 121 this.sliceBuilder = SliceBuilder(this); 122 checkASCII(); 123 } 124 125 /// Get character at specified index relative to current position. 126 /// 127 /// Params: index = Index of the character to get relative to current position 128 /// in the buffer. Can point outside of the buffer; In that 129 /// case, '\0' will be returned. 130 /// 131 /// Returns: Character at specified position or '\0' if outside of the buffer. 132 /// 133 // XXX removed; search for 'risky' to find why. 134 // Throws: ReaderException if trying to read past the end of the buffer. 135 dchar peek(const size_t index) @safe pure 136 { 137 if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; } 138 if(characterCount_ <= charIndex_ + index) 139 { 140 // XXX This is risky; revert this if bugs are introduced. We rely on 141 // the assumption that Reader only uses peek() to detect end of buffer. 142 // The test suite passes. 143 // Revert this case here and in other peek() versions if this causes 144 // errors. 145 // throw new ReaderException("Trying to read past the end of the buffer"); 146 return '\0'; 147 } 148 149 // Optimized path for Scanner code that peeks chars in linear order to 150 // determine the length of some sequence. 151 if(index == lastDecodedCharOffset_) 152 { 153 ++lastDecodedCharOffset_; 154 const char b = buffer_[lastDecodedBufferOffset_]; 155 // ASCII 156 if(b < 0x80) 157 { 158 ++lastDecodedBufferOffset_; 159 return b; 160 } 161 return decode(buffer_, lastDecodedBufferOffset_); 162 } 163 164 // 'Slow' path where we decode everything up to the requested character. 165 const asciiToTake = min(upcomingASCII_, index); 166 lastDecodedCharOffset_ = asciiToTake; 167 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 168 dchar d; 169 while(lastDecodedCharOffset_ <= index) 170 { 171 d = decodeNext(); 172 } 173 174 return d; 175 } 176 177 /// Optimized version of peek() for the case where peek index is 0. 178 dchar peek() @safe pure 179 { 180 if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; } 181 if(characterCount_ <= charIndex_) { return '\0'; } 182 183 lastDecodedCharOffset_ = 0; 184 lastDecodedBufferOffset_ = bufferOffset_; 185 return decodeNext(); 186 } 187 188 /// Get byte at specified index relative to current position. 189 /// 190 /// Params: index = Index of the byte to get relative to current position 191 /// in the buffer. Can point outside of the buffer; In that 192 /// case, '\0' will be returned. 193 /// 194 /// Returns: Byte at specified position or '\0' if outside of the buffer. 195 char peekByte(const size_t index) @safe pure nothrow @nogc 196 { 197 return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0'; 198 } 199 200 /// Optimized version of peekByte() for the case where peek byte index is 0. 201 char peekByte() @safe pure nothrow @nogc 202 { 203 return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0'; 204 } 205 206 207 /// Get specified number of characters starting at current position. 208 /// 209 /// Note: This gets only a "view" into the internal buffer, which will be 210 /// invalidated after other Reader calls. Use SliceBuilder to build slices 211 /// for permanent use. 212 /// 213 /// Params: length = Number of characters (code points, not bytes) to get. May 214 /// reach past the end of the buffer; in that case the returned 215 /// slice will be shorter. 216 /// 217 /// Returns: Characters starting at current position or an empty slice if out of bounds. 218 char[] prefix(const size_t length) @safe pure 219 { 220 return slice(length); 221 } 222 223 /// Get specified number of bytes, not code points, starting at current position. 224 /// 225 /// Note: This gets only a "view" into the internal buffer, which will be 226 /// invalidated after other Reader calls. Use SliceBuilder to build slices 227 /// for permanent use. 228 /// 229 /// Params: length = Number bytes (not code points) to get. May NOT reach past 230 /// the end of the buffer; should be used with peek() to avoid 231 /// this. 232 /// 233 /// Returns: Bytes starting at current position. 234 char[] prefixBytes(const size_t length) @safe pure nothrow @nogc 235 in(length == 0 || bufferOffset_ + length < buffer_.length, "prefixBytes out of bounds") 236 { 237 return buffer_[bufferOffset_ .. bufferOffset_ + length]; 238 } 239 240 /// Get a slice view of the internal buffer, starting at the current position. 241 /// 242 /// Note: This gets only a "view" into the internal buffer, 243 /// which get invalidated after other Reader calls. 244 /// 245 /// Params: end = End of the slice relative to current position. May reach past 246 /// the end of the buffer; in that case the returned slice will 247 /// be shorter. 248 /// 249 /// Returns: Slice into the internal buffer or an empty slice if out of bounds. 250 char[] slice(const size_t end) @safe pure 251 { 252 // Fast path in case the caller has already peek()ed all the way to end. 253 if(end == lastDecodedCharOffset_) 254 { 255 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 256 } 257 258 const asciiToTake = min(upcomingASCII_, end, buffer_.length); 259 lastDecodedCharOffset_ = asciiToTake; 260 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 261 262 // 'Slow' path - decode everything up to end. 263 while(lastDecodedCharOffset_ < end && 264 lastDecodedBufferOffset_ < buffer_.length) 265 { 266 decodeNext(); 267 } 268 269 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 270 } 271 272 /// Get the next character, moving buffer position beyond it. 273 /// 274 /// Returns: Next character. 275 /// 276 /// Throws: ReaderException if trying to read past the end of the buffer 277 /// or if invalid data is read. 278 dchar get() @safe pure 279 { 280 const result = peek(); 281 forward(); 282 return result; 283 } 284 285 /// Get specified number of characters, moving buffer position beyond them. 286 /// 287 /// Params: length = Number or characters (code points, not bytes) to get. 288 /// 289 /// Returns: Characters starting at current position. 290 char[] get(const size_t length) @safe pure 291 { 292 auto result = slice(length); 293 forward(length); 294 return result; 295 } 296 297 /// Move current position forward. 298 /// 299 /// Params: length = Number of characters to move position forward. 300 void forward(size_t length) @safe pure 301 { 302 while(length > 0) 303 { 304 auto asciiToTake = min(upcomingASCII_, length); 305 charIndex_ += asciiToTake; 306 length -= asciiToTake; 307 upcomingASCII_ -= asciiToTake; 308 309 for(; asciiToTake > 0; --asciiToTake) 310 { 311 const c = buffer_[bufferOffset_++]; 312 // c is ASCII, do we only need to check for ASCII line breaks. 313 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 314 { 315 ++line_; 316 column_ = 0; 317 continue; 318 } 319 ++column_; 320 } 321 322 // If we have used up all upcoming ASCII chars, the next char is 323 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to 324 // be updated - it's zero. 325 if(length == 0) { break; } 326 327 assert(upcomingASCII_ == 0, 328 "Running unicode handling code but we haven't run out of ASCII chars"); 329 assert(bufferOffset_ < buffer_.length, 330 "Attempted to decode past the end of YAML buffer"); 331 assert(buffer_[bufferOffset_] >= 0x80, 332 "ASCII must be handled by preceding code"); 333 334 ++charIndex_; 335 const c = decode(buffer_, bufferOffset_); 336 337 // New line. (can compare with '\n' without decoding since it's ASCII) 338 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n')) 339 { 340 ++line_; 341 column_ = 0; 342 } 343 else if(c != '\uFEFF') { ++column_; } 344 --length; 345 checkASCII(); 346 } 347 348 lastDecodedBufferOffset_ = bufferOffset_; 349 lastDecodedCharOffset_ = 0; 350 } 351 352 /// Move current position forward by one character. 353 void forward() @safe pure 354 { 355 ++charIndex_; 356 lastDecodedBufferOffset_ = bufferOffset_; 357 lastDecodedCharOffset_ = 0; 358 359 // ASCII 360 if(upcomingASCII_ > 0) 361 { 362 --upcomingASCII_; 363 const c = buffer_[bufferOffset_++]; 364 365 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 366 { 367 ++line_; 368 column_ = 0; 369 return; 370 } 371 ++column_; 372 return; 373 } 374 375 // UTF-8 376 assert(bufferOffset_ < buffer_.length, 377 "Attempted to decode past the end of YAML buffer"); 378 assert(buffer_[bufferOffset_] >= 0x80, 379 "ASCII must be handled by preceding code"); 380 381 const c = decode(buffer_, bufferOffset_); 382 383 // New line. (can compare with '\n' without decoding since it's ASCII) 384 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n')) 385 { 386 ++line_; 387 column_ = 0; 388 } 389 else if(c != '\uFEFF') { ++column_; } 390 391 checkASCII(); 392 } 393 394 /// Used to build slices of read data in Reader; to avoid allocations. 395 SliceBuilder sliceBuilder; 396 397 /// Get a string describing current buffer position, used for error messages. 398 Mark mark() const pure nothrow @nogc @safe { return Mark(line_, column_); } 399 400 /// Get current line number. 401 uint line() const @safe pure nothrow @nogc { return line_; } 402 403 /// Get current column number. 404 uint column() const @safe pure nothrow @nogc { return column_; } 405 406 /// Get index of the current character in the buffer. 407 size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; } 408 409 /// Get encoding of the input buffer. 410 Encoding encoding() const @safe pure nothrow @nogc { return encoding_; } 411 412 private: 413 // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence) 414 void checkASCII() @safe pure nothrow @nogc 415 { 416 upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]); 417 } 418 419 // Decode the next character relative to 420 // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them. 421 // 422 // Does not advance the buffer position. Used in peek() and slice(). 423 dchar decodeNext() @safe pure 424 { 425 assert(lastDecodedBufferOffset_ < buffer_.length, 426 "Attempted to decode past the end of YAML buffer"); 427 const char b = buffer_[lastDecodedBufferOffset_]; 428 ++lastDecodedCharOffset_; 429 // ASCII 430 if(b < 0x80) 431 { 432 ++lastDecodedBufferOffset_; 433 return b; 434 } 435 436 return decode(buffer_, lastDecodedBufferOffset_); 437 } 438 } 439 440 /// Used to build slices of already read data in Reader buffer, avoiding allocations. 441 /// 442 /// Usually these slices point to unchanged Reader data, but sometimes the data is 443 /// changed due to how YAML interprets certain characters/strings. 444 /// 445 /// See begin() documentation. 446 struct SliceBuilder 447 { 448 private: 449 // No copying by the user. 450 @disable this(this); 451 @disable void opAssign(ref SliceBuilder); 452 453 // Reader this builder works in. 454 Reader reader_; 455 456 // Start of the slice om reader_.buffer_ (size_t.max while no slice being build) 457 size_t start_ = size_t.max; 458 // End of the slice om reader_.buffer_ (size_t.max while no slice being build) 459 size_t end_ = size_t.max; 460 461 // Stack of slice ends to revert to (see Transaction) 462 // 463 // Very few levels as we don't want arbitrarily nested transactions. 464 size_t[4] endStack_; 465 // The number of elements currently in endStack_. 466 size_t endStackUsed_; 467 468 @safe const pure nothrow @nogc invariant() 469 { 470 if(!inProgress) { return; } 471 assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position"); 472 assert(start_ <= end_, "Slice start after slice end"); 473 } 474 475 // Is a slice currently being built? 476 bool inProgress() @safe const pure nothrow @nogc 477 in(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max, "start_/end_ are not consistent") 478 { 479 return start_ != size_t.max; 480 } 481 482 public: 483 /// Begin building a slice. 484 /// 485 /// Only one slice can be built at any given time; before beginning a new slice, 486 /// finish the previous one (if any). 487 /// 488 /// The slice starts at the current position in the Reader buffer. It can only be 489 /// extended up to the current position in the buffer; Reader methods get() and 490 /// forward() move the position. E.g. it is valid to extend a slice by write()-ing 491 /// a string just returned by get() - but not one returned by prefix() unless the 492 /// position has changed since the prefix() call. 493 void begin() @safe pure nothrow @nogc 494 in(!inProgress, "Beginning a slice while another slice is being built") 495 in(endStackUsed_ == 0, "Slice stack not empty at slice begin") 496 { 497 498 start_ = reader_.bufferOffset_; 499 end_ = reader_.bufferOffset_; 500 } 501 502 /// Finish building a slice and return it. 503 /// 504 /// Any Transactions on the slice must be committed or destroyed before the slice 505 /// is finished. 506 /// 507 /// Returns a string; once a slice is finished it is definitive that its contents 508 /// will not be changed. 509 char[] finish() @safe pure nothrow @nogc 510 in(inProgress, "finish called without begin") 511 in(endStackUsed_ == 0, "Finishing a slice with running transactions.") 512 { 513 514 auto result = reader_.buffer_[start_ .. end_]; 515 start_ = end_ = size_t.max; 516 return result; 517 } 518 519 /// Write a string to the slice being built. 520 /// 521 /// Data can only be written up to the current position in the Reader buffer. 522 /// 523 /// If str is a string returned by a Reader method, and str starts right after the 524 /// end of the slice being built, the slice is extended (trivial operation). 525 /// 526 /// See_Also: begin 527 void write(scope char[] str) @safe pure nothrow @nogc 528 { 529 assert(inProgress, "write called without begin"); 530 assert(end_ <= reader_.bufferOffset_, 531 "AT START: Slice ends after buffer position"); 532 533 // Nothing? Already done. 534 if (str.length == 0) { return; } 535 // If str starts at the end of the slice (is a string returned by a Reader 536 // method), just extend the slice to contain str. 537 if(&str[0] == &reader_.buffer_[end_]) 538 { 539 end_ += str.length; 540 } 541 // Even if str does not start at the end of the slice, it still may be returned 542 // by a Reader method and point to buffer. So we need to memmove. 543 else 544 { 545 copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]); 546 end_ += str.length; 547 } 548 } 549 550 /// Write a character to the slice being built. 551 /// 552 /// Data can only be written up to the current position in the Reader buffer. 553 /// 554 /// See_Also: begin 555 void write(dchar c) @safe pure 556 in(inProgress, "write called without begin") 557 { 558 if(c < 0x80) 559 { 560 reader_.buffer_[end_++] = cast(char)c; 561 return; 562 } 563 564 // We need to encode a non-ASCII dchar into UTF-8 565 char[4] encodeBuf; 566 const bytes = encode(encodeBuf, c); 567 reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes]; 568 end_ += bytes; 569 } 570 571 /// Insert a character to a specified position in the slice. 572 /// 573 /// Enlarges the slice by 1 char. Note that the slice can only extend up to the 574 /// current position in the Reader buffer. 575 /// 576 /// Params: 577 /// 578 /// c = The character to insert. 579 /// position = Position to insert the character at in code units, not code points. 580 /// Must be less than slice length(); a previously returned length() 581 /// can be used. 582 void insert(const dchar c, const size_t position) @safe pure 583 in(inProgress, "insert called without begin") 584 in(start_ + position <= end_, "Trying to insert after the end of the slice") 585 { 586 587 const point = start_ + position; 588 const movedLength = end_ - point; 589 590 // Encode c into UTF-8 591 char[4] encodeBuf; 592 if(c < 0x80) { encodeBuf[0] = cast(char)c; } 593 const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c); 594 595 if(movedLength > 0) 596 { 597 copy(reader_.buffer_[point..point + movedLength * char.sizeof], 598 reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]); 599 } 600 reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes]; 601 end_ += bytes; 602 } 603 604 /// Get the current length of the slice. 605 size_t length() @safe const pure nothrow @nogc 606 { 607 return end_ - start_; 608 } 609 610 /// A slice building transaction. 611 /// 612 /// Can be used to save and revert back to slice state. 613 struct Transaction 614 { 615 private: 616 // The slice builder affected by the transaction. 617 SliceBuilder* builder_; 618 // Index of the return point of the transaction in StringBuilder.endStack_. 619 size_t stackLevel_; 620 // True after commit() has been called. 621 bool committed_; 622 623 public: 624 /// Begins a transaction on a SliceBuilder object. 625 /// 626 /// The transaction must end $(B after) any transactions created within the 627 /// transaction but $(B before) the slice is finish()-ed. A transaction can be 628 /// ended either by commit()-ing or reverting through the destructor. 629 /// 630 /// Saves the current state of a slice. 631 this(SliceBuilder* builder) @safe pure nothrow @nogc 632 { 633 builder_ = builder; 634 stackLevel_ = builder_.endStackUsed_; 635 builder_.push(); 636 } 637 638 /// Commit changes to the slice. 639 /// 640 /// Ends the transaction - can only be called once, and removes the possibility 641 /// to revert slice state. 642 /// 643 /// Does nothing for a default-initialized transaction (the transaction has not 644 /// been started yet). 645 void commit() @safe pure nothrow @nogc 646 in(!committed_, "Can't commit a transaction more than once") 647 { 648 649 if(builder_ is null) { return; } 650 assert(builder_.endStackUsed_ == stackLevel_ + 1, 651 "Parent transactions don't fully contain child transactions"); 652 builder_.apply(); 653 committed_ = true; 654 } 655 656 /// Destroy the transaction and revert it if it hasn't been committed yet. 657 void end() @safe pure nothrow @nogc 658 in(builder_ && builder_.endStackUsed_ == stackLevel_ + 1, "Parent transactions don't fully contain child transactions") 659 { 660 builder_.pop(); 661 builder_ = null; 662 } 663 664 } 665 666 private: 667 // Push the current end of the slice so we can revert to it if needed. 668 // 669 // Used by Transaction. 670 void push() @safe pure nothrow @nogc 671 in(inProgress, "push called without begin") 672 in(endStackUsed_ < endStack_.length, "Slice stack overflow") 673 { 674 endStack_[endStackUsed_++] = end_; 675 } 676 677 // Pop the current end of endStack_ and set the end of the slice to the popped 678 // value, reverting changes since the old end was pushed. 679 // 680 // Used by Transaction. 681 void pop() @safe pure nothrow @nogc 682 in(inProgress, "pop called without begin") 683 in(endStackUsed_ > 0, "Trying to pop an empty slice stack") 684 { 685 end_ = endStack_[--endStackUsed_]; 686 } 687 688 // Pop the current end of endStack_, but keep the current end of the slice, applying 689 // changes made since pushing the old end. 690 // 691 // Used by Transaction. 692 void apply() @safe pure nothrow @nogc 693 in(inProgress, "apply called without begin") 694 in(endStackUsed_ > 0, "Trying to apply an empty slice stack") 695 { 696 --endStackUsed_; 697 } 698 } 699 700 701 private: 702 703 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible. 704 // 705 // Params: 706 // 707 // input = Buffer with UTF-8/16/32 data to decode. May be overwritten by the 708 // conversion, in which case the result will be a slice of this buffer. 709 // encoding = Encoding of input. 710 // 711 // Returns: 712 // 713 // A struct with the following members: 714 // 715 // $(D string errorMessage) In case of an error, the error message is stored here. If 716 // there was no error, errorMessage is NULL. Always check 717 // this first. 718 // $(D char[] utf8) input converted to UTF-8. May be a slice of input. 719 // $(D size_t characterCount) Number of characters (code points) in input. 720 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow 721 { 722 // Documented in function ddoc. 723 struct Result 724 { 725 string errorMessage; 726 char[] utf8; 727 size_t characterCount; 728 } 729 730 Result result; 731 732 // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32. 733 // 734 // Params: 735 // 736 // buffer = The input buffer to encode. 737 // result = A Result struct to put encoded result and any error messages to. 738 // 739 // On error, result.errorMessage will be set. 740 static void encode(C)(C[] input, ref Result result) @safe pure 741 { 742 // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or 743 // less bytes. 744 static if(is(C == dchar)) 745 { 746 char[4] encodeBuf; 747 auto utf8 = cast(char[])input; 748 auto length = 0; 749 foreach(dchar c; input) 750 { 751 ++result.characterCount; 752 // ASCII 753 if(c < 0x80) 754 { 755 utf8[length++] = cast(char)c; 756 continue; 757 } 758 759 std.utf.encode(encodeBuf, c); 760 const bytes = codeLength!char(c); 761 utf8[length .. length + bytes] = encodeBuf[0 .. bytes]; 762 length += bytes; 763 } 764 result.utf8 = utf8[0 .. length]; 765 } 766 // Unfortunately we can't do UTF-16 in place so we just use std.conv.to 767 else 768 { 769 result.characterCount = std.utf.count(input); 770 result.utf8 = input.to!(char[]); 771 } 772 } 773 774 try final switch(encoding) 775 { 776 case UTFEncoding.UTF_8: 777 result.utf8 = cast(char[])input; 778 result.utf8.validate(); 779 result.characterCount = std.utf.count(result.utf8); 780 break; 781 case UTFEncoding.UTF_16: 782 assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); 783 encode(cast(wchar[])input, result); 784 break; 785 case UTFEncoding.UTF_32: 786 assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4"); 787 encode(cast(dchar[])input, result); 788 break; 789 } 790 catch(ConvException e) { result.errorMessage = e.msg; } 791 catch(UTFException e) { result.errorMessage = e.msg; } 792 catch(Exception e) 793 { 794 assert(false, "Unexpected exception in encode(): " ~ e.msg); 795 } 796 797 return result; 798 } 799 800 /// Determine if all characters (code points, not bytes) in a string are printable. 801 bool isPrintableValidUTF8(const char[] chars) @safe pure 802 { 803 import std.uni : isControl, isWhite; 804 foreach (dchar chr; chars) 805 { 806 if (!chr.isValidDchar || (chr.isControl && !chr.isWhite)) 807 { 808 return false; 809 } 810 } 811 return true; 812 } 813 814 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence. 815 /// 816 /// Used to determine how many characters we can process without decoding. 817 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc 818 { 819 return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength; 820 } 821 // Unittests. 822 823 void testEndian(R)() 824 { 825 void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) 826 { 827 auto reader = new R(data); 828 assert(reader.encoding == encoding_expected); 829 assert(reader.endian_ == endian_expected); 830 } 831 ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; 832 ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; 833 endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); 834 endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); 835 } 836 837 void testPeekPrefixForward(R)() 838 { 839 import std.encoding; 840 ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data"; 841 auto reader = new R(data); 842 assert(reader.peek() == 'd'); 843 assert(reader.peek(1) == 'a'); 844 assert(reader.peek(2) == 't'); 845 assert(reader.peek(3) == 'a'); 846 assert(reader.peek(4) == '\0'); 847 assert(reader.prefix(4) == "data"); 848 // assert(reader.prefix(6) == "data\0"); 849 reader.forward(2); 850 assert(reader.peek(1) == 'a'); 851 // assert(collectException(reader.peek(3))); 852 } 853 854 void testUTF(R)() 855 { 856 import std.encoding; 857 dchar[] data = cast(dchar[])"data"; 858 void utf_test(T)(T[] data, BOM bom) 859 { 860 ubyte[] bytes = bomTable[bom].sequence ~ 861 (cast(ubyte[])data)[0 .. data.length * T.sizeof]; 862 auto reader = new R(bytes); 863 assert(reader.peek() == 'd'); 864 assert(reader.peek(1) == 'a'); 865 assert(reader.peek(2) == 't'); 866 assert(reader.peek(3) == 'a'); 867 } 868 utf_test!char(to!(char[])(data), BOM.utf8); 869 utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le); 870 utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le); 871 } 872 873 void test1Byte(R)() 874 { 875 ubyte[] data = [97]; 876 877 auto reader = new R(data); 878 assert(reader.peek() == 'a'); 879 assert(reader.peek(1) == '\0'); 880 // assert(collectException(reader.peek(2))); 881 } 882 883 @system unittest 884 { 885 testEndian!Reader(); 886 testPeekPrefixForward!Reader(); 887 testUTF!Reader(); 888 test1Byte!Reader(); 889 }