1 2 // Copyright Ferdinand Majerech 2011-2014. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 7 module dyaml.reader; 8 9 10 import core.stdc.stdlib; 11 import core.stdc.string; 12 import core.thread; 13 14 import std.algorithm; 15 import std.array; 16 import std.conv; 17 import std.exception; 18 import std.range; 19 import std.string; 20 import std.system; 21 import std.typecons; 22 import std.utf; 23 24 import tinyendian; 25 26 import dyaml.encoding; 27 import dyaml.exception; 28 29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029'); 30 31 package: 32 33 34 /// Provides an API to read characters from a UTF-8 buffer. 35 struct Reader 36 { 37 private: 38 // Buffer of currently loaded characters. 39 char[] buffer_; 40 41 // Current position within buffer. Only data after this position can be read. 42 size_t bufferOffset_; 43 44 // Index of the current character in the buffer. 45 size_t charIndex_; 46 // Number of characters (code points) in buffer_. 47 size_t characterCount_; 48 49 // File name 50 string name_; 51 // Current line in file. 52 uint line_; 53 // Current column in file. 54 uint column_; 55 56 // Original Unicode encoding of the data. 57 Encoding encoding_; 58 59 version(unittest) 60 { 61 // Endianness of the input before it was converted (for testing) 62 Endian endian_; 63 } 64 65 // The number of consecutive ASCII characters starting at bufferOffset_. 66 // 67 // Used to minimize UTF-8 decoding. 68 size_t upcomingASCII_; 69 70 // Index to buffer_ where the last decoded character starts. 71 size_t lastDecodedBufferOffset_; 72 // Offset, relative to charIndex_, of the last decoded character, 73 // in code points, not chars. 74 size_t lastDecodedCharOffset_; 75 76 public: 77 /// Construct a Reader. 78 /// 79 /// Params: buffer = Buffer with YAML data. This may be e.g. the entire 80 /// contents of a file or a string. $(B will) be modified by 81 /// the Reader and other parts of D:YAML (D:YAML tries to 82 /// reuse the buffer to minimize memory allocations) 83 /// name = File name if the buffer is the contents of a file or 84 /// `"<unknown>"` if the buffer is the contents of a string. 85 /// 86 /// Throws: ReaderException on a UTF decoding error or if there are 87 /// nonprintable Unicode characters illegal in YAML. 88 this(ubyte[] buffer, string name = "<unknown>") @safe pure 89 { 90 name_ = name; 91 auto endianResult = fixUTFByteOrder(buffer); 92 if(endianResult.bytesStripped > 0) 93 { 94 // TODO: add line and column 95 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~ 96 "to 2 or 4 bytes, respectively", Mark(name, 0, 0)); 97 } 98 99 version(unittest) { endian_ = endianResult.endian; } 100 encoding_ = endianResult.encoding; 101 102 auto utf8Result = toUTF8(endianResult.array, endianResult.encoding); 103 const msg = utf8Result.errorMessage; 104 if(msg !is null) 105 { 106 // TODO: add line and column 107 throw new ReaderException("Error when converting to UTF-8: " ~ msg, Mark(name, 0, 0)); 108 } 109 110 buffer_ = utf8Result.utf8; 111 112 characterCount_ = utf8Result.characterCount; 113 // Check that all characters in buffer are printable. 114 // TODO: add line and column 115 enforce(isPrintableValidUTF8(buffer_), 116 new ReaderException("Special unicode characters are not allowed", Mark(name, 0, 0))); 117 118 checkASCII(); 119 } 120 121 /// Get character at specified index relative to current position. 122 /// 123 /// Params: index = Index of the character to get relative to current position 124 /// in the buffer. Can point outside of the buffer; In that 125 /// case, '\0' will be returned. 126 /// 127 /// Returns: Character at specified position or '\0' if outside of the buffer. 128 /// 129 // XXX removed; search for 'risky' to find why. 130 // Throws: ReaderException if trying to read past the end of the buffer. 131 dchar peek(const size_t index) @safe pure 132 { 133 if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; } 134 if(characterCount_ <= charIndex_ + index) 135 { 136 // XXX This is risky; revert this if bugs are introduced. We rely on 137 // the assumption that Reader only uses peek() to detect end of buffer. 138 // The test suite passes. 139 // Revert this case here and in other peek() versions if this causes 140 // errors. 141 // throw new ReaderException("Trying to read past the end of the buffer"); 142 return '\0'; 143 } 144 145 // Optimized path for Scanner code that peeks chars in linear order to 146 // determine the length of some sequence. 147 if(index == lastDecodedCharOffset_) 148 { 149 ++lastDecodedCharOffset_; 150 const char b = buffer_[lastDecodedBufferOffset_]; 151 // ASCII 152 if(b < 0x80) 153 { 154 ++lastDecodedBufferOffset_; 155 return b; 156 } 157 return decode(buffer_, lastDecodedBufferOffset_); 158 } 159 160 // 'Slow' path where we decode everything up to the requested character. 161 const asciiToTake = min(upcomingASCII_, index); 162 lastDecodedCharOffset_ = asciiToTake; 163 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 164 dchar d; 165 while(lastDecodedCharOffset_ <= index) 166 { 167 d = decodeNext(); 168 } 169 170 return d; 171 } 172 173 /// Optimized version of peek() for the case where peek index is 0. 174 dchar peek() @safe pure 175 { 176 if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; } 177 if(characterCount_ <= charIndex_) { return '\0'; } 178 179 lastDecodedCharOffset_ = 0; 180 lastDecodedBufferOffset_ = bufferOffset_; 181 return decodeNext(); 182 } 183 184 /// Get byte at specified index relative to current position. 185 /// 186 /// Params: index = Index of the byte to get relative to current position 187 /// in the buffer. Can point outside of the buffer; In that 188 /// case, '\0' will be returned. 189 /// 190 /// Returns: Byte at specified position or '\0' if outside of the buffer. 191 char peekByte(const size_t index) @safe pure nothrow @nogc 192 { 193 return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0'; 194 } 195 196 /// Optimized version of peekByte() for the case where peek byte index is 0. 197 char peekByte() @safe pure nothrow @nogc 198 { 199 return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0'; 200 } 201 202 203 /// Get specified number of characters starting at current position. 204 /// 205 /// Note: This gets only a "view" into the internal buffer, which will be 206 /// invalidated after other Reader calls. 207 /// 208 /// Params: length = Number of characters (code points, not bytes) to get. May 209 /// reach past the end of the buffer; in that case the returned 210 /// slice will be shorter. 211 /// 212 /// Returns: Characters starting at current position or an empty slice if out of bounds. 213 char[] prefix(const size_t length) @safe pure 214 { 215 return slice(length); 216 } 217 218 /// Get specified number of bytes, not code points, starting at current position. 219 /// 220 /// Note: This gets only a "view" into the internal buffer, which will be 221 /// invalidated after other Reader calls. 222 /// 223 /// Params: length = Number bytes (not code points) to get. May NOT reach past 224 /// the end of the buffer; should be used with peek() to avoid 225 /// this. 226 /// 227 /// Returns: Bytes starting at current position. 228 char[] prefixBytes(const size_t length) @safe pure nothrow @nogc 229 in(length == 0 || bufferOffset_ + length <= buffer_.length, "prefixBytes out of bounds") 230 { 231 return buffer_[bufferOffset_ .. bufferOffset_ + length]; 232 } 233 234 /// Get a slice view of the internal buffer, starting at the current position. 235 /// 236 /// Note: This gets only a "view" into the internal buffer, 237 /// which get invalidated after other Reader calls. 238 /// 239 /// Params: end = End of the slice relative to current position. May reach past 240 /// the end of the buffer; in that case the returned slice will 241 /// be shorter. 242 /// 243 /// Returns: Slice into the internal buffer or an empty slice if out of bounds. 244 char[] slice(const size_t end) @safe pure 245 { 246 // Fast path in case the caller has already peek()ed all the way to end. 247 if(end == lastDecodedCharOffset_) 248 { 249 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 250 } 251 252 const asciiToTake = min(upcomingASCII_, end, buffer_.length); 253 lastDecodedCharOffset_ = asciiToTake; 254 lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake; 255 256 // 'Slow' path - decode everything up to end. 257 while(lastDecodedCharOffset_ < end && 258 lastDecodedBufferOffset_ < buffer_.length) 259 { 260 decodeNext(); 261 } 262 263 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_]; 264 } 265 266 /// Get the next character, moving buffer position beyond it. 267 /// 268 /// Returns: Next character. 269 /// 270 /// Throws: ReaderException if trying to read past the end of the buffer 271 /// or if invalid data is read. 272 dchar get() @safe pure 273 { 274 const result = peek(); 275 forward(); 276 return result; 277 } 278 279 /// Get specified number of characters, moving buffer position beyond them. 280 /// 281 /// Params: length = Number or characters (code points, not bytes) to get. 282 /// 283 /// Returns: Characters starting at current position. 284 char[] get(const size_t length) @safe pure 285 { 286 auto result = slice(length); 287 forward(length); 288 return result; 289 } 290 291 /// Move current position forward. 292 /// 293 /// Params: length = Number of characters to move position forward. 294 void forward(size_t length) @safe pure 295 { 296 while(length > 0) 297 { 298 auto asciiToTake = min(upcomingASCII_, length); 299 charIndex_ += asciiToTake; 300 length -= asciiToTake; 301 upcomingASCII_ -= asciiToTake; 302 303 for(; asciiToTake > 0; --asciiToTake) 304 { 305 const c = buffer_[bufferOffset_++]; 306 // c is ASCII, do we only need to check for ASCII line breaks. 307 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 308 { 309 ++line_; 310 column_ = 0; 311 continue; 312 } 313 ++column_; 314 } 315 316 // If we have used up all upcoming ASCII chars, the next char is 317 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to 318 // be updated - it's zero. 319 if(length == 0) { break; } 320 321 assert(upcomingASCII_ == 0, 322 "Running unicode handling code but we haven't run out of ASCII chars"); 323 assert(bufferOffset_ < buffer_.length, 324 "Attempted to decode past the end of YAML buffer"); 325 assert(buffer_[bufferOffset_] >= 0x80, 326 "ASCII must be handled by preceding code"); 327 328 ++charIndex_; 329 const c = decode(buffer_, bufferOffset_); 330 331 // New line. (can compare with '\n' without decoding since it's ASCII) 332 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n')) 333 { 334 ++line_; 335 column_ = 0; 336 } 337 else if(c != '\uFEFF') { ++column_; } 338 --length; 339 checkASCII(); 340 } 341 342 lastDecodedBufferOffset_ = bufferOffset_; 343 lastDecodedCharOffset_ = 0; 344 } 345 346 /// Move current position forward by one character. 347 void forward() @safe pure 348 { 349 ++charIndex_; 350 lastDecodedBufferOffset_ = bufferOffset_; 351 lastDecodedCharOffset_ = 0; 352 353 // ASCII 354 if(upcomingASCII_ > 0) 355 { 356 --upcomingASCII_; 357 const c = buffer_[bufferOffset_++]; 358 359 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n')) 360 { 361 ++line_; 362 column_ = 0; 363 return; 364 } 365 ++column_; 366 return; 367 } 368 369 // UTF-8 370 assert(bufferOffset_ < buffer_.length, 371 "Attempted to decode past the end of YAML buffer"); 372 assert(buffer_[bufferOffset_] >= 0x80, 373 "ASCII must be handled by preceding code"); 374 375 const c = decode(buffer_, bufferOffset_); 376 377 // New line. (can compare with '\n' without decoding since it's ASCII) 378 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n')) 379 { 380 ++line_; 381 column_ = 0; 382 } 383 else if(c != '\uFEFF') { ++column_; } 384 385 checkASCII(); 386 } 387 388 /// Get filename, line and column of current position. 389 Mark mark() const pure nothrow @nogc @safe { return Mark(name_, line_, column_); } 390 391 /// Get filename, line and column of current position + some number of chars 392 Mark mark(size_t advance) const pure @safe 393 { 394 auto lineTemp = cast()line_; 395 auto columnTemp = cast()column_; 396 auto bufferOffsetTemp = cast()bufferOffset_; 397 for (size_t pos = 0; pos < advance; pos++) 398 { 399 if (bufferOffsetTemp >= buffer_.length) 400 { 401 break; 402 } 403 const c = decode(buffer_, bufferOffsetTemp); 404 if (c.isBreak || (c == '\r' && buffer_[bufferOffsetTemp] == '\n')) 405 { 406 lineTemp++; 407 columnTemp = 0; 408 } 409 columnTemp++; 410 } 411 return Mark(name_, lineTemp, columnTemp); 412 } 413 414 /// Get file name. 415 ref inout(string) name() inout @safe return pure nothrow @nogc { return name_; } 416 417 /// Get current line number. 418 uint line() const @safe pure nothrow @nogc { return line_; } 419 420 /// Get current column number. 421 uint column() const @safe pure nothrow @nogc { return column_; } 422 423 /// Get index of the current character in the buffer. 424 size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; } 425 426 /// Get encoding of the input buffer. 427 Encoding encoding() const @safe pure nothrow @nogc { return encoding_; } 428 429 private: 430 // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence) 431 void checkASCII() @safe pure nothrow @nogc 432 { 433 upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]); 434 } 435 436 // Decode the next character relative to 437 // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them. 438 // 439 // Does not advance the buffer position. Used in peek() and slice(). 440 dchar decodeNext() @safe pure 441 { 442 assert(lastDecodedBufferOffset_ < buffer_.length, 443 "Attempted to decode past the end of YAML buffer"); 444 const char b = buffer_[lastDecodedBufferOffset_]; 445 ++lastDecodedCharOffset_; 446 // ASCII 447 if(b < 0x80) 448 { 449 ++lastDecodedBufferOffset_; 450 return b; 451 } 452 453 return decode(buffer_, lastDecodedBufferOffset_); 454 } 455 } 456 457 private: 458 459 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible. 460 // 461 // Params: 462 // 463 // input = Buffer with UTF-8/16/32 data to decode. May be overwritten by the 464 // conversion, in which case the result will be a slice of this buffer. 465 // encoding = Encoding of input. 466 // 467 // Returns: 468 // 469 // A struct with the following members: 470 // 471 // $(D string errorMessage) In case of an error, the error message is stored here. If 472 // there was no error, errorMessage is NULL. Always check 473 // this first. 474 // $(D char[] utf8) input converted to UTF-8. May be a slice of input. 475 // $(D size_t characterCount) Number of characters (code points) in input. 476 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow 477 { 478 // Documented in function ddoc. 479 struct Result 480 { 481 string errorMessage; 482 char[] utf8; 483 size_t characterCount; 484 } 485 486 Result result; 487 488 // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32. 489 // 490 // Params: 491 // 492 // buffer = The input buffer to encode. 493 // result = A Result struct to put encoded result and any error messages to. 494 // 495 // On error, result.errorMessage will be set. 496 static void encode(C)(C[] input, ref Result result) @safe pure 497 { 498 // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or 499 // less bytes. 500 static if(is(C == dchar)) 501 { 502 char[4] encodeBuf; 503 auto utf8 = cast(char[])input; 504 auto length = 0; 505 foreach(dchar c; input) 506 { 507 ++result.characterCount; 508 // ASCII 509 if(c < 0x80) 510 { 511 utf8[length++] = cast(char)c; 512 continue; 513 } 514 515 std.utf.encode(encodeBuf, c); 516 const bytes = codeLength!char(c); 517 utf8[length .. length + bytes] = encodeBuf[0 .. bytes]; 518 length += bytes; 519 } 520 result.utf8 = utf8[0 .. length]; 521 } 522 // Unfortunately we can't do UTF-16 in place so we just use std.conv.to 523 else 524 { 525 result.characterCount = std.utf.count(input); 526 result.utf8 = input.to!(char[]); 527 } 528 } 529 530 try final switch(encoding) 531 { 532 case UTFEncoding.UTF_8: 533 result.utf8 = cast(char[])input; 534 result.utf8.validate(); 535 result.characterCount = std.utf.count(result.utf8); 536 break; 537 case UTFEncoding.UTF_16: 538 assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); 539 encode(cast(wchar[])input, result); 540 break; 541 case UTFEncoding.UTF_32: 542 assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4"); 543 encode(cast(dchar[])input, result); 544 break; 545 } 546 catch(ConvException e) { result.errorMessage = e.msg; } 547 catch(UTFException e) { result.errorMessage = e.msg; } 548 catch(Exception e) 549 { 550 assert(false, "Unexpected exception in encode(): " ~ e.msg); 551 } 552 553 return result; 554 } 555 556 /// Determine if all characters (code points, not bytes) in a string are printable. 557 bool isPrintableValidUTF8(const char[] chars) @safe pure 558 { 559 import std.uni : isControl, isWhite; 560 foreach (dchar chr; chars) 561 { 562 if (!chr.isValidDchar || (chr.isControl && !chr.isWhite)) 563 { 564 return false; 565 } 566 } 567 return true; 568 } 569 570 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence. 571 /// 572 /// Used to determine how many characters we can process without decoding. 573 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc 574 { 575 return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength; 576 } 577 // Unittests. 578 579 void testEndian(R)() 580 { 581 void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) 582 { 583 auto reader = new R(data); 584 assert(reader.encoding == encoding_expected); 585 assert(reader.endian_ == endian_expected); 586 } 587 ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; 588 ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; 589 endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); 590 endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); 591 } 592 593 void testPeekPrefixForward(R)() 594 { 595 import std.encoding; 596 ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data"; 597 auto reader = new R(data); 598 assert(reader.peek() == 'd'); 599 assert(reader.peek(1) == 'a'); 600 assert(reader.peek(2) == 't'); 601 assert(reader.peek(3) == 'a'); 602 assert(reader.peek(4) == '\0'); 603 assert(reader.prefix(4) == "data"); 604 // assert(reader.prefix(6) == "data\0"); 605 reader.forward(2); 606 assert(reader.peek(1) == 'a'); 607 // assert(collectException(reader.peek(3))); 608 } 609 610 void testUTF(R)() 611 { 612 import std.encoding; 613 dchar[] data = cast(dchar[])"data"; 614 void utf_test(T)(T[] data, BOM bom) 615 { 616 ubyte[] bytes = bomTable[bom].sequence ~ 617 (cast(ubyte[])data)[0 .. data.length * T.sizeof]; 618 auto reader = new R(bytes); 619 assert(reader.peek() == 'd'); 620 assert(reader.peek(1) == 'a'); 621 assert(reader.peek(2) == 't'); 622 assert(reader.peek(3) == 'a'); 623 } 624 utf_test!char(to!(char[])(data), BOM.utf8); 625 utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le); 626 utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le); 627 } 628 629 void test1Byte(R)() 630 { 631 ubyte[] data = [97]; 632 633 auto reader = new R(data); 634 assert(reader.peek() == 'a'); 635 assert(reader.peek(1) == '\0'); 636 // assert(collectException(reader.peek(2))); 637 } 638 639 @system unittest 640 { 641 testEndian!Reader(); 642 testPeekPrefixForward!Reader(); 643 testUTF!Reader(); 644 test1Byte!Reader(); 645 } 646 //Issue 257 - https://github.com/dlang-community/D-YAML/issues/257 647 @safe unittest 648 { 649 import dyaml.loader : Loader; 650 auto yaml = "hello "; 651 auto root = Loader.fromString(yaml).load(); 652 653 assert(root.isValid); 654 }