1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 module dyaml.reader;
8 
9 
10 import core.stdc.stdlib;
11 import core.stdc.string;
12 import core.thread;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.exception;
18 import std.range;
19 import std.string;
20 import std.system;
21 import std.typecons;
22 import std.utf;
23 
24 import tinyendian;
25 
26 import dyaml.encoding;
27 import dyaml.exception;
28 
29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029');
30 
31 package:
32 
33 
34 /// Provides an API to read characters from a UTF-8 buffer.
35 struct Reader
36 {
37     private:
38         // Buffer of currently loaded characters.
39         char[] buffer_;
40 
41         // Current position within buffer. Only data after this position can be read.
42         size_t bufferOffset_;
43 
44         // Index of the current character in the buffer.
45         size_t charIndex_;
46         // Number of characters (code points) in buffer_.
47         size_t characterCount_;
48 
49         // File name
50         string name_;
51         // Current line in file.
52         uint line_;
53         // Current column in file.
54         uint column_;
55 
56         // Original Unicode encoding of the data.
57         Encoding encoding_;
58 
59         version(unittest)
60         {
61             // Endianness of the input before it was converted (for testing)
62             Endian endian_;
63         }
64 
65         // The number of consecutive ASCII characters starting at bufferOffset_.
66         //
67         // Used to minimize UTF-8 decoding.
68         size_t upcomingASCII_;
69 
70         // Index to buffer_ where the last decoded character starts.
71         size_t lastDecodedBufferOffset_;
72         // Offset, relative to charIndex_, of the last decoded character,
73         // in code points, not chars.
74         size_t lastDecodedCharOffset_;
75 
76     public:
77         /// Construct a Reader.
78         ///
79         /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
80         ///                   contents of a file or a string. $(B will) be modified by
81         ///                   the Reader and other parts of D:YAML (D:YAML tries to
82         ///                   reuse the buffer to minimize memory allocations)
83         ///          name   = File name if the buffer is the contents of a file or
84         ///                   `"<unknown>"` if the buffer is the contents of a string.
85         ///
86         /// Throws:  ReaderException on a UTF decoding error or if there are
87         ///          nonprintable Unicode characters illegal in YAML.
88         this(ubyte[] buffer, string name = "<unknown>") @safe pure
89         {
90             name_ = name;
91             auto endianResult = fixUTFByteOrder(buffer);
92             if(endianResult.bytesStripped > 0)
93             {
94                 // TODO: add line and column
95                 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
96                                           "to 2 or 4 bytes, respectively", Mark(name, 0, 0));
97             }
98 
99             version(unittest) { endian_ = endianResult.endian; }
100             encoding_ = endianResult.encoding;
101 
102             auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
103             const msg = utf8Result.errorMessage;
104             if(msg !is null)
105             {
106                 // TODO: add line and column
107                 throw new ReaderException("Error when converting to UTF-8: " ~ msg, Mark(name, 0, 0));
108             }
109 
110             buffer_ = utf8Result.utf8;
111 
112             characterCount_ = utf8Result.characterCount;
113             // Check that all characters in buffer are printable.
114             // TODO: add line and column
115             enforce(isPrintableValidUTF8(buffer_),
116                     new ReaderException("Special unicode characters are not allowed", Mark(name, 0, 0)));
117 
118             checkASCII();
119         }
120 
121         /// Get character at specified index relative to current position.
122         ///
123         /// Params:  index = Index of the character to get relative to current position
124         ///                  in the buffer. Can point outside of the buffer; In that
125         ///                  case, '\0' will be returned.
126         ///
127         /// Returns: Character at specified position or '\0' if outside of the buffer.
128         ///
129         // XXX removed; search for 'risky' to find why.
130         // Throws:  ReaderException if trying to read past the end of the buffer.
131         dchar peek(const size_t index) @safe pure
132         {
133             if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
134             if(characterCount_ <= charIndex_ + index)
135             {
136                 // XXX This is risky; revert this if bugs are introduced. We rely on
137                 // the assumption that Reader only uses peek() to detect end of buffer.
138                 // The test suite passes.
139                 // Revert this case here and in other peek() versions if this causes
140                 // errors.
141                 // throw new ReaderException("Trying to read past the end of the buffer");
142                 return '\0';
143             }
144 
145             // Optimized path for Scanner code that peeks chars in linear order to
146             // determine the length of some sequence.
147             if(index == lastDecodedCharOffset_)
148             {
149                 ++lastDecodedCharOffset_;
150                 const char b = buffer_[lastDecodedBufferOffset_];
151                 // ASCII
152                 if(b < 0x80)
153                 {
154                     ++lastDecodedBufferOffset_;
155                     return b;
156                 }
157                 return decode(buffer_, lastDecodedBufferOffset_);
158             }
159 
160             // 'Slow' path where we decode everything up to the requested character.
161             const asciiToTake = min(upcomingASCII_, index);
162             lastDecodedCharOffset_   = asciiToTake;
163             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
164             dchar d;
165             while(lastDecodedCharOffset_ <= index)
166             {
167                 d = decodeNext();
168             }
169 
170             return d;
171         }
172 
173         /// Optimized version of peek() for the case where peek index is 0.
174         dchar peek() @safe pure
175         {
176             if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
177             if(characterCount_ <= charIndex_) { return '\0'; }
178 
179             lastDecodedCharOffset_   = 0;
180             lastDecodedBufferOffset_ = bufferOffset_;
181             return decodeNext();
182         }
183 
184         /// Get byte at specified index relative to current position.
185         ///
186         /// Params:  index = Index of the byte to get relative to current position
187         ///                  in the buffer. Can point outside of the buffer; In that
188         ///                  case, '\0' will be returned.
189         ///
190         /// Returns: Byte at specified position or '\0' if outside of the buffer.
191         char peekByte(const size_t index) @safe pure nothrow @nogc
192         {
193             return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
194         }
195 
196         /// Optimized version of peekByte() for the case where peek byte index is 0.
197         char peekByte() @safe pure nothrow @nogc
198         {
199             return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
200         }
201 
202 
203         /// Get specified number of characters starting at current position.
204         ///
205         /// Note: This gets only a "view" into the internal buffer, which will be
206         ///       invalidated after other Reader calls.
207         ///
208         /// Params: length = Number of characters (code points, not bytes) to get. May
209         ///                  reach past the end of the buffer; in that case the returned
210         ///                  slice will be shorter.
211         ///
212         /// Returns: Characters starting at current position or an empty slice if out of bounds.
213         char[] prefix(const size_t length) @safe pure
214         {
215             return slice(length);
216         }
217 
218         /// Get specified number of bytes, not code points, starting at current position.
219         ///
220         /// Note: This gets only a "view" into the internal buffer, which will be
221         ///       invalidated after other Reader calls.
222         ///
223         /// Params: length = Number bytes (not code points) to get. May NOT reach past
224         ///                  the end of the buffer; should be used with peek() to avoid
225         ///                  this.
226         ///
227         /// Returns: Bytes starting at current position.
228         char[] prefixBytes(const size_t length) @safe pure nothrow @nogc
229         in(length == 0 || bufferOffset_ + length <= buffer_.length, "prefixBytes out of bounds")
230         {
231             return buffer_[bufferOffset_ .. bufferOffset_ + length];
232         }
233 
234         /// Get a slice view of the internal buffer, starting at the current position.
235         ///
236         /// Note: This gets only a "view" into the internal buffer,
237         ///       which get invalidated after other Reader calls.
238         ///
239         /// Params:  end = End of the slice relative to current position. May reach past
240         ///                the end of the buffer; in that case the returned slice will
241         ///                be shorter.
242         ///
243         /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
244         char[] slice(const size_t end) @safe pure
245         {
246             // Fast path in case the caller has already peek()ed all the way to end.
247             if(end == lastDecodedCharOffset_)
248             {
249                 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
250             }
251 
252             const asciiToTake = min(upcomingASCII_, end, buffer_.length);
253             lastDecodedCharOffset_   = asciiToTake;
254             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
255 
256             // 'Slow' path - decode everything up to end.
257             while(lastDecodedCharOffset_ < end &&
258                   lastDecodedBufferOffset_ < buffer_.length)
259             {
260                 decodeNext();
261             }
262 
263             return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
264         }
265 
266         /// Get the next character, moving buffer position beyond it.
267         ///
268         /// Returns: Next character.
269         ///
270         /// Throws:  ReaderException if trying to read past the end of the buffer
271         ///          or if invalid data is read.
272         dchar get() @safe pure
273         {
274             const result = peek();
275             forward();
276             return result;
277         }
278 
279         /// Get specified number of characters, moving buffer position beyond them.
280         ///
281         /// Params:  length = Number or characters (code points, not bytes) to get.
282         ///
283         /// Returns: Characters starting at current position.
284         char[] get(const size_t length) @safe pure
285         {
286             auto result = slice(length);
287             forward(length);
288             return result;
289         }
290 
291         /// Move current position forward.
292         ///
293         /// Params:  length = Number of characters to move position forward.
294         void forward(size_t length) @safe pure
295         {
296             while(length > 0)
297             {
298                 auto asciiToTake = min(upcomingASCII_, length);
299                 charIndex_     += asciiToTake;
300                 length         -= asciiToTake;
301                 upcomingASCII_ -= asciiToTake;
302 
303                 for(; asciiToTake > 0; --asciiToTake)
304                 {
305                     const c = buffer_[bufferOffset_++];
306                     // c is ASCII, do we only need to check for ASCII line breaks.
307                     if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
308                     {
309                         ++line_;
310                         column_ = 0;
311                         continue;
312                     }
313                     ++column_;
314                 }
315 
316                 // If we have used up all upcoming ASCII chars, the next char is
317                 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
318                 // be updated - it's zero.
319                 if(length == 0) { break; }
320 
321                 assert(upcomingASCII_ == 0,
322                        "Running unicode handling code but we haven't run out of ASCII chars");
323                 assert(bufferOffset_ < buffer_.length,
324                        "Attempted to decode past the end of YAML buffer");
325                 assert(buffer_[bufferOffset_] >= 0x80,
326                        "ASCII must be handled by preceding code");
327 
328                 ++charIndex_;
329                 const c = decode(buffer_, bufferOffset_);
330 
331                 // New line. (can compare with '\n' without decoding since it's ASCII)
332                 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
333                 {
334                     ++line_;
335                     column_ = 0;
336                 }
337                 else if(c != '\uFEFF') { ++column_; }
338                 --length;
339                 checkASCII();
340             }
341 
342             lastDecodedBufferOffset_ = bufferOffset_;
343             lastDecodedCharOffset_ = 0;
344         }
345 
346         /// Move current position forward by one character.
347         void forward() @safe pure
348         {
349             ++charIndex_;
350             lastDecodedBufferOffset_ = bufferOffset_;
351             lastDecodedCharOffset_ = 0;
352 
353             // ASCII
354             if(upcomingASCII_ > 0)
355             {
356                 --upcomingASCII_;
357                 const c = buffer_[bufferOffset_++];
358 
359                 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
360                 {
361                     ++line_;
362                     column_ = 0;
363                     return;
364                 }
365                 ++column_;
366                 return;
367             }
368 
369             // UTF-8
370             assert(bufferOffset_ < buffer_.length,
371                    "Attempted to decode past the end of YAML buffer");
372             assert(buffer_[bufferOffset_] >= 0x80,
373                    "ASCII must be handled by preceding code");
374 
375             const c = decode(buffer_, bufferOffset_);
376 
377             // New line. (can compare with '\n' without decoding since it's ASCII)
378             if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
379             {
380                 ++line_;
381                 column_ = 0;
382             }
383             else if(c != '\uFEFF') { ++column_; }
384 
385             checkASCII();
386         }
387 
388         /// Get filename, line and column of current position.
389         Mark mark() const pure nothrow @nogc @safe { return Mark(name_, line_, column_); }
390 
391         /// Get filename, line and column of current position + some number of chars
392         Mark mark(size_t advance) const pure @safe
393         {
394             auto lineTemp = cast()line_;
395             auto columnTemp = cast()column_;
396             auto bufferOffsetTemp = cast()bufferOffset_;
397             for (size_t pos = 0; pos < advance; pos++)
398             {
399                 if (bufferOffsetTemp >= buffer_.length)
400                 {
401                     break;
402                 }
403                 const c = decode(buffer_, bufferOffsetTemp);
404                 if (c.isBreak || (c == '\r' && buffer_[bufferOffsetTemp] == '\n'))
405                 {
406                     lineTemp++;
407                     columnTemp = 0;
408                 }
409                 columnTemp++;
410             }
411             return Mark(name_, lineTemp, columnTemp);
412         }
413 
414         /// Get file name.
415         ref inout(string) name() inout @safe return pure nothrow @nogc { return name_; }
416 
417         /// Get current line number.
418         uint line() const @safe pure nothrow @nogc { return line_; }
419 
420         /// Get current column number.
421         uint column() const @safe pure nothrow @nogc { return column_; }
422 
423         /// Get index of the current character in the buffer.
424         size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; }
425 
426         /// Get encoding of the input buffer.
427         Encoding encoding() const @safe pure nothrow @nogc { return encoding_; }
428 
429 private:
430         // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
431         void checkASCII() @safe pure nothrow @nogc
432         {
433             upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
434         }
435 
436         // Decode the next character relative to
437         // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
438         //
439         // Does not advance the buffer position. Used in peek() and slice().
440         dchar decodeNext() @safe pure
441         {
442             assert(lastDecodedBufferOffset_ < buffer_.length,
443                    "Attempted to decode past the end of YAML buffer");
444             const char b = buffer_[lastDecodedBufferOffset_];
445             ++lastDecodedCharOffset_;
446             // ASCII
447             if(b < 0x80)
448             {
449                 ++lastDecodedBufferOffset_;
450                 return b;
451             }
452 
453             return decode(buffer_, lastDecodedBufferOffset_);
454         }
455 }
456 
457 private:
458 
459 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
460 //
461 // Params:
462 //
463 // input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
464 //            conversion, in which case the result will be a slice of this buffer.
465 // encoding = Encoding of input.
466 //
467 // Returns:
468 //
469 // A struct with the following members:
470 //
471 // $(D string errorMessage)   In case of an error, the error message is stored here. If
472 //                            there was no error, errorMessage is NULL. Always check
473 //                            this first.
474 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
475 // $(D size_t characterCount) Number of characters (code points) in input.
476 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
477 {
478     // Documented in function ddoc.
479     struct Result
480     {
481         string errorMessage;
482         char[] utf8;
483         size_t characterCount;
484     }
485 
486     Result result;
487 
488     // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
489     //
490     // Params:
491     //
492     // buffer = The input buffer to encode.
493     // result = A Result struct to put encoded result and any error messages to.
494     //
495     // On error, result.errorMessage will be set.
496     static void encode(C)(C[] input, ref Result result) @safe pure
497     {
498         // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
499         // less bytes.
500         static if(is(C == dchar))
501         {
502             char[4] encodeBuf;
503             auto utf8 = cast(char[])input;
504             auto length = 0;
505             foreach(dchar c; input)
506             {
507                 ++result.characterCount;
508                 // ASCII
509                 if(c < 0x80)
510                 {
511                     utf8[length++] = cast(char)c;
512                     continue;
513                 }
514 
515                 std.utf.encode(encodeBuf, c);
516                 const bytes = codeLength!char(c);
517                 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
518                 length += bytes;
519             }
520             result.utf8 = utf8[0 .. length];
521         }
522         // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
523         else
524         {
525             result.characterCount = std.utf.count(input);
526             result.utf8 = input.to!(char[]);
527         }
528     }
529 
530     try final switch(encoding)
531     {
532         case UTFEncoding.UTF_8:
533             result.utf8 = cast(char[])input;
534             result.utf8.validate();
535             result.characterCount = std.utf.count(result.utf8);
536             break;
537         case UTFEncoding.UTF_16:
538             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
539             encode(cast(wchar[])input, result);
540             break;
541         case UTFEncoding.UTF_32:
542             assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
543             encode(cast(dchar[])input, result);
544             break;
545     }
546     catch(ConvException e) { result.errorMessage = e.msg; }
547     catch(UTFException e)  { result.errorMessage = e.msg; }
548     catch(Exception e)
549     {
550         assert(false, "Unexpected exception in encode(): " ~ e.msg);
551     }
552 
553     return result;
554 }
555 
556 /// Determine if all characters (code points, not bytes) in a string are printable.
557 bool isPrintableValidUTF8(const char[] chars) @safe pure
558 {
559     import std.uni : isControl, isWhite;
560     foreach (dchar chr; chars)
561     {
562         if (!chr.isValidDchar || (chr.isControl && !chr.isWhite))
563         {
564             return false;
565         }
566     }
567     return true;
568 }
569 
570 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
571 ///
572 /// Used to determine how many characters we can process without decoding.
573 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc
574 {
575     return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength;
576 }
577 // Unittests.
578 
579 void testEndian(R)()
580 {
581     void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
582     {
583         auto reader = new R(data);
584         assert(reader.encoding == encoding_expected);
585         assert(reader.endian_ == endian_expected);
586     }
587     ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
588     ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
589     endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
590     endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
591 }
592 
593 void testPeekPrefixForward(R)()
594 {
595     import std.encoding;
596     ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data";
597     auto reader = new R(data);
598     assert(reader.peek() == 'd');
599     assert(reader.peek(1) == 'a');
600     assert(reader.peek(2) == 't');
601     assert(reader.peek(3) == 'a');
602     assert(reader.peek(4) == '\0');
603     assert(reader.prefix(4) == "data");
604     // assert(reader.prefix(6) == "data\0");
605     reader.forward(2);
606     assert(reader.peek(1) == 'a');
607     // assert(collectException(reader.peek(3)));
608 }
609 
610 void testUTF(R)()
611 {
612     import std.encoding;
613     dchar[] data = cast(dchar[])"data";
614     void utf_test(T)(T[] data, BOM bom)
615     {
616         ubyte[] bytes = bomTable[bom].sequence ~
617                         (cast(ubyte[])data)[0 .. data.length * T.sizeof];
618         auto reader = new R(bytes);
619         assert(reader.peek() == 'd');
620         assert(reader.peek(1) == 'a');
621         assert(reader.peek(2) == 't');
622         assert(reader.peek(3) == 'a');
623     }
624     utf_test!char(to!(char[])(data), BOM.utf8);
625     utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le);
626     utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le);
627 }
628 
629 void test1Byte(R)()
630 {
631     ubyte[] data = [97];
632 
633     auto reader = new R(data);
634     assert(reader.peek() == 'a');
635     assert(reader.peek(1) == '\0');
636     // assert(collectException(reader.peek(2)));
637 }
638 
639 @system unittest
640 {
641     testEndian!Reader();
642     testPeekPrefixForward!Reader();
643     testUTF!Reader();
644     test1Byte!Reader();
645 }
646 //Issue 257 - https://github.com/dlang-community/D-YAML/issues/257
647 @safe unittest
648 {
649     import dyaml.loader : Loader;
650     auto yaml = "hello ";
651     auto root = Loader.fromString(yaml).load();
652 
653     assert(root.isValid);
654 }