1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 module dyaml.reader;
8 
9 
10 import core.stdc.stdlib;
11 import core.stdc.string;
12 import core.thread;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.exception;
18 import std.range;
19 import std.string;
20 import std.system;
21 import std.typecons;
22 import std.utf;
23 
24 import tinyendian;
25 
26 import dyaml.encoding;
27 import dyaml.exception;
28 
29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029');
30 
31 package:
32 
33 
34 ///Exception thrown at Reader errors.
35 class ReaderException : YAMLException
36 {
37     this(string msg, string file = __FILE__, size_t line = __LINE__)
38         @safe pure nothrow
39     {
40         super("Reader error: " ~ msg, file, line);
41     }
42 }
43 
44 /// Provides an API to read characters from a UTF-8 buffer and build slices into that
45 /// buffer to avoid allocations (see SliceBuilder).
46 final class Reader
47 {
48     private:
49         // Buffer of currently loaded characters.
50         char[] buffer_;
51 
52         // Current position within buffer. Only data after this position can be read.
53         size_t bufferOffset_;
54 
55         // Index of the current character in the buffer.
56         size_t charIndex_;
57         // Number of characters (code points) in buffer_.
58         size_t characterCount_;
59 
60         // File name
61         string name_;
62         // Current line in file.
63         uint line_;
64         // Current column in file.
65         uint column_;
66 
67         // Original Unicode encoding of the data.
68         Encoding encoding_;
69 
70         version(unittest)
71         {
72             // Endianness of the input before it was converted (for testing)
73             Endian endian_;
74         }
75 
76         // The number of consecutive ASCII characters starting at bufferOffset_.
77         //
78         // Used to minimize UTF-8 decoding.
79         size_t upcomingASCII_;
80 
81         // Index to buffer_ where the last decoded character starts.
82         size_t lastDecodedBufferOffset_;
83         // Offset, relative to charIndex_, of the last decoded character,
84         // in code points, not chars.
85         size_t lastDecodedCharOffset_;
86 
87     public:
88         /// Construct a Reader.
89         ///
90         /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
91         ///                   contents of a file or a string. $(B will) be modified by
92         ///                   the Reader and other parts of D:YAML (D:YAML tries to
93         ///                   reuse the buffer to minimize memory allocations)
94         ///          name   = File name if the buffer is the contents of a file or
95         ///                   `"<unknown>"` if the buffer is the contents of a string.
96         ///
97         /// Throws:  ReaderException on a UTF decoding error or if there are
98         ///          nonprintable Unicode characters illegal in YAML.
99         this(ubyte[] buffer, string name = "<unknown>") @safe pure
100         {
101             name_ = name;
102             auto endianResult = fixUTFByteOrder(buffer);
103             if(endianResult.bytesStripped > 0)
104             {
105                 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
106                                           "to 2 or 4 bytes, respectively");
107             }
108 
109             version(unittest) { endian_ = endianResult.endian; }
110             encoding_ = endianResult.encoding;
111 
112             auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
113             const msg = utf8Result.errorMessage;
114             if(msg !is null)
115             {
116                 throw new ReaderException("Error when converting to UTF-8: " ~ msg);
117             }
118 
119             buffer_ = utf8Result.utf8;
120 
121             characterCount_ = utf8Result.characterCount;
122             // Check that all characters in buffer are printable.
123             enforce(isPrintableValidUTF8(buffer_),
124                     new ReaderException("Special unicode characters are not allowed"));
125 
126             this.sliceBuilder = SliceBuilder(this);
127             checkASCII();
128         }
129 
130         /// Get character at specified index relative to current position.
131         ///
132         /// Params:  index = Index of the character to get relative to current position
133         ///                  in the buffer. Can point outside of the buffer; In that
134         ///                  case, '\0' will be returned.
135         ///
136         /// Returns: Character at specified position or '\0' if outside of the buffer.
137         ///
138         // XXX removed; search for 'risky' to find why.
139         // Throws:  ReaderException if trying to read past the end of the buffer.
140         dchar peek(const size_t index) @safe pure
141         {
142             if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
143             if(characterCount_ <= charIndex_ + index)
144             {
145                 // XXX This is risky; revert this if bugs are introduced. We rely on
146                 // the assumption that Reader only uses peek() to detect end of buffer.
147                 // The test suite passes.
148                 // Revert this case here and in other peek() versions if this causes
149                 // errors.
150                 // throw new ReaderException("Trying to read past the end of the buffer");
151                 return '\0';
152             }
153 
154             // Optimized path for Scanner code that peeks chars in linear order to
155             // determine the length of some sequence.
156             if(index == lastDecodedCharOffset_)
157             {
158                 ++lastDecodedCharOffset_;
159                 const char b = buffer_[lastDecodedBufferOffset_];
160                 // ASCII
161                 if(b < 0x80)
162                 {
163                     ++lastDecodedBufferOffset_;
164                     return b;
165                 }
166                 return decode(buffer_, lastDecodedBufferOffset_);
167             }
168 
169             // 'Slow' path where we decode everything up to the requested character.
170             const asciiToTake = min(upcomingASCII_, index);
171             lastDecodedCharOffset_   = asciiToTake;
172             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
173             dchar d;
174             while(lastDecodedCharOffset_ <= index)
175             {
176                 d = decodeNext();
177             }
178 
179             return d;
180         }
181 
182         /// Optimized version of peek() for the case where peek index is 0.
183         dchar peek() @safe pure
184         {
185             if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
186             if(characterCount_ <= charIndex_) { return '\0'; }
187 
188             lastDecodedCharOffset_   = 0;
189             lastDecodedBufferOffset_ = bufferOffset_;
190             return decodeNext();
191         }
192 
193         /// Get byte at specified index relative to current position.
194         ///
195         /// Params:  index = Index of the byte to get relative to current position
196         ///                  in the buffer. Can point outside of the buffer; In that
197         ///                  case, '\0' will be returned.
198         ///
199         /// Returns: Byte at specified position or '\0' if outside of the buffer.
200         char peekByte(const size_t index) @safe pure nothrow @nogc
201         {
202             return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
203         }
204 
205         /// Optimized version of peekByte() for the case where peek byte index is 0.
206         char peekByte() @safe pure nothrow @nogc
207         {
208             return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
209         }
210 
211 
212         /// Get specified number of characters starting at current position.
213         ///
214         /// Note: This gets only a "view" into the internal buffer, which will be
215         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
216         ///       for permanent use.
217         ///
218         /// Params: length = Number of characters (code points, not bytes) to get. May
219         ///                  reach past the end of the buffer; in that case the returned
220         ///                  slice will be shorter.
221         ///
222         /// Returns: Characters starting at current position or an empty slice if out of bounds.
223         char[] prefix(const size_t length) @safe pure
224         {
225             return slice(length);
226         }
227 
228         /// Get specified number of bytes, not code points, starting at current position.
229         ///
230         /// Note: This gets only a "view" into the internal buffer, which will be
231         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
232         ///       for permanent use.
233         ///
234         /// Params: length = Number bytes (not code points) to get. May NOT reach past
235         ///                  the end of the buffer; should be used with peek() to avoid
236         ///                  this.
237         ///
238         /// Returns: Bytes starting at current position.
239         char[] prefixBytes(const size_t length) @safe pure nothrow @nogc
240         in(length == 0 || bufferOffset_ + length <= buffer_.length, "prefixBytes out of bounds")
241         {
242             return buffer_[bufferOffset_ .. bufferOffset_ + length];
243         }
244 
245         /// Get a slice view of the internal buffer, starting at the current position.
246         ///
247         /// Note: This gets only a "view" into the internal buffer,
248         ///       which get invalidated after other Reader calls.
249         ///
250         /// Params:  end = End of the slice relative to current position. May reach past
251         ///                the end of the buffer; in that case the returned slice will
252         ///                be shorter.
253         ///
254         /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
255         char[] slice(const size_t end) @safe pure
256         {
257             // Fast path in case the caller has already peek()ed all the way to end.
258             if(end == lastDecodedCharOffset_)
259             {
260                 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
261             }
262 
263             const asciiToTake = min(upcomingASCII_, end, buffer_.length);
264             lastDecodedCharOffset_   = asciiToTake;
265             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
266 
267             // 'Slow' path - decode everything up to end.
268             while(lastDecodedCharOffset_ < end &&
269                   lastDecodedBufferOffset_ < buffer_.length)
270             {
271                 decodeNext();
272             }
273 
274             return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
275         }
276 
277         /// Get the next character, moving buffer position beyond it.
278         ///
279         /// Returns: Next character.
280         ///
281         /// Throws:  ReaderException if trying to read past the end of the buffer
282         ///          or if invalid data is read.
283         dchar get() @safe pure
284         {
285             const result = peek();
286             forward();
287             return result;
288         }
289 
290         /// Get specified number of characters, moving buffer position beyond them.
291         ///
292         /// Params:  length = Number or characters (code points, not bytes) to get.
293         ///
294         /// Returns: Characters starting at current position.
295         char[] get(const size_t length) @safe pure
296         {
297             auto result = slice(length);
298             forward(length);
299             return result;
300         }
301 
302         /// Move current position forward.
303         ///
304         /// Params:  length = Number of characters to move position forward.
305         void forward(size_t length) @safe pure
306         {
307             while(length > 0)
308             {
309                 auto asciiToTake = min(upcomingASCII_, length);
310                 charIndex_     += asciiToTake;
311                 length         -= asciiToTake;
312                 upcomingASCII_ -= asciiToTake;
313 
314                 for(; asciiToTake > 0; --asciiToTake)
315                 {
316                     const c = buffer_[bufferOffset_++];
317                     // c is ASCII, do we only need to check for ASCII line breaks.
318                     if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
319                     {
320                         ++line_;
321                         column_ = 0;
322                         continue;
323                     }
324                     ++column_;
325                 }
326 
327                 // If we have used up all upcoming ASCII chars, the next char is
328                 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
329                 // be updated - it's zero.
330                 if(length == 0) { break; }
331 
332                 assert(upcomingASCII_ == 0,
333                        "Running unicode handling code but we haven't run out of ASCII chars");
334                 assert(bufferOffset_ < buffer_.length,
335                        "Attempted to decode past the end of YAML buffer");
336                 assert(buffer_[bufferOffset_] >= 0x80,
337                        "ASCII must be handled by preceding code");
338 
339                 ++charIndex_;
340                 const c = decode(buffer_, bufferOffset_);
341 
342                 // New line. (can compare with '\n' without decoding since it's ASCII)
343                 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
344                 {
345                     ++line_;
346                     column_ = 0;
347                 }
348                 else if(c != '\uFEFF') { ++column_; }
349                 --length;
350                 checkASCII();
351             }
352 
353             lastDecodedBufferOffset_ = bufferOffset_;
354             lastDecodedCharOffset_ = 0;
355         }
356 
357         /// Move current position forward by one character.
358         void forward() @safe pure
359         {
360             ++charIndex_;
361             lastDecodedBufferOffset_ = bufferOffset_;
362             lastDecodedCharOffset_ = 0;
363 
364             // ASCII
365             if(upcomingASCII_ > 0)
366             {
367                 --upcomingASCII_;
368                 const c = buffer_[bufferOffset_++];
369 
370                 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
371                 {
372                     ++line_;
373                     column_ = 0;
374                     return;
375                 }
376                 ++column_;
377                 return;
378             }
379 
380             // UTF-8
381             assert(bufferOffset_ < buffer_.length,
382                    "Attempted to decode past the end of YAML buffer");
383             assert(buffer_[bufferOffset_] >= 0x80,
384                    "ASCII must be handled by preceding code");
385 
386             const c = decode(buffer_, bufferOffset_);
387 
388             // New line. (can compare with '\n' without decoding since it's ASCII)
389             if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
390             {
391                 ++line_;
392                 column_ = 0;
393             }
394             else if(c != '\uFEFF') { ++column_; }
395 
396             checkASCII();
397         }
398 
399         /// Used to build slices of read data in Reader; to avoid allocations.
400         SliceBuilder sliceBuilder;
401 
402         /// Get a string describing current buffer position, used for error messages.
403         Mark mark() const pure nothrow @nogc @safe { return Mark(name_, line_, column_); }
404 
405         /// Get file name.
406         string name() const @safe pure nothrow @nogc { return name_; }
407 
408         /// Set file name.
409         void name(string name) pure @safe nothrow @nogc { name_ = name; }
410 
411         /// Get current line number.
412         uint line() const @safe pure nothrow @nogc { return line_; }
413 
414         /// Get current column number.
415         uint column() const @safe pure nothrow @nogc { return column_; }
416 
417         /// Get index of the current character in the buffer.
418         size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; }
419 
420         /// Get encoding of the input buffer.
421         Encoding encoding() const @safe pure nothrow @nogc { return encoding_; }
422 
423 private:
424         // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
425         void checkASCII() @safe pure nothrow @nogc
426         {
427             upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
428         }
429 
430         // Decode the next character relative to
431         // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
432         //
433         // Does not advance the buffer position. Used in peek() and slice().
434         dchar decodeNext() @safe pure
435         {
436             assert(lastDecodedBufferOffset_ < buffer_.length,
437                    "Attempted to decode past the end of YAML buffer");
438             const char b = buffer_[lastDecodedBufferOffset_];
439             ++lastDecodedCharOffset_;
440             // ASCII
441             if(b < 0x80)
442             {
443                 ++lastDecodedBufferOffset_;
444                 return b;
445             }
446 
447             return decode(buffer_, lastDecodedBufferOffset_);
448         }
449 }
450 
451 /// Used to build slices of already read data in Reader buffer, avoiding allocations.
452 ///
453 /// Usually these slices point to unchanged Reader data, but sometimes the data is
454 /// changed due to how YAML interprets certain characters/strings.
455 ///
456 /// See begin() documentation.
457 struct SliceBuilder
458 {
459 private:
460     // No copying by the user.
461     @disable this(this);
462     @disable void opAssign(ref SliceBuilder);
463 
464     // Reader this builder works in.
465     Reader reader_;
466 
467     // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
468     size_t start_ = size_t.max;
469     // End of the slice om reader_.buffer_ (size_t.max while no slice being build)
470     size_t end_   = size_t.max;
471 
472     // Stack of slice ends to revert to (see Transaction)
473     //
474     // Very few levels as we don't want arbitrarily nested transactions.
475     size_t[4] endStack_;
476     // The number of elements currently in endStack_.
477     size_t endStackUsed_;
478 
479     @safe const pure nothrow @nogc invariant()
480     {
481         if(!inProgress) { return; }
482         assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
483         assert(start_ <= end_, "Slice start after slice end");
484     }
485 
486     // Is a slice currently being built?
487     bool inProgress() @safe const pure nothrow @nogc
488     in(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max, "start_/end_ are not consistent")
489     {
490         return start_ != size_t.max;
491     }
492 
493 public:
494     /// Begin building a slice.
495     ///
496     /// Only one slice can be built at any given time; before beginning a new slice,
497     /// finish the previous one (if any).
498     ///
499     /// The slice starts at the current position in the Reader buffer. It can only be
500     /// extended up to the current position in the buffer; Reader methods get() and
501     /// forward() move the position. E.g. it is valid to extend a slice by write()-ing
502     /// a string just returned by get() - but not one returned by prefix() unless the
503     /// position has changed since the prefix() call.
504     void begin() @safe pure nothrow @nogc
505     in(!inProgress, "Beginning a slice while another slice is being built")
506     in(endStackUsed_ == 0, "Slice stack not empty at slice begin")
507     {
508 
509         start_ = reader_.bufferOffset_;
510         end_   = reader_.bufferOffset_;
511     }
512 
513     /// Finish building a slice and return it.
514     ///
515     /// Any Transactions on the slice must be committed or destroyed before the slice
516     /// is finished.
517     ///
518     /// Returns a string; once a slice is finished it is definitive that its contents
519     /// will not be changed.
520     char[] finish() @safe pure nothrow @nogc
521     in(inProgress, "finish called without begin")
522     in(endStackUsed_ == 0, "Finishing a slice with running transactions.")
523     {
524 
525         auto result = reader_.buffer_[start_ .. end_];
526         start_ = end_ = size_t.max;
527         return result;
528     }
529 
530     /// Write a string to the slice being built.
531     ///
532     /// Data can only be written up to the current position in the Reader buffer.
533     ///
534     /// If str is a string returned by a Reader method, and str starts right after the
535     /// end of the slice being built, the slice is extended (trivial operation).
536     ///
537     /// See_Also: begin
538     void write(scope char[] str) @safe pure nothrow @nogc
539     {
540         assert(inProgress, "write called without begin");
541         assert(end_ <= reader_.bufferOffset_,
542                "AT START: Slice ends after buffer position");
543 
544         // Nothing? Already done.
545         if (str.length == 0) { return; }
546         // If str starts at the end of the slice (is a string returned by a Reader
547         // method), just extend the slice to contain str.
548         if(&str[0] == &reader_.buffer_[end_])
549         {
550             end_ += str.length;
551         }
552         // Even if str does not start at the end of the slice, it still may be returned
553         // by a Reader method and point to buffer. So we need to memmove.
554         else
555         {
556             copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]);
557             end_ += str.length;
558         }
559     }
560 
561     /// Write a character to the slice being built.
562     ///
563     /// Data can only be written up to the current position in the Reader buffer.
564     ///
565     /// See_Also: begin
566     void write(dchar c) @safe pure
567     in(inProgress, "write called without begin")
568     {
569         if(c < 0x80)
570         {
571             reader_.buffer_[end_++] = cast(char)c;
572             return;
573         }
574 
575         // We need to encode a non-ASCII dchar into UTF-8
576         char[4] encodeBuf;
577         const bytes = encode(encodeBuf, c);
578         reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
579         end_ += bytes;
580     }
581 
582     /// Insert a character to a specified position in the slice.
583     ///
584     /// Enlarges the slice by 1 char. Note that the slice can only extend up to the
585     /// current position in the Reader buffer.
586     ///
587     /// Params:
588     ///
589     /// c        = The character to insert.
590     /// position = Position to insert the character at in code units, not code points.
591     ///            Must be less than slice length(); a previously returned length()
592     ///            can be used.
593     void insert(const dchar c, const size_t position) @safe pure
594     in(inProgress, "insert called without begin")
595     in(start_ + position <= end_, "Trying to insert after the end of the slice")
596     {
597 
598         const point       = start_ + position;
599         const movedLength = end_ - point;
600 
601         // Encode c into UTF-8
602         char[4] encodeBuf;
603         if(c < 0x80) { encodeBuf[0] = cast(char)c; }
604         const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c);
605 
606         if(movedLength > 0)
607         {
608             copy(reader_.buffer_[point..point + movedLength * char.sizeof],
609                     reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]);
610         }
611         reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
612         end_ += bytes;
613     }
614 
615     /// Get the current length of the slice.
616     size_t length() @safe const pure nothrow @nogc
617     {
618         return end_ - start_;
619     }
620 
621     /// A slice building transaction.
622     ///
623     /// Can be used to save and revert back to slice state.
624     struct Transaction
625     {
626     private:
627         // The slice builder affected by the transaction.
628         SliceBuilder* builder_;
629         // Index of the return point of the transaction in StringBuilder.endStack_.
630         size_t stackLevel_;
631         // True after commit() has been called.
632         bool committed_;
633 
634     public:
635         /// Begins a transaction on a SliceBuilder object.
636         ///
637         /// The transaction must end $(B after) any transactions created within the
638         /// transaction but $(B before) the slice is finish()-ed. A transaction can be
639         /// ended either by commit()-ing or reverting through the destructor.
640         ///
641         /// Saves the current state of a slice.
642         this(SliceBuilder* builder) @safe pure nothrow @nogc
643         {
644             builder_ = builder;
645             stackLevel_ = builder_.endStackUsed_;
646             builder_.push();
647         }
648 
649         /// Commit changes to the slice.
650         ///
651         /// Ends the transaction - can only be called once, and removes the possibility
652         /// to revert slice state.
653         ///
654         /// Does nothing for a default-initialized transaction (the transaction has not
655         /// been started yet).
656         void commit() @safe pure nothrow @nogc
657         in(!committed_, "Can't commit a transaction more than once")
658         {
659 
660             if(builder_ is null) { return; }
661             assert(builder_.endStackUsed_ == stackLevel_ + 1,
662                    "Parent transactions don't fully contain child transactions");
663             builder_.apply();
664             committed_ = true;
665         }
666 
667         /// Destroy the transaction and revert it if it hasn't been committed yet.
668         void end() @safe pure nothrow @nogc
669         in(builder_ && builder_.endStackUsed_ == stackLevel_ + 1, "Parent transactions don't fully contain child transactions")
670         {
671             builder_.pop();
672             builder_ = null;
673         }
674 
675     }
676 
677 private:
678     // Push the current end of the slice so we can revert to it if needed.
679     //
680     // Used by Transaction.
681     void push() @safe pure nothrow @nogc
682     in(inProgress, "push called without begin")
683     in(endStackUsed_ < endStack_.length, "Slice stack overflow")
684     {
685         endStack_[endStackUsed_++] = end_;
686     }
687 
688     // Pop the current end of endStack_ and set the end of the slice to the popped
689     // value, reverting changes since the old end was pushed.
690     //
691     // Used by Transaction.
692     void pop() @safe pure nothrow @nogc
693     in(inProgress, "pop called without begin")
694     in(endStackUsed_ > 0, "Trying to pop an empty slice stack")
695     {
696         end_ = endStack_[--endStackUsed_];
697     }
698 
699     // Pop the current end of endStack_, but keep the current end of the slice, applying
700     // changes made since pushing the old end.
701     //
702     // Used by Transaction.
703     void apply() @safe pure nothrow @nogc
704     in(inProgress, "apply called without begin")
705     in(endStackUsed_ > 0, "Trying to apply an empty slice stack")
706     {
707         --endStackUsed_;
708     }
709 }
710 
711 
712 private:
713 
714 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
715 //
716 // Params:
717 //
718 // input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
719 //            conversion, in which case the result will be a slice of this buffer.
720 // encoding = Encoding of input.
721 //
722 // Returns:
723 //
724 // A struct with the following members:
725 //
726 // $(D string errorMessage)   In case of an error, the error message is stored here. If
727 //                            there was no error, errorMessage is NULL. Always check
728 //                            this first.
729 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
730 // $(D size_t characterCount) Number of characters (code points) in input.
731 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
732 {
733     // Documented in function ddoc.
734     struct Result
735     {
736         string errorMessage;
737         char[] utf8;
738         size_t characterCount;
739     }
740 
741     Result result;
742 
743     // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
744     //
745     // Params:
746     //
747     // buffer = The input buffer to encode.
748     // result = A Result struct to put encoded result and any error messages to.
749     //
750     // On error, result.errorMessage will be set.
751     static void encode(C)(C[] input, ref Result result) @safe pure
752     {
753         // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
754         // less bytes.
755         static if(is(C == dchar))
756         {
757             char[4] encodeBuf;
758             auto utf8 = cast(char[])input;
759             auto length = 0;
760             foreach(dchar c; input)
761             {
762                 ++result.characterCount;
763                 // ASCII
764                 if(c < 0x80)
765                 {
766                     utf8[length++] = cast(char)c;
767                     continue;
768                 }
769 
770                 std.utf.encode(encodeBuf, c);
771                 const bytes = codeLength!char(c);
772                 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
773                 length += bytes;
774             }
775             result.utf8 = utf8[0 .. length];
776         }
777         // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
778         else
779         {
780             result.characterCount = std.utf.count(input);
781             result.utf8 = input.to!(char[]);
782         }
783     }
784 
785     try final switch(encoding)
786     {
787         case UTFEncoding.UTF_8:
788             result.utf8 = cast(char[])input;
789             result.utf8.validate();
790             result.characterCount = std.utf.count(result.utf8);
791             break;
792         case UTFEncoding.UTF_16:
793             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
794             encode(cast(wchar[])input, result);
795             break;
796         case UTFEncoding.UTF_32:
797             assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
798             encode(cast(dchar[])input, result);
799             break;
800     }
801     catch(ConvException e) { result.errorMessage = e.msg; }
802     catch(UTFException e)  { result.errorMessage = e.msg; }
803     catch(Exception e)
804     {
805         assert(false, "Unexpected exception in encode(): " ~ e.msg);
806     }
807 
808     return result;
809 }
810 
811 /// Determine if all characters (code points, not bytes) in a string are printable.
812 bool isPrintableValidUTF8(const char[] chars) @safe pure
813 {
814     import std.uni : isControl, isWhite;
815     foreach (dchar chr; chars)
816     {
817         if (!chr.isValidDchar || (chr.isControl && !chr.isWhite))
818         {
819             return false;
820         }
821     }
822     return true;
823 }
824 
825 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
826 ///
827 /// Used to determine how many characters we can process without decoding.
828 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc
829 {
830     return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength;
831 }
832 // Unittests.
833 
834 void testEndian(R)()
835 {
836     void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
837     {
838         auto reader = new R(data);
839         assert(reader.encoding == encoding_expected);
840         assert(reader.endian_ == endian_expected);
841     }
842     ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
843     ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
844     endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
845     endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
846 }
847 
848 void testPeekPrefixForward(R)()
849 {
850     import std.encoding;
851     ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data";
852     auto reader = new R(data);
853     assert(reader.peek() == 'd');
854     assert(reader.peek(1) == 'a');
855     assert(reader.peek(2) == 't');
856     assert(reader.peek(3) == 'a');
857     assert(reader.peek(4) == '\0');
858     assert(reader.prefix(4) == "data");
859     // assert(reader.prefix(6) == "data\0");
860     reader.forward(2);
861     assert(reader.peek(1) == 'a');
862     // assert(collectException(reader.peek(3)));
863 }
864 
865 void testUTF(R)()
866 {
867     import std.encoding;
868     dchar[] data = cast(dchar[])"data";
869     void utf_test(T)(T[] data, BOM bom)
870     {
871         ubyte[] bytes = bomTable[bom].sequence ~
872                         (cast(ubyte[])data)[0 .. data.length * T.sizeof];
873         auto reader = new R(bytes);
874         assert(reader.peek() == 'd');
875         assert(reader.peek(1) == 'a');
876         assert(reader.peek(2) == 't');
877         assert(reader.peek(3) == 'a');
878     }
879     utf_test!char(to!(char[])(data), BOM.utf8);
880     utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le);
881     utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le);
882 }
883 
884 void test1Byte(R)()
885 {
886     ubyte[] data = [97];
887 
888     auto reader = new R(data);
889     assert(reader.peek() == 'a');
890     assert(reader.peek(1) == '\0');
891     // assert(collectException(reader.peek(2)));
892 }
893 
894 @system unittest
895 {
896     testEndian!Reader();
897     testPeekPrefixForward!Reader();
898     testUTF!Reader();
899     test1Byte!Reader();
900 }
901 //Issue 257 - https://github.com/dlang-community/D-YAML/issues/257
902 @safe unittest
903 {
904     import dyaml.loader : Loader;
905     auto yaml = "hello ";
906     auto root = Loader.fromString(yaml).load();
907 
908     assert(root.isValid);
909 }