dyaml.reader source code

1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 module dyaml.reader;
8 
9 
10 import core.stdc.stdlib;
11 import core.stdc.string;
12 import core.thread;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.exception;
18 import std.stdio;
19 import std.string;
20 import std.system;
21 import std.typecons;
22 import std.utf;
23 
24 import tinyendian;
25 
26 import dyaml.fastcharsearch;
27 import dyaml.encoding;
28 import dyaml.exception;
29 import dyaml.nogcutil;
30 
31 
32 
33 package:
34 
35 
36 ///Exception thrown at Reader errors.
37 class ReaderException : YAMLException
38 {
39     this(string msg, string file = __FILE__, int line = __LINE__)
40         @safe pure nothrow
41     {
42         super("Reader error: " ~ msg, file, line);
43     }
44 }
45 
46 /// Provides an API to read characters from a UTF-8 buffer and build slices into that
47 /// buffer to avoid allocations (see SliceBuilder).
48 final class Reader
49 {
50     private:
51         // Buffer of currently loaded characters.
52         char[] buffer_ = null;
53 
54         // Current position within buffer. Only data after this position can be read.
55         size_t bufferOffset_ = 0;
56 
57         // Index of the current character in the buffer.
58         size_t charIndex_ = 0;
59         // Number of characters (code points) in buffer_.
60         size_t characterCount_ = 0;
61 
62         // Current line in file.
63         uint line_;
64         // Current column in file.
65         uint column_;
66 
67         // Original Unicode encoding of the data.
68         Encoding encoding_;
69 
70         version(unittest)
71         {
72             // Endianness of the input before it was converted (for testing)
73             Endian endian_;
74         }
75 
76         // The number of consecutive ASCII characters starting at bufferOffset_.
77         //
78         // Used to minimize UTF-8 decoding.
79         size_t upcomingASCII_ = 0;
80 
81         // Index to buffer_ where the last decoded character starts.
82         size_t lastDecodedBufferOffset_ = 0;
83         // Offset, relative to charIndex_, of the last decoded character,
84         // in code points, not chars.
85         size_t lastDecodedCharOffset_ = 0;
86 
87     public:
88         /// Construct a Reader.
89         ///
90         /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
91         ///                   contents of a file or a string. $(B will) be modified by
92         ///                   the Reader and other parts of D:YAML (D:YAML tries to
93         ///                   reuse the buffer to minimize memory allocations)
94         ///
95         /// Throws:  ReaderException on a UTF decoding error or if there are
96         ///          nonprintable Unicode characters illegal in YAML.
97         this(ubyte[] buffer) @trusted pure //!nothrow
98         {
99             auto endianResult = fixUTFByteOrder(buffer);
100             if(endianResult.bytesStripped > 0)
101             {
102                 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
103                                           "to 2 or 4 bytes, respectively");
104             }
105 
106             version(unittest) { endian_ = endianResult.endian; }
107             encoding_ = endianResult.encoding;
108 
109             auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
110             const msg = utf8Result.errorMessage;
111             if(msg !is null)
112             {
113                 throw new ReaderException("Error when converting to UTF-8: " ~ msg);
114             }
115 
116             buffer_ = utf8Result.utf8;
117 
118             characterCount_ = utf8Result.characterCount;
119             // Check that all characters in buffer are printable.
120             enforce(isPrintableValidUTF8(buffer_),
121                     new ReaderException("Special unicode characters are not allowed"));
122 
123             this.sliceBuilder = SliceBuilder(this);
124             checkASCII();
125         }
126 
127 pure nothrow @nogc:
128         /// Get character at specified index relative to current position.
129         ///
130         /// Params:  index = Index of the character to get relative to current position
131         ///                  in the buffer. Can point outside of the buffer; In that
132         ///                  case, '\0' will be returned.
133         ///
134         /// Returns: Character at specified position or '\0' if outside of the buffer.
135         ///
136         // XXX removed; search for 'risky' to find why.
137         // Throws:  ReaderException if trying to read past the end of the buffer.
138         dchar peek(const size_t index) @safe
139         {
140             if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
141             if(characterCount_ <= charIndex_ + index)
142             {
143                 // XXX This is risky; revert this if bugs are introduced. We rely on
144                 // the assumption that Reader only uses peek() to detect end of buffer.
145                 // The test suite passes.
146                 // Revert this case here and in other peek() versions if this causes
147                 // errors.
148                 // throw new ReaderException("Trying to read past the end of the buffer");
149                 return '\0';
150             }
151 
152             // Optimized path for Scanner code that peeks chars in linear order to
153             // determine the length of some sequence.
154             if(index == lastDecodedCharOffset_)
155             {
156                 ++lastDecodedCharOffset_;
157                 const char b = buffer_[lastDecodedBufferOffset_];
158                 // ASCII
159                 if(b < 0x80)
160                 {
161                     ++lastDecodedBufferOffset_;
162                     return b;
163                 }
164                 return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
165             }
166 
167             // 'Slow' path where we decode everything up to the requested character.
168             const asciiToTake = min(upcomingASCII_, index);
169             lastDecodedCharOffset_   = asciiToTake;
170             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
171             dchar d;
172             while(lastDecodedCharOffset_ <= index)
173             {
174                 d = decodeNext();
175             }
176 
177             return d;
178         }
179 
180         /// Optimized version of peek() for the case where peek index is 0.
181         dchar peek() @safe
182         {
183             if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
184             if(characterCount_ <= charIndex_) { return '\0'; }
185 
186             lastDecodedCharOffset_   = 0;
187             lastDecodedBufferOffset_ = bufferOffset_;
188             return decodeNext();
189         }
190 
191         /// Get byte at specified index relative to current position.
192         ///
193         /// Params:  index = Index of the byte to get relative to current position
194         ///                  in the buffer. Can point outside of the buffer; In that
195         ///                  case, '\0' will be returned.
196         ///
197         /// Returns: Byte at specified position or '\0' if outside of the buffer.
198         char peekByte(const size_t index) @safe
199         {
200             return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
201         }
202 
203         /// Optimized version of peekByte() for the case where peek byte index is 0.
204         char peekByte() @safe
205         {
206             return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
207         }
208 
209 
210         /// Get specified number of characters starting at current position.
211         ///
212         /// Note: This gets only a "view" into the internal buffer, which will be
213         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
214         ///       for permanent use.
215         ///
216         /// Params: length = Number of characters (code points, not bytes) to get. May
217         ///                  reach past the end of the buffer; in that case the returned
218         ///                  slice will be shorter.
219         ///
220         /// Returns: Characters starting at current position or an empty slice if out of bounds.
221         char[] prefix(const size_t length) @safe
222         {
223             return slice(length);
224         }
225 
226         /// Get specified number of bytes, not code points, starting at current position.
227         ///
228         /// Note: This gets only a "view" into the internal buffer, which will be
229         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
230         ///       for permanent use.
231         ///
232         /// Params: length = Number bytes (not code points) to get. May NOT reach past
233         ///                  the end of the buffer; should be used with peek() to avoid
234         ///                  this.
235         ///
236         /// Returns: Bytes starting at current position.
237         char[] prefixBytes(const size_t length) @safe
238         {
239             assert(length == 0 || bufferOffset_ + length < buffer_.length,
240                    "prefixBytes out of bounds");
241             return buffer_[bufferOffset_ .. bufferOffset_ + length];
242         }
243 
244         /// Get a slice view of the internal buffer, starting at the current position.
245         ///
246         /// Note: This gets only a "view" into the internal buffer,
247         ///       which get invalidated after other Reader calls.
248         ///
249         /// Params:  end = End of the slice relative to current position. May reach past
250         ///                the end of the buffer; in that case the returned slice will
251         ///                be shorter.
252         ///
253         /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
254         char[] slice(const size_t end) @safe
255         {
256             // Fast path in case the caller has already peek()ed all the way to end.
257             if(end == lastDecodedCharOffset_)
258             {
259                 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
260             }
261 
262             const asciiToTake = min(upcomingASCII_, end, buffer_.length);
263             lastDecodedCharOffset_   = asciiToTake;
264             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
265 
266             // 'Slow' path - decode everything up to end.
267             while(lastDecodedCharOffset_ < end &&
268                   lastDecodedBufferOffset_ < buffer_.length)
269             {
270                 decodeNext();
271             }
272 
273             return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
274         }
275 
276         /// Get the next character, moving buffer position beyond it.
277         ///
278         /// Returns: Next character.
279         ///
280         /// Throws:  ReaderException if trying to read past the end of the buffer
281         ///          or if invalid data is read.
282         dchar get() @safe
283         {
284             const result = peek();
285             forward();
286             return result;
287         }
288 
289         /// Get specified number of characters, moving buffer position beyond them.
290         ///
291         /// Params:  length = Number or characters (code points, not bytes) to get.
292         ///
293         /// Returns: Characters starting at current position.
294         char[] get(const size_t length) @safe
295         {
296             auto result = slice(length);
297             forward(length);
298             return result;
299         }
300 
301         /// Move current position forward.
302         ///
303         /// Params:  length = Number of characters to move position forward.
304         void forward(size_t length) @safe
305         {
306             mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
307 
308             while(length > 0)
309             {
310                 auto asciiToTake = min(upcomingASCII_, length);
311                 charIndex_     += asciiToTake;
312                 length         -= asciiToTake;
313                 upcomingASCII_ -= asciiToTake;
314 
315                 for(; asciiToTake > 0; --asciiToTake)
316                 {
317                     const c = buffer_[bufferOffset_++];
318                     // c is ASCII, do we only need to check for ASCII line breaks.
319                     if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
320                     {
321                         ++line_;
322                         column_ = 0;
323                         continue;
324                     }
325                     ++column_;
326                 }
327 
328                 // If we have used up all upcoming ASCII chars, the next char is
329                 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
330                 // be updated - it's zero.
331                 if(length == 0) { break; }
332 
333                 assert(upcomingASCII_ == 0,
334                        "Running unicode handling code but we haven't run out of ASCII chars");
335                 assert(bufferOffset_ < buffer_.length,
336                        "Attempted to decode past the end of YAML buffer");
337                 assert(buffer_[bufferOffset_] >= 0x80,
338                        "ASCII must be handled by preceding code");
339 
340                 ++charIndex_;
341                 const c = decodeValidUTF8NoGC(buffer_, bufferOffset_);
342 
343                 // New line. (can compare with '\n' without decoding since it's ASCII)
344                 if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
345                 {
346                     ++line_;
347                     column_ = 0;
348                 }
349                 else if(c != '\uFEFF') { ++column_; }
350                 --length;
351                 checkASCII();
352             }
353 
354             lastDecodedBufferOffset_ = bufferOffset_;
355             lastDecodedCharOffset_ = 0;
356         }
357 
358         /// Move current position forward by one character.
359         void forward() @trusted
360         {
361             ++charIndex_;
362             lastDecodedBufferOffset_ = bufferOffset_;
363             lastDecodedCharOffset_ = 0;
364 
365             // ASCII
366             if(upcomingASCII_ > 0)
367             {
368                 --upcomingASCII_;
369                 const c = buffer_[bufferOffset_++];
370 
371                 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
372                 {
373                     ++line_;
374                     column_ = 0;
375                     return;
376                 }
377                 ++column_;
378                 return;
379             }
380 
381             // UTF-8
382             mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
383             assert(bufferOffset_ < buffer_.length,
384                    "Attempted to decode past the end of YAML buffer");
385             assert(buffer_[bufferOffset_] >= 0x80,
386                    "ASCII must be handled by preceding code");
387 
388             const c = decodeValidUTF8NoGC(buffer_, bufferOffset_);
389 
390             // New line. (can compare with '\n' without decoding since it's ASCII)
391             if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
392             {
393                 ++line_;
394                 column_ = 0;
395             }
396             else if(c != '\uFEFF') { ++column_; }
397 
398             checkASCII();
399         }
400 
401         /// Used to build slices of read data in Reader; to avoid allocations.
402         SliceBuilder sliceBuilder;
403 
404 @safe pure nothrow @nogc:
405         /// Get a string describing current buffer position, used for error messages.
406         Mark mark() const { return Mark(line_, column_); }
407 
408         /// Get current line number.
409         uint line() const { return line_; }
410 
411         /// Get current column number.
412         uint column() const { return column_; }
413 
414         /// Get index of the current character in the buffer.
415         size_t charIndex() const { return charIndex_; }
416 
417         /// Get encoding of the input buffer.
418         Encoding encoding() const { return encoding_; }
419 
420 private:
421         // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
422         void checkASCII()
423         {
424             upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
425         }
426 
427         // Decode the next character relative to
428         // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
429         //
430         // Does not advance the buffer position. Used in peek() and slice().
431         dchar decodeNext()
432         {
433             assert(lastDecodedBufferOffset_ < buffer_.length,
434                    "Attempted to decode past the end of YAML buffer");
435             const char b = buffer_[lastDecodedBufferOffset_];
436             ++lastDecodedCharOffset_;
437             // ASCII
438             if(b < 0x80)
439             {
440                 ++lastDecodedBufferOffset_;
441                 return b;
442             }
443 
444             return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
445         }
446 }
447 
448 /// Used to build slices of already read data in Reader buffer, avoiding allocations.
449 ///
450 /// Usually these slices point to unchanged Reader data, but sometimes the data is
451 /// changed due to how YAML interprets certain characters/strings.
452 ///
453 /// See begin() documentation.
454 struct SliceBuilder
455 {
456 pure nothrow @nogc:
457 private:
458     // No copying by the user.
459     @disable this(this);
460     @disable void opAssign(ref SliceBuilder);
461 
462     // Reader this builder works in.
463     Reader reader_;
464 
465     // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
466     size_t start_ = size_t.max;
467     // End of the slice om reader_.buffer_ (size_t.max while no slice being build)
468     size_t end_   = size_t.max;
469 
470     // Stack of slice ends to revert to (see Transaction)
471     //
472     // Very few levels as we don't want arbitrarily nested transactions.
473     size_t[4] endStack_;
474     // The number of elements currently in endStack_.
475     size_t endStackUsed_ = 0;
476 
477     @safe const invariant()
478     {
479         if(!inProgress) { return; }
480         assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
481         assert(start_ <= end_, "Slice start after slice end");
482     }
483 
484     // Is a slice currently being built?
485     bool inProgress() @safe const
486     {
487         assert(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max,
488                "start_/end_ are not consistent");
489         return start_ != size_t.max;
490     }
491 
492 public:
493     /// Begin building a slice.
494     ///
495     /// Only one slice can be built at any given time; before beginning a new slice,
496     /// finish the previous one (if any).
497     ///
498     /// The slice starts at the current position in the Reader buffer. It can only be
499     /// extended up to the current position in the buffer; Reader methods get() and
500     /// forward() move the position. E.g. it is valid to extend a slice by write()-ing
501     /// a string just returned by get() - but not one returned by prefix() unless the
502     /// position has changed since the prefix() call.
503     void begin() @system
504     {
505         assert(!inProgress, "Beginning a slice while another slice is being built");
506         assert(endStackUsed_ == 0, "Slice stack not empty at slice begin");
507 
508         start_ = reader_.bufferOffset_;
509         end_   = reader_.bufferOffset_;
510     }
511 
512     /// Finish building a slice and return it.
513     ///
514     /// Any Transactions on the slice must be committed or destroyed before the slice
515     /// is finished.
516     ///
517     /// Returns a string; once a slice is finished it is definitive that its contents
518     /// will not be changed.
519     char[] finish() @system
520     {
521         assert(inProgress, "finish called without begin");
522         assert(endStackUsed_ == 0, "Finishing a slice with running transactions.");
523 
524         auto result = reader_.buffer_[start_ .. end_];
525         start_ = end_ = size_t.max;
526         return result;
527     }
528 
529     /// Write a string to the slice being built.
530     ///
531     /// Data can only be written up to the current position in the Reader buffer.
532     ///
533     /// If str is a string returned by a Reader method, and str starts right after the
534     /// end of the slice being built, the slice is extended (trivial operation).
535     ///
536     /// See_Also: begin
537     void write(char[] str) @system
538     {
539         assert(inProgress, "write called without begin");
540         assert(end_ <= reader_.bufferOffset_,
541                "AT START: Slice ends after buffer position");
542 
543         // If str starts at the end of the slice (is a string returned by a Reader
544         // method), just extend the slice to contain str.
545         if(str.ptr == reader_.buffer_.ptr + end_)
546         {
547             end_ += str.length;
548         }
549         // Even if str does not start at the end of the slice, it still may be returned
550         // by a Reader method and point to buffer. So we need to memmove.
551         else
552         {
553             core.stdc..string.memmove(reader_.buffer_.ptr + end_, cast(char*)str.ptr,
554                                      str.length * char.sizeof);
555             end_ += str.length;
556         }
557     }
558 
559     /// Write a character to the slice being built.
560     ///
561     /// Data can only be written up to the current position in the Reader buffer.
562     ///
563     /// See_Also: begin
564     void write(dchar c) @system
565     {
566         assert(inProgress, "write called without begin");
567         if(c < 0x80)
568         {
569             reader_.buffer_[end_++] = cast(char)c;
570             return;
571         }
572 
573         // We need to encode a non-ASCII dchar into UTF-8
574         char[4] encodeBuf;
575         const bytes = encodeValidCharNoGC(encodeBuf, c);
576         reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
577         end_ += bytes;
578     }
579 
580     /// Insert a character to a specified position in the slice.
581     ///
582     /// Enlarges the slice by 1 char. Note that the slice can only extend up to the
583     /// current position in the Reader buffer.
584     ///
585     /// Params:
586     ///
587     /// c        = The character to insert.
588     /// position = Position to insert the character at in code units, not code points.
589     ///            Must be less than slice length(); a previously returned length()
590     ///            can be used.
591     void insert(const dchar c, const size_t position) @system
592     {
593         assert(inProgress, "insert called without begin");
594         assert(start_ + position <= end_, "Trying to insert after the end of the slice");
595 
596         const point       = start_ + position;
597         const movedLength = end_ - point;
598 
599         // Encode c into UTF-8
600         char[4] encodeBuf;
601         if(c < 0x80) { encodeBuf[0] = cast(char)c; }
602         const size_t bytes = c < 0x80 ? 1 : encodeValidCharNoGC(encodeBuf, c);
603 
604         if(movedLength > 0)
605         {
606             core.stdc..string.memmove(reader_.buffer_.ptr + point + bytes,
607                                      reader_.buffer_.ptr + point,
608                                      movedLength * char.sizeof);
609         }
610         reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
611         end_ += bytes;
612     }
613 
614     /// Get the current length of the slice.
615     size_t length() @safe const
616     {
617         return end_ - start_;
618     }
619 
620     /// A slice building transaction.
621     ///
622     /// Can be used to save and revert back to slice state.
623     struct Transaction
624     {
625     @system pure nothrow @nogc:
626     private:
627         // The slice builder affected by the transaction.
628         SliceBuilder* builder_ = null;
629         // Index of the return point of the transaction in StringBuilder.endStack_.
630         size_t stackLevel_;
631         // True after commit() has been called.
632         bool committed_;
633 
634     public:
635         /// Begins a transaction on a SliceBuilder object.
636         ///
637         /// The transaction must end $(B after) any transactions created within the
638         /// transaction but $(B before) the slice is finish()-ed. A transaction can be
639         /// ended either by commit()-ing or reverting through the destructor.
640         ///
641         /// Saves the current state of a slice.
642         this(ref SliceBuilder builder)
643         {
644             builder_ = &builder;
645             stackLevel_ = builder_.endStackUsed_;
646             builder_.push();
647         }
648 
649         /// Commit changes to the slice.
650         ///
651         /// Ends the transaction - can only be called once, and removes the possibility
652         /// to revert slice state.
653         ///
654         /// Does nothing for a default-initialized transaction (the transaction has not
655         /// been started yet).
656         void commit()
657         {
658             assert(!committed_, "Can't commit a transaction more than once");
659 
660             if(builder_ is null) { return; }
661             assert(builder_.endStackUsed_ == stackLevel_ + 1,
662                    "Parent transactions don't fully contain child transactions");
663             builder_.apply();
664             committed_ = true;
665         }
666 
667         /// Destroy the transaction and revert it if it hasn't been committed yet.
668         ///
669         /// Does nothing for a default-initialized transaction.
670         ~this()
671         {
672             if(builder_ is null || committed_) { return; }
673             assert(builder_.endStackUsed_ == stackLevel_ + 1,
674                    "Parent transactions don't fully contain child transactions");
675             builder_.pop();
676             builder_ = null;
677         }
678     }
679 
680 private:
681     // Push the current end of the slice so we can revert to it if needed.
682     //
683     // Used by Transaction.
684     void push() @system
685     {
686         assert(inProgress, "push called without begin");
687         assert(endStackUsed_ < endStack_.length, "Slice stack overflow");
688         endStack_[endStackUsed_++] = end_;
689     }
690 
691     // Pop the current end of endStack_ and set the end of the slice to the popped
692     // value, reverting changes since the old end was pushed.
693     //
694     // Used by Transaction.
695     void pop() @system
696     {
697         assert(inProgress, "pop called without begin");
698         assert(endStackUsed_ > 0, "Trying to pop an empty slice stack");
699         end_ = endStack_[--endStackUsed_];
700     }
701 
702     // Pop the current end of endStack_, but keep the current end of the slice, applying
703     // changes made since pushing the old end.
704     //
705     // Used by Transaction.
706     void apply() @system
707     {
708         assert(inProgress, "apply called without begin");
709         assert(endStackUsed_ > 0, "Trying to apply an empty slice stack");
710         --endStackUsed_;
711     }
712 }
713 
714 
715 private:
716 
717 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
718 //
719 // Params:
720 //
721 // input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
722 //            conversion, in which case the result will be a slice of this buffer.
723 // encoding = Encoding of input.
724 //
725 // Returns:
726 //
727 // A struct with the following members:
728 //
729 // $(D string errorMessage)   In case of an error, the error message is stored here. If
730 //                            there was no error, errorMessage is NULL. Always check
731 //                            this first.
732 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
733 // $(D size_t characterCount) Number of characters (code points) in input.
734 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
735 {
736     // Documented in function ddoc.
737     struct Result
738     {
739         string errorMessage;
740         char[] utf8;
741         size_t characterCount;
742     }
743 
744     Result result;
745 
746     // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
747     //
748     // Params:
749     //
750     // buffer = The input buffer to encode.
751     // result = A Result struct to put encoded result and any error messages to.
752     //
753     // On error, result.errorMessage will be set.
754     static void encode(C)(C[] input, ref Result result) @safe pure
755     {
756         // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
757         // less bytes.
758         static if(is(C == dchar))
759         {
760             char[4] encodeBuf;
761             auto utf8 = cast(char[])input;
762             auto length = 0;
763             foreach(dchar c; input)
764             {
765                 ++result.characterCount;
766                 // ASCII
767                 if(c < 0x80)
768                 {
769                     utf8[length++] = cast(char)c;
770                     continue;
771                 }
772 
773                 const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c);
774                 if(encodeResult.errorMessage !is null)
775                 {
776                     result.errorMessage = encodeResult.errorMessage;
777                     return;
778                 }
779                 const bytes = encodeResult.bytes;
780                 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
781                 length += bytes;
782             }
783             result.utf8 = utf8[0 .. length];
784         }
785         // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
786         else
787         {
788             result.characterCount = std.utf.count(input);
789             result.utf8 = input.to!(char[]);
790         }
791     }
792 
793     try final switch(encoding)
794     {
795         case UTFEncoding.UTF_8:
796             result.utf8 = cast(char[])input;
797             const validateResult = result.utf8.validateUTF8NoGC();
798             if(!validateResult.valid)
799             {
800                 result.errorMessage = "UTF-8 validation error after character #" ~
801                                       validateResult.characterCount.to!string ~ ": " ~
802                                       validateResult.msg;
803             }
804             result.characterCount = validateResult.characterCount;
805             break;
806         case UTFEncoding.UTF_16:
807             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
808             encode(cast(wchar[])input, result);
809             break;
810         case UTFEncoding.UTF_32:
811             assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
812             encode(cast(dchar[])input, result);
813             break;
814     }
815     catch(ConvException e) { result.errorMessage = e.msg; }
816     catch(UTFException e)  { result.errorMessage = e.msg; }
817     catch(Exception e)
818     {
819         assert(false, "Unexpected exception in encode(): " ~ e.msg);
820     }
821 
822     return result;
823 }
824 
825 /// Determine if all characters (code points, not bytes) in a string are printable.
826 bool isPrintableValidUTF8(const char[] chars) @trusted pure nothrow @nogc
827 {
828     // This is oversized (only 128 entries are necessary) simply because having 256
829     // entries improves performance... for some reason (alignment?)
830     bool[256] printable = [false, false, false, false, false, false, false, false,
831                            false, true,  true,  false, false, true,  false, false,
832                            false, false, false, false, false, false, false, false,
833                            false, false, false, false, false, false, false, false,
834 
835                            true,  true,  true,  true, true,  true,  true,  true,
836                            true,  true,  true,  true, true,  true,  true,  true,
837                            true,  true,  true,  true, true,  true,  true,  true,
838                            true,  true,  true,  true, true,  true,  true,  true,
839 
840                            true,  true,  true,  true, true,  true,  true,  true,
841                            true,  true,  true,  true, true,  true,  true,  true,
842                            true,  true,  true,  true, true,  true,  true,  true,
843                            true,  true,  true,  true, true,  true,  true,  true,
844                            true,  true,  true,  true, true,  true,  true,  true,
845                            true,  true,  true,  true, true,  true,  true,  true,
846                            true,  true,  true,  true, true,  true,  true,  true,
847                            true,  true,  true,  true, true,  true,  true,  true,
848 
849                            false, false, false, false, false, false, false, false,
850                            false, false, false, false, false, false, false, false,
851                            false, false, false, false, false, false, false, false,
852                            false, false, false, false, false, false, false, false,
853                            false, false, false, false, false, false, false, false,
854                            false, false, false, false, false, false, false, false,
855                            false, false, false, false, false, false, false, false,
856                            false, false, false, false, false, false, false, false,
857 
858                            false, false, false, false, false, false, false, false,
859                            false, false, false, false, false, false, false, false,
860                            false, false, false, false, false, false, false, false,
861                            false, false, false, false, false, false, false, false,
862                            false, false, false, false, false, false, false, false,
863                            false, false, false, false, false, false, false, false,
864                            false, false, false, false, false, false, false, false,
865                            false, false, false, false, false, false, false, false];
866 
867     for(size_t index = 0; index < chars.length;)
868     {
869         // Fast path for ASCII.
870         // Both this while() block and the if() block below it are optimized, unrolled
871         // versions of the for() block below them; the while()/if() block could be
872         // removed without affecting logic, but both help increase performance.
873         size_t asciiCount = countASCII(chars[index .. $]);
874         // 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars.
875         while(asciiCount > 8)
876         {
877             const dchar b0 = chars[index];
878             const dchar b1 = chars[index + 1];
879             const dchar b2 = chars[index + 2];
880             const dchar b3 = chars[index + 3];
881             const dchar b4 = chars[index + 4];
882             const dchar b5 = chars[index + 5];
883             const dchar b6 = chars[index + 6];
884             const dchar b7 = chars[index + 7];
885 
886             index += 8;
887             asciiCount -= 8;
888 
889             const all = printable[b0] & printable[b1] & printable[b2] & printable[b3] &
890                         printable[b4] & printable[b5] & printable[b6] & printable[b1];
891             if(!all)
892             {
893                 return false;
894             }
895         }
896         // 4 ASCII iterations unrolled
897         if(asciiCount > 4)
898         {
899             const char b0 = chars[index];
900             const char b1 = chars[index + 1];
901             const char b2 = chars[index + 2];
902             const char b3 = chars[index + 3];
903 
904             index += 4;
905             asciiCount -= 4;
906 
907             if(!printable[b0]) { return false; }
908             if(!printable[b1]) { return false; }
909             if(!printable[b2]) { return false; }
910             if(!printable[b3]) { return false; }
911         }
912         // Any remaining ASCII chars. This is really the only code needed to handle
913         // ASCII, the above if() and while() blocks are just an optimization.
914         for(; asciiCount > 0; --asciiCount)
915         {
916             const char b = chars[index];
917             ++index;
918             if(b >= 0x20)    { continue; }
919             if(printable[b]) { continue; }
920             return false;
921         }
922 
923         if(index == chars.length) { break; }
924 
925         // Not ASCII, need to decode.
926         const dchar c = decodeValidUTF8NoGC(chars, index);
927         // We now c is not ASCII, so only check for printable non-ASCII chars.
928         if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
929             (c >= '\uE000' && c <= '\uFFFD')))
930         {
931             return false;
932         }
933     }
934     return true;
935 }
936 
937 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
938 ///
939 /// Used to determine how many characters we can process without decoding.
940 size_t countASCII(const(char)[] buffer) @trusted pure nothrow @nogc
941 {
942     size_t count = 0;
943 
944     // The topmost bit in ASCII characters is always 0
945     enum ulong Mask8  = 0x7f7f7f7f7f7f7f7f;
946     enum uint Mask4   = 0x7f7f7f7f;
947     enum ushort Mask2 = 0x7f7f;
948 
949     // Start by checking in 8-byte chunks.
950     while(buffer.length >= Mask8.sizeof)
951     {
952         const block  = *cast(typeof(Mask8)*)buffer.ptr;
953         const masked = Mask8 & block;
954         if(masked != block) { break; }
955         count += Mask8.sizeof;
956         buffer = buffer[Mask8.sizeof .. $];
957     }
958 
959     // If 8 bytes didn't match, try 4, 2 bytes.
960     import std.typetuple;
961     foreach(Mask; TypeTuple!(Mask4, Mask2))
962     {
963         if(buffer.length < Mask.sizeof) { continue; }
964         const block  = *cast(typeof(Mask)*)buffer.ptr;
965         const masked = Mask & block;
966         if(masked != block) { continue; }
967         count += Mask.sizeof;
968         buffer = buffer[Mask.sizeof .. $];
969     }
970 
971     // If even a 2-byte chunk didn't match, test just one byte.
972     if(buffer.empty || buffer[0] >= 0x80) { return count; }
973     ++count;
974 
975     return count;
976 }
977 // Unittests.
978 
979 void testEndian(R)()
980 {
981     writeln(typeid(R).toString() ~ ": endian unittest");
982     void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
983     {
984         auto reader = new R(data);
985         assert(reader.encoding == encoding_expected);
986         assert(reader.endian_ == endian_expected);
987     }
988     ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
989     ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
990     endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
991     endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
992 }
993 
994 void testPeekPrefixForward(R)()
995 {
996     import dyaml.stream;
997     writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest");
998     ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
999     auto reader = new R(data);
1000     assert(reader.peek() == 'd');
1001     assert(reader.peek(1) == 'a');
1002     assert(reader.peek(2) == 't');
1003     assert(reader.peek(3) == 'a');
1004     assert(reader.peek(4) == '\0');
1005     assert(reader.prefix(4) == "data");
1006     // assert(reader.prefix(6) == "data\0");
1007     reader.forward(2);
1008     assert(reader.peek(1) == 'a');
1009     // assert(collectException(reader.peek(3)));
1010 }
1011 
1012 void testUTF(R)()
1013 {
1014     import dyaml.stream;
1015     writeln(typeid(R).toString() ~ ": UTF formats unittest");
1016     dchar[] data = cast(dchar[])"data";
1017     void utf_test(T)(T[] data, BOM bom)
1018     {
1019         ubyte[] bytes = ByteOrderMarks[bom] ~
1020                         (cast(ubyte[])data)[0 .. data.length * T.sizeof];
1021         auto reader = new R(bytes);
1022         assert(reader.peek() == 'd');
1023         assert(reader.peek(1) == 'a');
1024         assert(reader.peek(2) == 't');
1025         assert(reader.peek(3) == 'a');
1026     }
1027     utf_test!char(to!(char[])(data), BOM.UTF8);
1028     utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
1029     utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
1030 }
1031 
1032 void test1Byte(R)()
1033 {
1034     writeln(typeid(R).toString() ~ ": 1 byte file unittest");
1035     ubyte[] data = [97];
1036 
1037     auto reader = new R(data);
1038     assert(reader.peek() == 'a');
1039     assert(reader.peek(1) == '\0');
1040     // assert(collectException(reader.peek(2)));
1041 }
1042 
1043 unittest
1044 {
1045     testEndian!Reader();
1046     testPeekPrefixForward!Reader();
1047     testUTF!Reader();
1048     test1Byte!Reader();
1049 }