dyaml.reader source code

1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 module dyaml.reader;
8 
9 
10 import core.stdc.stdlib;
11 import core.stdc..string;
12 import core.thread;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.exception;
18 import std.range;
19 import std..string;
20 import std.system;
21 import std.typecons;
22 import std.utf;
23 
24 import tinyendian;
25 
26 import dyaml.encoding;
27 import dyaml.exception;
28 
29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029');
30 
31 package:
32 
33 
34 ///Exception thrown at Reader errors.
35 class ReaderException : YAMLException
36 {
37     this(string msg, string file = __FILE__, size_t line = __LINE__)
38         @safe pure nothrow
39     {
40         super("Reader error: " ~ msg, file, line);
41     }
42 }
43 
44 /// Provides an API to read characters from a UTF-8 buffer and build slices into that
45 /// buffer to avoid allocations (see SliceBuilder).
46 final class Reader
47 {
48     private:
49         // Buffer of currently loaded characters.
50         char[] buffer_;
51 
52         // Current position within buffer. Only data after this position can be read.
53         size_t bufferOffset_;
54 
55         // Index of the current character in the buffer.
56         size_t charIndex_;
57         // Number of characters (code points) in buffer_.
58         size_t characterCount_;
59 
60         // Current line in file.
61         uint line_;
62         // Current column in file.
63         uint column_;
64 
65         // Original Unicode encoding of the data.
66         Encoding encoding_;
67 
68         version(unittest)
69         {
70             // Endianness of the input before it was converted (for testing)
71             Endian endian_;
72         }
73 
74         // The number of consecutive ASCII characters starting at bufferOffset_.
75         //
76         // Used to minimize UTF-8 decoding.
77         size_t upcomingASCII_;
78 
79         // Index to buffer_ where the last decoded character starts.
80         size_t lastDecodedBufferOffset_;
81         // Offset, relative to charIndex_, of the last decoded character,
82         // in code points, not chars.
83         size_t lastDecodedCharOffset_;
84 
85     public:
86         /// Construct a Reader.
87         ///
88         /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
89         ///                   contents of a file or a string. $(B will) be modified by
90         ///                   the Reader and other parts of D:YAML (D:YAML tries to
91         ///                   reuse the buffer to minimize memory allocations)
92         ///
93         /// Throws:  ReaderException on a UTF decoding error or if there are
94         ///          nonprintable Unicode characters illegal in YAML.
95         this(ubyte[] buffer) @safe pure
96         {
97             auto endianResult = fixUTFByteOrder(buffer);
98             if(endianResult.bytesStripped > 0)
99             {
100                 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
101                                           "to 2 or 4 bytes, respectively");
102             }
103 
104             version(unittest) { endian_ = endianResult.endian; }
105             encoding_ = endianResult.encoding;
106 
107             auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
108             const msg = utf8Result.errorMessage;
109             if(msg !is null)
110             {
111                 throw new ReaderException("Error when converting to UTF-8: " ~ msg);
112             }
113 
114             buffer_ = utf8Result.utf8;
115 
116             characterCount_ = utf8Result.characterCount;
117             // Check that all characters in buffer are printable.
118             enforce(isPrintableValidUTF8(buffer_),
119                     new ReaderException("Special unicode characters are not allowed"));
120 
121             this.sliceBuilder = SliceBuilder(this);
122             checkASCII();
123         }
124 
125         /// Get character at specified index relative to current position.
126         ///
127         /// Params:  index = Index of the character to get relative to current position
128         ///                  in the buffer. Can point outside of the buffer; In that
129         ///                  case, '\0' will be returned.
130         ///
131         /// Returns: Character at specified position or '\0' if outside of the buffer.
132         ///
133         // XXX removed; search for 'risky' to find why.
134         // Throws:  ReaderException if trying to read past the end of the buffer.
135         dchar peek(const size_t index) @safe pure
136         {
137             if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
138             if(characterCount_ <= charIndex_ + index)
139             {
140                 // XXX This is risky; revert this if bugs are introduced. We rely on
141                 // the assumption that Reader only uses peek() to detect end of buffer.
142                 // The test suite passes.
143                 // Revert this case here and in other peek() versions if this causes
144                 // errors.
145                 // throw new ReaderException("Trying to read past the end of the buffer");
146                 return '\0';
147             }
148 
149             // Optimized path for Scanner code that peeks chars in linear order to
150             // determine the length of some sequence.
151             if(index == lastDecodedCharOffset_)
152             {
153                 ++lastDecodedCharOffset_;
154                 const char b = buffer_[lastDecodedBufferOffset_];
155                 // ASCII
156                 if(b < 0x80)
157                 {
158                     ++lastDecodedBufferOffset_;
159                     return b;
160                 }
161                 return decode(buffer_, lastDecodedBufferOffset_);
162             }
163 
164             // 'Slow' path where we decode everything up to the requested character.
165             const asciiToTake = min(upcomingASCII_, index);
166             lastDecodedCharOffset_   = asciiToTake;
167             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
168             dchar d;
169             while(lastDecodedCharOffset_ <= index)
170             {
171                 d = decodeNext();
172             }
173 
174             return d;
175         }
176 
177         /// Optimized version of peek() for the case where peek index is 0.
178         dchar peek() @safe pure
179         {
180             if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
181             if(characterCount_ <= charIndex_) { return '\0'; }
182 
183             lastDecodedCharOffset_   = 0;
184             lastDecodedBufferOffset_ = bufferOffset_;
185             return decodeNext();
186         }
187 
188         /// Get byte at specified index relative to current position.
189         ///
190         /// Params:  index = Index of the byte to get relative to current position
191         ///                  in the buffer. Can point outside of the buffer; In that
192         ///                  case, '\0' will be returned.
193         ///
194         /// Returns: Byte at specified position or '\0' if outside of the buffer.
195         char peekByte(const size_t index) @safe pure nothrow @nogc
196         {
197             return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
198         }
199 
200         /// Optimized version of peekByte() for the case where peek byte index is 0.
201         char peekByte() @safe pure nothrow @nogc
202         {
203             return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
204         }
205 
206 
207         /// Get specified number of characters starting at current position.
208         ///
209         /// Note: This gets only a "view" into the internal buffer, which will be
210         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
211         ///       for permanent use.
212         ///
213         /// Params: length = Number of characters (code points, not bytes) to get. May
214         ///                  reach past the end of the buffer; in that case the returned
215         ///                  slice will be shorter.
216         ///
217         /// Returns: Characters starting at current position or an empty slice if out of bounds.
218         char[] prefix(const size_t length) @safe pure
219         {
220             return slice(length);
221         }
222 
223         /// Get specified number of bytes, not code points, starting at current position.
224         ///
225         /// Note: This gets only a "view" into the internal buffer, which will be
226         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
227         ///       for permanent use.
228         ///
229         /// Params: length = Number bytes (not code points) to get. May NOT reach past
230         ///                  the end of the buffer; should be used with peek() to avoid
231         ///                  this.
232         ///
233         /// Returns: Bytes starting at current position.
234         char[] prefixBytes(const size_t length) @safe pure nothrow @nogc
235         in(length == 0 || bufferOffset_ + length < buffer_.length, "prefixBytes out of bounds")
236         {
237             return buffer_[bufferOffset_ .. bufferOffset_ + length];
238         }
239 
240         /// Get a slice view of the internal buffer, starting at the current position.
241         ///
242         /// Note: This gets only a "view" into the internal buffer,
243         ///       which get invalidated after other Reader calls.
244         ///
245         /// Params:  end = End of the slice relative to current position. May reach past
246         ///                the end of the buffer; in that case the returned slice will
247         ///                be shorter.
248         ///
249         /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
250         char[] slice(const size_t end) @safe pure
251         {
252             // Fast path in case the caller has already peek()ed all the way to end.
253             if(end == lastDecodedCharOffset_)
254             {
255                 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
256             }
257 
258             const asciiToTake = min(upcomingASCII_, end, buffer_.length);
259             lastDecodedCharOffset_   = asciiToTake;
260             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
261 
262             // 'Slow' path - decode everything up to end.
263             while(lastDecodedCharOffset_ < end &&
264                   lastDecodedBufferOffset_ < buffer_.length)
265             {
266                 decodeNext();
267             }
268 
269             return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
270         }
271 
272         /// Get the next character, moving buffer position beyond it.
273         ///
274         /// Returns: Next character.
275         ///
276         /// Throws:  ReaderException if trying to read past the end of the buffer
277         ///          or if invalid data is read.
278         dchar get() @safe pure
279         {
280             const result = peek();
281             forward();
282             return result;
283         }
284 
285         /// Get specified number of characters, moving buffer position beyond them.
286         ///
287         /// Params:  length = Number or characters (code points, not bytes) to get.
288         ///
289         /// Returns: Characters starting at current position.
290         char[] get(const size_t length) @safe pure
291         {
292             auto result = slice(length);
293             forward(length);
294             return result;
295         }
296 
297         /// Move current position forward.
298         ///
299         /// Params:  length = Number of characters to move position forward.
300         void forward(size_t length) @safe pure
301         {
302             while(length > 0)
303             {
304                 auto asciiToTake = min(upcomingASCII_, length);
305                 charIndex_     += asciiToTake;
306                 length         -= asciiToTake;
307                 upcomingASCII_ -= asciiToTake;
308 
309                 for(; asciiToTake > 0; --asciiToTake)
310                 {
311                     const c = buffer_[bufferOffset_++];
312                     // c is ASCII, do we only need to check for ASCII line breaks.
313                     if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
314                     {
315                         ++line_;
316                         column_ = 0;
317                         continue;
318                     }
319                     ++column_;
320                 }
321 
322                 // If we have used up all upcoming ASCII chars, the next char is
323                 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
324                 // be updated - it's zero.
325                 if(length == 0) { break; }
326 
327                 assert(upcomingASCII_ == 0,
328                        "Running unicode handling code but we haven't run out of ASCII chars");
329                 assert(bufferOffset_ < buffer_.length,
330                        "Attempted to decode past the end of YAML buffer");
331                 assert(buffer_[bufferOffset_] >= 0x80,
332                        "ASCII must be handled by preceding code");
333 
334                 ++charIndex_;
335                 const c = decode(buffer_, bufferOffset_);
336 
337                 // New line. (can compare with '\n' without decoding since it's ASCII)
338                 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
339                 {
340                     ++line_;
341                     column_ = 0;
342                 }
343                 else if(c != '\uFEFF') { ++column_; }
344                 --length;
345                 checkASCII();
346             }
347 
348             lastDecodedBufferOffset_ = bufferOffset_;
349             lastDecodedCharOffset_ = 0;
350         }
351 
352         /// Move current position forward by one character.
353         void forward() @safe pure
354         {
355             ++charIndex_;
356             lastDecodedBufferOffset_ = bufferOffset_;
357             lastDecodedCharOffset_ = 0;
358 
359             // ASCII
360             if(upcomingASCII_ > 0)
361             {
362                 --upcomingASCII_;
363                 const c = buffer_[bufferOffset_++];
364 
365                 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
366                 {
367                     ++line_;
368                     column_ = 0;
369                     return;
370                 }
371                 ++column_;
372                 return;
373             }
374 
375             // UTF-8
376             assert(bufferOffset_ < buffer_.length,
377                    "Attempted to decode past the end of YAML buffer");
378             assert(buffer_[bufferOffset_] >= 0x80,
379                    "ASCII must be handled by preceding code");
380 
381             const c = decode(buffer_, bufferOffset_);
382 
383             // New line. (can compare with '\n' without decoding since it's ASCII)
384             if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
385             {
386                 ++line_;
387                 column_ = 0;
388             }
389             else if(c != '\uFEFF') { ++column_; }
390 
391             checkASCII();
392         }
393 
394         /// Used to build slices of read data in Reader; to avoid allocations.
395         SliceBuilder sliceBuilder;
396 
397         /// Get a string describing current buffer position, used for error messages.
398         Mark mark() const pure nothrow @nogc @safe { return Mark(line_, column_); }
399 
400         /// Get current line number.
401         uint line() const @safe pure nothrow @nogc { return line_; }
402 
403         /// Get current column number.
404         uint column() const @safe pure nothrow @nogc { return column_; }
405 
406         /// Get index of the current character in the buffer.
407         size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; }
408 
409         /// Get encoding of the input buffer.
410         Encoding encoding() const @safe pure nothrow @nogc { return encoding_; }
411 
412 private:
413         // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
414         void checkASCII() @safe pure nothrow @nogc
415         {
416             upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
417         }
418 
419         // Decode the next character relative to
420         // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
421         //
422         // Does not advance the buffer position. Used in peek() and slice().
423         dchar decodeNext() @safe pure
424         {
425             assert(lastDecodedBufferOffset_ < buffer_.length,
426                    "Attempted to decode past the end of YAML buffer");
427             const char b = buffer_[lastDecodedBufferOffset_];
428             ++lastDecodedCharOffset_;
429             // ASCII
430             if(b < 0x80)
431             {
432                 ++lastDecodedBufferOffset_;
433                 return b;
434             }
435 
436             return decode(buffer_, lastDecodedBufferOffset_);
437         }
438 }
439 
440 /// Used to build slices of already read data in Reader buffer, avoiding allocations.
441 ///
442 /// Usually these slices point to unchanged Reader data, but sometimes the data is
443 /// changed due to how YAML interprets certain characters/strings.
444 ///
445 /// See begin() documentation.
446 struct SliceBuilder
447 {
448 private:
449     // No copying by the user.
450     @disable this(this);
451     @disable void opAssign(ref SliceBuilder);
452 
453     // Reader this builder works in.
454     Reader reader_;
455 
456     // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
457     size_t start_ = size_t.max;
458     // End of the slice om reader_.buffer_ (size_t.max while no slice being build)
459     size_t end_   = size_t.max;
460 
461     // Stack of slice ends to revert to (see Transaction)
462     //
463     // Very few levels as we don't want arbitrarily nested transactions.
464     size_t[4] endStack_;
465     // The number of elements currently in endStack_.
466     size_t endStackUsed_;
467 
468     @safe const pure nothrow @nogc invariant()
469     {
470         if(!inProgress) { return; }
471         assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
472         assert(start_ <= end_, "Slice start after slice end");
473     }
474 
475     // Is a slice currently being built?
476     bool inProgress() @safe const pure nothrow @nogc
477     in(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max, "start_/end_ are not consistent")
478     {
479         return start_ != size_t.max;
480     }
481 
482 public:
483     /// Begin building a slice.
484     ///
485     /// Only one slice can be built at any given time; before beginning a new slice,
486     /// finish the previous one (if any).
487     ///
488     /// The slice starts at the current position in the Reader buffer. It can only be
489     /// extended up to the current position in the buffer; Reader methods get() and
490     /// forward() move the position. E.g. it is valid to extend a slice by write()-ing
491     /// a string just returned by get() - but not one returned by prefix() unless the
492     /// position has changed since the prefix() call.
493     void begin() @safe pure nothrow @nogc
494     in(!inProgress, "Beginning a slice while another slice is being built")
495     in(endStackUsed_ == 0, "Slice stack not empty at slice begin")
496     {
497 
498         start_ = reader_.bufferOffset_;
499         end_   = reader_.bufferOffset_;
500     }
501 
502     /// Finish building a slice and return it.
503     ///
504     /// Any Transactions on the slice must be committed or destroyed before the slice
505     /// is finished.
506     ///
507     /// Returns a string; once a slice is finished it is definitive that its contents
508     /// will not be changed.
509     char[] finish() @safe pure nothrow @nogc
510     in(inProgress, "finish called without begin")
511     in(endStackUsed_ == 0, "Finishing a slice with running transactions.")
512     {
513 
514         auto result = reader_.buffer_[start_ .. end_];
515         start_ = end_ = size_t.max;
516         return result;
517     }
518 
519     /// Write a string to the slice being built.
520     ///
521     /// Data can only be written up to the current position in the Reader buffer.
522     ///
523     /// If str is a string returned by a Reader method, and str starts right after the
524     /// end of the slice being built, the slice is extended (trivial operation).
525     ///
526     /// See_Also: begin
527     void write(scope char[] str) @safe pure nothrow @nogc
528     {
529         assert(inProgress, "write called without begin");
530         assert(end_ <= reader_.bufferOffset_,
531                "AT START: Slice ends after buffer position");
532 
533         // Nothing? Already done.
534         if (str.length == 0) { return; }
535         // If str starts at the end of the slice (is a string returned by a Reader
536         // method), just extend the slice to contain str.
537         if(&str[0] == &reader_.buffer_[end_])
538         {
539             end_ += str.length;
540         }
541         // Even if str does not start at the end of the slice, it still may be returned
542         // by a Reader method and point to buffer. So we need to memmove.
543         else
544         {
545             copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]);
546             end_ += str.length;
547         }
548     }
549 
550     /// Write a character to the slice being built.
551     ///
552     /// Data can only be written up to the current position in the Reader buffer.
553     ///
554     /// See_Also: begin
555     void write(dchar c) @safe pure
556     in(inProgress, "write called without begin")
557     {
558         if(c < 0x80)
559         {
560             reader_.buffer_[end_++] = cast(char)c;
561             return;
562         }
563 
564         // We need to encode a non-ASCII dchar into UTF-8
565         char[4] encodeBuf;
566         const bytes = encode(encodeBuf, c);
567         reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
568         end_ += bytes;
569     }
570 
571     /// Insert a character to a specified position in the slice.
572     ///
573     /// Enlarges the slice by 1 char. Note that the slice can only extend up to the
574     /// current position in the Reader buffer.
575     ///
576     /// Params:
577     ///
578     /// c        = The character to insert.
579     /// position = Position to insert the character at in code units, not code points.
580     ///            Must be less than slice length(); a previously returned length()
581     ///            can be used.
582     void insert(const dchar c, const size_t position) @safe pure
583     in(inProgress, "insert called without begin")
584     in(start_ + position <= end_, "Trying to insert after the end of the slice")
585     {
586 
587         const point       = start_ + position;
588         const movedLength = end_ - point;
589 
590         // Encode c into UTF-8
591         char[4] encodeBuf;
592         if(c < 0x80) { encodeBuf[0] = cast(char)c; }
593         const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c);
594 
595         if(movedLength > 0)
596         {
597             copy(reader_.buffer_[point..point + movedLength * char.sizeof],
598                     reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]);
599         }
600         reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
601         end_ += bytes;
602     }
603 
604     /// Get the current length of the slice.
605     size_t length() @safe const pure nothrow @nogc
606     {
607         return end_ - start_;
608     }
609 
610     /// A slice building transaction.
611     ///
612     /// Can be used to save and revert back to slice state.
613     struct Transaction
614     {
615     private:
616         // The slice builder affected by the transaction.
617         SliceBuilder* builder_;
618         // Index of the return point of the transaction in StringBuilder.endStack_.
619         size_t stackLevel_;
620         // True after commit() has been called.
621         bool committed_;
622 
623     public:
624         /// Begins a transaction on a SliceBuilder object.
625         ///
626         /// The transaction must end $(B after) any transactions created within the
627         /// transaction but $(B before) the slice is finish()-ed. A transaction can be
628         /// ended either by commit()-ing or reverting through the destructor.
629         ///
630         /// Saves the current state of a slice.
631         this(SliceBuilder* builder) @safe pure nothrow @nogc
632         {
633             builder_ = builder;
634             stackLevel_ = builder_.endStackUsed_;
635             builder_.push();
636         }
637 
638         /// Commit changes to the slice.
639         ///
640         /// Ends the transaction - can only be called once, and removes the possibility
641         /// to revert slice state.
642         ///
643         /// Does nothing for a default-initialized transaction (the transaction has not
644         /// been started yet).
645         void commit() @safe pure nothrow @nogc
646         in(!committed_, "Can't commit a transaction more than once")
647         {
648 
649             if(builder_ is null) { return; }
650             assert(builder_.endStackUsed_ == stackLevel_ + 1,
651                    "Parent transactions don't fully contain child transactions");
652             builder_.apply();
653             committed_ = true;
654         }
655 
656         /// Destroy the transaction and revert it if it hasn't been committed yet.
657         void end() @safe pure nothrow @nogc
658         in(builder_ && builder_.endStackUsed_ == stackLevel_ + 1, "Parent transactions don't fully contain child transactions")
659         {
660             builder_.pop();
661             builder_ = null;
662         }
663 
664     }
665 
666 private:
667     // Push the current end of the slice so we can revert to it if needed.
668     //
669     // Used by Transaction.
670     void push() @safe pure nothrow @nogc
671     in(inProgress, "push called without begin")
672     in(endStackUsed_ < endStack_.length, "Slice stack overflow")
673     {
674         endStack_[endStackUsed_++] = end_;
675     }
676 
677     // Pop the current end of endStack_ and set the end of the slice to the popped
678     // value, reverting changes since the old end was pushed.
679     //
680     // Used by Transaction.
681     void pop() @safe pure nothrow @nogc
682     in(inProgress, "pop called without begin")
683     in(endStackUsed_ > 0, "Trying to pop an empty slice stack")
684     {
685         end_ = endStack_[--endStackUsed_];
686     }
687 
688     // Pop the current end of endStack_, but keep the current end of the slice, applying
689     // changes made since pushing the old end.
690     //
691     // Used by Transaction.
692     void apply() @safe pure nothrow @nogc
693     in(inProgress, "apply called without begin")
694     in(endStackUsed_ > 0, "Trying to apply an empty slice stack")
695     {
696         --endStackUsed_;
697     }
698 }
699 
700 
701 private:
702 
703 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
704 //
705 // Params:
706 //
707 // input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
708 //            conversion, in which case the result will be a slice of this buffer.
709 // encoding = Encoding of input.
710 //
711 // Returns:
712 //
713 // A struct with the following members:
714 //
715 // $(D string errorMessage)   In case of an error, the error message is stored here. If
716 //                            there was no error, errorMessage is NULL. Always check
717 //                            this first.
718 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
719 // $(D size_t characterCount) Number of characters (code points) in input.
720 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
721 {
722     // Documented in function ddoc.
723     struct Result
724     {
725         string errorMessage;
726         char[] utf8;
727         size_t characterCount;
728     }
729 
730     Result result;
731 
732     // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
733     //
734     // Params:
735     //
736     // buffer = The input buffer to encode.
737     // result = A Result struct to put encoded result and any error messages to.
738     //
739     // On error, result.errorMessage will be set.
740     static void encode(C)(C[] input, ref Result result) @safe pure
741     {
742         // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
743         // less bytes.
744         static if(is(C == dchar))
745         {
746             char[4] encodeBuf;
747             auto utf8 = cast(char[])input;
748             auto length = 0;
749             foreach(dchar c; input)
750             {
751                 ++result.characterCount;
752                 // ASCII
753                 if(c < 0x80)
754                 {
755                     utf8[length++] = cast(char)c;
756                     continue;
757                 }
758 
759                 std.utf.encode(encodeBuf, c);
760                 const bytes = codeLength!char(c);
761                 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
762                 length += bytes;
763             }
764             result.utf8 = utf8[0 .. length];
765         }
766         // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
767         else
768         {
769             result.characterCount = std.utf.count(input);
770             result.utf8 = input.to!(char[]);
771         }
772     }
773 
774     try final switch(encoding)
775     {
776         case UTFEncoding.UTF_8:
777             result.utf8 = cast(char[])input;
778             result.utf8.validate();
779             result.characterCount = std.utf.count(result.utf8);
780             break;
781         case UTFEncoding.UTF_16:
782             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
783             encode(cast(wchar[])input, result);
784             break;
785         case UTFEncoding.UTF_32:
786             assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
787             encode(cast(dchar[])input, result);
788             break;
789     }
790     catch(ConvException e) { result.errorMessage = e.msg; }
791     catch(UTFException e)  { result.errorMessage = e.msg; }
792     catch(Exception e)
793     {
794         assert(false, "Unexpected exception in encode(): " ~ e.msg);
795     }
796 
797     return result;
798 }
799 
800 /// Determine if all characters (code points, not bytes) in a string are printable.
801 bool isPrintableValidUTF8(const char[] chars) @safe pure
802 {
803     import std.uni : isControl, isWhite;
804     foreach (dchar chr; chars)
805     {
806         if (!chr.isValidDchar || (chr.isControl && !chr.isWhite))
807         {
808             return false;
809         }
810     }
811     return true;
812 }
813 
814 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
815 ///
816 /// Used to determine how many characters we can process without decoding.
817 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc
818 {
819     return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength;
820 }
821 // Unittests.
822 
823 void testEndian(R)()
824 {
825     void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
826     {
827         auto reader = new R(data);
828         assert(reader.encoding == encoding_expected);
829         assert(reader.endian_ == endian_expected);
830     }
831     ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
832     ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
833     endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
834     endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
835 }
836 
837 void testPeekPrefixForward(R)()
838 {
839     import std.encoding;
840     ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data";
841     auto reader = new R(data);
842     assert(reader.peek() == 'd');
843     assert(reader.peek(1) == 'a');
844     assert(reader.peek(2) == 't');
845     assert(reader.peek(3) == 'a');
846     assert(reader.peek(4) == '\0');
847     assert(reader.prefix(4) == "data");
848     // assert(reader.prefix(6) == "data\0");
849     reader.forward(2);
850     assert(reader.peek(1) == 'a');
851     // assert(collectException(reader.peek(3)));
852 }
853 
854 void testUTF(R)()
855 {
856     import std.encoding;
857     dchar[] data = cast(dchar[])"data";
858     void utf_test(T)(T[] data, BOM bom)
859     {
860         ubyte[] bytes = bomTable[bom].sequence ~
861                         (cast(ubyte[])data)[0 .. data.length * T.sizeof];
862         auto reader = new R(bytes);
863         assert(reader.peek() == 'd');
864         assert(reader.peek(1) == 'a');
865         assert(reader.peek(2) == 't');
866         assert(reader.peek(3) == 'a');
867     }
868     utf_test!char(to!(char[])(data), BOM.utf8);
869     utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le);
870     utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le);
871 }
872 
873 void test1Byte(R)()
874 {
875     ubyte[] data = [97];
876 
877     auto reader = new R(data);
878     assert(reader.peek() == 'a');
879     assert(reader.peek(1) == '\0');
880     // assert(collectException(reader.peek(2)));
881 }
882 
883 @system unittest
884 {
885     testEndian!Reader();
886     testPeekPrefixForward!Reader();
887     testUTF!Reader();
888     test1Byte!Reader();
889 }