1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 module dyaml.reader;
8 
9 
10 import core.stdc.stdlib;
11 import core.stdc..string;
12 import core.thread;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.exception;
18 import std.range;
19 import std..string;
20 import std.system;
21 import std.typecons;
22 import std.utf;
23 
24 import tinyendian;
25 
26 import dyaml.encoding;
27 import dyaml.exception;
28 
29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029');
30 
31 package:
32 
33 
34 ///Exception thrown at Reader errors.
35 class ReaderException : YAMLException
36 {
37     this(string msg, string file = __FILE__, int line = __LINE__)
38         @safe pure nothrow
39     {
40         super("Reader error: " ~ msg, file, line);
41     }
42 }
43 
44 /// Provides an API to read characters from a UTF-8 buffer and build slices into that
45 /// buffer to avoid allocations (see SliceBuilder).
46 final class Reader
47 {
48     private:
49         // Buffer of currently loaded characters.
50         char[] buffer_;
51 
52         // Current position within buffer. Only data after this position can be read.
53         size_t bufferOffset_;
54 
55         // Index of the current character in the buffer.
56         size_t charIndex_;
57         // Number of characters (code points) in buffer_.
58         size_t characterCount_;
59 
60         // Current line in file.
61         uint line_;
62         // Current column in file.
63         uint column_;
64 
65         // Original Unicode encoding of the data.
66         Encoding encoding_;
67 
68         version(unittest)
69         {
70             // Endianness of the input before it was converted (for testing)
71             Endian endian_;
72         }
73 
74         // The number of consecutive ASCII characters starting at bufferOffset_.
75         //
76         // Used to minimize UTF-8 decoding.
77         size_t upcomingASCII_;
78 
79         // Index to buffer_ where the last decoded character starts.
80         size_t lastDecodedBufferOffset_;
81         // Offset, relative to charIndex_, of the last decoded character,
82         // in code points, not chars.
83         size_t lastDecodedCharOffset_;
84 
85     public:
86         /// Construct a Reader.
87         ///
88         /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
89         ///                   contents of a file or a string. $(B will) be modified by
90         ///                   the Reader and other parts of D:YAML (D:YAML tries to
91         ///                   reuse the buffer to minimize memory allocations)
92         ///
93         /// Throws:  ReaderException on a UTF decoding error or if there are
94         ///          nonprintable Unicode characters illegal in YAML.
95         this(ubyte[] buffer) @safe pure
96         {
97             auto endianResult = fixUTFByteOrder(buffer);
98             if(endianResult.bytesStripped > 0)
99             {
100                 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
101                                           "to 2 or 4 bytes, respectively");
102             }
103 
104             version(unittest) { endian_ = endianResult.endian; }
105             encoding_ = endianResult.encoding;
106 
107             auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
108             const msg = utf8Result.errorMessage;
109             if(msg !is null)
110             {
111                 throw new ReaderException("Error when converting to UTF-8: " ~ msg);
112             }
113 
114             buffer_ = utf8Result.utf8;
115 
116             characterCount_ = utf8Result.characterCount;
117             // Check that all characters in buffer are printable.
118             enforce(isPrintableValidUTF8(buffer_),
119                     new ReaderException("Special unicode characters are not allowed"));
120 
121             this.sliceBuilder = SliceBuilder(this);
122             checkASCII();
123         }
124 
125         /// Get character at specified index relative to current position.
126         ///
127         /// Params:  index = Index of the character to get relative to current position
128         ///                  in the buffer. Can point outside of the buffer; In that
129         ///                  case, '\0' will be returned.
130         ///
131         /// Returns: Character at specified position or '\0' if outside of the buffer.
132         ///
133         // XXX removed; search for 'risky' to find why.
134         // Throws:  ReaderException if trying to read past the end of the buffer.
135         dchar peek(const size_t index) @safe pure
136         {
137             if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
138             if(characterCount_ <= charIndex_ + index)
139             {
140                 // XXX This is risky; revert this if bugs are introduced. We rely on
141                 // the assumption that Reader only uses peek() to detect end of buffer.
142                 // The test suite passes.
143                 // Revert this case here and in other peek() versions if this causes
144                 // errors.
145                 // throw new ReaderException("Trying to read past the end of the buffer");
146                 return '\0';
147             }
148 
149             // Optimized path for Scanner code that peeks chars in linear order to
150             // determine the length of some sequence.
151             if(index == lastDecodedCharOffset_)
152             {
153                 ++lastDecodedCharOffset_;
154                 const char b = buffer_[lastDecodedBufferOffset_];
155                 // ASCII
156                 if(b < 0x80)
157                 {
158                     ++lastDecodedBufferOffset_;
159                     return b;
160                 }
161                 return decode(buffer_, lastDecodedBufferOffset_);
162             }
163 
164             // 'Slow' path where we decode everything up to the requested character.
165             const asciiToTake = min(upcomingASCII_, index);
166             lastDecodedCharOffset_   = asciiToTake;
167             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
168             dchar d;
169             while(lastDecodedCharOffset_ <= index)
170             {
171                 d = decodeNext();
172             }
173 
174             return d;
175         }
176 
177         /// Optimized version of peek() for the case where peek index is 0.
178         dchar peek() @safe pure
179         {
180             if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
181             if(characterCount_ <= charIndex_) { return '\0'; }
182 
183             lastDecodedCharOffset_   = 0;
184             lastDecodedBufferOffset_ = bufferOffset_;
185             return decodeNext();
186         }
187 
188         /// Get byte at specified index relative to current position.
189         ///
190         /// Params:  index = Index of the byte to get relative to current position
191         ///                  in the buffer. Can point outside of the buffer; In that
192         ///                  case, '\0' will be returned.
193         ///
194         /// Returns: Byte at specified position or '\0' if outside of the buffer.
195         char peekByte(const size_t index) @safe pure nothrow @nogc
196         {
197             return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
198         }
199 
200         /// Optimized version of peekByte() for the case where peek byte index is 0.
201         char peekByte() @safe pure nothrow @nogc
202         {
203             return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
204         }
205 
206 
207         /// Get specified number of characters starting at current position.
208         ///
209         /// Note: This gets only a "view" into the internal buffer, which will be
210         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
211         ///       for permanent use.
212         ///
213         /// Params: length = Number of characters (code points, not bytes) to get. May
214         ///                  reach past the end of the buffer; in that case the returned
215         ///                  slice will be shorter.
216         ///
217         /// Returns: Characters starting at current position or an empty slice if out of bounds.
218         char[] prefix(const size_t length) @safe pure
219         {
220             return slice(length);
221         }
222 
223         /// Get specified number of bytes, not code points, starting at current position.
224         ///
225         /// Note: This gets only a "view" into the internal buffer, which will be
226         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
227         ///       for permanent use.
228         ///
229         /// Params: length = Number bytes (not code points) to get. May NOT reach past
230         ///                  the end of the buffer; should be used with peek() to avoid
231         ///                  this.
232         ///
233         /// Returns: Bytes starting at current position.
234         char[] prefixBytes(const size_t length) @safe pure nothrow @nogc
235         {
236             assert(length == 0 || bufferOffset_ + length < buffer_.length,
237                    "prefixBytes out of bounds");
238             return buffer_[bufferOffset_ .. bufferOffset_ + length];
239         }
240 
241         /// Get a slice view of the internal buffer, starting at the current position.
242         ///
243         /// Note: This gets only a "view" into the internal buffer,
244         ///       which get invalidated after other Reader calls.
245         ///
246         /// Params:  end = End of the slice relative to current position. May reach past
247         ///                the end of the buffer; in that case the returned slice will
248         ///                be shorter.
249         ///
250         /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
251         char[] slice(const size_t end) @safe pure
252         {
253             // Fast path in case the caller has already peek()ed all the way to end.
254             if(end == lastDecodedCharOffset_)
255             {
256                 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
257             }
258 
259             const asciiToTake = min(upcomingASCII_, end, buffer_.length);
260             lastDecodedCharOffset_   = asciiToTake;
261             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
262 
263             // 'Slow' path - decode everything up to end.
264             while(lastDecodedCharOffset_ < end &&
265                   lastDecodedBufferOffset_ < buffer_.length)
266             {
267                 decodeNext();
268             }
269 
270             return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
271         }
272 
273         /// Get the next character, moving buffer position beyond it.
274         ///
275         /// Returns: Next character.
276         ///
277         /// Throws:  ReaderException if trying to read past the end of the buffer
278         ///          or if invalid data is read.
279         dchar get() @safe pure
280         {
281             const result = peek();
282             forward();
283             return result;
284         }
285 
286         /// Get specified number of characters, moving buffer position beyond them.
287         ///
288         /// Params:  length = Number or characters (code points, not bytes) to get.
289         ///
290         /// Returns: Characters starting at current position.
291         char[] get(const size_t length) @safe pure
292         {
293             auto result = slice(length);
294             forward(length);
295             return result;
296         }
297 
298         /// Move current position forward.
299         ///
300         /// Params:  length = Number of characters to move position forward.
301         void forward(size_t length) @safe pure
302         {
303             while(length > 0)
304             {
305                 auto asciiToTake = min(upcomingASCII_, length);
306                 charIndex_     += asciiToTake;
307                 length         -= asciiToTake;
308                 upcomingASCII_ -= asciiToTake;
309 
310                 for(; asciiToTake > 0; --asciiToTake)
311                 {
312                     const c = buffer_[bufferOffset_++];
313                     // c is ASCII, do we only need to check for ASCII line breaks.
314                     if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
315                     {
316                         ++line_;
317                         column_ = 0;
318                         continue;
319                     }
320                     ++column_;
321                 }
322 
323                 // If we have used up all upcoming ASCII chars, the next char is
324                 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
325                 // be updated - it's zero.
326                 if(length == 0) { break; }
327 
328                 assert(upcomingASCII_ == 0,
329                        "Running unicode handling code but we haven't run out of ASCII chars");
330                 assert(bufferOffset_ < buffer_.length,
331                        "Attempted to decode past the end of YAML buffer");
332                 assert(buffer_[bufferOffset_] >= 0x80,
333                        "ASCII must be handled by preceding code");
334 
335                 ++charIndex_;
336                 const c = decode(buffer_, bufferOffset_);
337 
338                 // New line. (can compare with '\n' without decoding since it's ASCII)
339                 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
340                 {
341                     ++line_;
342                     column_ = 0;
343                 }
344                 else if(c != '\uFEFF') { ++column_; }
345                 --length;
346                 checkASCII();
347             }
348 
349             lastDecodedBufferOffset_ = bufferOffset_;
350             lastDecodedCharOffset_ = 0;
351         }
352 
353         /// Move current position forward by one character.
354         void forward() @safe pure
355         {
356             ++charIndex_;
357             lastDecodedBufferOffset_ = bufferOffset_;
358             lastDecodedCharOffset_ = 0;
359 
360             // ASCII
361             if(upcomingASCII_ > 0)
362             {
363                 --upcomingASCII_;
364                 const c = buffer_[bufferOffset_++];
365 
366                 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
367                 {
368                     ++line_;
369                     column_ = 0;
370                     return;
371                 }
372                 ++column_;
373                 return;
374             }
375 
376             // UTF-8
377             assert(bufferOffset_ < buffer_.length,
378                    "Attempted to decode past the end of YAML buffer");
379             assert(buffer_[bufferOffset_] >= 0x80,
380                    "ASCII must be handled by preceding code");
381 
382             const c = decode(buffer_, bufferOffset_);
383 
384             // New line. (can compare with '\n' without decoding since it's ASCII)
385             if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
386             {
387                 ++line_;
388                 column_ = 0;
389             }
390             else if(c != '\uFEFF') { ++column_; }
391 
392             checkASCII();
393         }
394 
395         /// Used to build slices of read data in Reader; to avoid allocations.
396         SliceBuilder sliceBuilder;
397 
398         /// Get a string describing current buffer position, used for error messages.
399         Mark mark() const pure nothrow @nogc @safe { return Mark(line_, column_); }
400 
401         /// Get current line number.
402         uint line() const @safe pure nothrow @nogc { return line_; }
403 
404         /// Get current column number.
405         uint column() const @safe pure nothrow @nogc { return column_; }
406 
407         /// Get index of the current character in the buffer.
408         size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; }
409 
410         /// Get encoding of the input buffer.
411         Encoding encoding() const @safe pure nothrow @nogc { return encoding_; }
412 
413 private:
414         // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
415         void checkASCII() @safe pure nothrow @nogc
416         {
417             upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
418         }
419 
420         // Decode the next character relative to
421         // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
422         //
423         // Does not advance the buffer position. Used in peek() and slice().
424         dchar decodeNext() @safe pure
425         {
426             assert(lastDecodedBufferOffset_ < buffer_.length,
427                    "Attempted to decode past the end of YAML buffer");
428             const char b = buffer_[lastDecodedBufferOffset_];
429             ++lastDecodedCharOffset_;
430             // ASCII
431             if(b < 0x80)
432             {
433                 ++lastDecodedBufferOffset_;
434                 return b;
435             }
436 
437             return decode(buffer_, lastDecodedBufferOffset_);
438         }
439 }
440 
441 /// Used to build slices of already read data in Reader buffer, avoiding allocations.
442 ///
443 /// Usually these slices point to unchanged Reader data, but sometimes the data is
444 /// changed due to how YAML interprets certain characters/strings.
445 ///
446 /// See begin() documentation.
447 struct SliceBuilder
448 {
449 private:
450     // No copying by the user.
451     @disable this(this);
452     @disable void opAssign(ref SliceBuilder);
453 
454     // Reader this builder works in.
455     Reader reader_;
456 
457     // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
458     size_t start_ = size_t.max;
459     // End of the slice om reader_.buffer_ (size_t.max while no slice being build)
460     size_t end_   = size_t.max;
461 
462     // Stack of slice ends to revert to (see Transaction)
463     //
464     // Very few levels as we don't want arbitrarily nested transactions.
465     size_t[4] endStack_;
466     // The number of elements currently in endStack_.
467     size_t endStackUsed_;
468 
469     @safe const pure nothrow @nogc invariant()
470     {
471         if(!inProgress) { return; }
472         assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
473         assert(start_ <= end_, "Slice start after slice end");
474     }
475 
476     // Is a slice currently being built?
477     bool inProgress() @safe const pure nothrow @nogc
478     {
479         assert(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max,
480                "start_/end_ are not consistent");
481         return start_ != size_t.max;
482     }
483 
484 public:
485     /// Begin building a slice.
486     ///
487     /// Only one slice can be built at any given time; before beginning a new slice,
488     /// finish the previous one (if any).
489     ///
490     /// The slice starts at the current position in the Reader buffer. It can only be
491     /// extended up to the current position in the buffer; Reader methods get() and
492     /// forward() move the position. E.g. it is valid to extend a slice by write()-ing
493     /// a string just returned by get() - but not one returned by prefix() unless the
494     /// position has changed since the prefix() call.
495     void begin() @safe pure nothrow @nogc
496     {
497         assert(!inProgress, "Beginning a slice while another slice is being built");
498         assert(endStackUsed_ == 0, "Slice stack not empty at slice begin");
499 
500         start_ = reader_.bufferOffset_;
501         end_   = reader_.bufferOffset_;
502     }
503 
504     /// Finish building a slice and return it.
505     ///
506     /// Any Transactions on the slice must be committed or destroyed before the slice
507     /// is finished.
508     ///
509     /// Returns a string; once a slice is finished it is definitive that its contents
510     /// will not be changed.
511     char[] finish() @safe pure nothrow @nogc
512     {
513         assert(inProgress, "finish called without begin");
514         assert(endStackUsed_ == 0, "Finishing a slice with running transactions.");
515 
516         auto result = reader_.buffer_[start_ .. end_];
517         start_ = end_ = size_t.max;
518         return result;
519     }
520 
521     /// Write a string to the slice being built.
522     ///
523     /// Data can only be written up to the current position in the Reader buffer.
524     ///
525     /// If str is a string returned by a Reader method, and str starts right after the
526     /// end of the slice being built, the slice is extended (trivial operation).
527     ///
528     /// See_Also: begin
529     void write(scope char[] str) @safe pure nothrow @nogc
530     {
531         assert(inProgress, "write called without begin");
532         assert(end_ <= reader_.bufferOffset_,
533                "AT START: Slice ends after buffer position");
534 
535         // Nothing? Already done.
536         if (str.length == 0) { return; }
537         // If str starts at the end of the slice (is a string returned by a Reader
538         // method), just extend the slice to contain str.
539         if(&str[0] == &reader_.buffer_[end_])
540         {
541             end_ += str.length;
542         }
543         // Even if str does not start at the end of the slice, it still may be returned
544         // by a Reader method and point to buffer. So we need to memmove.
545         else
546         {
547             copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]);
548             end_ += str.length;
549         }
550     }
551 
552     /// Write a character to the slice being built.
553     ///
554     /// Data can only be written up to the current position in the Reader buffer.
555     ///
556     /// See_Also: begin
557     void write(dchar c) @safe pure
558     {
559         assert(inProgress, "write called without begin");
560         if(c < 0x80)
561         {
562             reader_.buffer_[end_++] = cast(char)c;
563             return;
564         }
565 
566         // We need to encode a non-ASCII dchar into UTF-8
567         char[4] encodeBuf;
568         const bytes = encode(encodeBuf, c);
569         reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
570         end_ += bytes;
571     }
572 
573     /// Insert a character to a specified position in the slice.
574     ///
575     /// Enlarges the slice by 1 char. Note that the slice can only extend up to the
576     /// current position in the Reader buffer.
577     ///
578     /// Params:
579     ///
580     /// c        = The character to insert.
581     /// position = Position to insert the character at in code units, not code points.
582     ///            Must be less than slice length(); a previously returned length()
583     ///            can be used.
584     void insert(const dchar c, const size_t position) @safe pure
585     {
586         assert(inProgress, "insert called without begin");
587         assert(start_ + position <= end_, "Trying to insert after the end of the slice");
588 
589         const point       = start_ + position;
590         const movedLength = end_ - point;
591 
592         // Encode c into UTF-8
593         char[4] encodeBuf;
594         if(c < 0x80) { encodeBuf[0] = cast(char)c; }
595         const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c);
596 
597         if(movedLength > 0)
598         {
599             copy(reader_.buffer_[point..point + movedLength * char.sizeof],
600                     reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]);
601         }
602         reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
603         end_ += bytes;
604     }
605 
606     /// Get the current length of the slice.
607     size_t length() @safe const pure nothrow @nogc
608     {
609         return end_ - start_;
610     }
611 
612     /// A slice building transaction.
613     ///
614     /// Can be used to save and revert back to slice state.
615     struct Transaction
616     {
617     private:
618         // The slice builder affected by the transaction.
619         SliceBuilder* builder_;
620         // Index of the return point of the transaction in StringBuilder.endStack_.
621         size_t stackLevel_;
622         // True after commit() has been called.
623         bool committed_;
624 
625     public:
626         /// Begins a transaction on a SliceBuilder object.
627         ///
628         /// The transaction must end $(B after) any transactions created within the
629         /// transaction but $(B before) the slice is finish()-ed. A transaction can be
630         /// ended either by commit()-ing or reverting through the destructor.
631         ///
632         /// Saves the current state of a slice.
633         this(SliceBuilder* builder) @safe pure nothrow @nogc
634         {
635             builder_ = builder;
636             stackLevel_ = builder_.endStackUsed_;
637             builder_.push();
638         }
639 
640         /// Commit changes to the slice.
641         ///
642         /// Ends the transaction - can only be called once, and removes the possibility
643         /// to revert slice state.
644         ///
645         /// Does nothing for a default-initialized transaction (the transaction has not
646         /// been started yet).
647         void commit() @safe pure nothrow @nogc
648         {
649             assert(!committed_, "Can't commit a transaction more than once");
650 
651             if(builder_ is null) { return; }
652             assert(builder_.endStackUsed_ == stackLevel_ + 1,
653                    "Parent transactions don't fully contain child transactions");
654             builder_.apply();
655             committed_ = true;
656         }
657 
658         /// Destroy the transaction and revert it if it hasn't been committed yet.
659         void end() @safe pure nothrow @nogc
660         {
661             assert(builder_ && builder_.endStackUsed_ == stackLevel_ + 1,
662                    "Parent transactions don't fully contain child transactions");
663             builder_.pop();
664             builder_ = null;
665         }
666 
667     }
668 
669 private:
670     // Push the current end of the slice so we can revert to it if needed.
671     //
672     // Used by Transaction.
673     void push() @safe pure nothrow @nogc
674     {
675         assert(inProgress, "push called without begin");
676         assert(endStackUsed_ < endStack_.length, "Slice stack overflow");
677         endStack_[endStackUsed_++] = end_;
678     }
679 
680     // Pop the current end of endStack_ and set the end of the slice to the popped
681     // value, reverting changes since the old end was pushed.
682     //
683     // Used by Transaction.
684     void pop() @safe pure nothrow @nogc
685     {
686         assert(inProgress, "pop called without begin");
687         assert(endStackUsed_ > 0, "Trying to pop an empty slice stack");
688         end_ = endStack_[--endStackUsed_];
689     }
690 
691     // Pop the current end of endStack_, but keep the current end of the slice, applying
692     // changes made since pushing the old end.
693     //
694     // Used by Transaction.
695     void apply() @safe pure nothrow @nogc
696     {
697         assert(inProgress, "apply called without begin");
698         assert(endStackUsed_ > 0, "Trying to apply an empty slice stack");
699         --endStackUsed_;
700     }
701 }
702 
703 
704 private:
705 
706 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
707 //
708 // Params:
709 //
710 // input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
711 //            conversion, in which case the result will be a slice of this buffer.
712 // encoding = Encoding of input.
713 //
714 // Returns:
715 //
716 // A struct with the following members:
717 //
718 // $(D string errorMessage)   In case of an error, the error message is stored here. If
719 //                            there was no error, errorMessage is NULL. Always check
720 //                            this first.
721 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
722 // $(D size_t characterCount) Number of characters (code points) in input.
723 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
724 {
725     // Documented in function ddoc.
726     struct Result
727     {
728         string errorMessage;
729         char[] utf8;
730         size_t characterCount;
731     }
732 
733     Result result;
734 
735     // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
736     //
737     // Params:
738     //
739     // buffer = The input buffer to encode.
740     // result = A Result struct to put encoded result and any error messages to.
741     //
742     // On error, result.errorMessage will be set.
743     static void encode(C)(C[] input, ref Result result) @safe pure
744     {
745         // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
746         // less bytes.
747         static if(is(C == dchar))
748         {
749             char[4] encodeBuf;
750             auto utf8 = cast(char[])input;
751             auto length = 0;
752             foreach(dchar c; input)
753             {
754                 ++result.characterCount;
755                 // ASCII
756                 if(c < 0x80)
757                 {
758                     utf8[length++] = cast(char)c;
759                     continue;
760                 }
761 
762                 std.utf.encode(encodeBuf, c);
763                 const bytes = codeLength!char(c);
764                 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
765                 length += bytes;
766             }
767             result.utf8 = utf8[0 .. length];
768         }
769         // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
770         else
771         {
772             result.characterCount = std.utf.count(input);
773             result.utf8 = input.to!(char[]);
774         }
775     }
776 
777     try final switch(encoding)
778     {
779         case UTFEncoding.UTF_8:
780             result.utf8 = cast(char[])input;
781             result.utf8.validate();
782             result.characterCount = std.utf.count(result.utf8);
783             break;
784         case UTFEncoding.UTF_16:
785             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
786             encode(cast(wchar[])input, result);
787             break;
788         case UTFEncoding.UTF_32:
789             assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
790             encode(cast(dchar[])input, result);
791             break;
792     }
793     catch(ConvException e) { result.errorMessage = e.msg; }
794     catch(UTFException e)  { result.errorMessage = e.msg; }
795     catch(Exception e)
796     {
797         assert(false, "Unexpected exception in encode(): " ~ e.msg);
798     }
799 
800     return result;
801 }
802 
803 /// Determine if all characters (code points, not bytes) in a string are printable.
804 bool isPrintableValidUTF8(const char[] chars) @safe pure
805 {
806     // This is oversized (only 128 entries are necessary) simply because having 256
807     // entries improves performance... for some reason (alignment?)
808     bool[256] printable = [false, false, false, false, false, false, false, false,
809                            false, true,  true,  false, false, true,  false, false,
810                            false, false, false, false, false, false, false, false,
811                            false, false, false, false, false, false, false, false,
812 
813                            true,  true,  true,  true, true,  true,  true,  true,
814                            true,  true,  true,  true, true,  true,  true,  true,
815                            true,  true,  true,  true, true,  true,  true,  true,
816                            true,  true,  true,  true, true,  true,  true,  true,
817 
818                            true,  true,  true,  true, true,  true,  true,  true,
819                            true,  true,  true,  true, true,  true,  true,  true,
820                            true,  true,  true,  true, true,  true,  true,  true,
821                            true,  true,  true,  true, true,  true,  true,  true,
822                            true,  true,  true,  true, true,  true,  true,  true,
823                            true,  true,  true,  true, true,  true,  true,  true,
824                            true,  true,  true,  true, true,  true,  true,  true,
825                            true,  true,  true,  true, true,  true,  true,  true,
826 
827                            false, false, false, false, false, false, false, false,
828                            false, false, false, false, false, false, false, false,
829                            false, false, false, false, false, false, false, false,
830                            false, false, false, false, false, false, false, false,
831                            false, false, false, false, false, false, false, false,
832                            false, false, false, false, false, false, false, false,
833                            false, false, false, false, false, false, false, false,
834                            false, false, false, false, false, false, false, false,
835 
836                            false, false, false, false, false, false, false, false,
837                            false, false, false, false, false, false, false, false,
838                            false, false, false, false, false, false, false, false,
839                            false, false, false, false, false, false, false, false,
840                            false, false, false, false, false, false, false, false,
841                            false, false, false, false, false, false, false, false,
842                            false, false, false, false, false, false, false, false,
843                            false, false, false, false, false, false, false, false];
844 
845     for(size_t index; index < chars.length;)
846     {
847         // Fast path for ASCII.
848         // Both this while() block and the if() block below it are optimized, unrolled
849         // versions of the for() block below them; the while()/if() block could be
850         // removed without affecting logic, but both help increase performance.
851         size_t asciiCount = countASCII(chars[index .. $]);
852         // 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars.
853         while(asciiCount > 8)
854         {
855             const dchar b0 = chars[index];
856             const dchar b1 = chars[index + 1];
857             const dchar b2 = chars[index + 2];
858             const dchar b3 = chars[index + 3];
859             const dchar b4 = chars[index + 4];
860             const dchar b5 = chars[index + 5];
861             const dchar b6 = chars[index + 6];
862             const dchar b7 = chars[index + 7];
863 
864             index += 8;
865             asciiCount -= 8;
866 
867             const all = printable[b0] & printable[b1] & printable[b2] & printable[b3] &
868                         printable[b4] & printable[b5] & printable[b6] & printable[b1];
869             if(!all)
870             {
871                 return false;
872             }
873         }
874         // 4 ASCII iterations unrolled
875         if(asciiCount > 4)
876         {
877             const char b0 = chars[index];
878             const char b1 = chars[index + 1];
879             const char b2 = chars[index + 2];
880             const char b3 = chars[index + 3];
881 
882             index += 4;
883             asciiCount -= 4;
884 
885             if(!printable[b0]) { return false; }
886             if(!printable[b1]) { return false; }
887             if(!printable[b2]) { return false; }
888             if(!printable[b3]) { return false; }
889         }
890         // Any remaining ASCII chars. This is really the only code needed to handle
891         // ASCII, the above if() and while() blocks are just an optimization.
892         for(; asciiCount > 0; --asciiCount)
893         {
894             const char b = chars[index];
895             ++index;
896             if(b >= 0x20)    { continue; }
897             if(printable[b]) { continue; }
898             return false;
899         }
900 
901         if(index == chars.length) { break; }
902 
903         // Not ASCII, need to decode.
904         const dchar c = decode(chars, index);
905         // We now c is not ASCII, so only check for printable non-ASCII chars.
906         if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
907             (c >= '\uE000' && c <= '\uFFFD') ||
908             (c >= '\U00010000' && c <= '\U0010FFFF')))
909         {
910             return false;
911         }
912     }
913     return true;
914 }
915 
916 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
917 ///
918 /// Used to determine how many characters we can process without decoding.
919 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc
920 {
921     return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength;
922 }
923 // Unittests.
924 
925 void testEndian(R)()
926 {
927     void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
928     {
929         auto reader = new R(data);
930         assert(reader.encoding == encoding_expected);
931         assert(reader.endian_ == endian_expected);
932     }
933     ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
934     ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
935     endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
936     endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
937 }
938 
939 void testPeekPrefixForward(R)()
940 {
941     import std.encoding;
942     ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data";
943     auto reader = new R(data);
944     assert(reader.peek() == 'd');
945     assert(reader.peek(1) == 'a');
946     assert(reader.peek(2) == 't');
947     assert(reader.peek(3) == 'a');
948     assert(reader.peek(4) == '\0');
949     assert(reader.prefix(4) == "data");
950     // assert(reader.prefix(6) == "data\0");
951     reader.forward(2);
952     assert(reader.peek(1) == 'a');
953     // assert(collectException(reader.peek(3)));
954 }
955 
956 void testUTF(R)()
957 {
958     import std.encoding;
959     dchar[] data = cast(dchar[])"data";
960     void utf_test(T)(T[] data, BOM bom)
961     {
962         ubyte[] bytes = bomTable[bom].sequence ~
963                         (cast(ubyte[])data)[0 .. data.length * T.sizeof];
964         auto reader = new R(bytes);
965         assert(reader.peek() == 'd');
966         assert(reader.peek(1) == 'a');
967         assert(reader.peek(2) == 't');
968         assert(reader.peek(3) == 'a');
969     }
970     utf_test!char(to!(char[])(data), BOM.utf8);
971     utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le);
972     utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le);
973 }
974 
975 void test1Byte(R)()
976 {
977     ubyte[] data = [97];
978 
979     auto reader = new R(data);
980     assert(reader.peek() == 'a');
981     assert(reader.peek(1) == '\0');
982     // assert(collectException(reader.peek(2)));
983 }
984 
985 @system unittest
986 {
987     testEndian!Reader();
988     testPeekPrefixForward!Reader();
989     testUTF!Reader();
990     test1Byte!Reader();
991 }