1 //          Copyright Ferdinand Majerech 2014.
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 
7 // Code that is currently unused but may be useful for future D:YAML releases
8 module dyaml.unused;
9 
10 
11 
12 import std.utf;
13 
14 import tinyendian;
15 
16 // Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
17 //
18 // Params:
19 //
20 // input    = The UTF-8/16/32 buffer to decode.
21 // encoding = Encoding of input.
22 //
23 // Returns:
24 //
25 // A struct with the following members:
26 //
27 // $(D string errorMessage) In case of a decoding error, the error message is stored
28 //                          here. If there was no error, errorMessage is NULL. Always
29 //                          check this first before using the other members.
30 // $(D dchar[] decoded)     A GC-allocated buffer with decoded UTF-32 characters.
31 auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
32 {
33     // Documented in function ddoc.
34     struct Result
35     {
36         string errorMessage;
37         dchar[] decoded;
38     }
39 
40     Result result;
41 
42     // Decode input_ if it's encoded as UTF-8 or UTF-16.
43     //
44     // Params:
45     //
46     // buffer = The input buffer to decode.
47     // result = A Result struct to put decoded result and any error messages to.
48     //
49     // On error, result.errorMessage will be set.
50     static void decode(C)(C[] input, ref Result result) @safe pure nothrow
51     {
52         // End of part of input that contains complete characters that can be decoded.
53         const size_t end = endOfLastUTFSequence(input);
54         // If end is 0, there are no full chars.
55         // This can happen at the end of file if there is an incomplete UTF sequence.
56         if(end < input.length)
57         {
58             result.errorMessage = "Invalid UTF character at the end of input";
59             return;
60         }
61 
62         const srclength = input.length;
63         try for(size_t srcpos = 0; srcpos < srclength;)
64         {
65             const c = input[srcpos];
66             if(c < 0x80)
67             {
68                 result.decoded ~= c;
69                 ++srcpos;
70             }
71             else
72             {
73                 result.decoded ~= std.utf.decode(input, srcpos);
74             }
75         }
76         catch(UTFException e)
77         {
78             result.errorMessage = e.msg;
79             return;
80         }
81         catch(Exception e)
82         {
83             assert(false, "Unexpected exception in decode(): " ~ e.msg);
84         }
85     }
86 
87     final switch(encoding)
88     {
89         case UTFEncoding.UTF_8:  decode(cast(char[])input, result); break;
90         case UTFEncoding.UTF_16:
91             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
92             decode(cast(wchar[])input, result);
93             break;
94         case UTFEncoding.UTF_32:
95             assert(input.length % 4 == 0,
96                     "UTF-32 buffer size must be a multiple of 4");
97             // No need to decode anything
98             result.decoded = cast(dchar[])input;
99             break;
100     }
101 
102     if(result.errorMessage !is null) { return result; }
103 
104     return result;
105 }
106 
107 
108 // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
109 size_t endOfLastUTFSequence(C)(const C[] buffer)
110     @safe pure nothrow @nogc
111 {
112     static if(is(C == char))
113     {
114         for(long end = buffer.length - 1; end >= 0; --end)
115         {
116             const stride = utf8Stride[buffer[cast(size_t)end]];
117             if(stride != 0xFF)
118             {
119                 // If stride goes beyond end of the buffer, return end.
120                 // Otherwise the last sequence ends at buffer.length, so we can
121                 // return that. (Unless there is an invalid code unit, which is
122                 // caught at decoding)
123                 return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
124             }
125         }
126         return 0;
127     }
128     else static if(is(C == wchar))
129     {
130         // TODO this is O(N), which is slow. Find out if we can somehow go
131         // from the end backwards with UTF-16.
132         size_t end = 0;
133         while(end < buffer.length)
134         {
135             const s = stride(buffer, end);
136             if(s + end > buffer.length) { break; }
137             end += s;
138         }
139         return end;
140     }
141 }
142 
143 // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
144 immutable ubyte[256] utf8Stride =
145 [
146     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
147     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
148     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
149     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
150     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
151     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
152     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
153     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
154     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
155     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
156     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
157     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
158     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
159     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
160     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
161     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
162 ];