1 // Copyright Ferdinand Majerech 2014. 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 7 // Code that is currently unused but may be useful for future D:YAML releases 8 module dyaml.unused; 9 10 11 12 import std.utf; 13 14 import tinyendian; 15 16 // Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing). 17 // 18 // Params: 19 // 20 // input = The UTF-8/16/32 buffer to decode. 21 // encoding = Encoding of input. 22 // 23 // Returns: 24 // 25 // A struct with the following members: 26 // 27 // $(D string errorMessage) In case of a decoding error, the error message is stored 28 // here. If there was no error, errorMessage is NULL. Always 29 // check this first before using the other members. 30 // $(D dchar[] decoded) A GC-allocated buffer with decoded UTF-32 characters. 31 auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow 32 { 33 // Documented in function ddoc. 34 struct Result 35 { 36 string errorMessage; 37 dchar[] decoded; 38 } 39 40 Result result; 41 42 // Decode input_ if it's encoded as UTF-8 or UTF-16. 43 // 44 // Params: 45 // 46 // buffer = The input buffer to decode. 47 // result = A Result struct to put decoded result and any error messages to. 48 // 49 // On error, result.errorMessage will be set. 50 static void decode(C)(C[] input, ref Result result) @safe pure nothrow 51 { 52 // End of part of input that contains complete characters that can be decoded. 53 const size_t end = endOfLastUTFSequence(input); 54 // If end is 0, there are no full chars. 55 // This can happen at the end of file if there is an incomplete UTF sequence. 56 if(end < input.length) 57 { 58 result.errorMessage = "Invalid UTF character at the end of input"; 59 return; 60 } 61 62 const srclength = input.length; 63 try for(size_t srcpos = 0; srcpos < srclength;) 64 { 65 const c = input[srcpos]; 66 if(c < 0x80) 67 { 68 result.decoded ~= c; 69 ++srcpos; 70 } 71 else 72 { 73 result.decoded ~= std.utf.decode(input, srcpos); 74 } 75 } 76 catch(UTFException e) 77 { 78 result.errorMessage = e.msg; 79 return; 80 } 81 catch(Exception e) 82 { 83 assert(false, "Unexpected exception in decode(): " ~ e.msg); 84 } 85 } 86 87 final switch(encoding) 88 { 89 case UTFEncoding.UTF_8: decode(cast(char[])input, result); break; 90 case UTFEncoding.UTF_16: 91 assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); 92 decode(cast(wchar[])input, result); 93 break; 94 case UTFEncoding.UTF_32: 95 assert(input.length % 4 == 0, 96 "UTF-32 buffer size must be a multiple of 4"); 97 // No need to decode anything 98 result.decoded = cast(dchar[])input; 99 break; 100 } 101 102 if(result.errorMessage !is null) { return result; } 103 104 return result; 105 } 106 107 108 // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer. 109 size_t endOfLastUTFSequence(C)(const C[] buffer) 110 @safe pure nothrow @nogc 111 { 112 static if(is(C == char)) 113 { 114 for(long end = buffer.length - 1; end >= 0; --end) 115 { 116 const stride = utf8Stride[buffer[cast(size_t)end]]; 117 if(stride != 0xFF) 118 { 119 // If stride goes beyond end of the buffer, return end. 120 // Otherwise the last sequence ends at buffer.length, so we can 121 // return that. (Unless there is an invalid code unit, which is 122 // caught at decoding) 123 return (stride > buffer.length - end) ? cast(size_t)end : buffer.length; 124 } 125 } 126 return 0; 127 } 128 else static if(is(C == wchar)) 129 { 130 // TODO this is O(N), which is slow. Find out if we can somehow go 131 // from the end backwards with UTF-16. 132 size_t end = 0; 133 while(end < buffer.length) 134 { 135 const s = stride(buffer, end); 136 if(s + end > buffer.length) { break; } 137 end += s; 138 } 139 return end; 140 } 141 } 142 143 // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence). 144 immutable ubyte[256] utf8Stride = 145 [ 146 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 147 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 148 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 149 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 150 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 151 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 152 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 153 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 154 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 155 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 156 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 157 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 158 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 159 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 160 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 161 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 162 ];