Wire Sysio Wire Sysion 1.0.0
Loading...
Searching...
No Matches
core.h
Go to the documentation of this file.
1// Copyright 2006 Nemanja Trifunovic
2
3/*
4Permission is hereby granted, free of charge, to any person or organization
5obtaining a copy of the software and accompanying documentation covered by
6this license (the "Software") to use, reproduce, display, distribute,
7execute, and transmit the Software, and to prepare derivative works of the
8Software, and to permit third-parties to whom the Software is furnished to
9do so, all subject to the following:
10
11The copyright notices in the Software and this entire statement, including
12the above license grant, this restriction and the following disclaimer,
13must be included in all copies of the Software, in whole or in part, and
14all derivative works of the Software, unless such copies or derivative
15works are solely in the form of machine-executable object code generated by
16a source language processor.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24DEALINGS IN THE SOFTWARE.
25*/
26
27
28#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30
31#include <iterator>
32
33namespace utf8
34{
35 // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
36 // You may need to change them to match your system.
37 // These typedefs have the same names as ones from cstdint, or boost/cstdint
38 typedef unsigned char uint8_t;
39 typedef unsigned short uint16_t;
40 typedef unsigned int uint32_t;
41
42// Helper code - not intended to be directly called by the library users. May be changed at any time
43namespace internal
44{
45 // Unicode constants
46 // Leading (high) surrogates: 0xd800 - 0xdbff
47 // Trailing (low) surrogates: 0xdc00 - 0xdfff
52 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
54
55 // Maximum valid value for a Unicode code point
56 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
57
58 template<typename octet_type>
59 inline uint8_t mask8(octet_type oc)
60 {
61 return static_cast<uint8_t>(0xff & oc);
62 }
63 template<typename u16_type>
64 inline uint16_t mask16(u16_type oc)
65 {
66 return static_cast<uint16_t>(0xffff & oc);
67 }
68 template<typename octet_type>
69 inline bool is_trail(octet_type oc)
70 {
71 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
72 }
73
74 template <typename u16>
75 inline bool is_lead_surrogate(u16 cp)
76 {
77 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
78 }
79
80 template <typename u16>
81 inline bool is_trail_surrogate(u16 cp)
82 {
83 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
84 }
85
86 template <typename u16>
87 inline bool is_surrogate(u16 cp)
88 {
89 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
90 }
91
92 template <typename u32>
93 inline bool is_code_point_valid(u32 cp)
94 {
95 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
96 }
97
98 template <typename octet_iterator>
99 inline typename std::iterator_traits<octet_iterator>::difference_type
100 sequence_length(octet_iterator lead_it)
101 {
102 uint8_t lead = utf8::internal::mask8(*lead_it);
103 if (lead < 0x80)
104 return 1;
105 else if ((lead >> 5) == 0x6)
106 return 2;
107 else if ((lead >> 4) == 0xe)
108 return 3;
109 else if ((lead >> 3) == 0x1e)
110 return 4;
111 else
112 return 0;
113 }
114
115 template <typename octet_difference_type>
116 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 {
118 if (cp < 0x80) {
119 if (length != 1)
120 return true;
121 }
122 else if (cp < 0x800) {
123 if (length != 2)
124 return true;
125 }
126 else if (cp < 0x10000) {
127 if (length != 3)
128 return true;
129 }
130
131 return false;
132 }
133
135
137 template <typename octet_iterator>
138 utf_error increase_safely(octet_iterator& it, octet_iterator end)
139 {
140 if (++it == end)
141 return NOT_ENOUGH_ROOM;
142
143 if (!utf8::internal::is_trail(*it))
144 return INCOMPLETE_SEQUENCE;
145
146 return UTF8_OK;
147 }
148
149 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
150
152 template <typename octet_iterator>
153 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
154 {
155 if (it == end)
156 return NOT_ENOUGH_ROOM;
157
158 code_point = utf8::internal::mask8(*it);
159
160 return UTF8_OK;
161 }
162
163 template <typename octet_iterator>
164 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
165 {
166 if (it == end)
167 return NOT_ENOUGH_ROOM;
168
169 code_point = utf8::internal::mask8(*it);
170
172
173 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174
175 return UTF8_OK;
176 }
177
178 template <typename octet_iterator>
179 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
180 {
181 if (it == end)
182 return NOT_ENOUGH_ROOM;
183
184 code_point = utf8::internal::mask8(*it);
185
187
188 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189
191
192 code_point += (*it) & 0x3f;
193
194 return UTF8_OK;
195 }
196
197 template <typename octet_iterator>
198 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 {
200 if (it == end)
201 return NOT_ENOUGH_ROOM;
202
203 code_point = utf8::internal::mask8(*it);
204
206
207 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208
210
211 code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212
214
215 code_point += (*it) & 0x3f;
216
217 return UTF8_OK;
218 }
219
220 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
221
222 template <typename octet_iterator>
223 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
224 {
225 // Save the original value of it so we can go back in case of failure
226 // Of course, it does not make much sense with i.e. stream iterators
227 octet_iterator original_it = it;
228
229 uint32_t cp = 0;
230 // Determine the sequence length based on the lead octet
231 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 const octet_difference_type length = utf8::internal::sequence_length(it);
233
234 // Get trail octets and calculate the code point
235 utf_error err = UTF8_OK;
236 switch (length) {
237 case 0:
238 return INVALID_LEAD;
239 case 1:
240 err = utf8::internal::get_sequence_1(it, end, cp);
241 break;
242 case 2:
243 err = utf8::internal::get_sequence_2(it, end, cp);
244 break;
245 case 3:
246 err = utf8::internal::get_sequence_3(it, end, cp);
247 break;
248 case 4:
249 err = utf8::internal::get_sequence_4(it, end, cp);
250 break;
251 }
252
253 if (err == UTF8_OK) {
254 // Decoding succeeded. Now, security checks...
256 if (!utf8::internal::is_overlong_sequence(cp, length)){
257 // Passed! Return here.
258 code_point = cp;
259 ++it;
260 return UTF8_OK;
261 }
262 else
263 err = OVERLONG_SEQUENCE;
264 }
265 else
266 err = INVALID_CODE_POINT;
267 }
268
269 // Failure branch - restore the original value of the iterator
270 it = original_it;
271 return err;
272 }
273
274 template <typename octet_iterator>
275 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
276 uint32_t ignored;
277 return utf8::internal::validate_next(it, end, ignored);
278 }
279
280} // namespace internal
281
283
284 // Byte order mark
285 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
286
287 template <typename octet_iterator>
288 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
289 {
290 octet_iterator result = start;
291 while (result != end) {
293 if (err_code != internal::UTF8_OK)
294 return result;
295 }
296 return result;
297 }
298
299 template <typename octet_iterator>
300 inline bool is_valid(octet_iterator start, octet_iterator end)
301 {
302 return (utf8::find_invalid(start, end) == end);
303 }
304
305 template <typename octet_iterator>
306 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
307 {
308 return (
309 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
310 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
311 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
312 );
313 }
314
315 //Deprecated in release 2.3
316 template <typename octet_iterator>
317 inline bool is_bom (octet_iterator it)
318 {
319 return (
320 (utf8::internal::mask8(*it++)) == bom[0] &&
321 (utf8::internal::mask8(*it++)) == bom[1] &&
322 (utf8::internal::mask8(*it)) == bom[2]
323 );
324 }
325} // namespace utf8
326
327#endif // header guard
328
329
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END)
Definition core.h:149
const uint32_t CODE_POINT_MAX
Definition core.h:56
bool is_lead_surrogate(u16 cp)
Definition core.h:75
bool is_trail_surrogate(u16 cp)
Definition core.h:81
bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
Definition core.h:116
bool is_surrogate(u16 cp)
Definition core.h:87
const uint32_t SURROGATE_OFFSET
Definition core.h:53
utf_error increase_safely(octet_iterator &it, octet_iterator end)
Helper for get_sequence_x.
Definition core.h:138
const uint16_t TRAIL_SURROGATE_MAX
Definition core.h:51
const uint16_t LEAD_OFFSET
Definition core.h:52
utf_error get_sequence_1(octet_iterator &it, octet_iterator end, uint32_t &code_point)
get_sequence_x functions decode utf-8 sequences of the length x
Definition core.h:153
const uint16_t TRAIL_SURROGATE_MIN
Definition core.h:50
bool is_code_point_valid(u32 cp)
Definition core.h:93
utf_error get_sequence_2(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:164
@ INCOMPLETE_SEQUENCE
Definition core.h:134
@ OVERLONG_SEQUENCE
Definition core.h:134
@ INVALID_CODE_POINT
Definition core.h:134
@ NOT_ENOUGH_ROOM
Definition core.h:134
uint16_t mask16(u16_type oc)
Definition core.h:64
bool is_trail(octet_type oc)
Definition core.h:69
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:223
utf_error get_sequence_3(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:179
const uint16_t LEAD_SURROGATE_MIN
Definition core.h:48
utf_error get_sequence_4(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:198
std::iterator_traits< octet_iterator >::difference_type sequence_length(octet_iterator lead_it)
Definition core.h:100
uint8_t mask8(octet_type oc)
Definition core.h:59
const uint16_t LEAD_SURROGATE_MAX
Definition core.h:49
Definition checked.h:35
bool starts_with_bom(octet_iterator it, octet_iterator end)
Definition core.h:306
const uint8_t bom[]
The library API - functions intended to be called by the users.
Definition core.h:285
bool is_valid(octet_iterator start, octet_iterator end)
Definition core.h:300
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition core.h:288
bool is_bom(octet_iterator it)
Definition core.h:317
unsigned short uint16_t
Definition stdint.h:125
unsigned int uint32_t
Definition stdint.h:126
unsigned char uint8_t
Definition stdint.h:124