Wire Sysio Wire Sysion 1.0.0
Loading...
Searching...
No Matches
utf8::internal Namespace Reference

Enumerations

enum  utf_error {
  UTF8_OK , NOT_ENOUGH_ROOM , INVALID_LEAD , INCOMPLETE_SEQUENCE ,
  OVERLONG_SEQUENCE , INVALID_CODE_POINT
}
 

Functions

template<typename octet_type >
uint8_t mask8 (octet_type oc)
 
template<typename u16_type >
uint16_t mask16 (u16_type oc)
 
template<typename octet_type >
bool is_trail (octet_type oc)
 
template<typename u16 >
bool is_lead_surrogate (u16 cp)
 
template<typename u16 >
bool is_trail_surrogate (u16 cp)
 
template<typename u16 >
bool is_surrogate (u16 cp)
 
template<typename u32 >
bool is_code_point_valid (u32 cp)
 
template<typename octet_iterator >
std::iterator_traits< octet_iterator >::difference_type sequence_length (octet_iterator lead_it)
 
template<typename octet_difference_type >
bool is_overlong_sequence (uint32_t cp, octet_difference_type length)
 
template<typename octet_iterator >
utf_error increase_safely (octet_iterator &it, octet_iterator end)
 Helper for get_sequence_x.
 
template<typename octet_iterator >
utf_error get_sequence_1 (octet_iterator &it, octet_iterator end, uint32_t &code_point)
 get_sequence_x functions decode utf-8 sequences of the length x
 
template<typename octet_iterator >
utf_error get_sequence_2 (octet_iterator &it, octet_iterator end, uint32_t &code_point)
 
template<typename octet_iterator >
utf_error get_sequence_3 (octet_iterator &it, octet_iterator end, uint32_t &code_point)
 
template<typename octet_iterator >
utf_error get_sequence_4 (octet_iterator &it, octet_iterator end, uint32_t &code_point)
 
template<typename octet_iterator >
utf_error validate_next (octet_iterator &it, octet_iterator end, uint32_t &code_point)
 
template<typename octet_iterator >
utf_error validate_next (octet_iterator &it, octet_iterator end)
 

Variables

const uint16_t LEAD_SURROGATE_MIN = 0xd800u
 
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu
 
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u
 
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu
 
const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10)
 
const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
 
const uint32_t CODE_POINT_MAX = 0x0010ffffu
 

Enumeration Type Documentation

◆ utf_error

Enumerator
UTF8_OK 
NOT_ENOUGH_ROOM 
INVALID_LEAD 
INCOMPLETE_SEQUENCE 
OVERLONG_SEQUENCE 
INVALID_CODE_POINT 

Definition at line 134 of file core.h.

Function Documentation

◆ get_sequence_1()

template<typename octet_iterator >
utf_error utf8::internal::get_sequence_1 ( octet_iterator & it,
octet_iterator end,
uint32_t & code_point )

Definition at line 153 of file core.h.

154 {
155 if (it == end)
156 return NOT_ENOUGH_ROOM;
157
158 code_point = utf8::internal::mask8(*it);
159
160 return UTF8_OK;
161 }
uint8_t mask8(octet_type oc)
Definition core.h:59
Here is the call graph for this function:
Here is the caller graph for this function:

◆ get_sequence_2()

template<typename octet_iterator >
utf_error utf8::internal::get_sequence_2 ( octet_iterator & it,
octet_iterator end,
uint32_t & code_point )

Definition at line 164 of file core.h.

165 {
166 if (it == end)
167 return NOT_ENOUGH_ROOM;
168
169 code_point = utf8::internal::mask8(*it);
170
172
173 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174
175 return UTF8_OK;
176 }
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END)
Definition core.h:149
Here is the call graph for this function:
Here is the caller graph for this function:

◆ get_sequence_3()

template<typename octet_iterator >
utf_error utf8::internal::get_sequence_3 ( octet_iterator & it,
octet_iterator end,
uint32_t & code_point )

Definition at line 179 of file core.h.

180 {
181 if (it == end)
182 return NOT_ENOUGH_ROOM;
183
184 code_point = utf8::internal::mask8(*it);
185
187
188 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189
191
192 code_point += (*it) & 0x3f;
193
194 return UTF8_OK;
195 }
Here is the call graph for this function:
Here is the caller graph for this function:

◆ get_sequence_4()

template<typename octet_iterator >
utf_error utf8::internal::get_sequence_4 ( octet_iterator & it,
octet_iterator end,
uint32_t & code_point )

Definition at line 198 of file core.h.

199 {
200 if (it == end)
201 return NOT_ENOUGH_ROOM;
202
203 code_point = utf8::internal::mask8(*it);
204
206
207 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208
210
211 code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212
214
215 code_point += (*it) & 0x3f;
216
217 return UTF8_OK;
218 }
Here is the call graph for this function:
Here is the caller graph for this function:

◆ increase_safely()

template<typename octet_iterator >
utf_error utf8::internal::increase_safely ( octet_iterator & it,
octet_iterator end )

Definition at line 138 of file core.h.

139 {
140 if (++it == end)
141 return NOT_ENOUGH_ROOM;
142
143 if (!utf8::internal::is_trail(*it))
144 return INCOMPLETE_SEQUENCE;
145
146 return UTF8_OK;
147 }
bool is_trail(octet_type oc)
Definition core.h:69
Here is the call graph for this function:

◆ is_code_point_valid()

template<typename u32 >
bool utf8::internal::is_code_point_valid ( u32 cp)
inline

Definition at line 93 of file core.h.

94 {
95 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
96 }
bool is_surrogate(u16 cp)
Definition core.h:87
Here is the call graph for this function:
Here is the caller graph for this function:

◆ is_lead_surrogate()

template<typename u16 >
bool utf8::internal::is_lead_surrogate ( u16 cp)
inline

Definition at line 75 of file core.h.

76 {
77 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
78 }
Here is the caller graph for this function:

◆ is_overlong_sequence()

template<typename octet_difference_type >
bool utf8::internal::is_overlong_sequence ( uint32_t cp,
octet_difference_type length )
inline

Definition at line 116 of file core.h.

117 {
118 if (cp < 0x80) {
119 if (length != 1)
120 return true;
121 }
122 else if (cp < 0x800) {
123 if (length != 2)
124 return true;
125 }
126 else if (cp < 0x10000) {
127 if (length != 3)
128 return true;
129 }
130
131 return false;
132 }
Here is the caller graph for this function:

◆ is_surrogate()

template<typename u16 >
bool utf8::internal::is_surrogate ( u16 cp)
inline

Definition at line 87 of file core.h.

88 {
89 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
90 }
Here is the caller graph for this function:

◆ is_trail()

template<typename octet_type >
bool utf8::internal::is_trail ( octet_type oc)
inline

Definition at line 69 of file core.h.

70 {
71 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
72 }
Here is the call graph for this function:
Here is the caller graph for this function:

◆ is_trail_surrogate()

template<typename u16 >
bool utf8::internal::is_trail_surrogate ( u16 cp)
inline

Definition at line 81 of file core.h.

82 {
83 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
84 }
Here is the caller graph for this function:

◆ mask16()

template<typename u16_type >
uint16_t utf8::internal::mask16 ( u16_type oc)
inline

Definition at line 64 of file core.h.

65 {
66 return static_cast<uint16_t>(0xffff & oc);
67 }
unsigned short uint16_t
Definition stdint.h:125
Here is the caller graph for this function:

◆ mask8()

template<typename octet_type >
uint8_t utf8::internal::mask8 ( octet_type oc)
inline

Definition at line 59 of file core.h.

60 {
61 return static_cast<uint8_t>(0xff & oc);
62 }
unsigned char uint8_t
Definition stdint.h:124
Here is the caller graph for this function:

◆ sequence_length()

template<typename octet_iterator >
std::iterator_traits< octet_iterator >::difference_type utf8::internal::sequence_length ( octet_iterator lead_it)
inline

Definition at line 100 of file core.h.

101 {
102 uint8_t lead = utf8::internal::mask8(*lead_it);
103 if (lead < 0x80)
104 return 1;
105 else if ((lead >> 5) == 0x6)
106 return 2;
107 else if ((lead >> 4) == 0xe)
108 return 3;
109 else if ((lead >> 3) == 0x1e)
110 return 4;
111 else
112 return 0;
113 }
Here is the call graph for this function:
Here is the caller graph for this function:

◆ validate_next() [1/2]

template<typename octet_iterator >
utf_error utf8::internal::validate_next ( octet_iterator & it,
octet_iterator end )
inline

Definition at line 275 of file core.h.

275 {
276 uint32_t ignored;
277 return utf8::internal::validate_next(it, end, ignored);
278 }
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:223
unsigned int uint32_t
Definition stdint.h:126
Here is the call graph for this function:

◆ validate_next() [2/2]

template<typename octet_iterator >
utf_error utf8::internal::validate_next ( octet_iterator & it,
octet_iterator end,
uint32_t & code_point )

Definition at line 223 of file core.h.

224 {
225 // Save the original value of it so we can go back in case of failure
226 // Of course, it does not make much sense with i.e. stream iterators
227 octet_iterator original_it = it;
228
229 uint32_t cp = 0;
230 // Determine the sequence length based on the lead octet
231 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 const octet_difference_type length = utf8::internal::sequence_length(it);
233
234 // Get trail octets and calculate the code point
235 utf_error err = UTF8_OK;
236 switch (length) {
237 case 0:
238 return INVALID_LEAD;
239 case 1:
240 err = utf8::internal::get_sequence_1(it, end, cp);
241 break;
242 case 2:
243 err = utf8::internal::get_sequence_2(it, end, cp);
244 break;
245 case 3:
246 err = utf8::internal::get_sequence_3(it, end, cp);
247 break;
248 case 4:
249 err = utf8::internal::get_sequence_4(it, end, cp);
250 break;
251 }
252
253 if (err == UTF8_OK) {
254 // Decoding succeeded. Now, security checks...
256 if (!utf8::internal::is_overlong_sequence(cp, length)){
257 // Passed! Return here.
258 code_point = cp;
259 ++it;
260 return UTF8_OK;
261 }
262 else
263 err = OVERLONG_SEQUENCE;
264 }
265 else
266 err = INVALID_CODE_POINT;
267 }
268
269 // Failure branch - restore the original value of the iterator
270 it = original_it;
271 return err;
272 }
bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
Definition core.h:116
utf_error get_sequence_1(octet_iterator &it, octet_iterator end, uint32_t &code_point)
get_sequence_x functions decode utf-8 sequences of the length x
Definition core.h:153
bool is_code_point_valid(u32 cp)
Definition core.h:93
utf_error get_sequence_2(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:164
utf_error get_sequence_3(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:179
utf_error get_sequence_4(octet_iterator &it, octet_iterator end, uint32_t &code_point)
Definition core.h:198
std::iterator_traits< octet_iterator >::difference_type sequence_length(octet_iterator lead_it)
Definition core.h:100
Here is the call graph for this function:
Here is the caller graph for this function:

Variable Documentation

◆ CODE_POINT_MAX

const uint32_t utf8::internal::CODE_POINT_MAX = 0x0010ffffu

Definition at line 56 of file core.h.

◆ LEAD_OFFSET

const uint16_t utf8::internal::LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10)

Definition at line 52 of file core.h.

◆ LEAD_SURROGATE_MAX

const uint16_t utf8::internal::LEAD_SURROGATE_MAX = 0xdbffu

Definition at line 49 of file core.h.

◆ LEAD_SURROGATE_MIN

const uint16_t utf8::internal::LEAD_SURROGATE_MIN = 0xd800u

Definition at line 48 of file core.h.

◆ SURROGATE_OFFSET

const uint32_t utf8::internal::SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN

Definition at line 53 of file core.h.

◆ TRAIL_SURROGATE_MAX

const uint16_t utf8::internal::TRAIL_SURROGATE_MAX = 0xdfffu

Definition at line 51 of file core.h.

◆ TRAIL_SURROGATE_MIN

const uint16_t utf8::internal::TRAIL_SURROGATE_MIN = 0xdc00u

Definition at line 50 of file core.h.