~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/utf8.c

Version: ~ [ 1.0 ] ~

** Warning: Cannot open xref database.

1 /* utf8.c -- convert characters to/from UTF-8 2 3 (c) 1998-2004 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: terry_teague $ 9 $Date: 2004/08/02 02:32:36 $ 10 $Revision: 1.7 $ 11 12 Uses public interfaces to abstract input source and output 13 sink, which may be user supplied or either FILE* or memory 14 based Tidy implementations. Encoding support is uniform 15 regardless of I/O mechanism. 16 17 Note, UTF-8 encoding, by itself, does not affect the actual 18 "codepoints" of the underlying character encoding. In the 19 cases of ASCII, Latin1, Unicode (16-bit, BMP), these all 20 refer to ISO-10646 "codepoints". For anything else, they 21 refer to some other "codepoint" set. 22 23 Put another way, UTF-8 is a variable length method to 24 represent any non-negative integer value. The glyph 25 that a integer value represents is unchanged and defined 26 externally (e.g. by ISO-10646, Big5, Win1252, MacRoman, 27 Latin2-9, and so on). 28 29 Put still another way, UTF-8 is more of a _transfer_ encoding 30 than a _character_ encoding, per se. 31 */ 32 33 #include "tidy.h" 34 #include "utf8.h" 35 36 /* 37 UTF-8 encoding/decoding functions 38 Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence 39 40 Also see below for UTF-16 encoding/decoding functions 41 42 References : 43 44 1) UCS Transformation Format 8 (UTF-8): 45 ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D 46 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335> 47 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html> 48 49 Table 4 - Mapping from UCS-4 to UTF-8 50 51 2) Unicode standards: 52 <http://www.unicode.org/unicode/standard/standard.html> 53 54 3) Legal UTF-8 byte sequences: 55 <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html> 56 57 Code point 1st byte 2nd byte 3rd byte 4th byte 58 ---------- -------- -------- -------- -------- 59 U+0000..U+007F 00..7F 60 U+0080..U+07FF C2..DF 80..BF 61 U+0800..U+0FFF E0 A0..BF 80..BF 62 U+1000..U+FFFF E1..EF 80..BF 80..BF 63 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 64 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 65 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 66 67 The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also 68 allows for the use of five- and six-byte sequences to encode 69 characters that are outside the range of the Unicode character 70 set; those five- and six-byte sequences are illegal for the use 71 of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646 72 does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF 73 (but it does allow other noncharacters). 74 75 4) RFC 2279: UTF-8, a transformation format of ISO 10646: 76 <http://www.ietf.org/rfc/rfc2279.txt> 77 78 5) UTF-8 and Unicode FAQ: 79 <http://www.cl.cam.ac.uk/~mgk25/unicode.html> 80 81 6) Markus Kuhn's UTF-8 decoder stress test file: 82 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt> 83 84 7) UTF-8 Demo: 85 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt> 86 87 8) UTF-8 Sampler: 88 <http://www.columbia.edu/kermit/utf8.html> 89 90 9) Transformation Format for 16 Planes of Group 00 (UTF-16): 91 ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C 92 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf> 93 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html> 94 95 10) RFC 2781: UTF-16, an encoding of ISO 10646: 96 <http://www.ietf.org/rfc/rfc2781.txt> 97 98 11) UTF-16 invalid surrogate pairs: 99 <http://www.unicode.org/unicode/faq/utf_bom.html#16> 100 101 UTF-16 UTF-8 UCS-4 102 D83F DFF* F0 9F BF B* 0001FFF* 103 D87F DFF* F0 AF BF B* 0002FFF* 104 D8BF DFF* F0 BF BF B* 0003FFF* 105 D8FF DFF* F1 8F BF B* 0004FFF* 106 D93F DFF* F1 9F BF B* 0005FFF* 107 D97F DFF* F1 AF BF B* 0006FFF* 108 ... 109 DBBF DFF* F3 BF BF B* 000FFFF* 110 DBFF DFF* F4 8F BF B* 0010FFF* 111 112 * = E or F 113 114 1010 A 115 1011 B 116 1100 C 117 1101 D 118 1110 E 119 1111 F 120 121 */ 122 123 #define kNumUTF8Sequences 7 124 #define kMaxUTF8Bytes 4 125 126 #define kUTF8ByteSwapNotAChar 0xFFFE 127 #define kUTF8NotAChar 0xFFFF 128 129 #define kMaxUTF8FromUCS4 0x10FFFF 130 131 #define kUTF16SurrogatesBegin 0x10000 132 #define kMaxUTF16FromUCS4 0x10FFFF 133 134 /* UTF-16 surrogate pair areas */ 135 #define kUTF16LowSurrogateBegin 0xD800 136 #define kUTF16LowSurrogateEnd 0xDBFF 137 #define kUTF16HighSurrogateBegin 0xDC00 138 #define kUTF16HighSurrogateEnd 0xDFFF 139 140 141 /* offsets into validUTF8 table below */ 142 static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] = 143 { 144 0, /* 1 byte */ 145 1, /* 2 bytes */ 146 2, /* 3 bytes */ 147 4, /* 4 bytes */ 148 kNumUTF8Sequences /* must be last */ 149 }; 150 151 static const struct validUTF8Sequence 152 { 153 uint lowChar; 154 uint highChar; 155 int numBytes; 156 byte validBytes[8]; 157 } validUTF8[kNumUTF8Sequences] = 158 { 159 /* low high #bytes byte 1 byte 2 byte 3 byte 4 */ 160 {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, 161 {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}}, 162 {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, 163 {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, 164 {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, 165 {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, 166 {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} 167 }; 168 169 int DecodeUTF8BytesToChar( uint* c, uint firstByte, ctmbstr successorBytes, 170 TidyInputSource* inp, int* count ) 171 { 172 byte tempbuf[10]; 173 byte *buf = &tempbuf[0]; 174 uint ch = 0, n = 0; 175 int i, bytes = 0; 176 Bool hasError = no; 177 178 if ( successorBytes ) 179 buf = (byte*) successorBytes; 180 181 /* special check if we have been passed an EOF char */ 182 if ( firstByte == EndOfStream ) 183 { 184 /* at present */ 185 *c = firstByte; 186 *count = 1; 187 return 0; 188 } 189 190 ch = firstByte; /* first byte is passed in separately */ 191 192 if (ch <= 0x7F) /* 0XXX XXXX one byte */ 193 { 194 n = ch; 195 bytes = 1; 196 } 197 else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */ 198 { 199 n = ch & 31; 200 bytes = 2; 201 } 202 else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */ 203 { 204 n = ch & 15; 205 bytes = 3; 206 } 207 else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */ 208 { 209 n = ch & 7; 210 bytes = 4; 211 } 212 else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */ 213 { 214 n = ch & 3; 215 bytes = 5; 216 hasError = yes; 217 } 218 else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */ 219 { 220 n = ch & 1; 221 bytes = 6; 222 hasError = yes; 223 } 224 else 225 { 226 /* not a valid first byte of a UTF-8 sequence */ 227 n = ch; 228 bytes = 1; 229 hasError = yes; 230 } 231 232 /* successor bytes should have the form 10XX XXXX */ 233 234 /* If caller supplied buffer, use it. Else see if caller 235 ** supplied an input source, use that. 236 */ 237 if ( successorBytes ) 238 { 239 for ( i=0; i < bytes-1; ++i ) 240 { 241 if ( !buf[i] || (buf[i] & 0xC0) != 0x80 ) 242 { 243 hasError = yes; 244 bytes = i; 245 break; 246 } 247 n = (n << 6) | (buf[i] & 0x3F); 248 } 249 } 250 else if ( inp ) 251 { 252 for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i ) 253 { 254 int b = inp->getByte( inp->sourceData ); 255 buf[i] = (tmbchar) b; 256 257 /* End of data or illegal successor byte value */ 258 if ( b == EOF || (buf[i] & 0xC0) != 0x80 ) 259 { 260 hasError = yes; 261 bytes = i; 262 if ( b != EOF ) 263 inp->ungetByte( inp->sourceData, buf[i] ); 264 break; 265 } 266 n = (n << 6) | (buf[i] & 0x3F); 267 } 268 } 269 else if ( bytes > 1 ) 270 { 271 hasError = yes; 272 bytes = 1; 273 } 274 275 if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar))) 276 hasError = yes; 277 278 if (!hasError && (n > kMaxUTF8FromUCS4)) 279 hasError = yes; 280 281 #if 0 /* Breaks Big5 D8 - DF */ 282 if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd)) 283 /* unpaired surrogates not allowed */ 284 hasError = yes; 285 #endif 286 287 if (!hasError) 288 { 289 int lo, hi; 290 291 lo = offsetUTF8Sequences[bytes - 1]; 292 hi = offsetUTF8Sequences[bytes] - 1; 293 294 /* check for overlong sequences */ 295 if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar)) 296 hasError = yes; 297 else 298 { 299 hasError = yes; /* assume error until proven otherwise */ 300 301 for (i = lo; i <= hi; i++) 302 { 303 int tempCount; 304 byte theByte; 305 306 for (tempCount = 0; tempCount < bytes; tempCount++) 307 { 308 if (!tempCount) 309 theByte = (tmbchar) firstByte; 310 else 311 theByte = buf[tempCount - 1]; 312 313 if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] && 314 theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] ) 315 hasError = no; 316 if (hasError) 317 break; 318 } 319 } 320 } 321 } 322 323 #if 1 && defined(_DEBUG) 324 if ( hasError ) 325 { 326 /* debug */ 327 fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes ); 328 fprintf( stderr, "0x%02x ", firstByte ); 329 for (i = 1; i < bytes; i++) 330 fprintf( stderr, "0x%02x ", buf[i - 1] ); 331 fprintf( stderr, " = U+%04ulx\n", n ); 332 } 333 #endif 334 335 *count = bytes; 336 *c = n; 337 if ( hasError ) 338 return -1; 339 return 0; 340 } 341 342 int EncodeCharToUTF8Bytes( uint c, tmbstr encodebuf, 343 TidyOutputSink* outp, int* count ) 344 { 345 byte tempbuf[10] = {0}; 346 byte* buf = &tempbuf[0]; 347 int bytes = 0; 348 Bool hasError = no; 349 350 if ( encodebuf ) 351 buf = (byte*) encodebuf; 352 353 if (c <= 0x7F) /* 0XXX XXXX one byte */ 354 { 355 buf[0] = (tmbchar) c; 356 bytes = 1; 357 } 358 else if (c <= 0x7FF) /* 110X XXXX two bytes */ 359 { 360 buf[0] = (tmbchar) ( 0xC0 | (c >> 6) ); 361 buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) ); 362 bytes = 2; 363 } 364 else if (c <= 0xFFFF) /* 1110 XXXX three bytes */ 365 { 366 buf[0] = (tmbchar) (0xE0 | (c >> 12)); 367 buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 368 buf[2] = (tmbchar) (0x80 | (c & 0x3F)); 369 bytes = 3; 370 if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar ) 371 hasError = yes; 372 #if 0 /* Breaks Big5 D8 - DF */ 373 else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd ) 374 /* unpaired surrogates not allowed */ 375 hasError = yes; 376 #endif 377 } 378 else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */ 379 { 380 buf[0] = (tmbchar) (0xF0 | (c >> 18)); 381 buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 382 buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 383 buf[3] = (tmbchar) (0x80 | (c & 0x3F)); 384 bytes = 4; 385 if (c > kMaxUTF8FromUCS4) 386 hasError = yes; 387 } 388 else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */ 389 { 390 buf[0] = (tmbchar) (0xF8 | (c >> 24)); 391 buf[1] = (tmbchar) (0x80 | (c >> 18)); 392 buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 393 buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 394 buf[4] = (tmbchar) (0x80 | (c & 0x3F)); 395 bytes = 5; 396 hasError = yes; 397 } 398 else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */ 399 { 400 buf[0] = (tmbchar) (0xFC | (c >> 30)); 401 buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F)); 402 buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F)); 403 buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 404 buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 405 buf[5] = (tmbchar) (0x80 | (c & 0x3F)); 406 bytes = 6; 407 hasError = yes; 408 } 409 else 410 hasError = yes; 411 412 /* don't output invalid UTF-8 byte sequence to a stream */ 413 if ( !hasError && outp != NULL ) 414 { 415 int ix; 416 for ( ix=0; ix < bytes; ++ix ) 417 outp->putByte( outp->sinkData, buf[ix] ); 418 } 419 420 #if 1 && defined(_DEBUG) 421 if ( hasError ) 422 { 423 int i; 424 fprintf( stderr, "UTF-8 encoding error for U+%x : ", c ); 425 for (i = 0; i < bytes; i++) 426 fprintf( stderr, "0x%02x ", buf[i] ); 427 fprintf( stderr, "\n" ); 428 } 429 #endif 430 431 *count = bytes; 432 if (hasError) 433 return -1; 434 return 0; 435 } 436 437 438 /* return one less than the number of bytes used by the UTF-8 byte sequence */ 439 /* str points to the UTF-8 byte sequence */ 440 /* the Unicode char is returned in *ch */ 441 uint GetUTF8( ctmbstr str, uint *ch ) 442 { 443 uint n; 444 int bytes; 445 446 int err; 447 448 bytes = 0; 449 450 /* first byte "str[0]" is passed in separately from the */ 451 /* rest of the UTF-8 byte sequence starting at "str[1]" */ 452 err = DecodeUTF8BytesToChar( &n, str[0], str+1, NULL, &bytes ); 453 if (err) 454 { 455 #if 1 && defined(_DEBUG) 456 fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n); 457 #endif 458 n = 0xFFFD; /* replacement char */ 459 } 460 461 *ch = n; 462 return bytes - 1; 463 } 464 465 /* store char c as UTF-8 encoded byte stream */ 466 tmbstr PutUTF8( tmbstr buf, uint c ) 467 { 468 int err, count = 0; 469 470 err = EncodeCharToUTF8Bytes( c, buf, NULL, &count ); 471 if (err) 472 { 473 #if 1 && defined(_DEBUG) 474 fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c); 475 #endif 476 /* replacement char 0xFFFD encoded as UTF-8 */ 477 buf[0] = (byte) 0xEF; 478 buf[1] = (byte) 0xBF; 479 buf[2] = (byte) 0xBD; 480 count = 3; 481 } 482 483 buf += count; 484 return buf; 485 } 486 487 Bool IsValidUTF16FromUCS4( tchar ucs4 ) 488 { 489 return ( ucs4 <= kMaxUTF16FromUCS4 ); 490 } 491 492 Bool IsHighSurrogate( tchar ch ) 493 { 494 return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd ); 495 } 496 Bool IsLowSurrogate( tchar ch ) 497 { 498 return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd ); 499 } 500 501 tchar CombineSurrogatePair( tchar high, tchar low ) 502 { 503 assert( IsHighSurrogate(high) && IsLowSurrogate(low) ); 504 return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + 505 high - kUTF16HighSurrogateBegin + 0x10000 ); 506 } 507 508 Bool SplitSurrogatePair( tchar utf16, tchar* low, tchar* high ) 509 { 510 Bool status = ( IsValidCombinedChar( utf16 ) && high && low ); 511 if ( status ) 512 { 513 *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin; 514 *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin; 515 } 516 return status; 517 } 518 519 Bool IsValidCombinedChar( tchar ch ) 520 { 521 return ( ch >= kUTF16SurrogatesBegin && 522 (ch & 0x0000FFFE) != 0x0000FFFE && 523 (ch & 0x0000FFFF) != 0x0000FFFF ); 524 } 525 526 Bool IsCombinedChar( tchar ch ) 527 { 528 return ( ch >= kUTF16SurrogatesBegin ); 529 } 530

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.