~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/streamio.c

Version: ~ [ 1.0 ] ~

** Warning: Cannot open xref database.

1 /* streamio.c -- handles character stream I/O 2 3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: arnaud02 $ 9 $Date: 2005/07/22 15:54:58 $ 10 $Revision: 1.30 $ 11 12 Wrapper around Tidy input source and output sink 13 that calls appropriate interfaces, and applies 14 necessary char encoding transformations: to/from 15 ISO-10646 and/or UTF-8. 16 17 */ 18 19 #include <stdio.h> 20 #include <errno.h> 21 22 #include "streamio.h" 23 #include "tidy-int.h" 24 #include "lexer.h" 25 #include "message.h" 26 #include "utf8.h" 27 #include "tmbstr.h" 28 29 #ifdef TIDY_WIN32_MLANG_SUPPORT 30 #include "win32tc.h" 31 #endif 32 33 /************************ 34 ** Forward Declarations 35 ************************/ 36 37 static uint ReadCharFromStream( StreamIn* in ); 38 39 static uint ReadByte( StreamIn* in ); 40 static void UngetByte( StreamIn* in, uint byteValue ); 41 42 static void PutByte( uint byteValue, StreamOut* out ); 43 44 static void EncodeWin1252( uint c, StreamOut* out ); 45 static void EncodeMacRoman( uint c, StreamOut* out ); 46 static void EncodeIbm858( uint c, StreamOut* out ); 47 static void EncodeLatin0( uint c, StreamOut* out ); 48 49 /****************************** 50 ** Static (duration) Globals 51 ******************************/ 52 53 static StreamOut stderrStreamOut = 54 { 55 ASCII, 56 FSM_ASCII, 57 DEFAULT_NL_CONFIG, 58 #ifdef TIDY_WIN32_MLANG_SUPPORT 59 (ulong)NULL, 60 #endif 61 FileIO, 62 { 0, filesink_putByte } 63 }; 64 65 static StreamOut stdoutStreamOut = 66 { 67 ASCII, 68 FSM_ASCII, 69 DEFAULT_NL_CONFIG, 70 #ifdef TIDY_WIN32_MLANG_SUPPORT 71 (ulong)NULL, 72 #endif 73 FileIO, 74 { 0, filesink_putByte } 75 }; 76 77 StreamOut* StdErrOutput(void) 78 { 79 if ( stderrStreamOut.sink.sinkData == 0 ) 80 stderrStreamOut.sink.sinkData = (ulong) stderr; 81 return &stderrStreamOut; 82 } 83 84 StreamOut* StdOutOutput(void) 85 { 86 if ( stdoutStreamOut.sink.sinkData == 0 ) 87 stdoutStreamOut.sink.sinkData = (ulong) stdout; 88 return &stdoutStreamOut; 89 } 90 91 void ReleaseStreamOut( StreamOut* out ) 92 { 93 if ( out && out != &stderrStreamOut && out != &stdoutStreamOut ) 94 { 95 if ( out->iotype == FileIO ) 96 fclose( (FILE*) out->sink.sinkData ); 97 MemFree( out ); 98 } 99 } 100 101 102 /************************ 103 ** Source 104 ************************/ 105 106 static StreamIn* initStreamIn( TidyDocImpl* doc, int encoding ) 107 { 108 StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) ); 109 110 ClearMemory( in, sizeof(StreamIn) ); 111 in->curline = 1; 112 in->curcol = 1; 113 in->encoding = encoding; 114 in->state = FSM_ASCII; 115 in->doc = doc; 116 in->bufsize = CHARBUF_SIZE; 117 in->charbuf = (tchar*)MemAlloc(sizeof(tchar) * in->bufsize); 118 #ifdef TIDY_STORE_ORIGINAL_TEXT 119 in->otextbuf = NULL; 120 in->otextlen = 0; 121 in->otextsize = 0; 122 #endif 123 return in; 124 } 125 126 void freeStreamIn(StreamIn* in) 127 { 128 #ifdef TIDY_STORE_ORIGINAL_TEXT 129 if (in->otextbuf) 130 MemFree(in->otextbuf); 131 #endif 132 MemFree(in->charbuf); 133 MemFree(in); 134 } 135 136 StreamIn* FileInput( TidyDocImpl* doc, FILE *fp, int encoding ) 137 { 138 StreamIn *in = initStreamIn( doc, encoding ); 139 initFileSource( &in->source, fp ); 140 in->iotype = FileIO; 141 return in; 142 } 143 144 StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* buf, int encoding ) 145 { 146 StreamIn *in = initStreamIn( doc, encoding ); 147 initInputBuffer( &in->source, buf ); 148 in->iotype = BufferIO; 149 return in; 150 } 151 152 StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding ) 153 { 154 StreamIn *in = initStreamIn( doc, encoding ); 155 memcpy( &in->source, source, sizeof(TidyInputSource) ); 156 in->iotype = UserIO; 157 return in; 158 } 159 160 int ReadBOMEncoding(StreamIn *in) 161 { 162 uint c, c1; 163 #if SUPPORT_UTF16_ENCODINGS 164 uint bom; 165 #endif 166 167 c = ReadByte(in); 168 if (c == EndOfStream) 169 return -1; 170 171 c1 = ReadByte( in ); 172 if (c1 == EndOfStream) 173 { 174 UngetByte(in, c); 175 return -1; 176 } 177 178 /* todo: dont warn about mismatch for auto input encoding */ 179 /* todo: let the user override the encoding found here */ 180 181 #if SUPPORT_UTF16_ENCODINGS 182 bom = (c << 8) + c1; 183 184 if ( bom == UNICODE_BOM_BE ) 185 { 186 /* big-endian UTF-16 */ 187 if ( in->encoding != UTF16 && in->encoding != UTF16BE ) 188 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16BE); 189 190 return UTF16BE; /* return decoded BOM */ 191 } 192 else if (bom == UNICODE_BOM_LE) 193 { 194 /* little-endian UTF-16 */ 195 if (in->encoding != UTF16 && in->encoding != UTF16LE) 196 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16LE); 197 198 return UTF16LE; /* return decoded BOM */ 199 } 200 else 201 #endif /* SUPPORT_UTF16_ENCODINGS */ 202 { 203 uint c2 = ReadByte(in); 204 205 if (c2 == EndOfStream) 206 { 207 UngetByte(in, c1); 208 UngetByte(in, c); 209 return -1; 210 } 211 212 if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8) 213 { 214 /* UTF-8 */ 215 if (in->encoding != UTF8) 216 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF8); 217 218 return UTF8; 219 } 220 else 221 UngetByte( in, c2 ); 222 } 223 224 UngetByte(in, c1); 225 UngetByte(in, c); 226 227 return -1; 228 } 229 230 #ifdef TIDY_STORE_ORIGINAL_TEXT 231 void AddByteToOriginalText(StreamIn *in, tmbchar c) 232 { 233 if (in->otextlen + 1 >= in->otextsize) 234 { 235 size_t size = in->otextsize ? 1 : 2; 236 in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size); 237 in->otextsize += size; 238 } 239 in->otextbuf[in->otextlen++] = c; 240 in->otextbuf[in->otextlen ] = 0; 241 } 242 243 void AddCharToOriginalText(StreamIn *in, tchar c) 244 { 245 int i, err, count = 0; 246 tmbchar buf[10] = {0}; 247 248 err = EncodeCharToUTF8Bytes(c, buf, NULL, &count); 249 250 if (err) 251 { 252 /* replacement character 0xFFFD encoded as UTF-8 */ 253 buf[0] = (byte) 0xEF; 254 buf[1] = (byte) 0xBF; 255 buf[2] = (byte) 0xBD; 256 count = 3; 257 } 258 259 for (i = 0; i < count; ++i) 260 AddByteToOriginalText(in, buf[i]); 261 } 262 #endif 263 264 265 uint ReadChar( StreamIn *in ) 266 { 267 uint c = EndOfStream; 268 uint tabsize = cfg( in->doc, TidyTabSize ); 269 #ifdef TIDY_STORE_ORIGINAL_TEXT 270 Bool added = no; 271 #endif 272 273 if ( in->pushed ) 274 return PopChar( in ); 275 276 in->lastcol = in->curcol; 277 278 if ( in->tabs > 0 ) 279 { 280 in->curcol++; 281 in->tabs--; 282 return ' '; 283 } 284 285 for (;;) 286 { 287 c = ReadCharFromStream(in); 288 289 if ( EndOfStream == c ) 290 return EndOfStream; 291 292 if (c == '\n') 293 { 294 #ifdef TIDY_STORE_ORIGINAL_TEXT 295 added = yes; 296 AddCharToOriginalText(in, (tchar)c); 297 #endif 298 in->curcol = 1; 299 in->curline++; 300 break; 301 } 302 303 if (c == '\t') 304 { 305 #ifdef TIDY_STORE_ORIGINAL_TEXT 306 added = yes; 307 AddCharToOriginalText(in, (tchar)c); 308 #endif 309 in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1; 310 in->curcol++; 311 c = ' '; 312 break; 313 } 314 315 /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */ 316 if (c == '\r') 317 { 318 #ifdef TIDY_STORE_ORIGINAL_TEXT 319 added = yes; 320 AddCharToOriginalText(in, (tchar)c); 321 #endif 322 c = ReadCharFromStream(in); 323 if (c != '\n') 324 { 325 UngetChar( c, in ); 326 c = '\n'; 327 } 328 else 329 { 330 #ifdef TIDY_STORE_ORIGINAL_TEXT 331 AddCharToOriginalText(in, (tchar)c); 332 #endif 333 } 334 in->curcol = 1; 335 in->curline++; 336 break; 337 } 338 339 #ifndef NO_NATIVE_ISO2022_SUPPORT 340 /* strip control characters, except for Esc */ 341 if (c == '\033') 342 break; 343 #endif 344 345 /* Form Feed is allowed in HTML */ 346 if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) ) 347 break; 348 349 if ( c < 32 ) 350 continue; /* discard control char */ 351 352 /* watch out for chars that have already been decoded such as */ 353 /* IS02022, UTF-8 etc, that don't require further decoding */ 354 355 if ( 356 in->encoding == RAW 357 #ifndef NO_NATIVE_ISO2022_SUPPORT 358 || in->encoding == ISO2022 359 #endif 360 || in->encoding == UTF8 361 362 #if SUPPORT_ASIAN_ENCODINGS 363 || in->encoding == SHIFTJIS /* #431953 - RJ */ 364 || in->encoding == BIG5 /* #431953 - RJ */ 365 #endif 366 ) 367 { 368 in->curcol++; 369 break; 370 } 371 372 #if SUPPORT_UTF16_ENCODINGS 373 /* handle surrogate pairs */ 374 if ( in->encoding == UTF16LE || 375 in->encoding == UTF16 || 376 in->encoding == UTF16BE ) 377 { 378 if ( !IsValidUTF16FromUCS4(c) ) 379 { 380 /* invalid UTF-16 value */ 381 ReportEncodingError(in->doc, INVALID_UTF16, c, yes); 382 c = 0; 383 } 384 else if ( IsLowSurrogate(c) ) 385 { 386 uint n = c; 387 uint m = ReadCharFromStream( in ); 388 if ( m == EndOfStream ) 389 return EndOfStream; 390 391 c = 0; 392 if ( IsHighSurrogate(m) ) 393 { 394 n = CombineSurrogatePair( m, n ); 395 if ( IsValidCombinedChar(n) ) 396 c = n; 397 } 398 /* not a valid pair */ 399 if ( 0 == c ) 400 ReportEncodingError( in->doc, INVALID_UTF16, c, yes ); 401 } 402 } 403 #endif 404 405 /* Do first: acts on range 128 - 255 */ 406 switch ( in->encoding ) 407 { 408 case MACROMAN: 409 c = DecodeMacRoman( c ); 410 break; 411 case IBM858: 412 c = DecodeIbm850( c ); 413 break; 414 case LATIN0: 415 c = DecodeLatin0( c ); 416 break; 417 } 418 419 /* produced e.g. as a side-effect of smart quotes in Word */ 420 /* but can't happen if using MACROMAN encoding */ 421 if ( 127 < c && c < 160 ) 422 { 423 uint c1 = 0, replMode = DISCARDED_CHAR; 424 Bool isVendorChar = ( in->encoding == WIN1252 || 425 in->encoding == MACROMAN ); 426 Bool isWinChar = ( in->encoding == WIN1252 || 427 ReplacementCharEncoding == WIN1252 ); 428 Bool isMacChar = ( in->encoding == MACROMAN || 429 ReplacementCharEncoding == MACROMAN ); 430 431 /* set error position just before offending character */ 432 in->doc->lexer->lines = in->curline; 433 in->doc->lexer->columns = in->curcol; 434 435 if ( isWinChar ) 436 c1 = DecodeWin1252( c ); 437 else if ( isMacChar ) 438 c1 = DecodeMacRoman( c ); 439 if ( c1 ) 440 replMode = REPLACED_CHAR; 441 442 if ( c1 == 0 && isVendorChar ) 443 ReportEncodingError(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR); 444 else if ( ! isVendorChar ) 445 ReportEncodingError(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR); 446 447 c = c1; 448 } 449 450 if ( c == 0 ) 451 continue; /* illegal char is discarded */ 452 453 in->curcol++; 454 break; 455 } 456 457 #ifdef TIDY_STORE_ORIGINAL_TEXT 458 if (!added) 459 AddCharToOriginalText(in, (tchar)c); 460 #endif 461 462 return c; 463 } 464 465 uint PopChar( StreamIn *in ) 466 { 467 uint c = EndOfStream; 468 if ( in->pushed ) 469 { 470 assert( in->bufpos > 0 ); 471 c = in->charbuf[ --in->bufpos ]; 472 if ( in->bufpos == 0 ) 473 in->pushed = no; 474 475 if ( c == '\n' ) 476 { 477 in->curcol = 1; 478 in->curline++; 479 return c; 480 } 481 in->curcol++; 482 } 483 return c; 484 } 485 486 void UngetChar( uint c, StreamIn *in ) 487 { 488 if (c == EndOfStream) 489 { 490 /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */ 491 return; 492 } 493 494 in->pushed = yes; 495 496 if (in->bufpos + 1 >= in->bufsize) 497 in->charbuf = (tchar*)MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize)); 498 499 in->charbuf[(in->bufpos)++] = c; 500 501 if (c == '\n') 502 --(in->curline); 503 504 in->curcol = in->lastcol; 505 } 506 507 508 509 /************************ 510 ** Sink 511 ************************/ 512 513 static StreamOut* initStreamOut( int encoding, uint nl ) 514 { 515 StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) ); 516 ClearMemory( out, sizeof(StreamOut) ); 517 out->encoding = encoding; 518 out->state = FSM_ASCII; 519 out->nl = nl; 520 return out; 521 } 522 523 StreamOut* FileOutput( FILE* fp, int encoding, uint nl ) 524 { 525 StreamOut* out = initStreamOut( encoding, nl ); 526 initFileSink( &out->sink, fp ); 527 out->iotype = FileIO; 528 return out; 529 } 530 StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint nl ) 531 { 532 StreamOut* out = initStreamOut( encoding, nl ); 533 initOutputBuffer( &out->sink, buf ); 534 out->iotype = BufferIO; 535 return out; 536 } 537 StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint nl ) 538 { 539 StreamOut* out = initStreamOut( encoding, nl ); 540 memcpy( &out->sink, sink, sizeof(TidyOutputSink) ); 541 out->iotype = UserIO; 542 return out; 543 } 544 545 void WriteChar( uint c, StreamOut* out ) 546 { 547 /* Translate outgoing newlines */ 548 if ( LF == c ) 549 { 550 if ( out->nl == TidyCRLF ) 551 WriteChar( CR, out ); 552 else if ( out->nl == TidyCR ) 553 c = CR; 554 } 555 556 if (out->encoding == MACROMAN) 557 { 558 EncodeMacRoman( c, out ); 559 } 560 else if (out->encoding == WIN1252) 561 { 562 EncodeWin1252( c, out ); 563 } 564 else if (out->encoding == IBM858) 565 { 566 EncodeIbm858( c, out ); 567 } 568 else if (out->encoding == LATIN0) 569 { 570 EncodeLatin0( c, out ); 571 } 572 573 else if (out->encoding == UTF8) 574 { 575 int count = 0; 576 577 EncodeCharToUTF8Bytes( c, NULL, &out->sink, &count ); 578 if (count <= 0) 579 { 580 /* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */ 581 /* replacement char 0xFFFD encoded as UTF-8 */ 582 PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out); 583 } 584 } 585 #ifndef NO_NATIVE_ISO2022_SUPPORT 586 else if (out->encoding == ISO2022) 587 { 588 if (c == 0x1b) /* ESC */ 589 out->state = FSM_ESC; 590 else 591 { 592 switch (out->state) 593 { 594 case FSM_ESC: 595 if (c == '$') 596 out->state = FSM_ESCD; 597 else if (c == '(') 598 out->state = FSM_ESCP; 599 else 600 out->state = FSM_ASCII; 601 break; 602 603 case FSM_ESCD: 604 if (c == '(') 605 out->state = FSM_ESCDP; 606 else 607 out->state = FSM_NONASCII; 608 break; 609 610 case FSM_ESCDP: 611 out->state = FSM_NONASCII; 612 break; 613 614 case FSM_ESCP: 615 out->state = FSM_ASCII; 616 break; 617 618 case FSM_NONASCII: 619 c &= 0x7F; 620 break; 621 } 622 } 623 624 PutByte(c, out); 625 } 626 #endif /* NO_NATIVE_ISO2022_SUPPORT */ 627 628 #if SUPPORT_UTF16_ENCODINGS 629 else if ( out->encoding == UTF16LE || 630 out->encoding == UTF16BE || 631 out->encoding == UTF16 ) 632 { 633 int i, numChars = 1; 634 uint theChars[2]; 635 636 if ( !IsValidUTF16FromUCS4(c) ) 637 { 638 /* invalid UTF-16 value */ 639 /* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */ 640 c = 0; 641 numChars = 0; 642 } 643 else if ( IsCombinedChar(c) ) 644 { 645 /* output both, unless something goes wrong */ 646 numChars = 2; 647 if ( !SplitSurrogatePair(c, &theChars[0], &theChars[1]) ) 648 { 649 /* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */ 650 c = 0; 651 numChars = 0; 652 } 653 } 654 else 655 { 656 /* just put the char out */ 657 theChars[0] = c; 658 } 659 660 for (i = 0; i < numChars; i++) 661 { 662 c = theChars[i]; 663 664 if (out->encoding == UTF16LE) 665 { 666 uint ch = c & 0xFF; PutByte(ch, out); 667 ch = (c >> 8) & 0xFF; PutByte(ch, out); 668 } 669 670 else if (out->encoding == UTF16BE || out->encoding == UTF16) 671 { 672 uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 673 ch = c & 0xFF; PutByte(ch, out); 674 } 675 } 676 } 677 #endif 678 679 #if SUPPORT_ASIAN_ENCODINGS 680 else if (out->encoding == BIG5 || out->encoding == SHIFTJIS) 681 { 682 if (c < 128) 683 PutByte(c, out); 684 else 685 { 686 uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 687 ch = c & 0xFF; PutByte(ch, out); 688 } 689 } 690 #endif 691 692 else 693 PutByte( c, out ); 694 } 695 696 697 698 /**************************** 699 ** Miscellaneous / Helpers 700 ****************************/ 701 702 /* char encoding used when replacing illegal SGML chars, 703 ** regardless of specified encoding. Set at compile time 704 ** to either Windows or Mac. 705 */ 706 const int ReplacementCharEncoding = DFLT_REPL_CHARENC; 707 708 709 /* Mapping for Windows Western character set CP 1252 710 ** (chars 128-159/U+0080-U+009F) to Unicode. 711 */ 712 static const uint Win2Unicode[32] = 713 { 714 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 715 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, 716 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 717 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178 718 }; 719 720 /* Function for conversion from Windows-1252 to Unicode */ 721 uint DecodeWin1252(uint c) 722 { 723 if (127 < c && c < 160) 724 c = Win2Unicode[c - 128]; 725 726 return c; 727 } 728 729 static void EncodeWin1252( uint c, StreamOut* out ) 730 { 731 if (c < 128 || (c > 159 && c < 256)) 732 PutByte(c, out); 733 else 734 { 735 int i; 736 737 for (i = 128; i < 160; i++) 738 if (Win2Unicode[i - 128] == c) 739 { 740 PutByte(i, out); 741 break; 742 } 743 } 744 } 745 746 /* 747 John Love-Jensen contributed this table for mapping MacRoman 748 character set to Unicode 749 */ 750 751 /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */ 752 static const uint Mac2Unicode[128] = 753 { 754 /* x7F = DEL */ 755 756 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 757 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, 758 759 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 760 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, 761 762 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 763 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, 764 765 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 766 /* =BD U+2126 OHM SIGN */ 767 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, 768 769 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 770 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, 771 772 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 773 /* =DB U+00A4 CURRENCY SIGN */ 774 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, 775 776 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 777 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, 778 /* xF0 = Apple Logo */ 779 /* =F0 U+2665 BLACK HEART SUIT */ 780 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 781 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7 782 }; 783 784 /* Function to convert from MacRoman to Unicode */ 785 uint DecodeMacRoman(uint c) 786 { 787 if (127 < c) 788 c = Mac2Unicode[c - 128]; 789 return c; 790 } 791 792 static void EncodeMacRoman( uint c, StreamOut* out ) 793 { 794 if (c < 128) 795 PutByte(c, out); 796 else 797 { 798 /* For mac users, map Unicode back to MacRoman. */ 799 int i; 800 for (i = 128; i < 256; i++) 801 { 802 if (Mac2Unicode[i - 128] == c) 803 { 804 PutByte(i, out); 805 break; 806 } 807 } 808 } 809 } 810 811 /* Mapping for OS/2 Western character set CP 850 812 ** (chars 128-255) to Unicode. 813 */ 814 static const uint IBM2Unicode[128] = 815 { 816 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 817 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, 818 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 819 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192, 820 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 821 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, 822 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0, 823 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510, 824 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3, 825 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 826 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce, 827 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, 828 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, 829 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4, 830 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, 831 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0 832 }; 833 834 /* Function for conversion from OS/2-850 to Unicode */ 835 uint DecodeIbm850(uint c) 836 { 837 if (127 < c && c < 256) 838 c = IBM2Unicode[c - 128]; 839 840 return c; 841 } 842 843 /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */ 844 static void EncodeIbm858( uint c, StreamOut* out ) 845 { 846 if (c < 128) 847 PutByte(c, out); 848 else 849 { 850 int i; 851 for (i = 128; i < 256; i++) 852 { 853 if (IBM2Unicode[i - 128] == c) 854 { 855 PutByte(i, out); 856 break; 857 } 858 } 859 } 860 } 861 862 863 /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */ 864 uint DecodeLatin0(uint c) 865 { 866 if (159 < c && c < 191) 867 { 868 switch (c) 869 { 870 case 0xA4: c = 0x20AC; break; 871 case 0xA6: c = 0x0160; break; 872 case 0xA8: c = 0x0161; break; 873 case 0xB4: c = 0x017D; break; 874 case 0xB8: c = 0x017E; break; 875 case 0xBC: c = 0x0152; break; 876 case 0xBD: c = 0x0153; break; 877 case 0xBE: c = 0x0178; break; 878 } 879 } 880 return c; 881 } 882 883 /* Map Unicode back to ISO-8859-15. */ 884 static void EncodeLatin0( uint c, StreamOut* out ) 885 { 886 switch (c) 887 { 888 case 0x20AC: c = 0xA4; break; 889 case 0x0160: c = 0xA6; break; 890 case 0x0161: c = 0xA8; break; 891 case 0x017D: c = 0xB4; break; 892 case 0x017E: c = 0xB8; break; 893 case 0x0152: c = 0xBC; break; 894 case 0x0153: c = 0xBD; break; 895 case 0x0178: c = 0xBE; break; 896 } 897 PutByte(c, out); 898 } 899 900 /* 901 Table to map symbol font characters to Unicode; undefined 902 characters are mapped to 0x0000 and characters without any 903 Unicode equivalent are mapped to '?'. Is this appropriate? 904 */ 905 906 static const uint Symbol2Unicode[] = 907 { 908 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 909 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 910 911 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 912 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 913 914 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D, 915 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F, 916 917 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 918 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, 919 920 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393, 921 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F, 922 923 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9, 924 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F, 925 926 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3, 927 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF, 928 929 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9, 930 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F, 931 932 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 933 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 934 935 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 936 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 937 938 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663, 939 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193, 940 941 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7, 942 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5, 943 944 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229, 945 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209, 946 947 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5, 948 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3, 949 950 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F, 951 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 952 953 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F, 954 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F 955 }; 956 957 /* Function to convert from Symbol Font chars to Unicode */ 958 uint DecodeSymbolFont(uint c) 959 { 960 if (c > 255) 961 return c; 962 963 /* todo: add some error message */ 964 965 return Symbol2Unicode[c]; 966 } 967 968 969 /* Facilitates user defined source by providing 970 ** an entry point to marshal pointers-to-functions. 971 ** Needed by .NET and possibly other language bindings. 972 */ 973 Bool TIDY_CALL tidyInitSource( TidyInputSource* source, 974 void* srcData, 975 TidyGetByteFunc gbFunc, 976 TidyUngetByteFunc ugbFunc, 977 TidyEOFFunc endFunc ) 978 { 979 Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc ); 980 981 if ( status ) 982 { 983 source->sourceData = (ulong) srcData; 984 source->getByte = gbFunc; 985 source->ungetByte = ugbFunc; 986 source->eof = endFunc; 987 } 988 989 return status; 990 } 991 992 Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink, 993 void* snkData, 994 TidyPutByteFunc pbFunc ) 995 { 996 Bool status = ( sink && snkData && pbFunc ); 997 if ( status ) 998 { 999 sink->sinkData = (ulong) snkData; 1000 sink->putByte = pbFunc; 1001 } 1002 return status; 1003 } 1004 1005 /* GetByte must return a byte value in a signed 1006 ** integer so that a negative value can signal EOF 1007 ** without interfering w/ 0-255 legitimate byte values. 1008 */ 1009 uint TIDY_CALL tidyGetByte( TidyInputSource* source ) 1010 { 1011 int bv = source->getByte( source->sourceData ); 1012 return (uint) bv; 1013 } 1014 Bool TIDY_CALL tidyIsEOF( TidyInputSource* source ) 1015 { 1016 return source->eof( source->sourceData ); 1017 } 1018 void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch ) 1019 { 1020 source->ungetByte( source->sourceData, (byte) ch ); 1021 } 1022 void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch ) 1023 { 1024 sink->putByte( sink->sinkData, (byte) ch ); 1025 } 1026 1027 static uint ReadByte( StreamIn* in ) 1028 { 1029 return tidyGetByte( &in->source ); 1030 } 1031 Bool IsEOF( StreamIn* in ) 1032 { 1033 return tidyIsEOF( &in->source ); 1034 } 1035 static void UngetByte( StreamIn* in, uint byteValue ) 1036 { 1037 tidyUngetByte( &in->source, byteValue ); 1038 } 1039 static void PutByte( uint byteValue, StreamOut* out ) 1040 { 1041 tidyPutByte( &out->sink, byteValue ); 1042 } 1043 1044 #if 0 1045 static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count ) 1046 { 1047 int i; 1048 1049 for (i = 0; i < *count; i++) 1050 { 1051 /* should never get here; testing for 0xFF, a valid char, is not a good idea */ 1052 if ( in && IsEOF(in) ) 1053 { 1054 /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */ 1055 *count = -i; 1056 return; 1057 } 1058 1059 in->source.ungetByte( in->source.sourceData, buf[i] ); 1060 } 1061 } 1062 1063 /* 1064 Read raw bytes from stream, return <= 0 if EOF; or if 1065 "unget" is true, Unget the bytes to re-synchronize the input stream 1066 Normally UTF-8 successor bytes are read using this routine. 1067 */ 1068 static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count ) 1069 { 1070 int ix; 1071 for ( ix=0; ix < *count; ++ix ) 1072 { 1073 if ( in->rawPushed ) 1074 { 1075 buf[ix] = in->rawBytebuf[ --in->rawBufpos ]; 1076 if ( in->rawBufpos == 0 ) 1077 in->rawPushed = no; 1078 } 1079 else 1080 { 1081 if ( in->source.eof(in->source.sourceData) ) 1082 { 1083 *count = -i; 1084 break; 1085 } 1086 buf[ix] = in->source.getByte( in->source.sourceData ); 1087 } 1088 } 1089 } 1090 #endif /* 0 */ 1091 1092 /* read char from stream */ 1093 static uint ReadCharFromStream( StreamIn* in ) 1094 { 1095 uint c, n; 1096 #ifdef TIDY_WIN32_MLANG_SUPPORT 1097 uint bytesRead = 0; 1098 #endif 1099 1100 if ( IsEOF(in) ) 1101 return EndOfStream; 1102 1103 c = ReadByte( in ); 1104 1105 if (c == EndOfStream) 1106 return c; 1107 1108 #ifndef NO_NATIVE_ISO2022_SUPPORT 1109 /* 1110 A document in ISO-2022 based encoding uses some ESC sequences 1111 called "designator" to switch character sets. The designators 1112 defined and used in ISO-2022-JP are: 1113 1114 "ESC" + "(" + ? for ISO646 variants 1115 1116 "ESC" + "$" + ? and 1117 "ESC" + "$" + "(" + ? for multibyte character sets 1118 1119 Where ? stands for a single character used to indicate the 1120 character set for multibyte characters. 1121 1122 Tidy handles this by preserving the escape sequence and 1123 setting the top bit of each byte for non-ascii chars. This 1124 bit is then cleared on output. The input stream keeps track 1125 of the state to determine when to set/clear the bit. 1126 */ 1127 1128 if (in->encoding == ISO2022) 1129 { 1130 if (c == 0x1b) /* ESC */ 1131 { 1132 in->state = FSM_ESC; 1133 return c; 1134 } 1135 1136 switch (in->state) 1137 { 1138 case FSM_ESC: 1139 if (c == '$') 1140 in->state = FSM_ESCD; 1141 else if (c == '(') 1142 in->state = FSM_ESCP; 1143 else 1144 in->state = FSM_ASCII; 1145 break; 1146 1147 case FSM_ESCD: 1148 if (c == '(') 1149 in->state = FSM_ESCDP; 1150 else 1151 in->state = FSM_NONASCII; 1152 break; 1153 1154 case FSM_ESCDP: 1155 in->state = FSM_NONASCII; 1156 break; 1157 1158 case FSM_ESCP: 1159 in->state = FSM_ASCII; 1160 break; 1161 1162 case FSM_NONASCII: 1163 c |= 0x80; 1164 break; 1165 } 1166 1167 return c; 1168 } 1169 #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */ 1170 1171 #if SUPPORT_UTF16_ENCODINGS 1172 if ( in->encoding == UTF16LE ) 1173 { 1174 uint c1 = ReadByte( in ); 1175 if ( EndOfStream == c1 ) 1176 return EndOfStream; 1177 n = (c1 << 8) + c; 1178 return n; 1179 } 1180 1181 if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */ 1182 { 1183 uint c1 = ReadByte( in ); 1184 if ( EndOfStream == c1 ) 1185 return EndOfStream; 1186 n = (c << 8) + c1; 1187 return n; 1188 } 1189 #endif 1190 1191 if ( in->encoding == UTF8 ) 1192 { 1193 /* deal with UTF-8 encoded char */ 1194 1195 int err, count = 0; 1196 1197 /* first byte "c" is passed in separately */ 1198 err = DecodeUTF8BytesToChar( &n, c, NULL, &in->source, &count ); 1199 if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */ 1200 return EndOfStream; 1201 else if (err) 1202 { 1203 /* set error position just before offending character */ 1204 in->doc->lexer->lines = in->curline; 1205 in->doc->lexer->columns = in->curcol; 1206 1207 ReportEncodingError(in->doc, INVALID_UTF8, n, no); 1208 n = 0xFFFD; /* replacement char */ 1209 } 1210 1211 return n; 1212 } 1213 1214 #if SUPPORT_ASIAN_ENCODINGS 1215 /* 1216 This section is suitable for any "multibyte" variable-width 1217 character encoding in which a one-byte code is less than 1218 128, and the first byte of a two-byte code is greater or 1219 equal to 128. Note that Big5 and ShiftJIS fit into this 1220 kind, even though their second byte may be less than 128 1221 */ 1222 if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS)) 1223 { 1224 if (c < 128) 1225 return c; 1226 else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */ 1227 { 1228 /* 1229 Rick Cameron pointed out that for Shift_JIS, the values from 1230 0xa1 through 0xdf represent singe-byte characters 1231 (U+FF61 to U+FF9F - half-shift Katakana) 1232 */ 1233 return c; 1234 } 1235 else 1236 { 1237 uint c1 = ReadByte( in ); 1238 if ( EndOfStream == c1 ) 1239 return EndOfStream; 1240 n = (c << 8) + c1; 1241 return n; 1242 } 1243 } 1244 #endif 1245 1246 #ifdef TIDY_WIN32_MLANG_SUPPORT 1247 else if (in->encoding > WIN32MLANG) 1248 { 1249 assert( in->mlang != 0 ); 1250 return Win32MLangGetChar((byte)c, in, &bytesRead); 1251 } 1252 #endif 1253 1254 else 1255 n = c; 1256 1257 return n; 1258 } 1259 1260 /* Output a Byte Order Mark if required */ 1261 void outBOM( StreamOut *out ) 1262 { 1263 if ( out->encoding == UTF8 1264 #if SUPPORT_UTF16_ENCODINGS 1265 || out->encoding == UTF16LE 1266 || out->encoding == UTF16BE 1267 || out->encoding == UTF16 1268 #endif 1269 ) 1270 { 1271 /* this will take care of encoding the BOM correctly */ 1272 WriteChar( UNICODE_BOM, out ); 1273 } 1274 } 1275 1276 /* this is in intermediate fix for various problems in the */ 1277 /* long term code and data in charsets.c should be used */ 1278 static struct _enc2iana 1279 { 1280 uint id; 1281 ctmbstr name; 1282 ctmbstr tidyOptName; 1283 } const enc2iana[] = 1284 { 1285 { ASCII, "us-ascii", "ascii" }, 1286 { LATIN0, "iso-8859-15", "latin0" }, 1287 { LATIN1, "iso-8859-1", "latin1" }, 1288 { UTF8, "utf-8", "utf8" }, 1289 { MACROMAN, "macintosh", "mac" }, 1290 { WIN1252, "windows-1252", "win1252" }, 1291 { IBM858, "ibm00858", "ibm858" }, 1292 #if SUPPORT_UTF16_ENCODINGS 1293 { UTF16LE, "utf-16", "utf16le" }, 1294 { UTF16BE, "utf-16", "utf16be" }, 1295 { UTF16, "utf-16", "utf16" }, 1296 #endif 1297 #if SUPPORT_ASIAN_ENCODINGS 1298 { BIG5, "big5", "big5" }, 1299 { SHIFTJIS, "shift_jis", "shiftjis"}, 1300 #endif 1301 #ifndef NO_NATIVE_ISO2022_SUPPORT 1302 { ISO2022, NULL, "iso2022" }, 1303 #endif 1304 { RAW, NULL, "raw" } 1305 }; 1306 1307 ctmbstr GetEncodingNameFromTidyId(uint id) 1308 { 1309 uint i; 1310 1311 for (i = 0; enc2iana[i].name; ++i) 1312 if (enc2iana[i].id == id) 1313 return enc2iana[i].name; 1314 1315 return NULL; 1316 } 1317 1318 ctmbstr GetEncodingOptNameFromTidyId(uint id) 1319 { 1320 uint i; 1321 1322 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) 1323 if (enc2iana[i].id == id) 1324 return enc2iana[i].tidyOptName; 1325 1326 return NULL; 1327 } 1328 1329 int GetCharEncodingFromOptName( ctmbstr charenc ) 1330 { 1331 uint i; 1332 1333 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) 1334 if (tmbstrcasecmp(charenc, enc2iana[i].tidyOptName) == 0 ) 1335 return enc2iana[i].id; 1336 1337 return -1; 1338 } 1339

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.