Version:
~ [ 1.0 ] ~
** Warning: Cannot open xref database.
1 /* streamio.c -- handles character stream I/O
2
3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 CVS Info :
7
8 $Author: arnaud02 $
9 $Date: 2005/07/22 15:54:58 $
10 $Revision: 1.30 $
11
12 Wrapper around Tidy input source and output sink
13 that calls appropriate interfaces, and applies
14 necessary char encoding transformations: to/from
15 ISO-10646 and/or UTF-8.
16
17 */
18
19 #include <stdio.h>
20 #include <errno.h>
21
22 #include "streamio.h"
23 #include "tidy-int.h"
24 #include "lexer.h"
25 #include "message.h"
26 #include "utf8.h"
27 #include "tmbstr.h"
28
29 #ifdef TIDY_WIN32_MLANG_SUPPORT
30 #include "win32tc.h"
31 #endif
32
33 /************************
34 ** Forward Declarations
35 ************************/
36
37 static uint ReadCharFromStream( StreamIn* in );
38
39 static uint ReadByte( StreamIn* in );
40 static void UngetByte( StreamIn* in, uint byteValue );
41
42 static void PutByte( uint byteValue, StreamOut* out );
43
44 static void EncodeWin1252( uint c, StreamOut* out );
45 static void EncodeMacRoman( uint c, StreamOut* out );
46 static void EncodeIbm858( uint c, StreamOut* out );
47 static void EncodeLatin0( uint c, StreamOut* out );
48
49 /******************************
50 ** Static (duration) Globals
51 ******************************/
52
53 static StreamOut stderrStreamOut =
54 {
55 ASCII,
56 FSM_ASCII,
57 DEFAULT_NL_CONFIG,
58 #ifdef TIDY_WIN32_MLANG_SUPPORT
59 (ulong)NULL,
60 #endif
61 FileIO,
62 { 0, filesink_putByte }
63 };
64
65 static StreamOut stdoutStreamOut =
66 {
67 ASCII,
68 FSM_ASCII,
69 DEFAULT_NL_CONFIG,
70 #ifdef TIDY_WIN32_MLANG_SUPPORT
71 (ulong)NULL,
72 #endif
73 FileIO,
74 { 0, filesink_putByte }
75 };
76
77 StreamOut* StdErrOutput(void)
78 {
79 if ( stderrStreamOut.sink.sinkData == 0 )
80 stderrStreamOut.sink.sinkData = (ulong) stderr;
81 return &stderrStreamOut;
82 }
83
84 StreamOut* StdOutOutput(void)
85 {
86 if ( stdoutStreamOut.sink.sinkData == 0 )
87 stdoutStreamOut.sink.sinkData = (ulong) stdout;
88 return &stdoutStreamOut;
89 }
90
91 void ReleaseStreamOut( StreamOut* out )
92 {
93 if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
94 {
95 if ( out->iotype == FileIO )
96 fclose( (FILE*) out->sink.sinkData );
97 MemFree( out );
98 }
99 }
100
101
102 /************************
103 ** Source
104 ************************/
105
106 static StreamIn* initStreamIn( TidyDocImpl* doc, int encoding )
107 {
108 StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) );
109
110 ClearMemory( in, sizeof(StreamIn) );
111 in->curline = 1;
112 in->curcol = 1;
113 in->encoding = encoding;
114 in->state = FSM_ASCII;
115 in->doc = doc;
116 in->bufsize = CHARBUF_SIZE;
117 in->charbuf = (tchar*)MemAlloc(sizeof(tchar) * in->bufsize);
118 #ifdef TIDY_STORE_ORIGINAL_TEXT
119 in->otextbuf = NULL;
120 in->otextlen = 0;
121 in->otextsize = 0;
122 #endif
123 return in;
124 }
125
126 void freeStreamIn(StreamIn* in)
127 {
128 #ifdef TIDY_STORE_ORIGINAL_TEXT
129 if (in->otextbuf)
130 MemFree(in->otextbuf);
131 #endif
132 MemFree(in->charbuf);
133 MemFree(in);
134 }
135
136 StreamIn* FileInput( TidyDocImpl* doc, FILE *fp, int encoding )
137 {
138 StreamIn *in = initStreamIn( doc, encoding );
139 initFileSource( &in->source, fp );
140 in->iotype = FileIO;
141 return in;
142 }
143
144 StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
145 {
146 StreamIn *in = initStreamIn( doc, encoding );
147 initInputBuffer( &in->source, buf );
148 in->iotype = BufferIO;
149 return in;
150 }
151
152 StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding )
153 {
154 StreamIn *in = initStreamIn( doc, encoding );
155 memcpy( &in->source, source, sizeof(TidyInputSource) );
156 in->iotype = UserIO;
157 return in;
158 }
159
160 int ReadBOMEncoding(StreamIn *in)
161 {
162 uint c, c1;
163 #if SUPPORT_UTF16_ENCODINGS
164 uint bom;
165 #endif
166
167 c = ReadByte(in);
168 if (c == EndOfStream)
169 return -1;
170
171 c1 = ReadByte( in );
172 if (c1 == EndOfStream)
173 {
174 UngetByte(in, c);
175 return -1;
176 }
177
178 /* todo: dont warn about mismatch for auto input encoding */
179 /* todo: let the user override the encoding found here */
180
181 #if SUPPORT_UTF16_ENCODINGS
182 bom = (c << 8) + c1;
183
184 if ( bom == UNICODE_BOM_BE )
185 {
186 /* big-endian UTF-16 */
187 if ( in->encoding != UTF16 && in->encoding != UTF16BE )
188 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16BE);
189
190 return UTF16BE; /* return decoded BOM */
191 }
192 else if (bom == UNICODE_BOM_LE)
193 {
194 /* little-endian UTF-16 */
195 if (in->encoding != UTF16 && in->encoding != UTF16LE)
196 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16LE);
197
198 return UTF16LE; /* return decoded BOM */
199 }
200 else
201 #endif /* SUPPORT_UTF16_ENCODINGS */
202 {
203 uint c2 = ReadByte(in);
204
205 if (c2 == EndOfStream)
206 {
207 UngetByte(in, c1);
208 UngetByte(in, c);
209 return -1;
210 }
211
212 if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
213 {
214 /* UTF-8 */
215 if (in->encoding != UTF8)
216 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF8);
217
218 return UTF8;
219 }
220 else
221 UngetByte( in, c2 );
222 }
223
224 UngetByte(in, c1);
225 UngetByte(in, c);
226
227 return -1;
228 }
229
230 #ifdef TIDY_STORE_ORIGINAL_TEXT
231 void AddByteToOriginalText(StreamIn *in, tmbchar c)
232 {
233 if (in->otextlen + 1 >= in->otextsize)
234 {
235 size_t size = in->otextsize ? 1 : 2;
236 in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size);
237 in->otextsize += size;
238 }
239 in->otextbuf[in->otextlen++] = c;
240 in->otextbuf[in->otextlen ] = 0;
241 }
242
243 void AddCharToOriginalText(StreamIn *in, tchar c)
244 {
245 int i, err, count = 0;
246 tmbchar buf[10] = {0};
247
248 err = EncodeCharToUTF8Bytes(c, buf, NULL, &count);
249
250 if (err)
251 {
252 /* replacement character 0xFFFD encoded as UTF-8 */
253 buf[0] = (byte) 0xEF;
254 buf[1] = (byte) 0xBF;
255 buf[2] = (byte) 0xBD;
256 count = 3;
257 }
258
259 for (i = 0; i < count; ++i)
260 AddByteToOriginalText(in, buf[i]);
261 }
262 #endif
263
264
265 uint ReadChar( StreamIn *in )
266 {
267 uint c = EndOfStream;
268 uint tabsize = cfg( in->doc, TidyTabSize );
269 #ifdef TIDY_STORE_ORIGINAL_TEXT
270 Bool added = no;
271 #endif
272
273 if ( in->pushed )
274 return PopChar( in );
275
276 in->lastcol = in->curcol;
277
278 if ( in->tabs > 0 )
279 {
280 in->curcol++;
281 in->tabs--;
282 return ' ';
283 }
284
285 for (;;)
286 {
287 c = ReadCharFromStream(in);
288
289 if ( EndOfStream == c )
290 return EndOfStream;
291
292 if (c == '\n')
293 {
294 #ifdef TIDY_STORE_ORIGINAL_TEXT
295 added = yes;
296 AddCharToOriginalText(in, (tchar)c);
297 #endif
298 in->curcol = 1;
299 in->curline++;
300 break;
301 }
302
303 if (c == '\t')
304 {
305 #ifdef TIDY_STORE_ORIGINAL_TEXT
306 added = yes;
307 AddCharToOriginalText(in, (tchar)c);
308 #endif
309 in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;
310 in->curcol++;
311 c = ' ';
312 break;
313 }
314
315 /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
316 if (c == '\r')
317 {
318 #ifdef TIDY_STORE_ORIGINAL_TEXT
319 added = yes;
320 AddCharToOriginalText(in, (tchar)c);
321 #endif
322 c = ReadCharFromStream(in);
323 if (c != '\n')
324 {
325 UngetChar( c, in );
326 c = '\n';
327 }
328 else
329 {
330 #ifdef TIDY_STORE_ORIGINAL_TEXT
331 AddCharToOriginalText(in, (tchar)c);
332 #endif
333 }
334 in->curcol = 1;
335 in->curline++;
336 break;
337 }
338
339 #ifndef NO_NATIVE_ISO2022_SUPPORT
340 /* strip control characters, except for Esc */
341 if (c == '\033')
342 break;
343 #endif
344
345 /* Form Feed is allowed in HTML */
346 if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
347 break;
348
349 if ( c < 32 )
350 continue; /* discard control char */
351
352 /* watch out for chars that have already been decoded such as */
353 /* IS02022, UTF-8 etc, that don't require further decoding */
354
355 if (
356 in->encoding == RAW
357 #ifndef NO_NATIVE_ISO2022_SUPPORT
358 || in->encoding == ISO2022
359 #endif
360 || in->encoding == UTF8
361
362 #if SUPPORT_ASIAN_ENCODINGS
363 || in->encoding == SHIFTJIS /* #431953 - RJ */
364 || in->encoding == BIG5 /* #431953 - RJ */
365 #endif
366 )
367 {
368 in->curcol++;
369 break;
370 }
371
372 #if SUPPORT_UTF16_ENCODINGS
373 /* handle surrogate pairs */
374 if ( in->encoding == UTF16LE ||
375 in->encoding == UTF16 ||
376 in->encoding == UTF16BE )
377 {
378 if ( !IsValidUTF16FromUCS4(c) )
379 {
380 /* invalid UTF-16 value */
381 ReportEncodingError(in->doc, INVALID_UTF16, c, yes);
382 c = 0;
383 }
384 else if ( IsLowSurrogate(c) )
385 {
386 uint n = c;
387 uint m = ReadCharFromStream( in );
388 if ( m == EndOfStream )
389 return EndOfStream;
390
391 c = 0;
392 if ( IsHighSurrogate(m) )
393 {
394 n = CombineSurrogatePair( m, n );
395 if ( IsValidCombinedChar(n) )
396 c = n;
397 }
398 /* not a valid pair */
399 if ( 0 == c )
400 ReportEncodingError( in->doc, INVALID_UTF16, c, yes );
401 }
402 }
403 #endif
404
405 /* Do first: acts on range 128 - 255 */
406 switch ( in->encoding )
407 {
408 case MACROMAN:
409 c = DecodeMacRoman( c );
410 break;
411 case IBM858:
412 c = DecodeIbm850( c );
413 break;
414 case LATIN0:
415 c = DecodeLatin0( c );
416 break;
417 }
418
419 /* produced e.g. as a side-effect of smart quotes in Word */
420 /* but can't happen if using MACROMAN encoding */
421 if ( 127 < c && c < 160 )
422 {
423 uint c1 = 0, replMode = DISCARDED_CHAR;
424 Bool isVendorChar = ( in->encoding == WIN1252 ||
425 in->encoding == MACROMAN );
426 Bool isWinChar = ( in->encoding == WIN1252 ||
427 ReplacementCharEncoding == WIN1252 );
428 Bool isMacChar = ( in->encoding == MACROMAN ||
429 ReplacementCharEncoding == MACROMAN );
430
431 /* set error position just before offending character */
432 in->doc->lexer->lines = in->curline;
433 in->doc->lexer->columns = in->curcol;
434
435 if ( isWinChar )
436 c1 = DecodeWin1252( c );
437 else if ( isMacChar )
438 c1 = DecodeMacRoman( c );
439 if ( c1 )
440 replMode = REPLACED_CHAR;
441
442 if ( c1 == 0 && isVendorChar )
443 ReportEncodingError(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
444 else if ( ! isVendorChar )
445 ReportEncodingError(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
446
447 c = c1;
448 }
449
450 if ( c == 0 )
451 continue; /* illegal char is discarded */
452
453 in->curcol++;
454 break;
455 }
456
457 #ifdef TIDY_STORE_ORIGINAL_TEXT
458 if (!added)
459 AddCharToOriginalText(in, (tchar)c);
460 #endif
461
462 return c;
463 }
464
465 uint PopChar( StreamIn *in )
466 {
467 uint c = EndOfStream;
468 if ( in->pushed )
469 {
470 assert( in->bufpos > 0 );
471 c = in->charbuf[ --in->bufpos ];
472 if ( in->bufpos == 0 )
473 in->pushed = no;
474
475 if ( c == '\n' )
476 {
477 in->curcol = 1;
478 in->curline++;
479 return c;
480 }
481 in->curcol++;
482 }
483 return c;
484 }
485
486 void UngetChar( uint c, StreamIn *in )
487 {
488 if (c == EndOfStream)
489 {
490 /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
491 return;
492 }
493
494 in->pushed = yes;
495
496 if (in->bufpos + 1 >= in->bufsize)
497 in->charbuf = (tchar*)MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize));
498
499 in->charbuf[(in->bufpos)++] = c;
500
501 if (c == '\n')
502 --(in->curline);
503
504 in->curcol = in->lastcol;
505 }
506
507
508
509 /************************
510 ** Sink
511 ************************/
512
513 static StreamOut* initStreamOut( int encoding, uint nl )
514 {
515 StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) );
516 ClearMemory( out, sizeof(StreamOut) );
517 out->encoding = encoding;
518 out->state = FSM_ASCII;
519 out->nl = nl;
520 return out;
521 }
522
523 StreamOut* FileOutput( FILE* fp, int encoding, uint nl )
524 {
525 StreamOut* out = initStreamOut( encoding, nl );
526 initFileSink( &out->sink, fp );
527 out->iotype = FileIO;
528 return out;
529 }
530 StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint nl )
531 {
532 StreamOut* out = initStreamOut( encoding, nl );
533 initOutputBuffer( &out->sink, buf );
534 out->iotype = BufferIO;
535 return out;
536 }
537 StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint nl )
538 {
539 StreamOut* out = initStreamOut( encoding, nl );
540 memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
541 out->iotype = UserIO;
542 return out;
543 }
544
545 void WriteChar( uint c, StreamOut* out )
546 {
547 /* Translate outgoing newlines */
548 if ( LF == c )
549 {
550 if ( out->nl == TidyCRLF )
551 WriteChar( CR, out );
552 else if ( out->nl == TidyCR )
553 c = CR;
554 }
555
556 if (out->encoding == MACROMAN)
557 {
558 EncodeMacRoman( c, out );
559 }
560 else if (out->encoding == WIN1252)
561 {
562 EncodeWin1252( c, out );
563 }
564 else if (out->encoding == IBM858)
565 {
566 EncodeIbm858( c, out );
567 }
568 else if (out->encoding == LATIN0)
569 {
570 EncodeLatin0( c, out );
571 }
572
573 else if (out->encoding == UTF8)
574 {
575 int count = 0;
576
577 EncodeCharToUTF8Bytes( c, NULL, &out->sink, &count );
578 if (count <= 0)
579 {
580 /* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
581 /* replacement char 0xFFFD encoded as UTF-8 */
582 PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
583 }
584 }
585 #ifndef NO_NATIVE_ISO2022_SUPPORT
586 else if (out->encoding == ISO2022)
587 {
588 if (c == 0x1b) /* ESC */
589 out->state = FSM_ESC;
590 else
591 {
592 switch (out->state)
593 {
594 case FSM_ESC:
595 if (c == '$')
596 out->state = FSM_ESCD;
597 else if (c == '(')
598 out->state = FSM_ESCP;
599 else
600 out->state = FSM_ASCII;
601 break;
602
603 case FSM_ESCD:
604 if (c == '(')
605 out->state = FSM_ESCDP;
606 else
607 out->state = FSM_NONASCII;
608 break;
609
610 case FSM_ESCDP:
611 out->state = FSM_NONASCII;
612 break;
613
614 case FSM_ESCP:
615 out->state = FSM_ASCII;
616 break;
617
618 case FSM_NONASCII:
619 c &= 0x7F;
620 break;
621 }
622 }
623
624 PutByte(c, out);
625 }
626 #endif /* NO_NATIVE_ISO2022_SUPPORT */
627
628 #if SUPPORT_UTF16_ENCODINGS
629 else if ( out->encoding == UTF16LE ||
630 out->encoding == UTF16BE ||
631 out->encoding == UTF16 )
632 {
633 int i, numChars = 1;
634 uint theChars[2];
635
636 if ( !IsValidUTF16FromUCS4(c) )
637 {
638 /* invalid UTF-16 value */
639 /* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
640 c = 0;
641 numChars = 0;
642 }
643 else if ( IsCombinedChar(c) )
644 {
645 /* output both, unless something goes wrong */
646 numChars = 2;
647 if ( !SplitSurrogatePair(c, &theChars[0], &theChars[1]) )
648 {
649 /* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
650 c = 0;
651 numChars = 0;
652 }
653 }
654 else
655 {
656 /* just put the char out */
657 theChars[0] = c;
658 }
659
660 for (i = 0; i < numChars; i++)
661 {
662 c = theChars[i];
663
664 if (out->encoding == UTF16LE)
665 {
666 uint ch = c & 0xFF; PutByte(ch, out);
667 ch = (c >> 8) & 0xFF; PutByte(ch, out);
668 }
669
670 else if (out->encoding == UTF16BE || out->encoding == UTF16)
671 {
672 uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
673 ch = c & 0xFF; PutByte(ch, out);
674 }
675 }
676 }
677 #endif
678
679 #if SUPPORT_ASIAN_ENCODINGS
680 else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
681 {
682 if (c < 128)
683 PutByte(c, out);
684 else
685 {
686 uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
687 ch = c & 0xFF; PutByte(ch, out);
688 }
689 }
690 #endif
691
692 else
693 PutByte( c, out );
694 }
695
696
697
698 /****************************
699 ** Miscellaneous / Helpers
700 ****************************/
701
702 /* char encoding used when replacing illegal SGML chars,
703 ** regardless of specified encoding. Set at compile time
704 ** to either Windows or Mac.
705 */
706 const int ReplacementCharEncoding = DFLT_REPL_CHARENC;
707
708
709 /* Mapping for Windows Western character set CP 1252
710 ** (chars 128-159/U+0080-U+009F) to Unicode.
711 */
712 static const uint Win2Unicode[32] =
713 {
714 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
715 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
716 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
717 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
718 };
719
720 /* Function for conversion from Windows-1252 to Unicode */
721 uint DecodeWin1252(uint c)
722 {
723 if (127 < c && c < 160)
724 c = Win2Unicode[c - 128];
725
726 return c;
727 }
728
729 static void EncodeWin1252( uint c, StreamOut* out )
730 {
731 if (c < 128 || (c > 159 && c < 256))
732 PutByte(c, out);
733 else
734 {
735 int i;
736
737 for (i = 128; i < 160; i++)
738 if (Win2Unicode[i - 128] == c)
739 {
740 PutByte(i, out);
741 break;
742 }
743 }
744 }
745
746 /*
747 John Love-Jensen contributed this table for mapping MacRoman
748 character set to Unicode
749 */
750
751 /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
752 static const uint Mac2Unicode[128] =
753 {
754 /* x7F = DEL */
755
756 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
757 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
758
759 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
760 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
761
762 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
763 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
764
765 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
766 /* =BD U+2126 OHM SIGN */
767 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
768
769 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
770 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
771
772 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
773 /* =DB U+00A4 CURRENCY SIGN */
774 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
775
776 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
777 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
778 /* xF0 = Apple Logo */
779 /* =F0 U+2665 BLACK HEART SUIT */
780 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
781 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
782 };
783
784 /* Function to convert from MacRoman to Unicode */
785 uint DecodeMacRoman(uint c)
786 {
787 if (127 < c)
788 c = Mac2Unicode[c - 128];
789 return c;
790 }
791
792 static void EncodeMacRoman( uint c, StreamOut* out )
793 {
794 if (c < 128)
795 PutByte(c, out);
796 else
797 {
798 /* For mac users, map Unicode back to MacRoman. */
799 int i;
800 for (i = 128; i < 256; i++)
801 {
802 if (Mac2Unicode[i - 128] == c)
803 {
804 PutByte(i, out);
805 break;
806 }
807 }
808 }
809 }
810
811 /* Mapping for OS/2 Western character set CP 850
812 ** (chars 128-255) to Unicode.
813 */
814 static const uint IBM2Unicode[128] =
815 {
816 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
817 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
818 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
819 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
820 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
821 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
822 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
823 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
824 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
825 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
826 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
827 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
828 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
829 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
830 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
831 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
832 };
833
834 /* Function for conversion from OS/2-850 to Unicode */
835 uint DecodeIbm850(uint c)
836 {
837 if (127 < c && c < 256)
838 c = IBM2Unicode[c - 128];
839
840 return c;
841 }
842
843 /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
844 static void EncodeIbm858( uint c, StreamOut* out )
845 {
846 if (c < 128)
847 PutByte(c, out);
848 else
849 {
850 int i;
851 for (i = 128; i < 256; i++)
852 {
853 if (IBM2Unicode[i - 128] == c)
854 {
855 PutByte(i, out);
856 break;
857 }
858 }
859 }
860 }
861
862
863 /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
864 uint DecodeLatin0(uint c)
865 {
866 if (159 < c && c < 191)
867 {
868 switch (c)
869 {
870 case 0xA4: c = 0x20AC; break;
871 case 0xA6: c = 0x0160; break;
872 case 0xA8: c = 0x0161; break;
873 case 0xB4: c = 0x017D; break;
874 case 0xB8: c = 0x017E; break;
875 case 0xBC: c = 0x0152; break;
876 case 0xBD: c = 0x0153; break;
877 case 0xBE: c = 0x0178; break;
878 }
879 }
880 return c;
881 }
882
883 /* Map Unicode back to ISO-8859-15. */
884 static void EncodeLatin0( uint c, StreamOut* out )
885 {
886 switch (c)
887 {
888 case 0x20AC: c = 0xA4; break;
889 case 0x0160: c = 0xA6; break;
890 case 0x0161: c = 0xA8; break;
891 case 0x017D: c = 0xB4; break;
892 case 0x017E: c = 0xB8; break;
893 case 0x0152: c = 0xBC; break;
894 case 0x0153: c = 0xBD; break;
895 case 0x0178: c = 0xBE; break;
896 }
897 PutByte(c, out);
898 }
899
900 /*
901 Table to map symbol font characters to Unicode; undefined
902 characters are mapped to 0x0000 and characters without any
903 Unicode equivalent are mapped to '?'. Is this appropriate?
904 */
905
906 static const uint Symbol2Unicode[] =
907 {
908 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
909 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
910
911 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
912 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
913
914 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
915 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
916
917 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
918 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
919
920 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
921 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
922
923 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
924 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
925
926 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
927 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
928
929 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
930 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
931
932 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
933 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
934
935 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
936 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
937
938 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
939 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
940
941 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
942 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
943
944 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
945 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
946
947 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
948 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
949
950 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
951 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
952
953 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
954 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
955 };
956
957 /* Function to convert from Symbol Font chars to Unicode */
958 uint DecodeSymbolFont(uint c)
959 {
960 if (c > 255)
961 return c;
962
963 /* todo: add some error message */
964
965 return Symbol2Unicode[c];
966 }
967
968
969 /* Facilitates user defined source by providing
970 ** an entry point to marshal pointers-to-functions.
971 ** Needed by .NET and possibly other language bindings.
972 */
973 Bool TIDY_CALL tidyInitSource( TidyInputSource* source,
974 void* srcData,
975 TidyGetByteFunc gbFunc,
976 TidyUngetByteFunc ugbFunc,
977 TidyEOFFunc endFunc )
978 {
979 Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
980
981 if ( status )
982 {
983 source->sourceData = (ulong) srcData;
984 source->getByte = gbFunc;
985 source->ungetByte = ugbFunc;
986 source->eof = endFunc;
987 }
988
989 return status;
990 }
991
992 Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
993 void* snkData,
994 TidyPutByteFunc pbFunc )
995 {
996 Bool status = ( sink && snkData && pbFunc );
997 if ( status )
998 {
999 sink->sinkData = (ulong) snkData;
1000 sink->putByte = pbFunc;
1001 }
1002 return status;
1003 }
1004
1005 /* GetByte must return a byte value in a signed
1006 ** integer so that a negative value can signal EOF
1007 ** without interfering w/ 0-255 legitimate byte values.
1008 */
1009 uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1010 {
1011 int bv = source->getByte( source->sourceData );
1012 return (uint) bv;
1013 }
1014 Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1015 {
1016 return source->eof( source->sourceData );
1017 }
1018 void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1019 {
1020 source->ungetByte( source->sourceData, (byte) ch );
1021 }
1022 void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1023 {
1024 sink->putByte( sink->sinkData, (byte) ch );
1025 }
1026
1027 static uint ReadByte( StreamIn* in )
1028 {
1029 return tidyGetByte( &in->source );
1030 }
1031 Bool IsEOF( StreamIn* in )
1032 {
1033 return tidyIsEOF( &in->source );
1034 }
1035 static void UngetByte( StreamIn* in, uint byteValue )
1036 {
1037 tidyUngetByte( &in->source, byteValue );
1038 }
1039 static void PutByte( uint byteValue, StreamOut* out )
1040 {
1041 tidyPutByte( &out->sink, byteValue );
1042 }
1043
1044 #if 0
1045 static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1046 {
1047 int i;
1048
1049 for (i = 0; i < *count; i++)
1050 {
1051 /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1052 if ( in && IsEOF(in) )
1053 {
1054 /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1055 *count = -i;
1056 return;
1057 }
1058
1059 in->source.ungetByte( in->source.sourceData, buf[i] );
1060 }
1061 }
1062
1063 /*
1064 Read raw bytes from stream, return <= 0 if EOF; or if
1065 "unget" is true, Unget the bytes to re-synchronize the input stream
1066 Normally UTF-8 successor bytes are read using this routine.
1067 */
1068 static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1069 {
1070 int ix;
1071 for ( ix=0; ix < *count; ++ix )
1072 {
1073 if ( in->rawPushed )
1074 {
1075 buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1076 if ( in->rawBufpos == 0 )
1077 in->rawPushed = no;
1078 }
1079 else
1080 {
1081 if ( in->source.eof(in->source.sourceData) )
1082 {
1083 *count = -i;
1084 break;
1085 }
1086 buf[ix] = in->source.getByte( in->source.sourceData );
1087 }
1088 }
1089 }
1090 #endif /* 0 */
1091
1092 /* read char from stream */
1093 static uint ReadCharFromStream( StreamIn* in )
1094 {
1095 uint c, n;
1096 #ifdef TIDY_WIN32_MLANG_SUPPORT
1097 uint bytesRead = 0;
1098 #endif
1099
1100 if ( IsEOF(in) )
1101 return EndOfStream;
1102
1103 c = ReadByte( in );
1104
1105 if (c == EndOfStream)
1106 return c;
1107
1108 #ifndef NO_NATIVE_ISO2022_SUPPORT
1109 /*
1110 A document in ISO-2022 based encoding uses some ESC sequences
1111 called "designator" to switch character sets. The designators
1112 defined and used in ISO-2022-JP are:
1113
1114 "ESC" + "(" + ? for ISO646 variants
1115
1116 "ESC" + "$" + ? and
1117 "ESC" + "$" + "(" + ? for multibyte character sets
1118
1119 Where ? stands for a single character used to indicate the
1120 character set for multibyte characters.
1121
1122 Tidy handles this by preserving the escape sequence and
1123 setting the top bit of each byte for non-ascii chars. This
1124 bit is then cleared on output. The input stream keeps track
1125 of the state to determine when to set/clear the bit.
1126 */
1127
1128 if (in->encoding == ISO2022)
1129 {
1130 if (c == 0x1b) /* ESC */
1131 {
1132 in->state = FSM_ESC;
1133 return c;
1134 }
1135
1136 switch (in->state)
1137 {
1138 case FSM_ESC:
1139 if (c == '$')
1140 in->state = FSM_ESCD;
1141 else if (c == '(')
1142 in->state = FSM_ESCP;
1143 else
1144 in->state = FSM_ASCII;
1145 break;
1146
1147 case FSM_ESCD:
1148 if (c == '(')
1149 in->state = FSM_ESCDP;
1150 else
1151 in->state = FSM_NONASCII;
1152 break;
1153
1154 case FSM_ESCDP:
1155 in->state = FSM_NONASCII;
1156 break;
1157
1158 case FSM_ESCP:
1159 in->state = FSM_ASCII;
1160 break;
1161
1162 case FSM_NONASCII:
1163 c |= 0x80;
1164 break;
1165 }
1166
1167 return c;
1168 }
1169 #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1170
1171 #if SUPPORT_UTF16_ENCODINGS
1172 if ( in->encoding == UTF16LE )
1173 {
1174 uint c1 = ReadByte( in );
1175 if ( EndOfStream == c1 )
1176 return EndOfStream;
1177 n = (c1 << 8) + c;
1178 return n;
1179 }
1180
1181 if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1182 {
1183 uint c1 = ReadByte( in );
1184 if ( EndOfStream == c1 )
1185 return EndOfStream;
1186 n = (c << 8) + c1;
1187 return n;
1188 }
1189 #endif
1190
1191 if ( in->encoding == UTF8 )
1192 {
1193 /* deal with UTF-8 encoded char */
1194
1195 int err, count = 0;
1196
1197 /* first byte "c" is passed in separately */
1198 err = DecodeUTF8BytesToChar( &n, c, NULL, &in->source, &count );
1199 if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1200 return EndOfStream;
1201 else if (err)
1202 {
1203 /* set error position just before offending character */
1204 in->doc->lexer->lines = in->curline;
1205 in->doc->lexer->columns = in->curcol;
1206
1207 ReportEncodingError(in->doc, INVALID_UTF8, n, no);
1208 n = 0xFFFD; /* replacement char */
1209 }
1210
1211 return n;
1212 }
1213
1214 #if SUPPORT_ASIAN_ENCODINGS
1215 /*
1216 This section is suitable for any "multibyte" variable-width
1217 character encoding in which a one-byte code is less than
1218 128, and the first byte of a two-byte code is greater or
1219 equal to 128. Note that Big5 and ShiftJIS fit into this
1220 kind, even though their second byte may be less than 128
1221 */
1222 if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1223 {
1224 if (c < 128)
1225 return c;
1226 else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1227 {
1228 /*
1229 Rick Cameron pointed out that for Shift_JIS, the values from
1230 0xa1 through 0xdf represent singe-byte characters
1231 (U+FF61 to U+FF9F - half-shift Katakana)
1232 */
1233 return c;
1234 }
1235 else
1236 {
1237 uint c1 = ReadByte( in );
1238 if ( EndOfStream == c1 )
1239 return EndOfStream;
1240 n = (c << 8) + c1;
1241 return n;
1242 }
1243 }
1244 #endif
1245
1246 #ifdef TIDY_WIN32_MLANG_SUPPORT
1247 else if (in->encoding > WIN32MLANG)
1248 {
1249 assert( in->mlang != 0 );
1250 return Win32MLangGetChar((byte)c, in, &bytesRead);
1251 }
1252 #endif
1253
1254 else
1255 n = c;
1256
1257 return n;
1258 }
1259
1260 /* Output a Byte Order Mark if required */
1261 void outBOM( StreamOut *out )
1262 {
1263 if ( out->encoding == UTF8
1264 #if SUPPORT_UTF16_ENCODINGS
1265 || out->encoding == UTF16LE
1266 || out->encoding == UTF16BE
1267 || out->encoding == UTF16
1268 #endif
1269 )
1270 {
1271 /* this will take care of encoding the BOM correctly */
1272 WriteChar( UNICODE_BOM, out );
1273 }
1274 }
1275
1276 /* this is in intermediate fix for various problems in the */
1277 /* long term code and data in charsets.c should be used */
1278 static struct _enc2iana
1279 {
1280 uint id;
1281 ctmbstr name;
1282 ctmbstr tidyOptName;
1283 } const enc2iana[] =
1284 {
1285 { ASCII, "us-ascii", "ascii" },
1286 { LATIN0, "iso-8859-15", "latin0" },
1287 { LATIN1, "iso-8859-1", "latin1" },
1288 { UTF8, "utf-8", "utf8" },
1289 { MACROMAN, "macintosh", "mac" },
1290 { WIN1252, "windows-1252", "win1252" },
1291 { IBM858, "ibm00858", "ibm858" },
1292 #if SUPPORT_UTF16_ENCODINGS
1293 { UTF16LE, "utf-16", "utf16le" },
1294 { UTF16BE, "utf-16", "utf16be" },
1295 { UTF16, "utf-16", "utf16" },
1296 #endif
1297 #if SUPPORT_ASIAN_ENCODINGS
1298 { BIG5, "big5", "big5" },
1299 { SHIFTJIS, "shift_jis", "shiftjis"},
1300 #endif
1301 #ifndef NO_NATIVE_ISO2022_SUPPORT
1302 { ISO2022, NULL, "iso2022" },
1303 #endif
1304 { RAW, NULL, "raw" }
1305 };
1306
1307 ctmbstr GetEncodingNameFromTidyId(uint id)
1308 {
1309 uint i;
1310
1311 for (i = 0; enc2iana[i].name; ++i)
1312 if (enc2iana[i].id == id)
1313 return enc2iana[i].name;
1314
1315 return NULL;
1316 }
1317
1318 ctmbstr GetEncodingOptNameFromTidyId(uint id)
1319 {
1320 uint i;
1321
1322 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1323 if (enc2iana[i].id == id)
1324 return enc2iana[i].tidyOptName;
1325
1326 return NULL;
1327 }
1328
1329 int GetCharEncodingFromOptName( ctmbstr charenc )
1330 {
1331 uint i;
1332
1333 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1334 if (tmbstrcasecmp(charenc, enc2iana[i].tidyOptName) == 0 )
1335 return enc2iana[i].id;
1336
1337 return -1;
1338 }
1339
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.