~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/streamio.c

Version: ~ [ 1.0 ] ~

  1 /* streamio.c -- handles character stream I/O
  2 
  3   (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
  4   See tidy.h for the copyright notice.
  5 
  6   CVS Info :
  7 
  8     $Author: arnaud02 $ 
  9     $Date: 2005/07/22 15:54:58 $ 
 10     $Revision: 1.30 $ 
 11 
 12   Wrapper around Tidy input source and output sink
 13   that calls appropriate interfaces, and applies
 14   necessary char encoding transformations: to/from
 15   ISO-10646 and/or UTF-8.
 16 
 17 */
 18 
 19 #include <stdio.h>
 20 #include <errno.h>
 21 
 22 #include "streamio.h"
 23 #include "tidy-int.h"
 24 #include "lexer.h"
 25 #include "message.h"
 26 #include "utf8.h"
 27 #include "tmbstr.h"
 28 
 29 #ifdef TIDY_WIN32_MLANG_SUPPORT
 30 #include "win32tc.h"
 31 #endif
 32 
 33 /************************
 34 ** Forward Declarations
 35 ************************/
 36 
 37 static uint ReadCharFromStream( StreamIn* in );
 38 
 39 static uint ReadByte( StreamIn* in );
 40 static void UngetByte( StreamIn* in, uint byteValue );
 41 
 42 static void PutByte( uint byteValue, StreamOut* out );
 43 
 44 static void EncodeWin1252( uint c, StreamOut* out );
 45 static void EncodeMacRoman( uint c, StreamOut* out );
 46 static void EncodeIbm858( uint c, StreamOut* out );
 47 static void EncodeLatin0( uint c, StreamOut* out );
 48 
 49 /******************************
 50 ** Static (duration) Globals
 51 ******************************/
 52 
 53 static StreamOut stderrStreamOut = 
 54 {
 55     ASCII,
 56     FSM_ASCII,
 57     DEFAULT_NL_CONFIG,
 58 #ifdef TIDY_WIN32_MLANG_SUPPORT
 59     (ulong)NULL,
 60 #endif
 61     FileIO,
 62     { 0, filesink_putByte }
 63 };
 64 
 65 static StreamOut stdoutStreamOut = 
 66 {
 67     ASCII,
 68     FSM_ASCII,
 69     DEFAULT_NL_CONFIG,
 70 #ifdef TIDY_WIN32_MLANG_SUPPORT
 71     (ulong)NULL,
 72 #endif
 73     FileIO,
 74     { 0, filesink_putByte }
 75 };
 76 
 77 StreamOut* StdErrOutput(void)
 78 {
 79   if ( stderrStreamOut.sink.sinkData == 0 )
 80       stderrStreamOut.sink.sinkData = (ulong) stderr;
 81   return &stderrStreamOut;
 82 }
 83 
 84 StreamOut* StdOutOutput(void)
 85 {
 86   if ( stdoutStreamOut.sink.sinkData == 0 )
 87       stdoutStreamOut.sink.sinkData = (ulong) stdout;
 88   return &stdoutStreamOut;
 89 }
 90 
 91 void  ReleaseStreamOut( StreamOut* out )
 92 {
 93     if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
 94     {
 95         if ( out->iotype == FileIO )
 96             fclose( (FILE*) out->sink.sinkData );
 97         MemFree( out );
 98     }
 99 }
100 
101 
102 /************************
103 ** Source
104 ************************/
105 
106 static StreamIn* initStreamIn( TidyDocImpl* doc, int encoding )
107 {
108     StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) );
109 
110     ClearMemory( in, sizeof(StreamIn) );
111     in->curline = 1;
112     in->curcol = 1;
113     in->encoding = encoding;
114     in->state = FSM_ASCII;
115     in->doc = doc;
116     in->bufsize = CHARBUF_SIZE;
117     in->charbuf = (tchar*)MemAlloc(sizeof(tchar) * in->bufsize);
118 #ifdef TIDY_STORE_ORIGINAL_TEXT
119     in->otextbuf = NULL;
120     in->otextlen = 0;
121     in->otextsize = 0;
122 #endif
123     return in;
124 }
125 
126 void freeStreamIn(StreamIn* in)
127 {
128 #ifdef TIDY_STORE_ORIGINAL_TEXT
129     if (in->otextbuf)
130         MemFree(in->otextbuf);
131 #endif
132     MemFree(in->charbuf);
133     MemFree(in);
134 }
135 
136 StreamIn* FileInput( TidyDocImpl* doc, FILE *fp, int encoding )
137 {
138     StreamIn *in = initStreamIn( doc, encoding );
139     initFileSource( &in->source, fp );
140     in->iotype = FileIO;
141     return in;
142 }
143 
144 StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
145 {
146     StreamIn *in = initStreamIn( doc, encoding );
147     initInputBuffer( &in->source, buf );
148     in->iotype = BufferIO;
149     return in;
150 }
151 
152 StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding )
153 {
154     StreamIn *in = initStreamIn( doc, encoding );
155     memcpy( &in->source, source, sizeof(TidyInputSource) );
156     in->iotype = UserIO;
157     return in;
158 }
159 
160 int ReadBOMEncoding(StreamIn *in)
161 {
162     uint c, c1;
163 #if SUPPORT_UTF16_ENCODINGS
164     uint bom;
165 #endif
166 
167     c = ReadByte(in);
168     if (c == EndOfStream)
169         return -1;
170 
171     c1 = ReadByte( in );
172     if (c1 == EndOfStream)
173     {
174         UngetByte(in, c);
175         return -1;
176     }
177 
178     /* todo: dont warn about mismatch for auto input encoding */
179     /* todo: let the user override the encoding found here */
180 
181 #if SUPPORT_UTF16_ENCODINGS
182     bom = (c << 8) + c1;
183 
184     if ( bom == UNICODE_BOM_BE )
185     {
186         /* big-endian UTF-16 */
187         if ( in->encoding != UTF16 && in->encoding != UTF16BE )
188             ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16BE);
189 
190         return UTF16BE; /* return decoded BOM */
191     }
192     else if (bom == UNICODE_BOM_LE)
193     {
194         /* little-endian UTF-16 */
195         if (in->encoding != UTF16 && in->encoding != UTF16LE)
196             ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16LE);
197 
198         return UTF16LE; /* return decoded BOM */
199     }
200     else
201 #endif /* SUPPORT_UTF16_ENCODINGS */
202     {
203         uint c2 = ReadByte(in);
204 
205         if (c2 == EndOfStream)
206         {
207             UngetByte(in, c1);
208             UngetByte(in, c);
209             return -1;
210         }
211 
212         if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
213         {
214             /* UTF-8 */
215             if (in->encoding != UTF8)
216                 ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF8);
217 
218             return UTF8;
219         }
220         else
221             UngetByte( in, c2 );
222     }
223 
224     UngetByte(in, c1);
225     UngetByte(in, c);
226 
227     return -1;
228 }
229 
230 #ifdef TIDY_STORE_ORIGINAL_TEXT
231 void AddByteToOriginalText(StreamIn *in, tmbchar c)
232 {
233     if (in->otextlen + 1 >= in->otextsize)
234     {
235         size_t size = in->otextsize ? 1 : 2;
236         in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size);
237         in->otextsize += size;
238     }
239     in->otextbuf[in->otextlen++] = c;
240     in->otextbuf[in->otextlen  ] = 0;
241 }
242 
243 void AddCharToOriginalText(StreamIn *in, tchar c)
244 {
245     int i, err, count = 0;
246     tmbchar buf[10] = {0};
247     
248     err = EncodeCharToUTF8Bytes(c, buf, NULL, &count);
249 
250     if (err)
251     {
252         /* replacement character 0xFFFD encoded as UTF-8 */
253         buf[0] = (byte) 0xEF;
254         buf[1] = (byte) 0xBF;
255         buf[2] = (byte) 0xBD;
256         count = 3;
257     }
258     
259     for (i = 0; i < count; ++i)
260         AddByteToOriginalText(in, buf[i]);
261 }
262 #endif
263 
264 
265 uint ReadChar( StreamIn *in )
266 {
267     uint c = EndOfStream;
268     uint tabsize = cfg( in->doc, TidyTabSize );
269 #ifdef TIDY_STORE_ORIGINAL_TEXT
270     Bool added = no;
271 #endif
272 
273     if ( in->pushed )
274         return PopChar( in );
275 
276     in->lastcol = in->curcol;
277 
278     if ( in->tabs > 0 )
279     {
280         in->curcol++;
281         in->tabs--;
282         return ' ';
283     }
284     
285     for (;;)
286     {
287         c = ReadCharFromStream(in);
288 
289         if ( EndOfStream == c )
290             return EndOfStream;
291 
292         if (c == '\n')
293         {
294 #ifdef TIDY_STORE_ORIGINAL_TEXT
295             added = yes;
296             AddCharToOriginalText(in, (tchar)c);
297 #endif
298             in->curcol = 1;
299             in->curline++;
300             break;
301         }
302 
303         if (c == '\t')
304         {
305 #ifdef TIDY_STORE_ORIGINAL_TEXT
306             added = yes;
307             AddCharToOriginalText(in, (tchar)c);
308 #endif
309             in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;
310             in->curcol++;
311             c = ' ';
312             break;
313         }
314 
315         /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
316         if (c == '\r')
317         {
318 #ifdef TIDY_STORE_ORIGINAL_TEXT
319             added = yes;
320             AddCharToOriginalText(in, (tchar)c);
321 #endif
322             c = ReadCharFromStream(in);
323             if (c != '\n')
324             {
325                 UngetChar( c, in );
326                 c = '\n';
327             }
328             else
329             {
330 #ifdef TIDY_STORE_ORIGINAL_TEXT
331                 AddCharToOriginalText(in, (tchar)c);
332 #endif
333             }
334             in->curcol = 1;
335             in->curline++;
336             break;
337         }
338 
339 #ifndef NO_NATIVE_ISO2022_SUPPORT
340         /* strip control characters, except for Esc */
341         if (c == '\033')
342             break;
343 #endif
344 
345         /* Form Feed is allowed in HTML */
346         if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
347             break;
348             
349         if ( c < 32 )
350             continue; /* discard control char */
351 
352         /* watch out for chars that have already been decoded such as */
353         /* IS02022, UTF-8 etc, that don't require further decoding */
354 
355         if (
356             in->encoding == RAW
357 #ifndef NO_NATIVE_ISO2022_SUPPORT
358          || in->encoding == ISO2022
359 #endif
360          || in->encoding == UTF8
361 
362 #if SUPPORT_ASIAN_ENCODINGS
363          || in->encoding == SHIFTJIS /* #431953 - RJ */
364          || in->encoding == BIG5     /* #431953 - RJ */
365 #endif
366            )
367         {
368             in->curcol++;
369             break;
370         }
371 
372 #if SUPPORT_UTF16_ENCODINGS
373         /* handle surrogate pairs */
374         if ( in->encoding == UTF16LE ||
375              in->encoding == UTF16   ||
376              in->encoding == UTF16BE )
377         {
378             if ( !IsValidUTF16FromUCS4(c) )
379             {
380                 /* invalid UTF-16 value */
381                 ReportEncodingError(in->doc, INVALID_UTF16, c, yes);
382                 c = 0;
383             }
384             else if ( IsLowSurrogate(c) )
385             {
386                 uint n = c;
387                 uint m = ReadCharFromStream( in );
388                 if ( m == EndOfStream )
389                    return EndOfStream;
390 
391                 c = 0;
392                 if ( IsHighSurrogate(m) )
393                 {
394                     n = CombineSurrogatePair( m, n );
395                     if ( IsValidCombinedChar(n) )
396                         c = n;
397                 }
398                 /* not a valid pair */
399                 if ( 0 == c )
400                     ReportEncodingError( in->doc, INVALID_UTF16, c, yes );
401             }
402         }
403 #endif
404 
405         /* Do first: acts on range 128 - 255 */
406         switch ( in->encoding )
407         {
408         case MACROMAN:
409             c = DecodeMacRoman( c );
410             break;
411         case IBM858:
412             c = DecodeIbm850( c );
413             break;
414         case LATIN0:
415             c = DecodeLatin0( c );
416             break;
417         }
418 
419         /* produced e.g. as a side-effect of smart quotes in Word */
420         /* but can't happen if using MACROMAN encoding */
421         if ( 127 < c && c < 160 )
422         {
423             uint c1 = 0, replMode = DISCARDED_CHAR;
424             Bool isVendorChar = ( in->encoding == WIN1252 ||
425                                   in->encoding == MACROMAN );
426             Bool isWinChar    = ( in->encoding == WIN1252 ||
427                                   ReplacementCharEncoding == WIN1252 );
428             Bool isMacChar    = ( in->encoding == MACROMAN ||
429                                   ReplacementCharEncoding == MACROMAN );
430             
431             /* set error position just before offending character */
432             in->doc->lexer->lines = in->curline;
433             in->doc->lexer->columns = in->curcol;
434                 
435             if ( isWinChar )
436                 c1 = DecodeWin1252( c );
437             else if ( isMacChar )
438                 c1 = DecodeMacRoman( c );
439             if ( c1 )
440                 replMode = REPLACED_CHAR;
441                 
442             if ( c1 == 0 && isVendorChar )
443                 ReportEncodingError(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
444             else if ( ! isVendorChar )
445                 ReportEncodingError(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
446                 
447             c = c1;
448         }
449 
450         if ( c == 0 )
451             continue; /* illegal char is discarded */
452         
453         in->curcol++;
454         break;
455     }
456 
457 #ifdef TIDY_STORE_ORIGINAL_TEXT
458     if (!added)
459         AddCharToOriginalText(in, (tchar)c);
460 #endif
461 
462     return c;
463 }
464 
465 uint PopChar( StreamIn *in )
466 {
467     uint c = EndOfStream;
468     if ( in->pushed )
469     {
470         assert( in->bufpos > 0 );
471         c = in->charbuf[ --in->bufpos ];
472         if ( in->bufpos == 0 )
473             in->pushed = no;
474 
475         if ( c == '\n' )
476         {
477             in->curcol = 1;
478             in->curline++;
479             return c;
480         }
481         in->curcol++;
482     }
483     return c;
484 }
485 
486 void UngetChar( uint c, StreamIn *in )
487 {
488     if (c == EndOfStream)
489     {
490         /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
491         return;
492     }
493     
494     in->pushed = yes;
495 
496     if (in->bufpos + 1 >= in->bufsize)
497         in->charbuf = (tchar*)MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize));
498 
499     in->charbuf[(in->bufpos)++] = c;
500 
501     if (c == '\n')
502         --(in->curline);
503 
504     in->curcol = in->lastcol;
505 }
506 
507 
508 
509 /************************
510 ** Sink
511 ************************/
512 
513 static StreamOut* initStreamOut( int encoding, uint nl )
514 {
515     StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) );
516     ClearMemory( out, sizeof(StreamOut) );
517     out->encoding = encoding;
518     out->state = FSM_ASCII;
519     out->nl = nl;
520     return out;
521 }
522 
523 StreamOut* FileOutput( FILE* fp, int encoding, uint nl )
524 {
525     StreamOut* out = initStreamOut( encoding, nl );
526     initFileSink( &out->sink, fp );
527     out->iotype = FileIO;
528     return out;
529 }
530 StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint nl )
531 {
532     StreamOut* out = initStreamOut( encoding, nl );
533     initOutputBuffer( &out->sink, buf );
534     out->iotype = BufferIO;
535     return out;
536 }
537 StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint nl )
538 {
539     StreamOut* out = initStreamOut( encoding, nl );
540     memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
541     out->iotype = UserIO;
542     return out;
543 }
544 
545 void WriteChar( uint c, StreamOut* out )
546 {
547     /* Translate outgoing newlines */
548     if ( LF == c )
549     {
550       if ( out->nl == TidyCRLF )
551           WriteChar( CR, out );
552       else if ( out->nl == TidyCR )
553           c = CR;
554     }
555 
556     if (out->encoding == MACROMAN)
557     {
558         EncodeMacRoman( c, out );
559     }
560     else if (out->encoding == WIN1252)
561     {
562         EncodeWin1252( c, out );
563     }
564     else if (out->encoding == IBM858)
565     {
566         EncodeIbm858( c, out );
567     }
568     else if (out->encoding == LATIN0)
569     {
570         EncodeLatin0( c, out );
571     }
572 
573     else if (out->encoding == UTF8)
574     {
575         int count = 0;
576         
577         EncodeCharToUTF8Bytes( c, NULL, &out->sink, &count );
578         if (count <= 0)
579         {
580           /* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
581             /* replacement char 0xFFFD encoded as UTF-8 */
582             PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
583         }
584     }
585 #ifndef NO_NATIVE_ISO2022_SUPPORT
586     else if (out->encoding == ISO2022)
587     {
588         if (c == 0x1b)  /* ESC */
589             out->state = FSM_ESC;
590         else
591         {
592             switch (out->state)
593             {
594             case FSM_ESC:
595                 if (c == '$')
596                     out->state = FSM_ESCD;
597                 else if (c == '(')
598                     out->state = FSM_ESCP;
599                 else
600                     out->state = FSM_ASCII;
601                 break;
602 
603             case FSM_ESCD:
604                 if (c == '(')
605                     out->state = FSM_ESCDP;
606                 else
607                     out->state = FSM_NONASCII;
608                 break;
609 
610             case FSM_ESCDP:
611                 out->state = FSM_NONASCII;
612                 break;
613 
614             case FSM_ESCP:
615                 out->state = FSM_ASCII;
616                 break;
617 
618             case FSM_NONASCII:
619                 c &= 0x7F;
620                 break;
621             }
622         }
623 
624         PutByte(c, out);
625     }
626 #endif /* NO_NATIVE_ISO2022_SUPPORT */
627 
628 #if SUPPORT_UTF16_ENCODINGS
629     else if ( out->encoding == UTF16LE ||
630               out->encoding == UTF16BE ||
631               out->encoding == UTF16 )
632     {
633         int i, numChars = 1;
634         uint theChars[2];
635         
636         if ( !IsValidUTF16FromUCS4(c) )
637         {
638             /* invalid UTF-16 value */
639             /* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
640             c = 0;
641             numChars = 0;
642         }
643         else if ( IsCombinedChar(c) )
644         {
645             /* output both, unless something goes wrong */
646             numChars = 2;
647             if ( !SplitSurrogatePair(c, &theChars[0], &theChars[1]) )
648             {
649                 /* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
650                 c = 0;
651                 numChars = 0;
652             }
653         }
654         else
655         {
656             /* just put the char out */
657             theChars[0] = c;
658         }
659         
660         for (i = 0; i < numChars; i++)
661         {
662             c = theChars[i];
663             
664             if (out->encoding == UTF16LE)
665             {
666                 uint ch = c & 0xFF; PutByte(ch, out); 
667                 ch = (c >> 8) & 0xFF; PutByte(ch, out); 
668             }
669     
670             else if (out->encoding == UTF16BE || out->encoding == UTF16)
671             {
672                 uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 
673                 ch = c & 0xFF; PutByte(ch, out); 
674             }
675         }
676     }
677 #endif
678 
679 #if SUPPORT_ASIAN_ENCODINGS
680     else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
681     {
682         if (c < 128)
683             PutByte(c, out);
684         else
685         {
686             uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 
687             ch = c & 0xFF; PutByte(ch, out); 
688         }
689     }
690 #endif
691 
692     else
693         PutByte( c, out );
694 }
695 
696 
697 
698 /****************************
699 ** Miscellaneous / Helpers
700 ****************************/
701 
702 /* char encoding used when replacing illegal SGML chars,
703 ** regardless of specified encoding.  Set at compile time
704 ** to either Windows or Mac.
705 */
706 const int ReplacementCharEncoding = DFLT_REPL_CHARENC;
707 
708 
709 /* Mapping for Windows Western character set CP 1252 
710 ** (chars 128-159/U+0080-U+009F) to Unicode.
711 */
712 static const uint Win2Unicode[32] =
713 {
714     0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
715     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
716     0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
717     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
718 };
719 
720 /* Function for conversion from Windows-1252 to Unicode */
721 uint DecodeWin1252(uint c)
722 {
723     if (127 < c && c < 160)
724         c = Win2Unicode[c - 128];
725         
726     return c;
727 }
728 
729 static void EncodeWin1252( uint c, StreamOut* out )
730 {
731     if (c < 128 || (c > 159 && c < 256))
732         PutByte(c, out);
733     else
734     {
735         int i;
736 
737         for (i = 128; i < 160; i++)
738             if (Win2Unicode[i - 128] == c)
739             {
740                 PutByte(i, out);
741                 break;
742             }
743     }
744 }
745 
746 /*
747    John Love-Jensen contributed this table for mapping MacRoman
748    character set to Unicode
749 */
750 
751 /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
752 static const uint Mac2Unicode[128] = 
753 {
754     /* x7F = DEL */
755     
756     0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
757     0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
758 
759     0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
760     0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
761 
762     0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
763     0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
764 
765     0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
766                                             /* =BD U+2126 OHM SIGN */
767     0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
768 
769     0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
770     0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
771 
772     0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
773                             /* =DB U+00A4 CURRENCY SIGN */
774     0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
775 
776     0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
777     0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
778     /* xF0 = Apple Logo */
779     /* =F0 U+2665 BLACK HEART SUIT */
780     0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
781     0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
782 };
783 
784 /* Function to convert from MacRoman to Unicode */
785 uint DecodeMacRoman(uint c)
786 {
787     if (127 < c)
788         c = Mac2Unicode[c - 128];
789     return c;
790 }
791 
792 static void EncodeMacRoman( uint c, StreamOut* out )
793 {
794         if (c < 128)
795             PutByte(c, out);
796         else
797         {
798             /* For mac users, map Unicode back to MacRoman. */
799             int i;
800             for (i = 128; i < 256; i++)
801             {
802                 if (Mac2Unicode[i - 128] == c)
803                 {
804                     PutByte(i, out);
805                     break;
806                 }
807             }
808         }
809 }
810 
811 /* Mapping for OS/2 Western character set CP 850
812 ** (chars 128-255) to Unicode.
813 */
814 static const uint IBM2Unicode[128] =
815 {
816     0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
817     0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
818     0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
819     0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
820     0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
821     0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
822     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
823     0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
824     0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
825     0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
826     0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
827     0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
828     0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
829     0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
830     0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
831     0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
832 };
833 
834 /* Function for conversion from OS/2-850 to Unicode */
835 uint DecodeIbm850(uint c)
836 {
837     if (127 < c && c < 256)
838         c = IBM2Unicode[c - 128];
839 
840     return c;
841 }
842 
843 /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
844 static void EncodeIbm858( uint c, StreamOut* out )
845 {
846     if (c < 128)
847         PutByte(c, out);
848     else
849     {
850         int i;
851         for (i = 128; i < 256; i++)
852         {
853             if (IBM2Unicode[i - 128] == c)
854             {
855                 PutByte(i, out);
856                 break;
857             }
858         }
859     }
860 }
861 
862 
863 /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
864 uint DecodeLatin0(uint c)
865 {
866     if (159 < c && c < 191)
867     {
868         switch (c)
869         {
870         case 0xA4: c = 0x20AC; break;
871         case 0xA6: c = 0x0160; break;
872         case 0xA8: c = 0x0161; break;
873         case 0xB4: c = 0x017D; break;
874         case 0xB8: c = 0x017E; break;
875         case 0xBC: c = 0x0152; break;
876         case 0xBD: c = 0x0153; break;
877         case 0xBE: c = 0x0178; break;
878         }
879     }
880     return c;
881 }
882 
883 /* Map Unicode back to ISO-8859-15. */
884 static void EncodeLatin0( uint c, StreamOut* out )
885 {
886     switch (c)
887     {
888     case 0x20AC: c = 0xA4; break;
889     case 0x0160: c = 0xA6; break;
890     case 0x0161: c = 0xA8; break;
891     case 0x017D: c = 0xB4; break;
892     case 0x017E: c = 0xB8; break;
893     case 0x0152: c = 0xBC; break;
894     case 0x0153: c = 0xBD; break;
895     case 0x0178: c = 0xBE; break;
896     }
897     PutByte(c, out);
898 }
899 
900 /*
901    Table to map symbol font characters to Unicode; undefined
902    characters are mapped to 0x0000 and characters without any
903    Unicode equivalent are mapped to '?'. Is this appropriate?
904 */
905 
906 static const uint Symbol2Unicode[] = 
907 {
908     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
909     0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
910     
911     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
912     0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
913     
914     0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
915     0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
916     
917     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
918     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
919     
920     0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
921     0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
922     
923     0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
924     0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
925     
926     0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
927     0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
928     
929     0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
930     0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
931     
932     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
933     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
934     
935     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
936     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
937     
938     0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
939     0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
940     
941     0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
942     0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
943     
944     0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
945     0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
946     
947     0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
948     0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
949     
950     0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
951     0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
952     
953     0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
954     0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
955 };
956 
957 /* Function to convert from Symbol Font chars to Unicode */
958 uint DecodeSymbolFont(uint c)
959 {
960     if (c > 255)
961         return c;
962 
963     /* todo: add some error message */
964 
965     return Symbol2Unicode[c];
966 }
967 
968 
969 /* Facilitates user defined source by providing
970 ** an entry point to marshal pointers-to-functions.
971 ** Needed by .NET and possibly other language bindings.
972 */
973 Bool TIDY_CALL tidyInitSource( TidyInputSource*  source,
974                                void*             srcData,
975                                TidyGetByteFunc   gbFunc,
976                                TidyUngetByteFunc ugbFunc,
977                                TidyEOFFunc       endFunc )
978 {
979   Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
980 
981   if ( status )
982   {
983     source->sourceData = (ulong) srcData;
984     source->getByte    = gbFunc;
985     source->ungetByte  = ugbFunc;
986     source->eof        = endFunc;
987   }
988 
989   return status;
990 }
991 
992 Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
993                              void*           snkData,
994                              TidyPutByteFunc pbFunc )
995 {
996   Bool status = ( sink && snkData && pbFunc );
997   if ( status )
998   {
999     sink->sinkData = (ulong) snkData;
1000     sink->putByte  = pbFunc;
1001   }
1002   return status;
1003 }
1004 
1005 /* GetByte must return a byte value in a signed
1006 ** integer so that a negative value can signal EOF
1007 ** without interfering w/ 0-255 legitimate byte values.
1008 */
1009 uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1010 {
1011   int bv = source->getByte( source->sourceData );
1012   return (uint) bv;
1013 }
1014 Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1015 {
1016   return source->eof( source->sourceData );
1017 }
1018 void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1019 {
1020     source->ungetByte( source->sourceData, (byte) ch );
1021 }
1022 void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1023 {
1024     sink->putByte( sink->sinkData, (byte) ch );
1025 }
1026 
1027 static uint ReadByte( StreamIn* in )
1028 {
1029     return tidyGetByte( &in->source );
1030 }
1031 Bool IsEOF( StreamIn* in )
1032 {
1033     return tidyIsEOF( &in->source );
1034 }
1035 static void UngetByte( StreamIn* in, uint byteValue )
1036 {
1037     tidyUngetByte( &in->source, byteValue );
1038 }
1039 static void PutByte( uint byteValue, StreamOut* out )
1040 {
1041     tidyPutByte( &out->sink, byteValue );
1042 }
1043 
1044 #if 0
1045 static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1046 {
1047     int i;
1048     
1049     for (i = 0; i < *count; i++)
1050     {
1051         /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1052         if ( in && IsEOF(in) )
1053         {
1054             /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1055             *count = -i;
1056             return;
1057         }
1058 
1059         in->source.ungetByte( in->source.sourceData, buf[i] );
1060     }
1061 }
1062 
1063 /*
1064    Read raw bytes from stream, return <= 0 if EOF; or if
1065    "unget" is true, Unget the bytes to re-synchronize the input stream
1066    Normally UTF-8 successor bytes are read using this routine.
1067 */
1068 static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1069 {
1070     int ix;
1071     for ( ix=0; ix < *count; ++ix )
1072     {
1073         if ( in->rawPushed )
1074         {
1075             buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1076             if ( in->rawBufpos == 0 )
1077                 in->rawPushed = no;
1078         }
1079         else
1080         {
1081             if ( in->source.eof(in->source.sourceData) )
1082             {
1083                 *count = -i;
1084                 break;
1085             }
1086             buf[ix] = in->source.getByte( in->source.sourceData );
1087         }
1088     }
1089 }
1090 #endif /* 0 */
1091 
1092 /* read char from stream */
1093 static uint ReadCharFromStream( StreamIn* in )
1094 {
1095     uint c, n;
1096 #ifdef TIDY_WIN32_MLANG_SUPPORT
1097     uint bytesRead = 0;
1098 #endif
1099 
1100     if ( IsEOF(in) )
1101         return EndOfStream;
1102     
1103     c = ReadByte( in );
1104 
1105     if (c == EndOfStream)
1106         return c;
1107 
1108 #ifndef NO_NATIVE_ISO2022_SUPPORT
1109     /*
1110        A document in ISO-2022 based encoding uses some ESC sequences
1111        called "designator" to switch character sets. The designators
1112        defined and used in ISO-2022-JP are:
1113 
1114         "ESC" + "(" + ?     for ISO646 variants
1115 
1116         "ESC" + "$" + ?     and
1117         "ESC" + "$" + "(" + ?   for multibyte character sets
1118 
1119        Where ? stands for a single character used to indicate the
1120        character set for multibyte characters.
1121 
1122        Tidy handles this by preserving the escape sequence and
1123        setting the top bit of each byte for non-ascii chars. This
1124        bit is then cleared on output. The input stream keeps track
1125        of the state to determine when to set/clear the bit.
1126     */
1127 
1128     if (in->encoding == ISO2022)
1129     {
1130         if (c == 0x1b)  /* ESC */
1131         {
1132             in->state = FSM_ESC;
1133             return c;
1134         }
1135 
1136         switch (in->state)
1137         {
1138         case FSM_ESC:
1139             if (c == '$')
1140                 in->state = FSM_ESCD;
1141             else if (c == '(')
1142                 in->state = FSM_ESCP;
1143             else
1144                 in->state = FSM_ASCII;
1145             break;
1146 
1147         case FSM_ESCD:
1148             if (c == '(')
1149                 in->state = FSM_ESCDP;
1150             else
1151                 in->state = FSM_NONASCII;
1152             break;
1153 
1154         case FSM_ESCDP:
1155             in->state = FSM_NONASCII;
1156             break;
1157 
1158         case FSM_ESCP:
1159             in->state = FSM_ASCII;
1160             break;
1161 
1162         case FSM_NONASCII:
1163             c |= 0x80;
1164             break;
1165         }
1166 
1167         return c;
1168     }
1169 #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1170 
1171 #if SUPPORT_UTF16_ENCODINGS
1172     if ( in->encoding == UTF16LE )
1173     {
1174         uint c1 = ReadByte( in );
1175         if ( EndOfStream == c1 )
1176             return EndOfStream;
1177         n = (c1 << 8) + c;
1178         return n;
1179     }
1180 
1181     if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1182     {
1183         uint c1 = ReadByte( in );
1184         if ( EndOfStream == c1 )
1185             return EndOfStream;
1186         n = (c << 8) + c1;
1187         return n;
1188     }
1189 #endif
1190 
1191     if ( in->encoding == UTF8 )
1192     {
1193         /* deal with UTF-8 encoded char */
1194 
1195         int err, count = 0;
1196         
1197         /* first byte "c" is passed in separately */
1198         err = DecodeUTF8BytesToChar( &n, c, NULL, &in->source, &count );
1199         if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1200             return EndOfStream;
1201         else if (err)
1202         {
1203             /* set error position just before offending character */
1204             in->doc->lexer->lines = in->curline;
1205             in->doc->lexer->columns = in->curcol;
1206 
1207             ReportEncodingError(in->doc, INVALID_UTF8, n, no);
1208             n = 0xFFFD; /* replacement char */
1209         }
1210         
1211         return n;
1212     }
1213     
1214 #if SUPPORT_ASIAN_ENCODINGS
1215     /*
1216        This section is suitable for any "multibyte" variable-width 
1217        character encoding in which a one-byte code is less than
1218        128, and the first byte of a two-byte code is greater or
1219        equal to 128. Note that Big5 and ShiftJIS fit into this
1220        kind, even though their second byte may be less than 128
1221     */
1222     if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1223     {
1224         if (c < 128)
1225             return c;
1226         else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1227         {
1228             /*
1229               Rick Cameron pointed out that for Shift_JIS, the values from
1230               0xa1 through 0xdf represent singe-byte characters
1231               (U+FF61 to U+FF9F - half-shift Katakana)
1232             */
1233             return c;
1234         }
1235         else
1236         {
1237             uint c1 = ReadByte( in );
1238             if ( EndOfStream == c1 )
1239                 return EndOfStream;
1240             n = (c << 8) + c1;
1241             return n;
1242         }
1243     }
1244 #endif
1245 
1246 #ifdef TIDY_WIN32_MLANG_SUPPORT
1247     else if (in->encoding > WIN32MLANG)
1248     {
1249         assert( in->mlang != 0 );
1250         return Win32MLangGetChar((byte)c, in, &bytesRead);
1251     }
1252 #endif
1253 
1254     else
1255         n = c;
1256         
1257     return n;
1258 }
1259 
1260 /* Output a Byte Order Mark if required */
1261 void outBOM( StreamOut *out )
1262 {
1263     if ( out->encoding == UTF8
1264 #if SUPPORT_UTF16_ENCODINGS
1265          || out->encoding == UTF16LE
1266          || out->encoding == UTF16BE
1267          || out->encoding == UTF16
1268 #endif
1269        )
1270     {
1271         /* this will take care of encoding the BOM correctly */
1272         WriteChar( UNICODE_BOM, out );
1273     }
1274 }
1275 
1276 /* this is in intermediate fix for various problems in the */
1277 /* long term code and data in charsets.c should be used    */
1278 static struct _enc2iana
1279 {
1280     uint id;
1281     ctmbstr name;
1282     ctmbstr tidyOptName;
1283 } const enc2iana[] =
1284 {
1285   { ASCII,    "us-ascii",     "ascii"   },
1286   { LATIN0,   "iso-8859-15",  "latin0"  },
1287   { LATIN1,   "iso-8859-1",   "latin1"  },
1288   { UTF8,     "utf-8",        "utf8"   },
1289   { MACROMAN, "macintosh",    "mac"     },
1290   { WIN1252,  "windows-1252", "win1252" },
1291   { IBM858,   "ibm00858",     "ibm858"  },
1292 #if SUPPORT_UTF16_ENCODINGS
1293   { UTF16LE,  "utf-16",       "utf16le" },
1294   { UTF16BE,  "utf-16",       "utf16be" },
1295   { UTF16,    "utf-16",       "utf16"   },
1296 #endif
1297 #if SUPPORT_ASIAN_ENCODINGS
1298   { BIG5,     "big5",         "big5"    },
1299   { SHIFTJIS, "shift_jis",    "shiftjis"},
1300 #endif
1301 #ifndef NO_NATIVE_ISO2022_SUPPORT
1302   { ISO2022,  NULL,           "iso2022" },
1303 #endif
1304   { RAW,      NULL,           "raw"     }
1305 };
1306 
1307 ctmbstr GetEncodingNameFromTidyId(uint id)
1308 {
1309     uint i;
1310 
1311     for (i = 0; enc2iana[i].name; ++i)
1312         if (enc2iana[i].id == id)
1313             return enc2iana[i].name;
1314 
1315     return NULL;
1316 }
1317 
1318 ctmbstr GetEncodingOptNameFromTidyId(uint id)
1319 {
1320     uint i;
1321 
1322     for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1323         if (enc2iana[i].id == id)
1324             return enc2iana[i].tidyOptName;
1325 
1326     return NULL;
1327 }
1328 
1329 int GetCharEncodingFromOptName( ctmbstr charenc )
1330 {
1331     uint i;
1332 
1333     for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1334         if (tmbstrcasecmp(charenc, enc2iana[i].tidyOptName) == 0 )
1335             return enc2iana[i].id;
1336 
1337     return -1;
1338 }
1339 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.