~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/lexer.c

Version: ~ [ 1.0 ] ~

  1 /* lexer.c -- Lexer for html parser
  2   
  3   (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
  4   See tidy.h for the copyright notice.
  5   
  6   CVS Info :
  7 
  8     $Author: arnaud02 $ 
  9     $Date: 2005/10/13 12:11:01 $ 
 10     $Revision: 1.173 $ 
 11 
 12 */
 13 
 14 /*
 15   Given a file stream fp it returns a sequence of tokens.
 16 
 17      GetToken(fp) gets the next token
 18      UngetToken(fp) provides one level undo
 19 
 20   The tags include an attribute list:
 21 
 22     - linked list of attribute/value nodes
 23     - each node has 2 NULL-terminated strings.
 24     - entities are replaced in attribute values
 25 
 26   white space is compacted if not in preformatted mode
 27   If not in preformatted mode then leading white space
 28   is discarded and subsequent white space sequences
 29   compacted to single space characters.
 30 
 31   If XmlTags is no then Tag names are folded to upper
 32   case and attribute names to lower case.
 33 
 34  Not yet done:
 35     -   Doctype subset and marked sections
 36 */
 37 
 38 #include "tidy-int.h"
 39 #include "lexer.h"
 40 #include "parser.h"
 41 #include "entities.h"
 42 #include "streamio.h"
 43 #include "message.h"
 44 #include "tmbstr.h"
 45 #include "clean.h"
 46 #include "utf8.h"
 47 #include "streamio.h"
 48 
 49 /* Forward references
 50 */
 51 /* swallows closing '>' */
 52 static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
 53 
 54 static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, 
 55                              Node **asp, Node **php );
 56 
 57 static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
 58                          Bool *isempty, int *pdelim );
 59 
 60 static Node *ParseDocTypeDecl(TidyDocImpl* doc);
 61 
 62 static void AddAttrToList( AttVal** list, AttVal* av );
 63 
 64 /* used to classify characters for lexical purposes */
 65 #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
 66 static uint lexmap[128];
 67 
 68 #define IsValidXMLAttrName(name) IsValidXMLID(name)
 69 #define IsValidXMLElemName(name) IsValidXMLID(name)
 70 
 71 static struct _doctypes
 72 {
 73     uint score;
 74     uint vers;
 75     ctmbstr name;
 76     ctmbstr fpi;
 77     ctmbstr si;
 78 } const W3C_Doctypes[] =
 79 {
 80   {  2, HT20, "HTML 2.0",               "-//IETF//DTD HTML 2.0//EN",              NULL,                                                       },
 81   {  2, HT20, "HTML 2.0",               "-//IETF//DTD HTML//EN",                  NULL,                                                       },
 82   {  2, HT20, "HTML 2.0",               "-//W3C//DTD HTML 2.0//EN",               NULL,                                                       },
 83   {  1, HT32, "HTML 3.2",               "-//W3C//DTD HTML 3.2//EN",               NULL,                                                       },
 84   {  1, HT32, "HTML 3.2",               "-//W3C//DTD HTML 3.2 Final//EN",         NULL,                                                       },
 85   {  1, HT32, "HTML 3.2",               "-//W3C//DTD HTML 3.2 Draft//EN",         NULL,                                                       },
 86   {  6, H40S, "HTML 4.0 Strict",        "-//W3C//DTD HTML 4.0//EN",               "http://www.w3.org/TR/REC-html40/strict.dtd"                },
 87   {  8, H40T, "HTML 4.0 Transitional",  "-//W3C//DTD HTML 4.0 Transitional//EN",  "http://www.w3.org/TR/REC-html40/loose.dtd"                 },
 88   {  7, H40F, "HTML 4.0 Frameset",      "-//W3C//DTD HTML 4.0 Frameset//EN",      "http://www.w3.org/TR/REC-html40/frameset.dtd"              },
 89   {  3, H41S, "HTML 4.01 Strict",       "-//W3C//DTD HTML 4.01//EN",              "http://www.w3.org/TR/html4/strict.dtd"                     },
 90   {  5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd"                      },
 91   {  4, H41F, "HTML 4.01 Frameset",     "-//W3C//DTD HTML 4.01 Frameset//EN",     "http://www.w3.org/TR/html4/frameset.dtd"                   },
 92   {  9, X10S, "XHTML 1.0 Strict",       "-//W3C//DTD XHTML 1.0 Strict//EN",       "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"         },
 93   { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"   },
 94   { 10, X10F, "XHTML 1.0 Frameset",     "-//W3C//DTD XHTML 1.0 Frameset//EN",     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"       },
 95   { 12, XH11, "XHTML 1.1",              "-//W3C//DTD XHTML 1.1//EN",              "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"              },
 96   { 13, XB10, "XHTML Basic 1.0",        "-//W3C//DTD XHTML Basic 1.0//EN",        "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"        },
 97 
 98   /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
 99 #if 0
100   { 14, XP10, "XHTML Print 1.0",        "-//W3C//DTD XHTML-Print 1.0//EN",         "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd"           },
101   { 14, XP10, "XHTML Print 1.0",        "-//PWG//DTD XHTML-Print 1.0//EN",         "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
102 #endif
103   /* final entry */
104   {  0,    0, NULL,                     NULL,                                     NULL                                                        }
105 };
106 
107 int HTMLVersion(TidyDocImpl* doc)
108 {
109     uint i;
110     uint j = 0;
111     uint score = 0;
112     uint vers = doc->lexer->versions;
113     uint dtver = doc->lexer->doctype;
114     TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
115     Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
116                  !cfgBool(doc, TidyHtmlOut);
117     Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
118 
119     for (i = 0; W3C_Doctypes[i].name; ++i)
120     {
121         if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
122             (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
123             continue;
124 
125         if (vers & W3C_Doctypes[i].vers &&
126             (W3C_Doctypes[i].score < score || !score))
127         {
128             score = W3C_Doctypes[i].score;
129             j = i;
130         }
131     }
132 
133     if (score)
134         return W3C_Doctypes[j].vers;
135 
136     return VERS_UNKNOWN;
137 }
138 
139 ctmbstr GetFPIFromVers(uint vers)
140 {
141     uint i;
142 
143     for (i = 0; W3C_Doctypes[i].name; ++i)
144         if (W3C_Doctypes[i].vers == vers)
145             return W3C_Doctypes[i].fpi;
146 
147     return NULL;
148 }
149 
150 static ctmbstr GetSIFromVers(uint vers)
151 {
152     uint i;
153 
154     for (i = 0; W3C_Doctypes[i].name; ++i)
155         if (W3C_Doctypes[i].vers == vers)
156             return W3C_Doctypes[i].si;
157 
158     return NULL;
159 }
160 
161 static ctmbstr GetNameFromVers(uint vers)
162 {
163     uint i;
164 
165     for (i = 0; W3C_Doctypes[i].name; ++i)
166         if (W3C_Doctypes[i].vers == vers)
167             return W3C_Doctypes[i].name;
168 
169     return NULL;
170 }
171 
172 static uint GetVersFromFPI(ctmbstr fpi)
173 {
174     uint i;
175 
176     for (i = 0; W3C_Doctypes[i].name; ++i)
177         if (tmbstrcasecmp(W3C_Doctypes[i].fpi, fpi) == 0)
178             return W3C_Doctypes[i].vers;
179 
180     return 0;
181 }
182 
183 /* everything is allowed in proprietary version of HTML */
184 /* this is handled here rather than in the tag/attr dicts */
185 void ConstrainVersion(TidyDocImpl* doc, uint vers)
186 {
187     doc->lexer->versions &= (vers | VERS_PROPRIETARY);
188 }
189 
190 Bool IsWhite(uint c)
191 {
192     uint map = MAP(c);
193 
194     return (map & white)!=0;
195 }
196 
197 Bool IsNewline(uint c)
198 {
199     uint map = MAP(c);
200     return (map & newline)!=0;
201 }
202 
203 Bool IsDigit(uint c)
204 {
205     uint map;
206 
207     map = MAP(c);
208 
209     return (map & digit)!=0;
210 }
211 
212 Bool IsLetter(uint c)
213 {
214     uint map;
215 
216     map = MAP(c);
217 
218     return (map & letter)!=0;
219 }
220 
221 Bool IsNamechar(uint c)
222 {
223     uint map = MAP(c);
224     return (map & namechar)!=0;
225 }
226 
227 Bool IsXMLLetter(uint c)
228 {
229     return ((c >= 0x41 && c <= 0x5a) ||
230         (c >= 0x61 && c <= 0x7a) ||
231         (c >= 0xc0 && c <= 0xd6) ||
232         (c >= 0xd8 && c <= 0xf6) ||
233         (c >= 0xf8 && c <= 0xff) ||
234         (c >= 0x100 && c <= 0x131) ||
235         (c >= 0x134 && c <= 0x13e) ||
236         (c >= 0x141 && c <= 0x148) ||
237         (c >= 0x14a && c <= 0x17e) ||
238         (c >= 0x180 && c <= 0x1c3) ||
239         (c >= 0x1cd && c <= 0x1f0) ||
240         (c >= 0x1f4 && c <= 0x1f5) ||
241         (c >= 0x1fa && c <= 0x217) ||
242         (c >= 0x250 && c <= 0x2a8) ||
243         (c >= 0x2bb && c <= 0x2c1) ||
244         c == 0x386 ||
245         (c >= 0x388 && c <= 0x38a) ||
246         c == 0x38c ||
247         (c >= 0x38e && c <= 0x3a1) ||
248         (c >= 0x3a3 && c <= 0x3ce) ||
249         (c >= 0x3d0 && c <= 0x3d6) ||
250         c == 0x3da ||
251         c == 0x3dc ||
252         c == 0x3de ||
253         c == 0x3e0 ||
254         (c >= 0x3e2 && c <= 0x3f3) ||
255         (c >= 0x401 && c <= 0x40c) ||
256         (c >= 0x40e && c <= 0x44f) ||
257         (c >= 0x451 && c <= 0x45c) ||
258         (c >= 0x45e && c <= 0x481) ||
259         (c >= 0x490 && c <= 0x4c4) ||
260         (c >= 0x4c7 && c <= 0x4c8) ||
261         (c >= 0x4cb && c <= 0x4cc) ||
262         (c >= 0x4d0 && c <= 0x4eb) ||
263         (c >= 0x4ee && c <= 0x4f5) ||
264         (c >= 0x4f8 && c <= 0x4f9) ||
265         (c >= 0x531 && c <= 0x556) ||
266         c == 0x559 ||
267         (c >= 0x561 && c <= 0x586) ||
268         (c >= 0x5d0 && c <= 0x5ea) ||
269         (c >= 0x5f0 && c <= 0x5f2) ||
270         (c >= 0x621 && c <= 0x63a) ||
271         (c >= 0x641 && c <= 0x64a) ||
272         (c >= 0x671 && c <= 0x6b7) ||
273         (c >= 0x6ba && c <= 0x6be) ||
274         (c >= 0x6c0 && c <= 0x6ce) ||
275         (c >= 0x6d0 && c <= 0x6d3) ||
276         c == 0x6d5 ||
277         (c >= 0x6e5 && c <= 0x6e6) ||
278         (c >= 0x905 && c <= 0x939) ||
279         c == 0x93d ||
280         (c >= 0x958 && c <= 0x961) ||
281         (c >= 0x985 && c <= 0x98c) ||
282         (c >= 0x98f && c <= 0x990) ||
283         (c >= 0x993 && c <= 0x9a8) ||
284         (c >= 0x9aa && c <= 0x9b0) ||
285         c == 0x9b2 ||
286         (c >= 0x9b6 && c <= 0x9b9) ||
287         (c >= 0x9dc && c <= 0x9dd) ||
288         (c >= 0x9df && c <= 0x9e1) ||
289         (c >= 0x9f0 && c <= 0x9f1) ||
290         (c >= 0xa05 && c <= 0xa0a) ||
291         (c >= 0xa0f && c <= 0xa10) ||
292         (c >= 0xa13 && c <= 0xa28) ||
293         (c >= 0xa2a && c <= 0xa30) ||
294         (c >= 0xa32 && c <= 0xa33) ||
295         (c >= 0xa35 && c <= 0xa36) ||
296         (c >= 0xa38 && c <= 0xa39) ||
297         (c >= 0xa59 && c <= 0xa5c) ||
298         c == 0xa5e ||
299         (c >= 0xa72 && c <= 0xa74) ||
300         (c >= 0xa85 && c <= 0xa8b) ||
301         c == 0xa8d ||
302         (c >= 0xa8f && c <= 0xa91) ||
303         (c >= 0xa93 && c <= 0xaa8) ||
304         (c >= 0xaaa && c <= 0xab0) ||
305         (c >= 0xab2 && c <= 0xab3) ||
306         (c >= 0xab5 && c <= 0xab9) ||
307         c == 0xabd ||
308         c == 0xae0 ||
309         (c >= 0xb05 && c <= 0xb0c) ||
310         (c >= 0xb0f && c <= 0xb10) ||
311         (c >= 0xb13 && c <= 0xb28) ||
312         (c >= 0xb2a && c <= 0xb30) ||
313         (c >= 0xb32 && c <= 0xb33) ||
314         (c >= 0xb36 && c <= 0xb39) ||
315         c == 0xb3d ||
316         (c >= 0xb5c && c <= 0xb5d) ||
317         (c >= 0xb5f && c <= 0xb61) ||
318         (c >= 0xb85 && c <= 0xb8a) ||
319         (c >= 0xb8e && c <= 0xb90) ||
320         (c >= 0xb92 && c <= 0xb95) ||
321         (c >= 0xb99 && c <= 0xb9a) ||
322         c == 0xb9c ||
323         (c >= 0xb9e && c <= 0xb9f) ||
324         (c >= 0xba3 && c <= 0xba4) ||
325         (c >= 0xba8 && c <= 0xbaa) ||
326         (c >= 0xbae && c <= 0xbb5) ||
327         (c >= 0xbb7 && c <= 0xbb9) ||
328         (c >= 0xc05 && c <= 0xc0c) ||
329         (c >= 0xc0e && c <= 0xc10) ||
330         (c >= 0xc12 && c <= 0xc28) ||
331         (c >= 0xc2a && c <= 0xc33) ||
332         (c >= 0xc35 && c <= 0xc39) ||
333         (c >= 0xc60 && c <= 0xc61) ||
334         (c >= 0xc85 && c <= 0xc8c) ||
335         (c >= 0xc8e && c <= 0xc90) ||
336         (c >= 0xc92 && c <= 0xca8) ||
337         (c >= 0xcaa && c <= 0xcb3) ||
338         (c >= 0xcb5 && c <= 0xcb9) ||
339         c == 0xcde ||
340         (c >= 0xce0 && c <= 0xce1) ||
341         (c >= 0xd05 && c <= 0xd0c) ||
342         (c >= 0xd0e && c <= 0xd10) ||
343         (c >= 0xd12 && c <= 0xd28) ||
344         (c >= 0xd2a && c <= 0xd39) ||
345         (c >= 0xd60 && c <= 0xd61) ||
346         (c >= 0xe01 && c <= 0xe2e) ||
347         c == 0xe30 ||
348         (c >= 0xe32 && c <= 0xe33) ||
349         (c >= 0xe40 && c <= 0xe45) ||
350         (c >= 0xe81 && c <= 0xe82) ||
351         c == 0xe84 ||
352         (c >= 0xe87 && c <= 0xe88) ||
353         c == 0xe8a ||
354         c == 0xe8d ||
355         (c >= 0xe94 && c <= 0xe97) ||
356         (c >= 0xe99 && c <= 0xe9f) ||
357         (c >= 0xea1 && c <= 0xea3) ||
358         c == 0xea5 ||
359         c == 0xea7 ||
360         (c >= 0xeaa && c <= 0xeab) ||
361         (c >= 0xead && c <= 0xeae) ||
362         c == 0xeb0 ||
363         (c >= 0xeb2 && c <= 0xeb3) ||
364         c == 0xebd ||
365         (c >= 0xec0 && c <= 0xec4) ||
366         (c >= 0xf40 && c <= 0xf47) ||
367         (c >= 0xf49 && c <= 0xf69) ||
368         (c >= 0x10a0 && c <= 0x10c5) ||
369         (c >= 0x10d0 && c <= 0x10f6) ||
370         c == 0x1100 ||
371         (c >= 0x1102 && c <= 0x1103) ||
372         (c >= 0x1105 && c <= 0x1107) ||
373         c == 0x1109 ||
374         (c >= 0x110b && c <= 0x110c) ||
375         (c >= 0x110e && c <= 0x1112) ||
376         c == 0x113c ||
377         c == 0x113e ||
378         c == 0x1140 ||
379         c == 0x114c ||
380         c == 0x114e ||
381         c == 0x1150 ||
382         (c >= 0x1154 && c <= 0x1155) ||
383         c == 0x1159 ||
384         (c >= 0x115f && c <= 0x1161) ||
385         c == 0x1163 ||
386         c == 0x1165 ||
387         c == 0x1167 ||
388         c == 0x1169 ||
389         (c >= 0x116d && c <= 0x116e) ||
390         (c >= 0x1172 && c <= 0x1173) ||
391         c == 0x1175 ||
392         c == 0x119e ||
393         c == 0x11a8 ||
394         c == 0x11ab ||
395         (c >= 0x11ae && c <= 0x11af) ||
396         (c >= 0x11b7 && c <= 0x11b8) ||
397         c == 0x11ba ||
398         (c >= 0x11bc && c <= 0x11c2) ||
399         c == 0x11eb ||
400         c == 0x11f0 ||
401         c == 0x11f9 ||
402         (c >= 0x1e00 && c <= 0x1e9b) ||
403         (c >= 0x1ea0 && c <= 0x1ef9) ||
404         (c >= 0x1f00 && c <= 0x1f15) ||
405         (c >= 0x1f18 && c <= 0x1f1d) ||
406         (c >= 0x1f20 && c <= 0x1f45) ||
407         (c >= 0x1f48 && c <= 0x1f4d) ||
408         (c >= 0x1f50 && c <= 0x1f57) ||
409         c == 0x1f59 ||
410         c == 0x1f5b ||
411         c == 0x1f5d ||
412         (c >= 0x1f5f && c <= 0x1f7d) ||
413         (c >= 0x1f80 && c <= 0x1fb4) ||
414         (c >= 0x1fb6 && c <= 0x1fbc) ||
415         c == 0x1fbe ||
416         (c >= 0x1fc2 && c <= 0x1fc4) ||
417         (c >= 0x1fc6 && c <= 0x1fcc) ||
418         (c >= 0x1fd0 && c <= 0x1fd3) ||
419         (c >= 0x1fd6 && c <= 0x1fdb) ||
420         (c >= 0x1fe0 && c <= 0x1fec) ||
421         (c >= 0x1ff2 && c <= 0x1ff4) ||
422         (c >= 0x1ff6 && c <= 0x1ffc) ||
423         c == 0x2126 ||
424         (c >= 0x212a && c <= 0x212b) ||
425         c == 0x212e ||
426         (c >= 0x2180 && c <= 0x2182) ||
427         (c >= 0x3041 && c <= 0x3094) ||
428         (c >= 0x30a1 && c <= 0x30fa) ||
429         (c >= 0x3105 && c <= 0x312c) ||
430         (c >= 0xac00 && c <= 0xd7a3) ||
431         (c >= 0x4e00 && c <= 0x9fa5) ||
432         c == 0x3007 ||
433         (c >= 0x3021 && c <= 0x3029) ||
434         (c >= 0x4e00 && c <= 0x9fa5) ||
435         c == 0x3007 ||
436         (c >= 0x3021 && c <= 0x3029));
437 }
438 
439 Bool IsXMLNamechar(uint c)
440 {
441     return (IsXMLLetter(c) ||
442         c == '.' || c == '_' ||
443         c == ':' || c == '-' ||
444         (c >= 0x300 && c <= 0x345) ||
445         (c >= 0x360 && c <= 0x361) ||
446         (c >= 0x483 && c <= 0x486) ||
447         (c >= 0x591 && c <= 0x5a1) ||
448         (c >= 0x5a3 && c <= 0x5b9) ||
449         (c >= 0x5bb && c <= 0x5bd) ||
450         c == 0x5bf ||
451         (c >= 0x5c1 && c <= 0x5c2) ||
452         c == 0x5c4 ||
453         (c >= 0x64b && c <= 0x652) ||
454         c == 0x670 ||
455         (c >= 0x6d6 && c <= 0x6dc) ||
456         (c >= 0x6dd && c <= 0x6df) ||
457         (c >= 0x6e0 && c <= 0x6e4) ||
458         (c >= 0x6e7 && c <= 0x6e8) ||
459         (c >= 0x6ea && c <= 0x6ed) ||
460         (c >= 0x901 && c <= 0x903) ||
461         c == 0x93c ||
462         (c >= 0x93e && c <= 0x94c) ||
463         c == 0x94d ||
464         (c >= 0x951 && c <= 0x954) ||
465         (c >= 0x962 && c <= 0x963) ||
466         (c >= 0x981 && c <= 0x983) ||
467         c == 0x9bc ||
468         c == 0x9be ||
469         c == 0x9bf ||
470         (c >= 0x9c0 && c <= 0x9c4) ||
471         (c >= 0x9c7 && c <= 0x9c8) ||
472         (c >= 0x9cb && c <= 0x9cd) ||
473         c == 0x9d7 ||
474         (c >= 0x9e2 && c <= 0x9e3) ||
475         c == 0xa02 ||
476         c == 0xa3c ||
477         c == 0xa3e ||
478         c == 0xa3f ||
479         (c >= 0xa40 && c <= 0xa42) ||
480         (c >= 0xa47 && c <= 0xa48) ||
481         (c >= 0xa4b && c <= 0xa4d) ||
482         (c >= 0xa70 && c <= 0xa71) ||
483         (c >= 0xa81 && c <= 0xa83) ||
484         c == 0xabc ||
485         (c >= 0xabe && c <= 0xac5) ||
486         (c >= 0xac7 && c <= 0xac9) ||
487         (c >= 0xacb && c <= 0xacd) ||
488         (c >= 0xb01 && c <= 0xb03) ||
489         c == 0xb3c ||
490         (c >= 0xb3e && c <= 0xb43) ||
491         (c >= 0xb47 && c <= 0xb48) ||
492         (c >= 0xb4b && c <= 0xb4d) ||
493         (c >= 0xb56 && c <= 0xb57) ||
494         (c >= 0xb82 && c <= 0xb83) ||
495         (c >= 0xbbe && c <= 0xbc2) ||
496         (c >= 0xbc6 && c <= 0xbc8) ||
497         (c >= 0xbca && c <= 0xbcd) ||
498         c == 0xbd7 ||
499         (c >= 0xc01 && c <= 0xc03) ||
500         (c >= 0xc3e && c <= 0xc44) ||
501         (c >= 0xc46 && c <= 0xc48) ||
502         (c >= 0xc4a && c <= 0xc4d) ||
503         (c >= 0xc55 && c <= 0xc56) ||
504         (c >= 0xc82 && c <= 0xc83) ||
505         (c >= 0xcbe && c <= 0xcc4) ||
506         (c >= 0xcc6 && c <= 0xcc8) ||
507         (c >= 0xcca && c <= 0xccd) ||
508         (c >= 0xcd5 && c <= 0xcd6) ||
509         (c >= 0xd02 && c <= 0xd03) ||
510         (c >= 0xd3e && c <= 0xd43) ||
511         (c >= 0xd46 && c <= 0xd48) ||
512         (c >= 0xd4a && c <= 0xd4d) ||
513         c == 0xd57 ||
514         c == 0xe31 ||
515         (c >= 0xe34 && c <= 0xe3a) ||
516         (c >= 0xe47 && c <= 0xe4e) ||
517         c == 0xeb1 ||
518         (c >= 0xeb4 && c <= 0xeb9) ||
519         (c >= 0xebb && c <= 0xebc) ||
520         (c >= 0xec8 && c <= 0xecd) ||
521         (c >= 0xf18 && c <= 0xf19) ||
522         c == 0xf35 ||
523         c == 0xf37 ||
524         c == 0xf39 ||
525         c == 0xf3e ||
526         c == 0xf3f ||
527         (c >= 0xf71 && c <= 0xf84) ||
528         (c >= 0xf86 && c <= 0xf8b) ||
529         (c >= 0xf90 && c <= 0xf95) ||
530         c == 0xf97 ||
531         (c >= 0xf99 && c <= 0xfad) ||
532         (c >= 0xfb1 && c <= 0xfb7) ||
533         c == 0xfb9 ||
534         (c >= 0x20d0 && c <= 0x20dc) ||
535         c == 0x20e1 ||
536         (c >= 0x302a && c <= 0x302f) ||
537         c == 0x3099 ||
538         c == 0x309a ||
539         (c >= 0x30 && c <= 0x39) ||
540         (c >= 0x660 && c <= 0x669) ||
541         (c >= 0x6f0 && c <= 0x6f9) ||
542         (c >= 0x966 && c <= 0x96f) ||
543         (c >= 0x9e6 && c <= 0x9ef) ||
544         (c >= 0xa66 && c <= 0xa6f) ||
545         (c >= 0xae6 && c <= 0xaef) ||
546         (c >= 0xb66 && c <= 0xb6f) ||
547         (c >= 0xbe7 && c <= 0xbef) ||
548         (c >= 0xc66 && c <= 0xc6f) ||
549         (c >= 0xce6 && c <= 0xcef) ||
550         (c >= 0xd66 && c <= 0xd6f) ||
551         (c >= 0xe50 && c <= 0xe59) ||
552         (c >= 0xed0 && c <= 0xed9) ||
553         (c >= 0xf20 && c <= 0xf29) ||
554         c == 0xb7 ||
555         c == 0x2d0 ||
556         c == 0x2d1 ||
557         c == 0x387 ||
558         c == 0x640 ||
559         c == 0xe46 ||
560         c == 0xec6 ||
561         c == 0x3005 ||
562         (c >= 0x3031 && c <= 0x3035) ||
563         (c >= 0x309d && c <= 0x309e) ||
564         (c >= 0x30fc && c <= 0x30fe));
565 }
566 
567 Bool IsLower(uint c)
568 {
569     uint map = MAP(c);
570 
571     return (map & lowercase)!=0;
572 }
573 
574 Bool IsUpper(uint c)
575 {
576     uint map = MAP(c);
577 
578     return (map & uppercase)!=0;
579 }
580 
581 uint ToLower(uint c)
582 {
583     uint map = MAP(c);
584 
585     if (map & uppercase)
586         c += 'a' - 'A';
587 
588     return c;
589 }
590 
591 uint ToUpper(uint c)
592 {
593     uint map = MAP(c);
594 
595     if (map & lowercase)
596         c += (uint) ('A' - 'a' );
597 
598     return c;
599 }
600 
601 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
602 {
603     if ( !cfgBool(doc, TidyXmlTags) )
604     {
605         if ( tocaps )
606         {
607             c = (tmbchar) ToUpper(c);
608         }
609         else /* force to lower case */
610         {
611             c = (tmbchar) ToLower(c);
612         }
613     }
614     return c;
615 }
616 
617 
618 /*
619  return last character in string
620  this is useful when trailing quotemark
621  is missing on an attribute
622 */
623 static tmbchar LastChar( tmbstr str )
624 {
625     if ( str && *str )
626     {
627         int n = tmbstrlen(str);
628         return str[n-1];
629     }
630     return 0;
631 }
632 
633 /*
634    node->type is one of these:
635 
636     #define TextNode    1
637     #define StartTag    2
638     #define EndTag      3
639     #define StartEndTag 4
640 */
641 
642 Lexer* NewLexer( TidyDocImpl* doc )
643 {
644     Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) );
645 
646     if ( lexer != NULL )
647     {
648         ClearMemory( lexer, sizeof(Lexer) );
649 
650         lexer->lines = 1;
651         lexer->columns = 1;
652         lexer->state = LEX_CONTENT;
653 
654         lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
655         lexer->doctype = VERS_UNKNOWN;
656         lexer->root = &doc->root;
657     }
658     return lexer;
659 }
660 
661 Bool EndOfInput( TidyDocImpl* doc )
662 {
663     assert( doc->docIn != NULL );
664     return ( !doc->docIn->pushed && IsEOF(doc->docIn) );
665 }
666 
667 void FreeLexer( TidyDocImpl* doc )
668 {
669     Lexer *lexer = doc->lexer;
670     if ( lexer )
671     {
672         FreeStyles( doc );
673 
674         if ( lexer->pushed )
675             FreeNode( doc, lexer->token );
676 
677         while ( lexer->istacksize > 0 )
678             PopInline( doc, NULL );
679 
680         MemFree( lexer->istack );
681         MemFree( lexer->lexbuf );
682         MemFree( lexer );
683         doc->lexer = NULL;
684     }
685 }
686 
687 /* Lexer uses bigger memory chunks than pprint as
688 ** it must hold the entire input document. not just
689 ** the last line or three.
690 */
691 void AddByte( Lexer *lexer, tmbchar ch )
692 {
693     if ( lexer->lexsize + 2 >= lexer->lexlength )
694     {
695         tmbstr buf = NULL;
696         uint allocAmt = lexer->lexlength;
697         while ( lexer->lexsize + 2 >= allocAmt )
698         {
699             if ( allocAmt == 0 )
700                 allocAmt = 8192;
701             else
702                 allocAmt *= 2;
703         }
704         buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt );
705         if ( buf )
706         {
707           ClearMemory( buf + lexer->lexlength, 
708                        allocAmt - lexer->lexlength );
709           lexer->lexbuf = buf;
710           lexer->lexlength = allocAmt;
711         }
712     }
713 
714     lexer->lexbuf[ lexer->lexsize++ ] = ch;
715     lexer->lexbuf[ lexer->lexsize ]   = '\0';  /* debug */
716 }
717 
718 static void ChangeChar( Lexer *lexer, tmbchar c )
719 {
720     if ( lexer->lexsize > 0 )
721     {
722         lexer->lexbuf[ lexer->lexsize-1 ] = c;
723     }
724 }
725 
726 /* store character c as UTF-8 encoded byte stream */
727 void AddCharToLexer( Lexer *lexer, uint c )
728 {
729     int i, err, count = 0;
730     tmbchar buf[10] = {0};
731     
732     err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );
733     if (err)
734     {
735 #if 0 && defined(_DEBUG)
736         fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );
737 #endif
738         /* replacement character 0xFFFD encoded as UTF-8 */
739         buf[0] = (byte) 0xEF;
740         buf[1] = (byte) 0xBF;
741         buf[2] = (byte) 0xBD;
742         count = 3;
743     }
744     
745     for ( i = 0; i < count; ++i )
746         AddByte( lexer, buf[i] );
747 }
748 
749 static void AddStringToLexer( Lexer *lexer, ctmbstr str )
750 {
751     uint c;
752 
753     /*  Many (all?) compilers will sign-extend signed chars (the default) when
754     **  converting them to unsigned integer values.  We must cast our char to
755     **  unsigned char before assigning it to prevent this from happening.
756     */
757     while( 0 != (c = (unsigned char) *str++ ))
758         AddCharToLexer( lexer, c );
759 }
760 
761 /*
762   No longer attempts to insert missing ';' for unknown
763   enitities unless one was present already, since this
764   gives unexpected results.
765 
766   For example:   <a href="something.htm?foo&bar&fred">
767   was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
768   rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
769 
770   My thanks for Maurice Buxton for spotting this.
771 
772   Also Randy Waki pointed out the following case for the
773   04 Aug 00 version (bug #433012):
774   
775   For example:   <a href="something.htm?id=1&lang=en">
776   was tidied to: <a href="something.htm?id=1&lang;=en">
777   rather than:   <a href="something.htm?id=1&amp;lang=en">
778   
779   where "lang" is a known entity (#9001), but browsers would
780   misinterpret "&lang;" because it had a value > 256.
781   
782   So the case of an apparently known entity with a value > 256 and
783   missing a semicolon is handled specially.
784   
785   "ParseEntity" is also a bit of a misnomer - it handles entities and
786   numeric character references. Invalid NCR's are now reported.
787 */
788 static void ParseEntity( TidyDocImpl* doc, int mode )
789 {
790     uint start;
791     Bool first = yes, semicolon = no, found = no;
792     Bool isXml = cfgBool( doc, TidyXmlTags );
793     uint c, ch, startcol, entver = 0;
794     Lexer* lexer = doc->lexer;
795 
796     start = lexer->lexsize - 1;  /* to start at "&" */
797     startcol = doc->docIn->curcol - 1;
798 
799     while ( (c = ReadChar(doc->docIn)) != EndOfStream )
800     {
801         if ( c == ';' )
802         {
803             semicolon = yes;
804             break;
805         }
806 
807         if (first && c == '#')
808         {
809 #if SUPPORT_ASIAN_ENCODINGS
810             if ( !cfgBool(doc, TidyNCR) || 
811                  cfg(doc, TidyInCharEncoding) == BIG5 ||
812                  cfg(doc, TidyInCharEncoding) == SHIFTJIS )
813             {
814                 UngetChar('#', doc->docIn);
815                 return;
816             }
817 #endif
818             AddCharToLexer( lexer, c );
819             first = no;
820             continue;
821         }
822 
823         first = no;
824 
825         if ( IsNamechar(c) )
826         {
827             AddCharToLexer( lexer, c );
828             continue;
829         }
830 
831         /* otherwise put it back */
832 
833         UngetChar( c, doc->docIn );
834         break;
835     }
836 
837     /* make sure entity is NULL terminated */
838     lexer->lexbuf[lexer->lexsize] = '\0';
839 
840     /* Should contrain version to XML/XHTML if &apos; 
841     ** is encountered.  But this is not possible with
842     ** Tidy's content model bit mask.
843     */
844     if ( tmbstrcmp(lexer->lexbuf+start, "&apos") == 0
845          && !cfgBool(doc, TidyXmlOut)
846          && !lexer->isvoyager
847          && !cfgBool(doc, TidyXhtmlOut) )
848         ReportEntityError( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
849 
850     /* Lookup entity code and version
851     */
852     found = EntityInfo( lexer->lexbuf+start, isXml, &ch, &entver );
853 
854     /* deal with unrecognized or invalid entities */
855     /* #433012 - fix by Randy Waki 17 Feb 01 */
856     /* report invalid NCR's - Terry Teague 01 Sep 01 */
857     if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
858     {
859         /* set error position just before offending character */
860         lexer->lines = doc->docIn->curline;
861         lexer->columns = startcol;
862 
863         if (lexer->lexsize > start + 1)
864         {
865             if (ch >= 128 && ch <= 159)
866             {
867                 /* invalid numeric character reference */
868                 
869                 uint c1 = 0;
870                 int replaceMode = DISCARDED_CHAR;
871             
872                 if ( ReplacementCharEncoding == WIN1252 )
873                     c1 = DecodeWin1252( ch );
874                 else if ( ReplacementCharEncoding == MACROMAN )
875                     c1 = DecodeMacRoman( ch );
876 
877                 if ( c1 )
878                     replaceMode = REPLACED_CHAR;
879                 
880                 if ( c != ';' )  /* issue warning if not terminated by ';' */
881                     ReportEntityError( doc, MISSING_SEMICOLON_NCR,
882                                        lexer->lexbuf+start, c );
883  
884                 ReportEncodingError(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
885                 
886                 if ( c1 )
887                 {
888                     /* make the replacement */
889                     lexer->lexsize = start;
890                     AddCharToLexer( lexer, c1 );
891                     semicolon = no;
892                 }
893                 else
894                 {
895                     /* discard */
896                     lexer->lexsize = start;
897                     semicolon = no;
898                }
899                
900             }
901             else
902                 ReportEntityError( doc, UNKNOWN_ENTITY,
903                                    lexer->lexbuf+start, ch );
904 
905             if (semicolon)
906                 AddCharToLexer( lexer, ';' );
907         }
908         else /* naked & */
909             ReportEntityError( doc, UNESCAPED_AMPERSAND,
910                                lexer->lexbuf+start, ch );
911     }
912     else
913     {
914         if ( c != ';' )    /* issue warning if not terminated by ';' */
915         {
916             /* set error position just before offending chararcter */
917             lexer->lines = doc->docIn->curline;
918             lexer->columns = startcol;
919             ReportEntityError( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
920         }
921 
922         lexer->lexsize = start;
923         if ( ch == 160 && (mode & Preformatted) )
924             ch = ' ';
925         AddCharToLexer( lexer, ch );
926 
927         if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
928             AddStringToLexer( lexer, "amp;" );
929 
930         /* Detect extended vs. basic entities */
931         ConstrainVersion( doc, entver );
932     }
933 }
934 
935 static tmbchar ParseTagName( TidyDocImpl* doc )
936 {
937     Lexer *lexer = doc->lexer;
938     uint c = lexer->lexbuf[ lexer->txtstart ];
939     Bool xml = cfgBool(doc, TidyXmlTags);
940 
941     /* fold case of first character in buffer */
942     if (!xml && IsUpper(c))
943         lexer->lexbuf[lexer->txtstart] = (tmbchar) ToLower(c);
944 
945     while ((c = ReadChar(doc->docIn)) != EndOfStream)
946     {
947         if ((!xml && !IsNamechar(c)) ||
948             (xml && !IsXMLNamechar(c)))
949             break;
950 
951         /* fold case of subsequent characters */
952         if (!xml && IsUpper(c))
953              c = ToLower(c);
954 
955         AddCharToLexer(lexer, c);
956     }
957 
958     lexer->txtend = lexer->lexsize;
959     return (tmbchar) c;
960 }
961 
962 /*
963   Used for elements and text nodes
964   element name is NULL for text nodes
965   start and end are offsets into lexbuf
966   which contains the textual content of
967   all elements in the parse tree.
968 
969   parent and content allow traversal
970   of the parse tree in any direction.
971   attributes are represented as a linked
972   list of AttVal nodes which hold the
973   strings for attribute/value pairs.
974 */
975 
976 
977 Node *NewNode(Lexer *lexer)
978 {
979     Node* node = (Node*) MemAlloc( sizeof(Node) );
980     ClearMemory( node, sizeof(Node) );
981     if ( lexer )
982     {
983         node->line = lexer->lines;
984         node->column = lexer->columns;
985     }
986     node->type = TextNode;
987     return node;
988 }
989 
990 /* used to clone heading nodes when split by an <HR> */
991 Node *CloneNode( TidyDocImpl* doc, Node *element )
992 {
993     Lexer* lexer = doc->lexer;
994     Node *node = NewNode( lexer );
995 
996     node->start = lexer->lexsize;
997     node->end   = lexer->lexsize;
998 
999     if ( element )
1000     {
1001         node->parent     = element->parent;
1002         node->type       = element->type;
1003         node->closed     = element->closed;
1004         node->implicit   = element->implicit;
1005         node->tag        = element->tag;
1006         node->element    = tmbstrdup( element->element );
1007         node->attributes = DupAttrs( doc, element->attributes );
1008     }
1009     return node;
1010 }
1011 
1012 /* free node's attributes */
1013 void FreeAttrs( TidyDocImpl* doc, Node *node )
1014 {
1015 
1016     while ( node->attributes )
1017     {
1018         AttVal *av = node->attributes;
1019 
1020         if ( av->attribute )
1021         {
1022             if ( (attrIsID(av) || attrIsNAME(av)) &&
1023                  IsAnchorElement(doc, node) )
1024             {
1025                 RemoveAnchorByNode( doc, node );
1026             }
1027         }
1028 
1029         node->attributes = av->next;
1030         FreeAttribute( doc, av );
1031     }
1032 }
1033 
1034 /* doesn't repair attribute list linkage */
1035 void FreeAttribute( TidyDocImpl* doc, AttVal *av )
1036 {
1037     FreeNode( doc, av->asp );
1038     FreeNode( doc, av->php );
1039     MemFree( av->attribute );
1040     MemFree( av->value );
1041     MemFree( av );
1042 }
1043 
1044 /* detach attribute from node
1045 */
1046 void DetachAttribute( Node *node, AttVal *attr )
1047 {
1048     AttVal *av, *prev = NULL;
1049 
1050     for ( av = node->attributes; av; av = av->next )
1051     {
1052         if ( av == attr )
1053         {
1054             if ( prev )
1055                 prev->next = attr->next;
1056             else
1057                 node->attributes = attr->next;
1058             break;
1059         }
1060         prev = av;
1061     }
1062 }
1063 
1064 /* detach attribute from node then free it
1065 */
1066 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr )
1067 {
1068     DetachAttribute( node, attr );
1069     FreeAttribute( doc, attr );
1070 }
1071 
1072 /*
1073   Free document nodes by iterating through peers and recursing
1074   through children. Set next to NULL before calling FreeNode()
1075   to avoid freeing peer nodes. Doesn't patch up prev/next links.
1076  */
1077 void FreeNode( TidyDocImpl* doc, Node *node )
1078 {
1079     while ( node )
1080     {
1081         Node* next = node->next;
1082 
1083         FreeAttrs( doc, node );
1084         FreeNode( doc, node->content );
1085         MemFree( node->element );
1086 #ifdef TIDY_STORE_ORIGINAL_TEXT
1087         if (node->otext)
1088             MemFree(node->otext);
1089 #endif
1090         if (RootNode != node->type)
1091             MemFree( node );
1092         else
1093             node->content = NULL;
1094 
1095         node = next;
1096     }
1097 }
1098 
1099 #ifdef TIDY_STORE_ORIGINAL_TEXT
1100 void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)
1101 {
1102     if (!doc->storeText)
1103         return;
1104 
1105     if (count >= doc->docIn->otextlen)
1106         return;
1107 
1108     if (!doc->docIn->otextsize)
1109         return;
1110 
1111     if (count == 0)
1112     {
1113         node->otext = doc->docIn->otextbuf;
1114         doc->docIn->otextbuf = NULL;
1115         doc->docIn->otextlen = 0;
1116         doc->docIn->otextsize = 0;
1117     }
1118     else
1119     {
1120         uint len = doc->docIn->otextlen;
1121         tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1);
1122         tmbstr buf2 = (tmbstr)MemAlloc(count + 1);
1123         uint i, j;
1124 
1125         /* strncpy? */
1126 
1127         for (i = 0; i < len - count; ++i)
1128             buf1[i] = doc->docIn->otextbuf[i];
1129 
1130         buf1[i] = 0;
1131 
1132         for (j = 0; j + i < len; ++j)
1133             buf2[j] = doc->docIn->otextbuf[j + i];
1134 
1135         buf2[j] = 0;
1136 
1137         MemFree(doc->docIn->otextbuf);
1138         node->otext = buf1;
1139         doc->docIn->otextbuf = buf2;
1140         doc->docIn->otextlen = count;
1141         doc->docIn->otextsize = count + 1;
1142     }
1143 }
1144 #endif
1145 
1146 Node* TextToken( Lexer *lexer )
1147 {
1148     Node *node = NewNode( lexer );
1149     node->start = lexer->txtstart;
1150     node->end = lexer->txtend;
1151     return node;
1152 }
1153 
1154 /* used for creating preformatted text from Word2000 */
1155 Node *NewLineNode( Lexer *lexer )
1156 {
1157     Node *node = NewNode( lexer );
1158     node->start = lexer->lexsize;
1159     AddCharToLexer( lexer, (uint)'\n' );
1160     node->end = lexer->lexsize;
1161     return node;
1162 }
1163 
1164 /* used for adding a &nbsp; for Word2000 */
1165 Node* NewLiteralTextNode( Lexer *lexer, ctmbstr txt )
1166 {
1167     Node *node = NewNode( lexer );
1168     node->start = lexer->lexsize;
1169     AddStringToLexer( lexer, txt );
1170     node->end = lexer->lexsize;
1171     return node;
1172 }
1173 
1174 static Node* TagToken( TidyDocImpl* doc, NodeType type )
1175 {
1176     Lexer* lexer = doc->lexer;
1177     Node* node = NewNode( lexer );
1178     node->type = type;
1179     node->element = tmbstrndup( lexer->lexbuf + lexer->txtstart,
1180                                 lexer->txtend - lexer->txtstart );
1181     node->start = lexer->txtstart;
1182     node->end = lexer->txtstart;
1183 
1184     if ( type == StartTag || type == StartEndTag || type == EndTag )
1185         FindTag(doc, node);
1186 
1187     return node;
1188 }
1189 
1190 static Node* NewToken(TidyDocImpl* doc, NodeType type)
1191 {
1192     Lexer* lexer = doc->lexer;
1193     Node* node = NewNode(lexer);
1194     node->type = type;
1195     node->start = lexer->txtstart;
1196     node->end = lexer->txtend;
1197 #ifdef TIDY_STORE_ORIGINAL_TEXT
1198     StoreOriginalTextInToken(doc, node, 0);
1199 #endif
1200     return node;
1201 }
1202 
1203 #define CommentToken(doc) NewToken(doc, CommentTag)
1204 #define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1205 #define PIToken(doc)      NewToken(doc, ProcInsTag)
1206 #define AspToken(doc)     NewToken(doc, AspTag)
1207 #define JsteToken(doc)    NewToken(doc, JsteTag)
1208 #define PhpToken(doc)     NewToken(doc, PhpTag)
1209 #define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1210 #define SectionToken(doc) NewToken(doc, SectionTag)
1211 #define CDATAToken(doc)   NewToken(doc, CDATATag)
1212 
1213 void AddStringLiteral( Lexer* lexer, ctmbstr str )
1214 {
1215     byte c;
1216     while(0 != (c = *str++) )
1217         AddCharToLexer( lexer, c );
1218 }
1219 
1220 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1221 {
1222     byte c;
1223     int ix;
1224 
1225     for ( ix=0; ix < len && (c = *str++); ++ix )
1226         AddCharToLexer(lexer, c);
1227 }
1228 
1229 /* find doctype element */
1230 Node *FindDocType( TidyDocImpl* doc )
1231 {
1232     Node* node;
1233     for ( node = (doc ? doc->root.content : NULL);
1234           node && node->type != DocTypeTag; 
1235           node = node->next )
1236         /**/;
1237     return node;
1238 }
1239 
1240 /* find parent container element */
1241 Node* FindContainer( Node* node )
1242 {
1243     for ( node = (node ? node->parent : NULL);
1244           node && nodeHasCM(node, CM_INLINE);
1245           node = node->parent )
1246         /**/;
1247 
1248     return node;
1249 }
1250 
1251 
1252 /* find html element */
1253 Node *FindHTML( TidyDocImpl* doc )
1254 {
1255     Node *node;
1256     for ( node = (doc ? doc->root.content : NULL);
1257           node && !nodeIsHTML(node); 
1258           node = node->next )
1259         /**/;
1260 
1261     return node;
1262 }
1263 
1264 /* find XML Declaration */
1265 Node *FindXmlDecl(TidyDocImpl* doc)
1266 {
1267     Node *node;
1268     for ( node = (doc ? doc->root.content : NULL);
1269           node && !(node->type == XmlDecl);
1270           node = node->next )
1271         /**/;
1272 
1273     return node;
1274 }
1275 
1276 
1277 Node *FindHEAD( TidyDocImpl* doc )
1278 {
1279     Node *node = FindHTML( doc );
1280 
1281     if ( node )
1282     {
1283         for ( node = node->content;
1284               node && !nodeIsHEAD(node); 
1285               node = node->next )
1286             /**/;
1287     }
1288 
1289     return node;
1290 }
1291 
1292 Node *FindTITLE(TidyDocImpl* doc)
1293 {
1294     Node *node = FindHEAD(doc);
1295 
1296     if (node)
1297         for (node = node->content;
1298              node && !nodeIsTITLE(node);
1299              node = node->next) {}
1300 
1301     return node;
1302 }
1303 
1304 Node *FindBody( TidyDocImpl* doc )
1305 {
1306     Node *node = ( doc ? doc->root.content : NULL );
1307 
1308     while ( node && !nodeIsHTML(node) )
1309         node = node->next;
1310 
1311     if (node == NULL)
1312         return NULL;
1313 
1314     node = node->content;
1315     while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1316         node = node->next;
1317 
1318     if ( node && nodeIsFRAMESET(node) )
1319     {
1320         node = node->content;
1321         while ( node && !nodeIsNOFRAMES(node) )
1322             node = node->next;
1323 
1324         if ( node )
1325         {
1326             node = node->content;
1327             while ( node && !nodeIsBODY(node) )
1328                 node = node->next;
1329         }
1330     }
1331 
1332     return node;
1333 }
1334 
1335 /* add meta element for Tidy */
1336 Bool AddGenerator( TidyDocImpl* doc )
1337 {
1338     AttVal *attval;
1339     Node *node;
1340     Node *head = FindHEAD( doc );
1341     tmbchar buf[256];
1342     
1343     if (head)
1344     {
1345 #ifdef PLATFORM_NAME
1346         tmbsnprintf(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",
1347                  tidyReleaseDate());
1348 #else
1349         tmbsnprintf(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());
1350 #endif
1351 
1352         for ( node = head->content; node; node = node->next )
1353         {
1354             if ( nodeIsMETA(node) )
1355             {