Version:
~ [ 1.0 ] ~
1 /* lexer.c -- Lexer for html parser
2
3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 CVS Info :
7
8 $Author: arnaud02 $
9 $Date: 2005/10/13 12:11:01 $
10 $Revision: 1.173 $
11
12 */
13
14 /*
15 Given a file stream fp it returns a sequence of tokens.
16
17 GetToken(fp) gets the next token
18 UngetToken(fp) provides one level undo
19
20 The tags include an attribute list:
21
22 - linked list of attribute/value nodes
23 - each node has 2 NULL-terminated strings.
24 - entities are replaced in attribute values
25
26 white space is compacted if not in preformatted mode
27 If not in preformatted mode then leading white space
28 is discarded and subsequent white space sequences
29 compacted to single space characters.
30
31 If XmlTags is no then Tag names are folded to upper
32 case and attribute names to lower case.
33
34 Not yet done:
35 - Doctype subset and marked sections
36 */
37
38 #include "tidy-int.h"
39 #include "lexer.h"
40 #include "parser.h"
41 #include "entities.h"
42 #include "streamio.h"
43 #include "message.h"
44 #include "tmbstr.h"
45 #include "clean.h"
46 #include "utf8.h"
47 #include "streamio.h"
48
49 /* Forward references
50 */
51 /* swallows closing '>' */
52 static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
53
54 static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
55 Node **asp, Node **php );
56
57 static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
58 Bool *isempty, int *pdelim );
59
60 static Node *ParseDocTypeDecl(TidyDocImpl* doc);
61
62 static void AddAttrToList( AttVal** list, AttVal* av );
63
64 /* used to classify characters for lexical purposes */
65 #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
66 static uint lexmap[128];
67
68 #define IsValidXMLAttrName(name) IsValidXMLID(name)
69 #define IsValidXMLElemName(name) IsValidXMLID(name)
70
71 static struct _doctypes
72 {
73 uint score;
74 uint vers;
75 ctmbstr name;
76 ctmbstr fpi;
77 ctmbstr si;
78 } const W3C_Doctypes[] =
79 {
80 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },
81 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },
82 { 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },
83 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },
84 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },
85 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },
86 { 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },
87 { 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },
88 { 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },
89 { 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },
90 { 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },
91 { 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },
92 { 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },
93 { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },
94 { 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },
95 { 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },
96 { 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },
97
98 /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
99 #if 0
100 { 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },
101 { 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
102 #endif
103 /* final entry */
104 { 0, 0, NULL, NULL, NULL }
105 };
106
107 int HTMLVersion(TidyDocImpl* doc)
108 {
109 uint i;
110 uint j = 0;
111 uint score = 0;
112 uint vers = doc->lexer->versions;
113 uint dtver = doc->lexer->doctype;
114 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
115 Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
116 !cfgBool(doc, TidyHtmlOut);
117 Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
118
119 for (i = 0; W3C_Doctypes[i].name; ++i)
120 {
121 if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
122 (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
123 continue;
124
125 if (vers & W3C_Doctypes[i].vers &&
126 (W3C_Doctypes[i].score < score || !score))
127 {
128 score = W3C_Doctypes[i].score;
129 j = i;
130 }
131 }
132
133 if (score)
134 return W3C_Doctypes[j].vers;
135
136 return VERS_UNKNOWN;
137 }
138
139 ctmbstr GetFPIFromVers(uint vers)
140 {
141 uint i;
142
143 for (i = 0; W3C_Doctypes[i].name; ++i)
144 if (W3C_Doctypes[i].vers == vers)
145 return W3C_Doctypes[i].fpi;
146
147 return NULL;
148 }
149
150 static ctmbstr GetSIFromVers(uint vers)
151 {
152 uint i;
153
154 for (i = 0; W3C_Doctypes[i].name; ++i)
155 if (W3C_Doctypes[i].vers == vers)
156 return W3C_Doctypes[i].si;
157
158 return NULL;
159 }
160
161 static ctmbstr GetNameFromVers(uint vers)
162 {
163 uint i;
164
165 for (i = 0; W3C_Doctypes[i].name; ++i)
166 if (W3C_Doctypes[i].vers == vers)
167 return W3C_Doctypes[i].name;
168
169 return NULL;
170 }
171
172 static uint GetVersFromFPI(ctmbstr fpi)
173 {
174 uint i;
175
176 for (i = 0; W3C_Doctypes[i].name; ++i)
177 if (tmbstrcasecmp(W3C_Doctypes[i].fpi, fpi) == 0)
178 return W3C_Doctypes[i].vers;
179
180 return 0;
181 }
182
183 /* everything is allowed in proprietary version of HTML */
184 /* this is handled here rather than in the tag/attr dicts */
185 void ConstrainVersion(TidyDocImpl* doc, uint vers)
186 {
187 doc->lexer->versions &= (vers | VERS_PROPRIETARY);
188 }
189
190 Bool IsWhite(uint c)
191 {
192 uint map = MAP(c);
193
194 return (map & white)!=0;
195 }
196
197 Bool IsNewline(uint c)
198 {
199 uint map = MAP(c);
200 return (map & newline)!=0;
201 }
202
203 Bool IsDigit(uint c)
204 {
205 uint map;
206
207 map = MAP(c);
208
209 return (map & digit)!=0;
210 }
211
212 Bool IsLetter(uint c)
213 {
214 uint map;
215
216 map = MAP(c);
217
218 return (map & letter)!=0;
219 }
220
221 Bool IsNamechar(uint c)
222 {
223 uint map = MAP(c);
224 return (map & namechar)!=0;
225 }
226
227 Bool IsXMLLetter(uint c)
228 {
229 return ((c >= 0x41 && c <= 0x5a) ||
230 (c >= 0x61 && c <= 0x7a) ||
231 (c >= 0xc0 && c <= 0xd6) ||
232 (c >= 0xd8 && c <= 0xf6) ||
233 (c >= 0xf8 && c <= 0xff) ||
234 (c >= 0x100 && c <= 0x131) ||
235 (c >= 0x134 && c <= 0x13e) ||
236 (c >= 0x141 && c <= 0x148) ||
237 (c >= 0x14a && c <= 0x17e) ||
238 (c >= 0x180 && c <= 0x1c3) ||
239 (c >= 0x1cd && c <= 0x1f0) ||
240 (c >= 0x1f4 && c <= 0x1f5) ||
241 (c >= 0x1fa && c <= 0x217) ||
242 (c >= 0x250 && c <= 0x2a8) ||
243 (c >= 0x2bb && c <= 0x2c1) ||
244 c == 0x386 ||
245 (c >= 0x388 && c <= 0x38a) ||
246 c == 0x38c ||
247 (c >= 0x38e && c <= 0x3a1) ||
248 (c >= 0x3a3 && c <= 0x3ce) ||
249 (c >= 0x3d0 && c <= 0x3d6) ||
250 c == 0x3da ||
251 c == 0x3dc ||
252 c == 0x3de ||
253 c == 0x3e0 ||
254 (c >= 0x3e2 && c <= 0x3f3) ||
255 (c >= 0x401 && c <= 0x40c) ||
256 (c >= 0x40e && c <= 0x44f) ||
257 (c >= 0x451 && c <= 0x45c) ||
258 (c >= 0x45e && c <= 0x481) ||
259 (c >= 0x490 && c <= 0x4c4) ||
260 (c >= 0x4c7 && c <= 0x4c8) ||
261 (c >= 0x4cb && c <= 0x4cc) ||
262 (c >= 0x4d0 && c <= 0x4eb) ||
263 (c >= 0x4ee && c <= 0x4f5) ||
264 (c >= 0x4f8 && c <= 0x4f9) ||
265 (c >= 0x531 && c <= 0x556) ||
266 c == 0x559 ||
267 (c >= 0x561 && c <= 0x586) ||
268 (c >= 0x5d0 && c <= 0x5ea) ||
269 (c >= 0x5f0 && c <= 0x5f2) ||
270 (c >= 0x621 && c <= 0x63a) ||
271 (c >= 0x641 && c <= 0x64a) ||
272 (c >= 0x671 && c <= 0x6b7) ||
273 (c >= 0x6ba && c <= 0x6be) ||
274 (c >= 0x6c0 && c <= 0x6ce) ||
275 (c >= 0x6d0 && c <= 0x6d3) ||
276 c == 0x6d5 ||
277 (c >= 0x6e5 && c <= 0x6e6) ||
278 (c >= 0x905 && c <= 0x939) ||
279 c == 0x93d ||
280 (c >= 0x958 && c <= 0x961) ||
281 (c >= 0x985 && c <= 0x98c) ||
282 (c >= 0x98f && c <= 0x990) ||
283 (c >= 0x993 && c <= 0x9a8) ||
284 (c >= 0x9aa && c <= 0x9b0) ||
285 c == 0x9b2 ||
286 (c >= 0x9b6 && c <= 0x9b9) ||
287 (c >= 0x9dc && c <= 0x9dd) ||
288 (c >= 0x9df && c <= 0x9e1) ||
289 (c >= 0x9f0 && c <= 0x9f1) ||
290 (c >= 0xa05 && c <= 0xa0a) ||
291 (c >= 0xa0f && c <= 0xa10) ||
292 (c >= 0xa13 && c <= 0xa28) ||
293 (c >= 0xa2a && c <= 0xa30) ||
294 (c >= 0xa32 && c <= 0xa33) ||
295 (c >= 0xa35 && c <= 0xa36) ||
296 (c >= 0xa38 && c <= 0xa39) ||
297 (c >= 0xa59 && c <= 0xa5c) ||
298 c == 0xa5e ||
299 (c >= 0xa72 && c <= 0xa74) ||
300 (c >= 0xa85 && c <= 0xa8b) ||
301 c == 0xa8d ||
302 (c >= 0xa8f && c <= 0xa91) ||
303 (c >= 0xa93 && c <= 0xaa8) ||
304 (c >= 0xaaa && c <= 0xab0) ||
305 (c >= 0xab2 && c <= 0xab3) ||
306 (c >= 0xab5 && c <= 0xab9) ||
307 c == 0xabd ||
308 c == 0xae0 ||
309 (c >= 0xb05 && c <= 0xb0c) ||
310 (c >= 0xb0f && c <= 0xb10) ||
311 (c >= 0xb13 && c <= 0xb28) ||
312 (c >= 0xb2a && c <= 0xb30) ||
313 (c >= 0xb32 && c <= 0xb33) ||
314 (c >= 0xb36 && c <= 0xb39) ||
315 c == 0xb3d ||
316 (c >= 0xb5c && c <= 0xb5d) ||
317 (c >= 0xb5f && c <= 0xb61) ||
318 (c >= 0xb85 && c <= 0xb8a) ||
319 (c >= 0xb8e && c <= 0xb90) ||
320 (c >= 0xb92 && c <= 0xb95) ||
321 (c >= 0xb99 && c <= 0xb9a) ||
322 c == 0xb9c ||
323 (c >= 0xb9e && c <= 0xb9f) ||
324 (c >= 0xba3 && c <= 0xba4) ||
325 (c >= 0xba8 && c <= 0xbaa) ||
326 (c >= 0xbae && c <= 0xbb5) ||
327 (c >= 0xbb7 && c <= 0xbb9) ||
328 (c >= 0xc05 && c <= 0xc0c) ||
329 (c >= 0xc0e && c <= 0xc10) ||
330 (c >= 0xc12 && c <= 0xc28) ||
331 (c >= 0xc2a && c <= 0xc33) ||
332 (c >= 0xc35 && c <= 0xc39) ||
333 (c >= 0xc60 && c <= 0xc61) ||
334 (c >= 0xc85 && c <= 0xc8c) ||
335 (c >= 0xc8e && c <= 0xc90) ||
336 (c >= 0xc92 && c <= 0xca8) ||
337 (c >= 0xcaa && c <= 0xcb3) ||
338 (c >= 0xcb5 && c <= 0xcb9) ||
339 c == 0xcde ||
340 (c >= 0xce0 && c <= 0xce1) ||
341 (c >= 0xd05 && c <= 0xd0c) ||
342 (c >= 0xd0e && c <= 0xd10) ||
343 (c >= 0xd12 && c <= 0xd28) ||
344 (c >= 0xd2a && c <= 0xd39) ||
345 (c >= 0xd60 && c <= 0xd61) ||
346 (c >= 0xe01 && c <= 0xe2e) ||
347 c == 0xe30 ||
348 (c >= 0xe32 && c <= 0xe33) ||
349 (c >= 0xe40 && c <= 0xe45) ||
350 (c >= 0xe81 && c <= 0xe82) ||
351 c == 0xe84 ||
352 (c >= 0xe87 && c <= 0xe88) ||
353 c == 0xe8a ||
354 c == 0xe8d ||
355 (c >= 0xe94 && c <= 0xe97) ||
356 (c >= 0xe99 && c <= 0xe9f) ||
357 (c >= 0xea1 && c <= 0xea3) ||
358 c == 0xea5 ||
359 c == 0xea7 ||
360 (c >= 0xeaa && c <= 0xeab) ||
361 (c >= 0xead && c <= 0xeae) ||
362 c == 0xeb0 ||
363 (c >= 0xeb2 && c <= 0xeb3) ||
364 c == 0xebd ||
365 (c >= 0xec0 && c <= 0xec4) ||
366 (c >= 0xf40 && c <= 0xf47) ||
367 (c >= 0xf49 && c <= 0xf69) ||
368 (c >= 0x10a0 && c <= 0x10c5) ||
369 (c >= 0x10d0 && c <= 0x10f6) ||
370 c == 0x1100 ||
371 (c >= 0x1102 && c <= 0x1103) ||
372 (c >= 0x1105 && c <= 0x1107) ||
373 c == 0x1109 ||
374 (c >= 0x110b && c <= 0x110c) ||
375 (c >= 0x110e && c <= 0x1112) ||
376 c == 0x113c ||
377 c == 0x113e ||
378 c == 0x1140 ||
379 c == 0x114c ||
380 c == 0x114e ||
381 c == 0x1150 ||
382 (c >= 0x1154 && c <= 0x1155) ||
383 c == 0x1159 ||
384 (c >= 0x115f && c <= 0x1161) ||
385 c == 0x1163 ||
386 c == 0x1165 ||
387 c == 0x1167 ||
388 c == 0x1169 ||
389 (c >= 0x116d && c <= 0x116e) ||
390 (c >= 0x1172 && c <= 0x1173) ||
391 c == 0x1175 ||
392 c == 0x119e ||
393 c == 0x11a8 ||
394 c == 0x11ab ||
395 (c >= 0x11ae && c <= 0x11af) ||
396 (c >= 0x11b7 && c <= 0x11b8) ||
397 c == 0x11ba ||
398 (c >= 0x11bc && c <= 0x11c2) ||
399 c == 0x11eb ||
400 c == 0x11f0 ||
401 c == 0x11f9 ||
402 (c >= 0x1e00 && c <= 0x1e9b) ||
403 (c >= 0x1ea0 && c <= 0x1ef9) ||
404 (c >= 0x1f00 && c <= 0x1f15) ||
405 (c >= 0x1f18 && c <= 0x1f1d) ||
406 (c >= 0x1f20 && c <= 0x1f45) ||
407 (c >= 0x1f48 && c <= 0x1f4d) ||
408 (c >= 0x1f50 && c <= 0x1f57) ||
409 c == 0x1f59 ||
410 c == 0x1f5b ||
411 c == 0x1f5d ||
412 (c >= 0x1f5f && c <= 0x1f7d) ||
413 (c >= 0x1f80 && c <= 0x1fb4) ||
414 (c >= 0x1fb6 && c <= 0x1fbc) ||
415 c == 0x1fbe ||
416 (c >= 0x1fc2 && c <= 0x1fc4) ||
417 (c >= 0x1fc6 && c <= 0x1fcc) ||
418 (c >= 0x1fd0 && c <= 0x1fd3) ||
419 (c >= 0x1fd6 && c <= 0x1fdb) ||
420 (c >= 0x1fe0 && c <= 0x1fec) ||
421 (c >= 0x1ff2 && c <= 0x1ff4) ||
422 (c >= 0x1ff6 && c <= 0x1ffc) ||
423 c == 0x2126 ||
424 (c >= 0x212a && c <= 0x212b) ||
425 c == 0x212e ||
426 (c >= 0x2180 && c <= 0x2182) ||
427 (c >= 0x3041 && c <= 0x3094) ||
428 (c >= 0x30a1 && c <= 0x30fa) ||
429 (c >= 0x3105 && c <= 0x312c) ||
430 (c >= 0xac00 && c <= 0xd7a3) ||
431 (c >= 0x4e00 && c <= 0x9fa5) ||
432 c == 0x3007 ||
433 (c >= 0x3021 && c <= 0x3029) ||
434 (c >= 0x4e00 && c <= 0x9fa5) ||
435 c == 0x3007 ||
436 (c >= 0x3021 && c <= 0x3029));
437 }
438
439 Bool IsXMLNamechar(uint c)
440 {
441 return (IsXMLLetter(c) ||
442 c == '.' || c == '_' ||
443 c == ':' || c == '-' ||
444 (c >= 0x300 && c <= 0x345) ||
445 (c >= 0x360 && c <= 0x361) ||
446 (c >= 0x483 && c <= 0x486) ||
447 (c >= 0x591 && c <= 0x5a1) ||
448 (c >= 0x5a3 && c <= 0x5b9) ||
449 (c >= 0x5bb && c <= 0x5bd) ||
450 c == 0x5bf ||
451 (c >= 0x5c1 && c <= 0x5c2) ||
452 c == 0x5c4 ||
453 (c >= 0x64b && c <= 0x652) ||
454 c == 0x670 ||
455 (c >= 0x6d6 && c <= 0x6dc) ||
456 (c >= 0x6dd && c <= 0x6df) ||
457 (c >= 0x6e0 && c <= 0x6e4) ||
458 (c >= 0x6e7 && c <= 0x6e8) ||
459 (c >= 0x6ea && c <= 0x6ed) ||
460 (c >= 0x901 && c <= 0x903) ||
461 c == 0x93c ||
462 (c >= 0x93e && c <= 0x94c) ||
463 c == 0x94d ||
464 (c >= 0x951 && c <= 0x954) ||
465 (c >= 0x962 && c <= 0x963) ||
466 (c >= 0x981 && c <= 0x983) ||
467 c == 0x9bc ||
468 c == 0x9be ||
469 c == 0x9bf ||
470 (c >= 0x9c0 && c <= 0x9c4) ||
471 (c >= 0x9c7 && c <= 0x9c8) ||
472 (c >= 0x9cb && c <= 0x9cd) ||
473 c == 0x9d7 ||
474 (c >= 0x9e2 && c <= 0x9e3) ||
475 c == 0xa02 ||
476 c == 0xa3c ||
477 c == 0xa3e ||
478 c == 0xa3f ||
479 (c >= 0xa40 && c <= 0xa42) ||
480 (c >= 0xa47 && c <= 0xa48) ||
481 (c >= 0xa4b && c <= 0xa4d) ||
482 (c >= 0xa70 && c <= 0xa71) ||
483 (c >= 0xa81 && c <= 0xa83) ||
484 c == 0xabc ||
485 (c >= 0xabe && c <= 0xac5) ||
486 (c >= 0xac7 && c <= 0xac9) ||
487 (c >= 0xacb && c <= 0xacd) ||
488 (c >= 0xb01 && c <= 0xb03) ||
489 c == 0xb3c ||
490 (c >= 0xb3e && c <= 0xb43) ||
491 (c >= 0xb47 && c <= 0xb48) ||
492 (c >= 0xb4b && c <= 0xb4d) ||
493 (c >= 0xb56 && c <= 0xb57) ||
494 (c >= 0xb82 && c <= 0xb83) ||
495 (c >= 0xbbe && c <= 0xbc2) ||
496 (c >= 0xbc6 && c <= 0xbc8) ||
497 (c >= 0xbca && c <= 0xbcd) ||
498 c == 0xbd7 ||
499 (c >= 0xc01 && c <= 0xc03) ||
500 (c >= 0xc3e && c <= 0xc44) ||
501 (c >= 0xc46 && c <= 0xc48) ||
502 (c >= 0xc4a && c <= 0xc4d) ||
503 (c >= 0xc55 && c <= 0xc56) ||
504 (c >= 0xc82 && c <= 0xc83) ||
505 (c >= 0xcbe && c <= 0xcc4) ||
506 (c >= 0xcc6 && c <= 0xcc8) ||
507 (c >= 0xcca && c <= 0xccd) ||
508 (c >= 0xcd5 && c <= 0xcd6) ||
509 (c >= 0xd02 && c <= 0xd03) ||
510 (c >= 0xd3e && c <= 0xd43) ||
511 (c >= 0xd46 && c <= 0xd48) ||
512 (c >= 0xd4a && c <= 0xd4d) ||
513 c == 0xd57 ||
514 c == 0xe31 ||
515 (c >= 0xe34 && c <= 0xe3a) ||
516 (c >= 0xe47 && c <= 0xe4e) ||
517 c == 0xeb1 ||
518 (c >= 0xeb4 && c <= 0xeb9) ||
519 (c >= 0xebb && c <= 0xebc) ||
520 (c >= 0xec8 && c <= 0xecd) ||
521 (c >= 0xf18 && c <= 0xf19) ||
522 c == 0xf35 ||
523 c == 0xf37 ||
524 c == 0xf39 ||
525 c == 0xf3e ||
526 c == 0xf3f ||
527 (c >= 0xf71 && c <= 0xf84) ||
528 (c >= 0xf86 && c <= 0xf8b) ||
529 (c >= 0xf90 && c <= 0xf95) ||
530 c == 0xf97 ||
531 (c >= 0xf99 && c <= 0xfad) ||
532 (c >= 0xfb1 && c <= 0xfb7) ||
533 c == 0xfb9 ||
534 (c >= 0x20d0 && c <= 0x20dc) ||
535 c == 0x20e1 ||
536 (c >= 0x302a && c <= 0x302f) ||
537 c == 0x3099 ||
538 c == 0x309a ||
539 (c >= 0x30 && c <= 0x39) ||
540 (c >= 0x660 && c <= 0x669) ||
541 (c >= 0x6f0 && c <= 0x6f9) ||
542 (c >= 0x966 && c <= 0x96f) ||
543 (c >= 0x9e6 && c <= 0x9ef) ||
544 (c >= 0xa66 && c <= 0xa6f) ||
545 (c >= 0xae6 && c <= 0xaef) ||
546 (c >= 0xb66 && c <= 0xb6f) ||
547 (c >= 0xbe7 && c <= 0xbef) ||
548 (c >= 0xc66 && c <= 0xc6f) ||
549 (c >= 0xce6 && c <= 0xcef) ||
550 (c >= 0xd66 && c <= 0xd6f) ||
551 (c >= 0xe50 && c <= 0xe59) ||
552 (c >= 0xed0 && c <= 0xed9) ||
553 (c >= 0xf20 && c <= 0xf29) ||
554 c == 0xb7 ||
555 c == 0x2d0 ||
556 c == 0x2d1 ||
557 c == 0x387 ||
558 c == 0x640 ||
559 c == 0xe46 ||
560 c == 0xec6 ||
561 c == 0x3005 ||
562 (c >= 0x3031 && c <= 0x3035) ||
563 (c >= 0x309d && c <= 0x309e) ||
564 (c >= 0x30fc && c <= 0x30fe));
565 }
566
567 Bool IsLower(uint c)
568 {
569 uint map = MAP(c);
570
571 return (map & lowercase)!=0;
572 }
573
574 Bool IsUpper(uint c)
575 {
576 uint map = MAP(c);
577
578 return (map & uppercase)!=0;
579 }
580
581 uint ToLower(uint c)
582 {
583 uint map = MAP(c);
584
585 if (map & uppercase)
586 c += 'a' - 'A';
587
588 return c;
589 }
590
591 uint ToUpper(uint c)
592 {
593 uint map = MAP(c);
594
595 if (map & lowercase)
596 c += (uint) ('A' - 'a' );
597
598 return c;
599 }
600
601 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
602 {
603 if ( !cfgBool(doc, TidyXmlTags) )
604 {
605 if ( tocaps )
606 {
607 c = (tmbchar) ToUpper(c);
608 }
609 else /* force to lower case */
610 {
611 c = (tmbchar) ToLower(c);
612 }
613 }
614 return c;
615 }
616
617
618 /*
619 return last character in string
620 this is useful when trailing quotemark
621 is missing on an attribute
622 */
623 static tmbchar LastChar( tmbstr str )
624 {
625 if ( str && *str )
626 {
627 int n = tmbstrlen(str);
628 return str[n-1];
629 }
630 return 0;
631 }
632
633 /*
634 node->type is one of these:
635
636 #define TextNode 1
637 #define StartTag 2
638 #define EndTag 3
639 #define StartEndTag 4
640 */
641
642 Lexer* NewLexer( TidyDocImpl* doc )
643 {
644 Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) );
645
646 if ( lexer != NULL )
647 {
648 ClearMemory( lexer, sizeof(Lexer) );
649
650 lexer->lines = 1;
651 lexer->columns = 1;
652 lexer->state = LEX_CONTENT;
653
654 lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
655 lexer->doctype = VERS_UNKNOWN;
656 lexer->root = &doc->root;
657 }
658 return lexer;
659 }
660
661 Bool EndOfInput( TidyDocImpl* doc )
662 {
663 assert( doc->docIn != NULL );
664 return ( !doc->docIn->pushed && IsEOF(doc->docIn) );
665 }
666
667 void FreeLexer( TidyDocImpl* doc )
668 {
669 Lexer *lexer = doc->lexer;
670 if ( lexer )
671 {
672 FreeStyles( doc );
673
674 if ( lexer->pushed )
675 FreeNode( doc, lexer->token );
676
677 while ( lexer->istacksize > 0 )
678 PopInline( doc, NULL );
679
680 MemFree( lexer->istack );
681 MemFree( lexer->lexbuf );
682 MemFree( lexer );
683 doc->lexer = NULL;
684 }
685 }
686
687 /* Lexer uses bigger memory chunks than pprint as
688 ** it must hold the entire input document. not just
689 ** the last line or three.
690 */
691 void AddByte( Lexer *lexer, tmbchar ch )
692 {
693 if ( lexer->lexsize + 2 >= lexer->lexlength )
694 {
695 tmbstr buf = NULL;
696 uint allocAmt = lexer->lexlength;
697 while ( lexer->lexsize + 2 >= allocAmt )
698 {
699 if ( allocAmt == 0 )
700 allocAmt = 8192;
701 else
702 allocAmt *= 2;
703 }
704 buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt );
705 if ( buf )
706 {
707 ClearMemory( buf + lexer->lexlength,
708 allocAmt - lexer->lexlength );
709 lexer->lexbuf = buf;
710 lexer->lexlength = allocAmt;
711 }
712 }
713
714 lexer->lexbuf[ lexer->lexsize++ ] = ch;
715 lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */
716 }
717
718 static void ChangeChar( Lexer *lexer, tmbchar c )
719 {
720 if ( lexer->lexsize > 0 )
721 {
722 lexer->lexbuf[ lexer->lexsize-1 ] = c;
723 }
724 }
725
726 /* store character c as UTF-8 encoded byte stream */
727 void AddCharToLexer( Lexer *lexer, uint c )
728 {
729 int i, err, count = 0;
730 tmbchar buf[10] = {0};
731
732 err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );
733 if (err)
734 {
735 #if 0 && defined(_DEBUG)
736 fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );
737 #endif
738 /* replacement character 0xFFFD encoded as UTF-8 */
739 buf[0] = (byte) 0xEF;
740 buf[1] = (byte) 0xBF;
741 buf[2] = (byte) 0xBD;
742 count = 3;
743 }
744
745 for ( i = 0; i < count; ++i )
746 AddByte( lexer, buf[i] );
747 }
748
749 static void AddStringToLexer( Lexer *lexer, ctmbstr str )
750 {
751 uint c;
752
753 /* Many (all?) compilers will sign-extend signed chars (the default) when
754 ** converting them to unsigned integer values. We must cast our char to
755 ** unsigned char before assigning it to prevent this from happening.
756 */
757 while( 0 != (c = (unsigned char) *str++ ))
758 AddCharToLexer( lexer, c );
759 }
760
761 /*
762 No longer attempts to insert missing ';' for unknown
763 enitities unless one was present already, since this
764 gives unexpected results.
765
766 For example: <a href="something.htm?foo&bar&fred">
767 was tidied to: <a href="something.htm?foo&bar;&fred;">
768 rather than: <a href="something.htm?foo&bar&fred">
769
770 My thanks for Maurice Buxton for spotting this.
771
772 Also Randy Waki pointed out the following case for the
773 04 Aug 00 version (bug #433012):
774
775 For example: <a href="something.htm?id=1&lang=en">
776 was tidied to: <a href="something.htm?id=1⟨=en">
777 rather than: <a href="something.htm?id=1&lang=en">
778
779 where "lang" is a known entity (#9001), but browsers would
780 misinterpret "⟨" because it had a value > 256.
781
782 So the case of an apparently known entity with a value > 256 and
783 missing a semicolon is handled specially.
784
785 "ParseEntity" is also a bit of a misnomer - it handles entities and
786 numeric character references. Invalid NCR's are now reported.
787 */
788 static void ParseEntity( TidyDocImpl* doc, int mode )
789 {
790 uint start;
791 Bool first = yes, semicolon = no, found = no;
792 Bool isXml = cfgBool( doc, TidyXmlTags );
793 uint c, ch, startcol, entver = 0;
794 Lexer* lexer = doc->lexer;
795
796 start = lexer->lexsize - 1; /* to start at "&" */
797 startcol = doc->docIn->curcol - 1;
798
799 while ( (c = ReadChar(doc->docIn)) != EndOfStream )
800 {
801 if ( c == ';' )
802 {
803 semicolon = yes;
804 break;
805 }
806
807 if (first && c == '#')
808 {
809 #if SUPPORT_ASIAN_ENCODINGS
810 if ( !cfgBool(doc, TidyNCR) ||
811 cfg(doc, TidyInCharEncoding) == BIG5 ||
812 cfg(doc, TidyInCharEncoding) == SHIFTJIS )
813 {
814 UngetChar('#', doc->docIn);
815 return;
816 }
817 #endif
818 AddCharToLexer( lexer, c );
819 first = no;
820 continue;
821 }
822
823 first = no;
824
825 if ( IsNamechar(c) )
826 {
827 AddCharToLexer( lexer, c );
828 continue;
829 }
830
831 /* otherwise put it back */
832
833 UngetChar( c, doc->docIn );
834 break;
835 }
836
837 /* make sure entity is NULL terminated */
838 lexer->lexbuf[lexer->lexsize] = '\0';
839
840 /* Should contrain version to XML/XHTML if '
841 ** is encountered. But this is not possible with
842 ** Tidy's content model bit mask.
843 */
844 if ( tmbstrcmp(lexer->lexbuf+start, "&apos") == 0
845 && !cfgBool(doc, TidyXmlOut)
846 && !lexer->isvoyager
847 && !cfgBool(doc, TidyXhtmlOut) )
848 ReportEntityError( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
849
850 /* Lookup entity code and version
851 */
852 found = EntityInfo( lexer->lexbuf+start, isXml, &ch, &entver );
853
854 /* deal with unrecognized or invalid entities */
855 /* #433012 - fix by Randy Waki 17 Feb 01 */
856 /* report invalid NCR's - Terry Teague 01 Sep 01 */
857 if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
858 {
859 /* set error position just before offending character */
860 lexer->lines = doc->docIn->curline;
861 lexer->columns = startcol;
862
863 if (lexer->lexsize > start + 1)
864 {
865 if (ch >= 128 && ch <= 159)
866 {
867 /* invalid numeric character reference */
868
869 uint c1 = 0;
870 int replaceMode = DISCARDED_CHAR;
871
872 if ( ReplacementCharEncoding == WIN1252 )
873 c1 = DecodeWin1252( ch );
874 else if ( ReplacementCharEncoding == MACROMAN )
875 c1 = DecodeMacRoman( ch );
876
877 if ( c1 )
878 replaceMode = REPLACED_CHAR;
879
880 if ( c != ';' ) /* issue warning if not terminated by ';' */
881 ReportEntityError( doc, MISSING_SEMICOLON_NCR,
882 lexer->lexbuf+start, c );
883
884 ReportEncodingError(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
885
886 if ( c1 )
887 {
888 /* make the replacement */
889 lexer->lexsize = start;
890 AddCharToLexer( lexer, c1 );
891 semicolon = no;
892 }
893 else
894 {
895 /* discard */
896 lexer->lexsize = start;
897 semicolon = no;
898 }
899
900 }
901 else
902 ReportEntityError( doc, UNKNOWN_ENTITY,
903 lexer->lexbuf+start, ch );
904
905 if (semicolon)
906 AddCharToLexer( lexer, ';' );
907 }
908 else /* naked & */
909 ReportEntityError( doc, UNESCAPED_AMPERSAND,
910 lexer->lexbuf+start, ch );
911 }
912 else
913 {
914 if ( c != ';' ) /* issue warning if not terminated by ';' */
915 {
916 /* set error position just before offending chararcter */
917 lexer->lines = doc->docIn->curline;
918 lexer->columns = startcol;
919 ReportEntityError( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
920 }
921
922 lexer->lexsize = start;
923 if ( ch == 160 && (mode & Preformatted) )
924 ch = ' ';
925 AddCharToLexer( lexer, ch );
926
927 if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
928 AddStringToLexer( lexer, "amp;" );
929
930 /* Detect extended vs. basic entities */
931 ConstrainVersion( doc, entver );
932 }
933 }
934
935 static tmbchar ParseTagName( TidyDocImpl* doc )
936 {
937 Lexer *lexer = doc->lexer;
938 uint c = lexer->lexbuf[ lexer->txtstart ];
939 Bool xml = cfgBool(doc, TidyXmlTags);
940
941 /* fold case of first character in buffer */
942 if (!xml && IsUpper(c))
943 lexer->lexbuf[lexer->txtstart] = (tmbchar) ToLower(c);
944
945 while ((c = ReadChar(doc->docIn)) != EndOfStream)
946 {
947 if ((!xml && !IsNamechar(c)) ||
948 (xml && !IsXMLNamechar(c)))
949 break;
950
951 /* fold case of subsequent characters */
952 if (!xml && IsUpper(c))
953 c = ToLower(c);
954
955 AddCharToLexer(lexer, c);
956 }
957
958 lexer->txtend = lexer->lexsize;
959 return (tmbchar) c;
960 }
961
962 /*
963 Used for elements and text nodes
964 element name is NULL for text nodes
965 start and end are offsets into lexbuf
966 which contains the textual content of
967 all elements in the parse tree.
968
969 parent and content allow traversal
970 of the parse tree in any direction.
971 attributes are represented as a linked
972 list of AttVal nodes which hold the
973 strings for attribute/value pairs.
974 */
975
976
977 Node *NewNode(Lexer *lexer)
978 {
979 Node* node = (Node*) MemAlloc( sizeof(Node) );
980 ClearMemory( node, sizeof(Node) );
981 if ( lexer )
982 {
983 node->line = lexer->lines;
984 node->column = lexer->columns;
985 }
986 node->type = TextNode;
987 return node;
988 }
989
990 /* used to clone heading nodes when split by an <HR> */
991 Node *CloneNode( TidyDocImpl* doc, Node *element )
992 {
993 Lexer* lexer = doc->lexer;
994 Node *node = NewNode( lexer );
995
996 node->start = lexer->lexsize;
997 node->end = lexer->lexsize;
998
999 if ( element )
1000 {
1001 node->parent = element->parent;
1002 node->type = element->type;
1003 node->closed = element->closed;
1004 node->implicit = element->implicit;
1005 node->tag = element->tag;
1006 node->element = tmbstrdup( element->element );
1007 node->attributes = DupAttrs( doc, element->attributes );
1008 }
1009 return node;
1010 }
1011
1012 /* free node's attributes */
1013 void FreeAttrs( TidyDocImpl* doc, Node *node )
1014 {
1015
1016 while ( node->attributes )
1017 {
1018 AttVal *av = node->attributes;
1019
1020 if ( av->attribute )
1021 {
1022 if ( (attrIsID(av) || attrIsNAME(av)) &&
1023 IsAnchorElement(doc, node) )
1024 {
1025 RemoveAnchorByNode( doc, node );
1026 }
1027 }
1028
1029 node->attributes = av->next;
1030 FreeAttribute( doc, av );
1031 }
1032 }
1033
1034 /* doesn't repair attribute list linkage */
1035 void FreeAttribute( TidyDocImpl* doc, AttVal *av )
1036 {
1037 FreeNode( doc, av->asp );
1038 FreeNode( doc, av->php );
1039 MemFree( av->attribute );
1040 MemFree( av->value );
1041 MemFree( av );
1042 }
1043
1044 /* detach attribute from node
1045 */
1046 void DetachAttribute( Node *node, AttVal *attr )
1047 {
1048 AttVal *av, *prev = NULL;
1049
1050 for ( av = node->attributes; av; av = av->next )
1051 {
1052 if ( av == attr )
1053 {
1054 if ( prev )
1055 prev->next = attr->next;
1056 else
1057 node->attributes = attr->next;
1058 break;
1059 }
1060 prev = av;
1061 }
1062 }
1063
1064 /* detach attribute from node then free it
1065 */
1066 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr )
1067 {
1068 DetachAttribute( node, attr );
1069 FreeAttribute( doc, attr );
1070 }
1071
1072 /*
1073 Free document nodes by iterating through peers and recursing
1074 through children. Set next to NULL before calling FreeNode()
1075 to avoid freeing peer nodes. Doesn't patch up prev/next links.
1076 */
1077 void FreeNode( TidyDocImpl* doc, Node *node )
1078 {
1079 while ( node )
1080 {
1081 Node* next = node->next;
1082
1083 FreeAttrs( doc, node );
1084 FreeNode( doc, node->content );
1085 MemFree( node->element );
1086 #ifdef TIDY_STORE_ORIGINAL_TEXT
1087 if (node->otext)
1088 MemFree(node->otext);
1089 #endif
1090 if (RootNode != node->type)
1091 MemFree( node );
1092 else
1093 node->content = NULL;
1094
1095 node = next;
1096 }
1097 }
1098
1099 #ifdef TIDY_STORE_ORIGINAL_TEXT
1100 void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)
1101 {
1102 if (!doc->storeText)
1103 return;
1104
1105 if (count >= doc->docIn->otextlen)
1106 return;
1107
1108 if (!doc->docIn->otextsize)
1109 return;
1110
1111 if (count == 0)
1112 {
1113 node->otext = doc->docIn->otextbuf;
1114 doc->docIn->otextbuf = NULL;
1115 doc->docIn->otextlen = 0;
1116 doc->docIn->otextsize = 0;
1117 }
1118 else
1119 {
1120 uint len = doc->docIn->otextlen;
1121 tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1);
1122 tmbstr buf2 = (tmbstr)MemAlloc(count + 1);
1123 uint i, j;
1124
1125 /* strncpy? */
1126
1127 for (i = 0; i < len - count; ++i)
1128 buf1[i] = doc->docIn->otextbuf[i];
1129
1130 buf1[i] = 0;
1131
1132 for (j = 0; j + i < len; ++j)
1133 buf2[j] = doc->docIn->otextbuf[j + i];
1134
1135 buf2[j] = 0;
1136
1137 MemFree(doc->docIn->otextbuf);
1138 node->otext = buf1;
1139 doc->docIn->otextbuf = buf2;
1140 doc->docIn->otextlen = count;
1141 doc->docIn->otextsize = count + 1;
1142 }
1143 }
1144 #endif
1145
1146 Node* TextToken( Lexer *lexer )
1147 {
1148 Node *node = NewNode( lexer );
1149 node->start = lexer->txtstart;
1150 node->end = lexer->txtend;
1151 return node;
1152 }
1153
1154 /* used for creating preformatted text from Word2000 */
1155 Node *NewLineNode( Lexer *lexer )
1156 {
1157 Node *node = NewNode( lexer );
1158 node->start = lexer->lexsize;
1159 AddCharToLexer( lexer, (uint)'\n' );
1160 node->end = lexer->lexsize;
1161 return node;
1162 }
1163
1164 /* used for adding a for Word2000 */
1165 Node* NewLiteralTextNode( Lexer *lexer, ctmbstr txt )
1166 {
1167 Node *node = NewNode( lexer );
1168 node->start = lexer->lexsize;
1169 AddStringToLexer( lexer, txt );
1170 node->end = lexer->lexsize;
1171 return node;
1172 }
1173
1174 static Node* TagToken( TidyDocImpl* doc, NodeType type )
1175 {
1176 Lexer* lexer = doc->lexer;
1177 Node* node = NewNode( lexer );
1178 node->type = type;
1179 node->element = tmbstrndup( lexer->lexbuf + lexer->txtstart,
1180 lexer->txtend - lexer->txtstart );
1181 node->start = lexer->txtstart;
1182 node->end = lexer->txtstart;
1183
1184 if ( type == StartTag || type == StartEndTag || type == EndTag )
1185 FindTag(doc, node);
1186
1187 return node;
1188 }
1189
1190 static Node* NewToken(TidyDocImpl* doc, NodeType type)
1191 {
1192 Lexer* lexer = doc->lexer;
1193 Node* node = NewNode(lexer);
1194 node->type = type;
1195 node->start = lexer->txtstart;
1196 node->end = lexer->txtend;
1197 #ifdef TIDY_STORE_ORIGINAL_TEXT
1198 StoreOriginalTextInToken(doc, node, 0);
1199 #endif
1200 return node;
1201 }
1202
1203 #define CommentToken(doc) NewToken(doc, CommentTag)
1204 #define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1205 #define PIToken(doc) NewToken(doc, ProcInsTag)
1206 #define AspToken(doc) NewToken(doc, AspTag)
1207 #define JsteToken(doc) NewToken(doc, JsteTag)
1208 #define PhpToken(doc) NewToken(doc, PhpTag)
1209 #define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1210 #define SectionToken(doc) NewToken(doc, SectionTag)
1211 #define CDATAToken(doc) NewToken(doc, CDATATag)
1212
1213 void AddStringLiteral( Lexer* lexer, ctmbstr str )
1214 {
1215 byte c;
1216 while(0 != (c = *str++) )
1217 AddCharToLexer( lexer, c );
1218 }
1219
1220 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1221 {
1222 byte c;
1223 int ix;
1224
1225 for ( ix=0; ix < len && (c = *str++); ++ix )
1226 AddCharToLexer(lexer, c);
1227 }
1228
1229 /* find doctype element */
1230 Node *FindDocType( TidyDocImpl* doc )
1231 {
1232 Node* node;
1233 for ( node = (doc ? doc->root.content : NULL);
1234 node && node->type != DocTypeTag;
1235 node = node->next )
1236 /**/;
1237 return node;
1238 }
1239
1240 /* find parent container element */
1241 Node* FindContainer( Node* node )
1242 {
1243 for ( node = (node ? node->parent : NULL);
1244 node && nodeHasCM(node, CM_INLINE);
1245 node = node->parent )
1246 /**/;
1247
1248 return node;
1249 }
1250
1251
1252 /* find html element */
1253 Node *FindHTML( TidyDocImpl* doc )
1254 {
1255 Node *node;
1256 for ( node = (doc ? doc->root.content : NULL);
1257 node && !nodeIsHTML(node);
1258 node = node->next )
1259 /**/;
1260
1261 return node;
1262 }
1263
1264 /* find XML Declaration */
1265 Node *FindXmlDecl(TidyDocImpl* doc)
1266 {
1267 Node *node;
1268 for ( node = (doc ? doc->root.content : NULL);
1269 node && !(node->type == XmlDecl);
1270 node = node->next )
1271 /**/;
1272
1273 return node;
1274 }
1275
1276
1277 Node *FindHEAD( TidyDocImpl* doc )
1278 {
1279 Node *node = FindHTML( doc );
1280
1281 if ( node )
1282 {
1283 for ( node = node->content;
1284 node && !nodeIsHEAD(node);
1285 node = node->next )
1286 /**/;
1287 }
1288
1289 return node;
1290 }
1291
1292 Node *FindTITLE(TidyDocImpl* doc)
1293 {
1294 Node *node = FindHEAD(doc);
1295
1296 if (node)
1297 for (node = node->content;
1298 node && !nodeIsTITLE(node);
1299 node = node->next) {}
1300
1301 return node;
1302 }
1303
1304 Node *FindBody( TidyDocImpl* doc )
1305 {
1306 Node *node = ( doc ? doc->root.content : NULL );
1307
1308 while ( node && !nodeIsHTML(node) )
1309 node = node->next;
1310
1311 if (node == NULL)
1312 return NULL;
1313
1314 node = node->content;
1315 while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1316 node = node->next;
1317
1318 if ( node && nodeIsFRAMESET(node) )
1319 {
1320 node = node->content;
1321 while ( node && !nodeIsNOFRAMES(node) )
1322 node = node->next;
1323
1324 if ( node )
1325 {
1326 node = node->content;
1327 while ( node && !nodeIsBODY(node) )
1328 node = node->next;
1329 }
1330 }
1331
1332 return node;
1333 }
1334
1335 /* add meta element for Tidy */
1336 Bool AddGenerator( TidyDocImpl* doc )
1337 {
1338 AttVal *attval;
1339 Node *node;
1340 Node *head = FindHEAD( doc );
1341 tmbchar buf[256];
1342
1343 if (head)
1344 {
1345 #ifdef PLATFORM_NAME
1346 tmbsnprintf(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",
1347 tidyReleaseDate());
1348 #else
1349 tmbsnprintf(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());
1350 #endif
1351
1352 for ( node = head->content; node; node = node->next )
1353 {
1354 if ( nodeIsMETA(node) )
1355 {