~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/lexer.c

Version: ~ [ 1.0 ] ~

** Warning: Cannot open xref database.

1 /* lexer.c -- Lexer for html parser 2 3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: arnaud02 $ 9 $Date: 2005/10/13 12:11:01 $ 10 $Revision: 1.173 $ 11 12 */ 13 14 /* 15 Given a file stream fp it returns a sequence of tokens. 16 17 GetToken(fp) gets the next token 18 UngetToken(fp) provides one level undo 19 20 The tags include an attribute list: 21 22 - linked list of attribute/value nodes 23 - each node has 2 NULL-terminated strings. 24 - entities are replaced in attribute values 25 26 white space is compacted if not in preformatted mode 27 If not in preformatted mode then leading white space 28 is discarded and subsequent white space sequences 29 compacted to single space characters. 30 31 If XmlTags is no then Tag names are folded to upper 32 case and attribute names to lower case. 33 34 Not yet done: 35 - Doctype subset and marked sections 36 */ 37 38 #include "tidy-int.h" 39 #include "lexer.h" 40 #include "parser.h" 41 #include "entities.h" 42 #include "streamio.h" 43 #include "message.h" 44 #include "tmbstr.h" 45 #include "clean.h" 46 #include "utf8.h" 47 #include "streamio.h" 48 49 /* Forward references 50 */ 51 /* swallows closing '>' */ 52 static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty ); 53 54 static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, 55 Node **asp, Node **php ); 56 57 static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase, 58 Bool *isempty, int *pdelim ); 59 60 static Node *ParseDocTypeDecl(TidyDocImpl* doc); 61 62 static void AddAttrToList( AttVal** list, AttVal* av ); 63 64 /* used to classify characters for lexical purposes */ 65 #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0) 66 static uint lexmap[128]; 67 68 #define IsValidXMLAttrName(name) IsValidXMLID(name) 69 #define IsValidXMLElemName(name) IsValidXMLID(name) 70 71 static struct _doctypes 72 { 73 uint score; 74 uint vers; 75 ctmbstr name; 76 ctmbstr fpi; 77 ctmbstr si; 78 } const W3C_Doctypes[] = 79 { 80 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, }, 81 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, }, 82 { 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, }, 83 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, }, 84 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, }, 85 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, }, 86 { 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" }, 87 { 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" }, 88 { 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" }, 89 { 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" }, 90 { 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" }, 91 { 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" }, 92 { 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" }, 93 { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" }, 94 { 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" }, 95 { 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" }, 96 { 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" }, 97 98 /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */ 99 #if 0 100 { 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" }, 101 { 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" }, 102 #endif 103 /* final entry */ 104 { 0, 0, NULL, NULL, NULL } 105 }; 106 107 int HTMLVersion(TidyDocImpl* doc) 108 { 109 uint i; 110 uint j = 0; 111 uint score = 0; 112 uint vers = doc->lexer->versions; 113 uint dtver = doc->lexer->doctype; 114 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); 115 Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) && 116 !cfgBool(doc, TidyHtmlOut); 117 Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver; 118 119 for (i = 0; W3C_Doctypes[i].name; ++i) 120 { 121 if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) || 122 (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers))) 123 continue; 124 125 if (vers & W3C_Doctypes[i].vers && 126 (W3C_Doctypes[i].score < score || !score)) 127 { 128 score = W3C_Doctypes[i].score; 129 j = i; 130 } 131 } 132 133 if (score) 134 return W3C_Doctypes[j].vers; 135 136 return VERS_UNKNOWN; 137 } 138 139 ctmbstr GetFPIFromVers(uint vers) 140 { 141 uint i; 142 143 for (i = 0; W3C_Doctypes[i].name; ++i) 144 if (W3C_Doctypes[i].vers == vers) 145 return W3C_Doctypes[i].fpi; 146 147 return NULL; 148 } 149 150 static ctmbstr GetSIFromVers(uint vers) 151 { 152 uint i; 153 154 for (i = 0; W3C_Doctypes[i].name; ++i) 155 if (W3C_Doctypes[i].vers == vers) 156 return W3C_Doctypes[i].si; 157 158 return NULL; 159 } 160 161 static ctmbstr GetNameFromVers(uint vers) 162 { 163 uint i; 164 165 for (i = 0; W3C_Doctypes[i].name; ++i) 166 if (W3C_Doctypes[i].vers == vers) 167 return W3C_Doctypes[i].name; 168 169 return NULL; 170 } 171 172 static uint GetVersFromFPI(ctmbstr fpi) 173 { 174 uint i; 175 176 for (i = 0; W3C_Doctypes[i].name; ++i) 177 if (tmbstrcasecmp(W3C_Doctypes[i].fpi, fpi) == 0) 178 return W3C_Doctypes[i].vers; 179 180 return 0; 181 } 182 183 /* everything is allowed in proprietary version of HTML */ 184 /* this is handled here rather than in the tag/attr dicts */ 185 void ConstrainVersion(TidyDocImpl* doc, uint vers) 186 { 187 doc->lexer->versions &= (vers | VERS_PROPRIETARY); 188 } 189 190 Bool IsWhite(uint c) 191 { 192 uint map = MAP(c); 193 194 return (map & white)!=0; 195 } 196 197 Bool IsNewline(uint c) 198 { 199 uint map = MAP(c); 200 return (map & newline)!=0; 201 } 202 203 Bool IsDigit(uint c) 204 { 205 uint map; 206 207 map = MAP(c); 208 209 return (map & digit)!=0; 210 } 211 212 Bool IsLetter(uint c) 213 { 214 uint map; 215 216 map = MAP(c); 217 218 return (map & letter)!=0; 219 } 220 221 Bool IsNamechar(uint c) 222 { 223 uint map = MAP(c); 224 return (map & namechar)!=0; 225 } 226 227 Bool IsXMLLetter(uint c) 228 { 229 return ((c >= 0x41 && c <= 0x5a) || 230 (c >= 0x61 && c <= 0x7a) || 231 (c >= 0xc0 && c <= 0xd6) || 232 (c >= 0xd8 && c <= 0xf6) || 233 (c >= 0xf8 && c <= 0xff) || 234 (c >= 0x100 && c <= 0x131) || 235 (c >= 0x134 && c <= 0x13e) || 236 (c >= 0x141 && c <= 0x148) || 237 (c >= 0x14a && c <= 0x17e) || 238 (c >= 0x180 && c <= 0x1c3) || 239 (c >= 0x1cd && c <= 0x1f0) || 240 (c >= 0x1f4 && c <= 0x1f5) || 241 (c >= 0x1fa && c <= 0x217) || 242 (c >= 0x250 && c <= 0x2a8) || 243 (c >= 0x2bb && c <= 0x2c1) || 244 c == 0x386 || 245 (c >= 0x388 && c <= 0x38a) || 246 c == 0x38c || 247 (c >= 0x38e && c <= 0x3a1) || 248 (c >= 0x3a3 && c <= 0x3ce) || 249 (c >= 0x3d0 && c <= 0x3d6) || 250 c == 0x3da || 251 c == 0x3dc || 252 c == 0x3de || 253 c == 0x3e0 || 254 (c >= 0x3e2 && c <= 0x3f3) || 255 (c >= 0x401 && c <= 0x40c) || 256 (c >= 0x40e && c <= 0x44f) || 257 (c >= 0x451 && c <= 0x45c) || 258 (c >= 0x45e && c <= 0x481) || 259 (c >= 0x490 && c <= 0x4c4) || 260 (c >= 0x4c7 && c <= 0x4c8) || 261 (c >= 0x4cb && c <= 0x4cc) || 262 (c >= 0x4d0 && c <= 0x4eb) || 263 (c >= 0x4ee && c <= 0x4f5) || 264 (c >= 0x4f8 && c <= 0x4f9) || 265 (c >= 0x531 && c <= 0x556) || 266 c == 0x559 || 267 (c >= 0x561 && c <= 0x586) || 268 (c >= 0x5d0 && c <= 0x5ea) || 269 (c >= 0x5f0 && c <= 0x5f2) || 270 (c >= 0x621 && c <= 0x63a) || 271 (c >= 0x641 && c <= 0x64a) || 272 (c >= 0x671 && c <= 0x6b7) || 273 (c >= 0x6ba && c <= 0x6be) || 274 (c >= 0x6c0 && c <= 0x6ce) || 275 (c >= 0x6d0 && c <= 0x6d3) || 276 c == 0x6d5 || 277 (c >= 0x6e5 && c <= 0x6e6) || 278 (c >= 0x905 && c <= 0x939) || 279 c == 0x93d || 280 (c >= 0x958 && c <= 0x961) || 281 (c >= 0x985 && c <= 0x98c) || 282 (c >= 0x98f && c <= 0x990) || 283 (c >= 0x993 && c <= 0x9a8) || 284 (c >= 0x9aa && c <= 0x9b0) || 285 c == 0x9b2 || 286 (c >= 0x9b6 && c <= 0x9b9) || 287 (c >= 0x9dc && c <= 0x9dd) || 288 (c >= 0x9df && c <= 0x9e1) || 289 (c >= 0x9f0 && c <= 0x9f1) || 290 (c >= 0xa05 && c <= 0xa0a) || 291 (c >= 0xa0f && c <= 0xa10) || 292 (c >= 0xa13 && c <= 0xa28) || 293 (c >= 0xa2a && c <= 0xa30) || 294 (c >= 0xa32 && c <= 0xa33) || 295 (c >= 0xa35 && c <= 0xa36) || 296 (c >= 0xa38 && c <= 0xa39) || 297 (c >= 0xa59 && c <= 0xa5c) || 298 c == 0xa5e || 299 (c >= 0xa72 && c <= 0xa74) || 300 (c >= 0xa85 && c <= 0xa8b) || 301 c == 0xa8d || 302 (c >= 0xa8f && c <= 0xa91) || 303 (c >= 0xa93 && c <= 0xaa8) || 304 (c >= 0xaaa && c <= 0xab0) || 305 (c >= 0xab2 && c <= 0xab3) || 306 (c >= 0xab5 && c <= 0xab9) || 307 c == 0xabd || 308 c == 0xae0 || 309 (c >= 0xb05 && c <= 0xb0c) || 310 (c >= 0xb0f && c <= 0xb10) || 311 (c >= 0xb13 && c <= 0xb28) || 312 (c >= 0xb2a && c <= 0xb30) || 313 (c >= 0xb32 && c <= 0xb33) || 314 (c >= 0xb36 && c <= 0xb39) || 315 c == 0xb3d || 316 (c >= 0xb5c && c <= 0xb5d) || 317 (c >= 0xb5f && c <= 0xb61) || 318 (c >= 0xb85 && c <= 0xb8a) || 319 (c >= 0xb8e && c <= 0xb90) || 320 (c >= 0xb92 && c <= 0xb95) || 321 (c >= 0xb99 && c <= 0xb9a) || 322 c == 0xb9c || 323 (c >= 0xb9e && c <= 0xb9f) || 324 (c >= 0xba3 && c <= 0xba4) || 325 (c >= 0xba8 && c <= 0xbaa) || 326 (c >= 0xbae && c <= 0xbb5) || 327 (c >= 0xbb7 && c <= 0xbb9) || 328 (c >= 0xc05 && c <= 0xc0c) || 329 (c >= 0xc0e && c <= 0xc10) || 330 (c >= 0xc12 && c <= 0xc28) || 331 (c >= 0xc2a && c <= 0xc33) || 332 (c >= 0xc35 && c <= 0xc39) || 333 (c >= 0xc60 && c <= 0xc61) || 334 (c >= 0xc85 && c <= 0xc8c) || 335 (c >= 0xc8e && c <= 0xc90) || 336 (c >= 0xc92 && c <= 0xca8) || 337 (c >= 0xcaa && c <= 0xcb3) || 338 (c >= 0xcb5 && c <= 0xcb9) || 339 c == 0xcde || 340 (c >= 0xce0 && c <= 0xce1) || 341 (c >= 0xd05 && c <= 0xd0c) || 342 (c >= 0xd0e && c <= 0xd10) || 343 (c >= 0xd12 && c <= 0xd28) || 344 (c >= 0xd2a && c <= 0xd39) || 345 (c >= 0xd60 && c <= 0xd61) || 346 (c >= 0xe01 && c <= 0xe2e) || 347 c == 0xe30 || 348 (c >= 0xe32 && c <= 0xe33) || 349 (c >= 0xe40 && c <= 0xe45) || 350 (c >= 0xe81 && c <= 0xe82) || 351 c == 0xe84 || 352 (c >= 0xe87 && c <= 0xe88) || 353 c == 0xe8a || 354 c == 0xe8d || 355 (c >= 0xe94 && c <= 0xe97) || 356 (c >= 0xe99 && c <= 0xe9f) || 357 (c >= 0xea1 && c <= 0xea3) || 358 c == 0xea5 || 359 c == 0xea7 || 360 (c >= 0xeaa && c <= 0xeab) || 361 (c >= 0xead && c <= 0xeae) || 362 c == 0xeb0 || 363 (c >= 0xeb2 && c <= 0xeb3) || 364 c == 0xebd || 365 (c >= 0xec0 && c <= 0xec4) || 366 (c >= 0xf40 && c <= 0xf47) || 367 (c >= 0xf49 && c <= 0xf69) || 368 (c >= 0x10a0 && c <= 0x10c5) || 369 (c >= 0x10d0 && c <= 0x10f6) || 370 c == 0x1100 || 371 (c >= 0x1102 && c <= 0x1103) || 372 (c >= 0x1105 && c <= 0x1107) || 373 c == 0x1109 || 374 (c >= 0x110b && c <= 0x110c) || 375 (c >= 0x110e && c <= 0x1112) || 376 c == 0x113c || 377 c == 0x113e || 378 c == 0x1140 || 379 c == 0x114c || 380 c == 0x114e || 381 c == 0x1150 || 382 (c >= 0x1154 && c <= 0x1155) || 383 c == 0x1159 || 384 (c >= 0x115f && c <= 0x1161) || 385 c == 0x1163 || 386 c == 0x1165 || 387 c == 0x1167 || 388 c == 0x1169 || 389 (c >= 0x116d && c <= 0x116e) || 390 (c >= 0x1172 && c <= 0x1173) || 391 c == 0x1175 || 392 c == 0x119e || 393 c == 0x11a8 || 394 c == 0x11ab || 395 (c >= 0x11ae && c <= 0x11af) || 396 (c >= 0x11b7 && c <= 0x11b8) || 397 c == 0x11ba || 398 (c >= 0x11bc && c <= 0x11c2) || 399 c == 0x11eb || 400 c == 0x11f0 || 401 c == 0x11f9 || 402 (c >= 0x1e00 && c <= 0x1e9b) || 403 (c >= 0x1ea0 && c <= 0x1ef9) || 404 (c >= 0x1f00 && c <= 0x1f15) || 405 (c >= 0x1f18 && c <= 0x1f1d) || 406 (c >= 0x1f20 && c <= 0x1f45) || 407 (c >= 0x1f48 && c <= 0x1f4d) || 408 (c >= 0x1f50 && c <= 0x1f57) || 409 c == 0x1f59 || 410 c == 0x1f5b || 411 c == 0x1f5d || 412 (c >= 0x1f5f && c <= 0x1f7d) || 413 (c >= 0x1f80 && c <= 0x1fb4) || 414 (c >= 0x1fb6 && c <= 0x1fbc) || 415 c == 0x1fbe || 416 (c >= 0x1fc2 && c <= 0x1fc4) || 417 (c >= 0x1fc6 && c <= 0x1fcc) || 418 (c >= 0x1fd0 && c <= 0x1fd3) || 419 (c >= 0x1fd6 && c <= 0x1fdb) || 420 (c >= 0x1fe0 && c <= 0x1fec) || 421 (c >= 0x1ff2 && c <= 0x1ff4) || 422 (c >= 0x1ff6 && c <= 0x1ffc) || 423 c == 0x2126 || 424 (c >= 0x212a && c <= 0x212b) || 425 c == 0x212e || 426 (c >= 0x2180 && c <= 0x2182) || 427 (c >= 0x3041 && c <= 0x3094) || 428 (c >= 0x30a1 && c <= 0x30fa) || 429 (c >= 0x3105 && c <= 0x312c) || 430 (c >= 0xac00 && c <= 0xd7a3) || 431 (c >= 0x4e00 && c <= 0x9fa5) || 432 c == 0x3007 || 433 (c >= 0x3021 && c <= 0x3029) || 434 (c >= 0x4e00 && c <= 0x9fa5) || 435 c == 0x3007 || 436 (c >= 0x3021 && c <= 0x3029)); 437 } 438 439 Bool IsXMLNamechar(uint c) 440 { 441 return (IsXMLLetter(c) || 442 c == '.' || c == '_' || 443 c == ':' || c == '-' || 444 (c >= 0x300 && c <= 0x345) || 445 (c >= 0x360 && c <= 0x361) || 446 (c >= 0x483 && c <= 0x486) || 447 (c >= 0x591 && c <= 0x5a1) || 448 (c >= 0x5a3 && c <= 0x5b9) || 449 (c >= 0x5bb && c <= 0x5bd) || 450 c == 0x5bf || 451 (c >= 0x5c1 && c <= 0x5c2) || 452 c == 0x5c4 || 453 (c >= 0x64b && c <= 0x652) || 454 c == 0x670 || 455 (c >= 0x6d6 && c <= 0x6dc) || 456 (c >= 0x6dd && c <= 0x6df) || 457 (c >= 0x6e0 && c <= 0x6e4) || 458 (c >= 0x6e7 && c <= 0x6e8) || 459 (c >= 0x6ea && c <= 0x6ed) || 460 (c >= 0x901 && c <= 0x903) || 461 c == 0x93c || 462 (c >= 0x93e && c <= 0x94c) || 463 c == 0x94d || 464 (c >= 0x951 && c <= 0x954) || 465 (c >= 0x962 && c <= 0x963) || 466 (c >= 0x981 && c <= 0x983) || 467 c == 0x9bc || 468 c == 0x9be || 469 c == 0x9bf || 470 (c >= 0x9c0 && c <= 0x9c4) || 471 (c >= 0x9c7 && c <= 0x9c8) || 472 (c >= 0x9cb && c <= 0x9cd) || 473 c == 0x9d7 || 474 (c >= 0x9e2 && c <= 0x9e3) || 475 c == 0xa02 || 476 c == 0xa3c || 477 c == 0xa3e || 478 c == 0xa3f || 479 (c >= 0xa40 && c <= 0xa42) || 480 (c >= 0xa47 && c <= 0xa48) || 481 (c >= 0xa4b && c <= 0xa4d) || 482 (c >= 0xa70 && c <= 0xa71) || 483 (c >= 0xa81 && c <= 0xa83) || 484 c == 0xabc || 485 (c >= 0xabe && c <= 0xac5) || 486 (c >= 0xac7 && c <= 0xac9) || 487 (c >= 0xacb && c <= 0xacd) || 488 (c >= 0xb01 && c <= 0xb03) || 489 c == 0xb3c || 490 (c >= 0xb3e && c <= 0xb43) || 491 (c >= 0xb47 && c <= 0xb48) || 492 (c >= 0xb4b && c <= 0xb4d) || 493 (c >= 0xb56 && c <= 0xb57) || 494 (c >= 0xb82 && c <= 0xb83) || 495 (c >= 0xbbe && c <= 0xbc2) || 496 (c >= 0xbc6 && c <= 0xbc8) || 497 (c >= 0xbca && c <= 0xbcd) || 498 c == 0xbd7 || 499 (c >= 0xc01 && c <= 0xc03) || 500 (c >= 0xc3e && c <= 0xc44) || 501 (c >= 0xc46 && c <= 0xc48) || 502 (c >= 0xc4a && c <= 0xc4d) || 503 (c >= 0xc55 && c <= 0xc56) || 504 (c >= 0xc82 && c <= 0xc83) || 505 (c >= 0xcbe && c <= 0xcc4) || 506 (c >= 0xcc6 && c <= 0xcc8) || 507 (c >= 0xcca && c <= 0xccd) || 508 (c >= 0xcd5 && c <= 0xcd6) || 509 (c >= 0xd02 && c <= 0xd03) || 510 (c >= 0xd3e && c <= 0xd43) || 511 (c >= 0xd46 && c <= 0xd48) || 512 (c >= 0xd4a && c <= 0xd4d) || 513 c == 0xd57 || 514 c == 0xe31 || 515 (c >= 0xe34 && c <= 0xe3a) || 516 (c >= 0xe47 && c <= 0xe4e) || 517 c == 0xeb1 || 518 (c >= 0xeb4 && c <= 0xeb9) || 519 (c >= 0xebb && c <= 0xebc) || 520 (c >= 0xec8 && c <= 0xecd) || 521 (c >= 0xf18 && c <= 0xf19) || 522 c == 0xf35 || 523 c == 0xf37 || 524 c == 0xf39 || 525 c == 0xf3e || 526 c == 0xf3f || 527 (c >= 0xf71 && c <= 0xf84) || 528 (c >= 0xf86 && c <= 0xf8b) || 529 (c >= 0xf90 && c <= 0xf95) || 530 c == 0xf97 || 531 (c >= 0xf99 && c <= 0xfad) || 532 (c >= 0xfb1 && c <= 0xfb7) || 533 c == 0xfb9 || 534 (c >= 0x20d0 && c <= 0x20dc) || 535 c == 0x20e1 || 536 (c >= 0x302a && c <= 0x302f) || 537 c == 0x3099 || 538 c == 0x309a || 539 (c >= 0x30 && c <= 0x39) || 540 (c >= 0x660 && c <= 0x669) || 541 (c >= 0x6f0 && c <= 0x6f9) || 542 (c >= 0x966 && c <= 0x96f) || 543 (c >= 0x9e6 && c <= 0x9ef) || 544 (c >= 0xa66 && c <= 0xa6f) || 545 (c >= 0xae6 && c <= 0xaef) || 546 (c >= 0xb66 && c <= 0xb6f) || 547 (c >= 0xbe7 && c <= 0xbef) || 548 (c >= 0xc66 && c <= 0xc6f) || 549 (c >= 0xce6 && c <= 0xcef) || 550 (c >= 0xd66 && c <= 0xd6f) || 551 (c >= 0xe50 && c <= 0xe59) || 552 (c >= 0xed0 && c <= 0xed9) || 553 (c >= 0xf20 && c <= 0xf29) || 554 c == 0xb7 || 555 c == 0x2d0 || 556 c == 0x2d1 || 557 c == 0x387 || 558 c == 0x640 || 559 c == 0xe46 || 560 c == 0xec6 || 561 c == 0x3005 || 562 (c >= 0x3031 && c <= 0x3035) || 563 (c >= 0x309d && c <= 0x309e) || 564 (c >= 0x30fc && c <= 0x30fe)); 565 } 566 567 Bool IsLower(uint c) 568 { 569 uint map = MAP(c); 570 571 return (map & lowercase)!=0; 572 } 573 574 Bool IsUpper(uint c) 575 { 576 uint map = MAP(c); 577 578 return (map & uppercase)!=0; 579 } 580 581 uint ToLower(uint c) 582 { 583 uint map = MAP(c); 584 585 if (map & uppercase) 586 c += 'a' - 'A'; 587 588 return c; 589 } 590 591 uint ToUpper(uint c) 592 { 593 uint map = MAP(c); 594 595 if (map & lowercase) 596 c += (uint) ('A' - 'a' ); 597 598 return c; 599 } 600 601 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps ) 602 { 603 if ( !cfgBool(doc, TidyXmlTags) ) 604 { 605 if ( tocaps ) 606 { 607 c = (tmbchar) ToUpper(c); 608 } 609 else /* force to lower case */ 610 { 611 c = (tmbchar) ToLower(c); 612 } 613 } 614 return c; 615 } 616 617 618 /* 619 return last character in string 620 this is useful when trailing quotemark 621 is missing on an attribute 622 */ 623 static tmbchar LastChar( tmbstr str ) 624 { 625 if ( str && *str ) 626 { 627 int n = tmbstrlen(str); 628 return str[n-1]; 629 } 630 return 0; 631 } 632 633 /* 634 node->type is one of these: 635 636 #define TextNode 1 637 #define StartTag 2 638 #define EndTag 3 639 #define StartEndTag 4 640 */ 641 642 Lexer* NewLexer( TidyDocImpl* doc ) 643 { 644 Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) ); 645 646 if ( lexer != NULL ) 647 { 648 ClearMemory( lexer, sizeof(Lexer) ); 649 650 lexer->lines = 1; 651 lexer->columns = 1; 652 lexer->state = LEX_CONTENT; 653 654 lexer->versions = (VERS_ALL|VERS_PROPRIETARY); 655 lexer->doctype = VERS_UNKNOWN; 656 lexer->root = &doc->root; 657 } 658 return lexer; 659 } 660 661 Bool EndOfInput( TidyDocImpl* doc ) 662 { 663 assert( doc->docIn != NULL ); 664 return ( !doc->docIn->pushed && IsEOF(doc->docIn) ); 665 } 666 667 void FreeLexer( TidyDocImpl* doc ) 668 { 669 Lexer *lexer = doc->lexer; 670 if ( lexer ) 671 { 672 FreeStyles( doc ); 673 674 if ( lexer->pushed ) 675 FreeNode( doc, lexer->token ); 676 677 while ( lexer->istacksize > 0 ) 678 PopInline( doc, NULL ); 679 680 MemFree( lexer->istack ); 681 MemFree( lexer->lexbuf ); 682 MemFree( lexer ); 683 doc->lexer = NULL; 684 } 685 } 686 687 /* Lexer uses bigger memory chunks than pprint as 688 ** it must hold the entire input document. not just 689 ** the last line or three. 690 */ 691 void AddByte( Lexer *lexer, tmbchar ch ) 692 { 693 if ( lexer->lexsize + 2 >= lexer->lexlength ) 694 { 695 tmbstr buf = NULL; 696 uint allocAmt = lexer->lexlength; 697 while ( lexer->lexsize + 2 >= allocAmt ) 698 { 699 if ( allocAmt == 0 ) 700 allocAmt = 8192; 701 else 702 allocAmt *= 2; 703 } 704 buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt ); 705 if ( buf ) 706 { 707 ClearMemory( buf + lexer->lexlength, 708 allocAmt - lexer->lexlength ); 709 lexer->lexbuf = buf; 710 lexer->lexlength = allocAmt; 711 } 712 } 713 714 lexer->lexbuf[ lexer->lexsize++ ] = ch; 715 lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */ 716 } 717 718 static void ChangeChar( Lexer *lexer, tmbchar c ) 719 { 720 if ( lexer->lexsize > 0 ) 721 { 722 lexer->lexbuf[ lexer->lexsize-1 ] = c; 723 } 724 } 725 726 /* store character c as UTF-8 encoded byte stream */ 727 void AddCharToLexer( Lexer *lexer, uint c ) 728 { 729 int i, err, count = 0; 730 tmbchar buf[10] = {0}; 731 732 err = EncodeCharToUTF8Bytes( c, buf, NULL, &count ); 733 if (err) 734 { 735 #if 0 && defined(_DEBUG) 736 fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c ); 737 #endif 738 /* replacement character 0xFFFD encoded as UTF-8 */ 739 buf[0] = (byte) 0xEF; 740 buf[1] = (byte) 0xBF; 741 buf[2] = (byte) 0xBD; 742 count = 3; 743 } 744 745 for ( i = 0; i < count; ++i ) 746 AddByte( lexer, buf[i] ); 747 } 748 749 static void AddStringToLexer( Lexer *lexer, ctmbstr str ) 750 { 751 uint c; 752 753 /* Many (all?) compilers will sign-extend signed chars (the default) when 754 ** converting them to unsigned integer values. We must cast our char to 755 ** unsigned char before assigning it to prevent this from happening. 756 */ 757 while( 0 != (c = (unsigned char) *str++ )) 758 AddCharToLexer( lexer, c ); 759 } 760 761 /* 762 No longer attempts to insert missing ';' for unknown 763 enitities unless one was present already, since this 764 gives unexpected results. 765 766 For example: <a href="something.htm?foo&bar&fred"> 767 was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;"> 768 rather than: <a href="something.htm?foo&amp;bar&amp;fred"> 769 770 My thanks for Maurice Buxton for spotting this. 771 772 Also Randy Waki pointed out the following case for the 773 04 Aug 00 version (bug #433012): 774 775 For example: <a href="something.htm?id=1&lang=en"> 776 was tidied to: <a href="something.htm?id=1&lang;=en"> 777 rather than: <a href="something.htm?id=1&amp;lang=en"> 778 779 where "lang" is a known entity (#9001), but browsers would 780 misinterpret "&lang;" because it had a value > 256. 781 782 So the case of an apparently known entity with a value > 256 and 783 missing a semicolon is handled specially. 784 785 "ParseEntity" is also a bit of a misnomer - it handles entities and 786 numeric character references. Invalid NCR's are now reported. 787 */ 788 static void ParseEntity( TidyDocImpl* doc, int mode ) 789 { 790 uint start; 791 Bool first = yes, semicolon = no, found = no; 792 Bool isXml = cfgBool( doc, TidyXmlTags ); 793 uint c, ch, startcol, entver = 0; 794 Lexer* lexer = doc->lexer; 795 796 start = lexer->lexsize - 1; /* to start at "&" */ 797 startcol = doc->docIn->curcol - 1; 798 799 while ( (c = ReadChar(doc->docIn)) != EndOfStream ) 800 { 801 if ( c == ';' ) 802 { 803 semicolon = yes; 804 break; 805 } 806 807 if (first && c == '#') 808 { 809 #if SUPPORT_ASIAN_ENCODINGS 810 if ( !cfgBool(doc, TidyNCR) || 811 cfg(doc, TidyInCharEncoding) == BIG5 || 812 cfg(doc, TidyInCharEncoding) == SHIFTJIS ) 813 { 814 UngetChar('#', doc->docIn); 815 return; 816 } 817 #endif 818 AddCharToLexer( lexer, c ); 819 first = no; 820 continue; 821 } 822 823 first = no; 824 825 if ( IsNamechar(c) ) 826 { 827 AddCharToLexer( lexer, c ); 828 continue; 829 } 830 831 /* otherwise put it back */ 832 833 UngetChar( c, doc->docIn ); 834 break; 835 } 836 837 /* make sure entity is NULL terminated */ 838 lexer->lexbuf[lexer->lexsize] = '\0'; 839 840 /* Should contrain version to XML/XHTML if &apos; 841 ** is encountered. But this is not possible with 842 ** Tidy's content model bit mask. 843 */ 844 if ( tmbstrcmp(lexer->lexbuf+start, "&apos") == 0 845 && !cfgBool(doc, TidyXmlOut) 846 && !lexer->isvoyager 847 && !cfgBool(doc, TidyXhtmlOut) ) 848 ReportEntityError( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 ); 849 850 /* Lookup entity code and version 851 */ 852 found = EntityInfo( lexer->lexbuf+start, isXml, &ch, &entver ); 853 854 /* deal with unrecognized or invalid entities */ 855 /* #433012 - fix by Randy Waki 17 Feb 01 */ 856 /* report invalid NCR's - Terry Teague 01 Sep 01 */ 857 if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') ) 858 { 859 /* set error position just before offending character */ 860 lexer->lines = doc->docIn->curline; 861 lexer->columns = startcol; 862 863 if (lexer->lexsize > start + 1) 864 { 865 if (ch >= 128 && ch <= 159) 866 { 867 /* invalid numeric character reference */ 868 869 uint c1 = 0; 870 int replaceMode = DISCARDED_CHAR; 871 872 if ( ReplacementCharEncoding == WIN1252 ) 873 c1 = DecodeWin1252( ch ); 874 else if ( ReplacementCharEncoding == MACROMAN ) 875 c1 = DecodeMacRoman( ch ); 876 877 if ( c1 ) 878 replaceMode = REPLACED_CHAR; 879 880 if ( c != ';' ) /* issue warning if not terminated by ';' */ 881 ReportEntityError( doc, MISSING_SEMICOLON_NCR, 882 lexer->lexbuf+start, c ); 883 884 ReportEncodingError(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR); 885 886 if ( c1 ) 887 { 888 /* make the replacement */ 889 lexer->lexsize = start; 890 AddCharToLexer( lexer, c1 ); 891 semicolon = no; 892 } 893 else 894 { 895 /* discard */ 896 lexer->lexsize = start; 897 semicolon = no; 898 } 899 900 } 901 else 902 ReportEntityError( doc, UNKNOWN_ENTITY, 903 lexer->lexbuf+start, ch ); 904 905 if (semicolon) 906 AddCharToLexer( lexer, ';' ); 907 } 908 else /* naked & */ 909 ReportEntityError( doc, UNESCAPED_AMPERSAND, 910 lexer->lexbuf+start, ch ); 911 } 912 else 913 { 914 if ( c != ';' ) /* issue warning if not terminated by ';' */ 915 { 916 /* set error position just before offending chararcter */ 917 lexer->lines = doc->docIn->curline; 918 lexer->columns = startcol; 919 ReportEntityError( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c ); 920 } 921 922 lexer->lexsize = start; 923 if ( ch == 160 && (mode & Preformatted) ) 924 ch = ' '; 925 AddCharToLexer( lexer, ch ); 926 927 if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) ) 928 AddStringToLexer( lexer, "amp;" ); 929 930 /* Detect extended vs. basic entities */ 931 ConstrainVersion( doc, entver ); 932 } 933 } 934 935 static tmbchar ParseTagName( TidyDocImpl* doc ) 936 { 937 Lexer *lexer = doc->lexer; 938 uint c = lexer->lexbuf[ lexer->txtstart ]; 939 Bool xml = cfgBool(doc, TidyXmlTags); 940 941 /* fold case of first character in buffer */ 942 if (!xml && IsUpper(c)) 943 lexer->lexbuf[lexer->txtstart] = (tmbchar) ToLower(c); 944 945 while ((c = ReadChar(doc->docIn)) != EndOfStream) 946 { 947 if ((!xml && !IsNamechar(c)) || 948 (xml && !IsXMLNamechar(c))) 949 break; 950 951 /* fold case of subsequent characters */ 952 if (!xml && IsUpper(c)) 953 c = ToLower(c); 954 955 AddCharToLexer(lexer, c); 956 } 957 958 lexer->txtend = lexer->lexsize; 959 return (tmbchar) c; 960 } 961 962 /* 963 Used for elements and text nodes 964 element name is NULL for text nodes 965 start and end are offsets into lexbuf 966 which contains the textual content of 967 all elements in the parse tree. 968 969 parent and content allow traversal 970 of the parse tree in any direction. 971 attributes are represented as a linked 972 list of AttVal nodes which hold the 973 strings for attribute/value pairs. 974 */ 975 976 977 Node *NewNode(Lexer *lexer) 978 { 979 Node* node = (Node*) MemAlloc( sizeof(Node) ); 980 ClearMemory( node, sizeof(Node) ); 981 if ( lexer ) 982 { 983 node->line = lexer->lines; 984 node->column = lexer->columns; 985 } 986 node->type = TextNode; 987 return node; 988 } 989 990 /* used to clone heading nodes when split by an <HR> */ 991 Node *CloneNode( TidyDocImpl* doc, Node *element ) 992 { 993 Lexer* lexer = doc->lexer; 994 Node *node = NewNode( lexer ); 995 996 node->start = lexer->lexsize; 997 node->end = lexer->lexsize; 998 999 if ( element ) 1000 { 1001 node->parent = element->parent; 1002 node->type = element->type; 1003 node->closed = element->closed; 1004 node->implicit = element->implicit; 1005 node->tag = element->tag; 1006 node->element = tmbstrdup( element->element ); 1007 node->attributes = DupAttrs( doc, element->attributes ); 1008 } 1009 return node; 1010 } 1011 1012 /* free node's attributes */ 1013 void FreeAttrs( TidyDocImpl* doc, Node *node ) 1014 { 1015 1016 while ( node->attributes ) 1017 { 1018 AttVal *av = node->attributes; 1019 1020 if ( av->attribute ) 1021 { 1022 if ( (attrIsID(av) || attrIsNAME(av)) && 1023 IsAnchorElement(doc, node) ) 1024 { 1025 RemoveAnchorByNode( doc, node ); 1026 } 1027 } 1028 1029 node->attributes = av->next; 1030 FreeAttribute( doc, av ); 1031 } 1032 } 1033 1034 /* doesn't repair attribute list linkage */ 1035 void FreeAttribute( TidyDocImpl* doc, AttVal *av ) 1036 { 1037 FreeNode( doc, av->asp ); 1038 FreeNode( doc, av->php ); 1039 MemFree( av->attribute ); 1040 MemFree( av->value ); 1041 MemFree( av ); 1042 } 1043 1044 /* detach attribute from node 1045 */ 1046 void DetachAttribute( Node *node, AttVal *attr ) 1047 { 1048 AttVal *av, *prev = NULL; 1049 1050 for ( av = node->attributes; av; av = av->next ) 1051 { 1052 if ( av == attr ) 1053 { 1054 if ( prev ) 1055 prev->next = attr->next; 1056 else 1057 node->attributes = attr->next; 1058 break; 1059 } 1060 prev = av; 1061 } 1062 } 1063 1064 /* detach attribute from node then free it 1065 */ 1066 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr ) 1067 { 1068 DetachAttribute( node, attr ); 1069 FreeAttribute( doc, attr ); 1070 } 1071 1072 /* 1073 Free document nodes by iterating through peers and recursing 1074 through children. Set next to NULL before calling FreeNode() 1075 to avoid freeing peer nodes. Doesn't patch up prev/next links. 1076 */ 1077 void FreeNode( TidyDocImpl* doc, Node *node ) 1078 { 1079 while ( node ) 1080 { 1081 Node* next = node->next; 1082 1083 FreeAttrs( doc, node ); 1084 FreeNode( doc, node->content ); 1085 MemFree( node->element ); 1086 #ifdef TIDY_STORE_ORIGINAL_TEXT 1087 if (node->otext) 1088 MemFree(node->otext); 1089 #endif 1090 if (RootNode != node->type) 1091 MemFree( node ); 1092 else 1093 node->content = NULL; 1094 1095 node = next; 1096 } 1097 } 1098 1099 #ifdef TIDY_STORE_ORIGINAL_TEXT 1100 void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count) 1101 { 1102 if (!doc->storeText) 1103 return; 1104 1105 if (count >= doc->docIn->otextlen) 1106 return; 1107 1108 if (!doc->docIn->otextsize) 1109 return; 1110 1111 if (count == 0) 1112 { 1113 node->otext = doc->docIn->otextbuf; 1114 doc->docIn->otextbuf = NULL; 1115 doc->docIn->otextlen = 0; 1116 doc->docIn->otextsize = 0; 1117 } 1118 else 1119 { 1120 uint len = doc->docIn->otextlen; 1121 tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1); 1122 tmbstr buf2 = (tmbstr)MemAlloc(count + 1); 1123 uint i, j; 1124 1125 /* strncpy? */ 1126 1127 for (i = 0; i < len - count; ++i) 1128 buf1[i] = doc->docIn->otextbuf[i]; 1129 1130 buf1[i] = 0; 1131 1132 for (j = 0; j + i < len; ++j) 1133 buf2[j] = doc->docIn->otextbuf[j + i]; 1134 1135 buf2[j] = 0; 1136 1137 MemFree(doc->docIn->otextbuf); 1138 node->otext = buf1; 1139 doc->docIn->otextbuf = buf2; 1140 doc->docIn->otextlen = count; 1141 doc->docIn->otextsize = count + 1; 1142 } 1143 } 1144 #endif 1145 1146 Node* TextToken( Lexer *lexer ) 1147 { 1148 Node *node = NewNode( lexer ); 1149 node->start = lexer->txtstart; 1150 node->end = lexer->txtend; 1151 return node; 1152 } 1153 1154 /* used for creating preformatted text from Word2000 */ 1155 Node *NewLineNode( Lexer *lexer ) 1156 { 1157 Node *node = NewNode( lexer ); 1158 node->start = lexer->lexsize; 1159 AddCharToLexer( lexer, (uint)'\n' ); 1160 node->end = lexer->lexsize; 1161 return node; 1162 } 1163 1164 /* used for adding a &nbsp; for Word2000 */ 1165 Node* NewLiteralTextNode( Lexer *lexer, ctmbstr txt ) 1166 { 1167 Node *node = NewNode( lexer ); 1168 node->start = lexer->lexsize; 1169 AddStringToLexer( lexer, txt ); 1170 node->end = lexer->lexsize; 1171 return node; 1172 } 1173 1174 static Node* TagToken( TidyDocImpl* doc, NodeType type ) 1175 { 1176 Lexer* lexer = doc->lexer; 1177 Node* node = NewNode( lexer ); 1178 node->type = type; 1179 node->element = tmbstrndup( lexer->lexbuf + lexer->txtstart, 1180 lexer->txtend - lexer->txtstart ); 1181 node->start = lexer->txtstart; 1182 node->end = lexer->txtstart; 1183 1184 if ( type == StartTag || type == StartEndTag || type == EndTag ) 1185 FindTag(doc, node); 1186 1187 return node; 1188 } 1189 1190 static Node* NewToken(TidyDocImpl* doc, NodeType type) 1191 { 1192 Lexer* lexer = doc->lexer; 1193 Node* node = NewNode(lexer); 1194 node->type = type; 1195 node->start = lexer->txtstart; 1196 node->end = lexer->txtend; 1197 #ifdef TIDY_STORE_ORIGINAL_TEXT 1198 StoreOriginalTextInToken(doc, node, 0); 1199 #endif 1200 return node; 1201 } 1202 1203 #define CommentToken(doc) NewToken(doc, CommentTag) 1204 #define DocTypeToken(doc) NewToken(doc, DocTypeTag) 1205 #define PIToken(doc) NewToken(doc, ProcInsTag) 1206 #define AspToken(doc) NewToken(doc, AspTag) 1207 #define JsteToken(doc) NewToken(doc, JsteTag) 1208 #define PhpToken(doc) NewToken(doc, PhpTag) 1209 #define XmlDeclToken(doc) NewToken(doc, XmlDecl) 1210 #define SectionToken(doc) NewToken(doc, SectionTag) 1211 #define CDATAToken(doc) NewToken(doc, CDATATag) 1212 1213 void AddStringLiteral( Lexer* lexer, ctmbstr str ) 1214 { 1215 byte c; 1216 while(0 != (c = *str++) ) 1217 AddCharToLexer( lexer, c ); 1218 } 1219 1220 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ) 1221 { 1222 byte c; 1223 int ix; 1224 1225 for ( ix=0; ix < len && (c = *str++); ++ix ) 1226 AddCharToLexer(lexer, c); 1227 } 1228 1229 /* find doctype element */ 1230 Node *FindDocType( TidyDocImpl* doc ) 1231 { 1232 Node* node; 1233 for ( node = (doc ? doc->root.content : NULL); 1234 node && node->type != DocTypeTag; 1235 node = node->next ) 1236 /**/; 1237 return node; 1238 } 1239 1240 /* find parent container element */ 1241 Node* FindContainer( Node* node ) 1242 { 1243 for ( node = (node ? node->parent : NULL); 1244 node && nodeHasCM(node, CM_INLINE); 1245 node = node->parent ) 1246 /**/; 1247 1248 return node; 1249 } 1250 1251 1252 /* find html element */ 1253 Node *FindHTML( TidyDocImpl* doc ) 1254 { 1255 Node *node; 1256 for ( node = (doc ? doc->root.content : NULL); 1257 node && !nodeIsHTML(node); 1258 node = node->next ) 1259 /**/; 1260 1261 return node; 1262 } 1263 1264 /* find XML Declaration */ 1265 Node *FindXmlDecl(TidyDocImpl* doc) 1266 { 1267 Node *node; 1268 for ( node = (doc ? doc->root.content : NULL); 1269 node && !(node->type == XmlDecl); 1270 node = node->next ) 1271 /**/; 1272 1273 return node; 1274 } 1275 1276 1277 Node *FindHEAD( TidyDocImpl* doc ) 1278 { 1279 Node *node = FindHTML( doc ); 1280 1281 if ( node ) 1282 { 1283 for ( node = node->content; 1284 node && !nodeIsHEAD(node); 1285 node = node->next ) 1286 /**/; 1287 } 1288 1289 return node; 1290 } 1291 1292 Node *FindTITLE(TidyDocImpl* doc) 1293 { 1294 Node *node = FindHEAD(doc); 1295 1296 if (node) 1297 for (node = node->content; 1298 node && !nodeIsTITLE(node); 1299 node = node->next) {} 1300 1301 return node; 1302 } 1303 1304 Node *FindBody( TidyDocImpl* doc ) 1305 { 1306 Node *node = ( doc ? doc->root.content : NULL ); 1307 1308 while ( node && !nodeIsHTML(node) ) 1309 node = node->next; 1310 1311 if (node == NULL) 1312 return NULL; 1313 1314 node = node->content; 1315 while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) ) 1316 node = node->next; 1317 1318 if ( node && nodeIsFRAMESET(node) ) 1319 { 1320 node = node->content; 1321 while ( node && !nodeIsNOFRAMES(node) ) 1322 node = node->next; 1323 1324 if ( node ) 1325 { 1326 node = node->content; 1327 while ( node && !nodeIsBODY(node) ) 1328 node = node->next; 1329 } 1330 } 1331 1332 return node; 1333 } 1334 1335 /* add meta element for Tidy */ 1336 Bool AddGenerator( TidyDocImpl* doc ) 1337 { 1338 AttVal *attval; 1339 Node *node; 1340 Node *head = FindHEAD( doc ); 1341 tmbchar buf[256]; 1342 1343 if (head) 1344 { 1345 #ifdef PLATFORM_NAME 1346 tmbsnprintf(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org", 1347 tidyReleaseDate()); 1348 #else 1349 tmbsnprintf(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate()); 1350 #endif 1351 1352 for ( node = head->content; node; node = node->next ) 1353 { 1354 if ( nodeIsMETA(node) ) 1355 { 1356 attval = AttrGetById(node, TidyAttr_NAME); 1357 1358 if (AttrValueIs(attval, "generator")) 1359 { 1360 attval = AttrGetById(node, TidyAttr_CONTENT); 1361 1362 if (AttrHasValue(attval) && 1363 tmbstrncasecmp(attval->value, "HTML Tidy", 9) == 0) 1364 { 1365 /* update the existing content to reflect the */ 1366 /* actual version of Tidy currently being used */ 1367 1368 MemFree(attval->value); 1369 attval->value = tmbstrdup(buf); 1370 return no; 1371 } 1372 } 1373 } 1374 } 1375 1376 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 1377 { 1378 node = InferredTag(doc, TidyTag_META); 1379 AddAttribute( doc, node, "name", "generator" ); 1380 AddAttribute( doc, node, "content", buf ); 1381 InsertNodeAtStart( head, node ); 1382 return yes; 1383 } 1384 } 1385 1386 return no; 1387 } 1388 1389 /* examine <!DOCTYPE> to identify version */ 1390 uint FindGivenVersion( TidyDocImpl* doc, Node* doctype ) 1391 { 1392 AttVal * fpi = GetAttrByName(doctype, "PUBLIC"); 1393 uint vers; 1394 1395 if (!fpi || !fpi->value) 1396 return VERS_UNKNOWN; 1397 1398 vers = GetVersFromFPI(fpi->value); 1399 1400 if (VERS_XHTML & vers) 1401 { 1402 SetOptionBool(doc, TidyXmlOut, yes); 1403 SetOptionBool(doc, TidyXhtmlOut, yes); 1404 doc->lexer->isvoyager = yes; 1405 } 1406 1407 /* todo: add a warning if case does not match? */ 1408 MemFree(fpi->value); 1409 fpi->value = tmbstrdup(GetFPIFromVers(vers)); 1410 1411 return vers; 1412 } 1413 1414 /* return guessed version */ 1415 uint ApparentVersion( TidyDocImpl* doc ) 1416 { 1417 if ((doc->lexer->doctype == XH11 || 1418 doc->lexer->doctype == XB10) && 1419 (doc->lexer->versions & doc->lexer->doctype)) 1420 return doc->lexer->doctype; 1421 else 1422 return HTMLVersion(doc); 1423 } 1424 1425 ctmbstr HTMLVersionNameFromCode( uint vers, Bool ARG_UNUSED(isXhtml) ) 1426 { 1427 ctmbstr name = GetNameFromVers(vers); 1428 1429 /* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */ 1430 /* 1431 if (!name) 1432 name = "HTML Proprietary"; 1433 */ 1434 1435 return name; 1436 } 1437 1438 /* Put DOCTYPE declaration between the 1439 ** <?xml version "1.0" ... ?> declaration, if any, 1440 ** and the <html> tag. Should also work for any comments, 1441 ** etc. that may precede the <html> tag. 1442 */ 1443 1444 static Node* NewDocTypeNode( TidyDocImpl* doc ) 1445 { 1446 Node* doctype = NULL; 1447 Node* html = FindHTML( doc ); 1448 1449 if ( !html ) 1450 return NULL; 1451 1452 doctype = NewNode( NULL ); 1453 doctype->type = DocTypeTag; 1454 InsertNodeBeforeElement(html, doctype); 1455 return doctype; 1456 } 1457 1458 Bool SetXHTMLDocType( TidyDocImpl* doc ) 1459 { 1460 Lexer *lexer = doc->lexer; 1461 Node *doctype = FindDocType( doc ); 1462 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); 1463 ctmbstr pub = "PUBLIC"; 1464 ctmbstr sys = "SYSTEM"; 1465 1466 lexer->versionEmitted = ApparentVersion( doc ); 1467 1468 if (dtmode == TidyDoctypeOmit) 1469 { 1470 if (doctype) 1471 DiscardElement(doc, doctype); 1472 return yes; 1473 } 1474 1475 if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype)) 1476 return no; 1477 1478 if (!doctype) 1479 { 1480 doctype = NewDocTypeNode(doc); 1481 doctype->element = tmbstrdup("html"); 1482 } 1483 else 1484 { 1485 doctype->element = tmbstrtolower(doctype->element); 1486 } 1487 1488 switch(dtmode) 1489 { 1490 case TidyDoctypeStrict: 1491 /* XHTML 1.0 Strict */ 1492 RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10S)); 1493 RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10S)); 1494 lexer->versionEmitted = X10S; 1495 break; 1496 case TidyDoctypeLoose: 1497 /* XHTML 1.0 Transitional */ 1498 RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10T)); 1499 RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10T)); 1500 lexer->versionEmitted = X10T; 1501 break; 1502 case TidyDoctypeUser: 1503 /* user defined document type declaration */ 1504 RepairAttrValue(doc, doctype, pub, cfgStr(doc, TidyDoctype)); 1505 RepairAttrValue(doc, doctype, sys, ""); 1506 break; 1507 case TidyDoctypeAuto: 1508 if (lexer->versions & XH11 && lexer->doctype == XH11) 1509 { 1510 if (!GetAttrByName(doctype, sys)) 1511 RepairAttrValue(doc, doctype, sys, GetSIFromVers(XH11)); 1512 lexer->versionEmitted = XH11; 1513 return yes; 1514 } 1515 else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40)) 1516 { 1517 RepairAttrValue(doc, doctype, pub, GetFPIFromVers(XH11)); 1518 RepairAttrValue(doc, doctype, sys, GetSIFromVers(XH11)); 1519 lexer->versionEmitted = XH11; 1520 } 1521 else if (lexer->versions & XB10 && lexer->doctype == XB10) 1522 { 1523 if (!GetAttrByName(doctype, sys)) 1524 RepairAttrValue(doc, doctype, sys, GetSIFromVers(XB10)); 1525 lexer->versionEmitted = XB10; 1526 return yes; 1527 } 1528 else if (lexer->versions & VERS_HTML40_STRICT) 1529 { 1530 RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10S)); 1531 RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10S)); 1532 lexer->versionEmitted = X10S; 1533 } 1534 else if (lexer->versions & VERS_FRAMESET) 1535 { 1536 RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10F)); 1537 RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10F)); 1538 lexer->versionEmitted = X10F; 1539 } 1540 else if (lexer->versions & VERS_LOOSE) 1541 { 1542 RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10T)); 1543 RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10T)); 1544 lexer->versionEmitted = X10T; 1545 } 1546 else 1547 { 1548 if (doctype) 1549 DiscardElement(doc, doctype); 1550 return no; 1551 } 1552 break; 1553 } 1554 1555 return no; 1556 } 1557 1558 /* fixup doctype if missing */ 1559 Bool FixDocType( TidyDocImpl* doc ) 1560 { 1561 Lexer* lexer = doc->lexer; 1562 Node* doctype = FindDocType( doc ); 1563 uint dtmode = cfg( doc, TidyDoctypeMode ); 1564 uint guessed = VERS_UNKNOWN; 1565 Bool hadSI = no; 1566 1567 if (dtmode == TidyDoctypeAuto && 1568 lexer->versions & lexer->doctype && 1569 !(VERS_XHTML & lexer->doctype && !lexer->isvoyager) 1570 && FindDocType(doc)) 1571 { 1572 lexer->versionEmitted = lexer->doctype; 1573 return yes; 1574 } 1575 1576 if (dtmode == TidyDoctypeOmit) 1577 { 1578 if (doctype) 1579 DiscardElement( doc, doctype ); 1580 lexer->versionEmitted = ApparentVersion( doc ); 1581 return yes; 1582 } 1583 1584 if (cfgBool(doc, TidyXmlOut)) 1585 return yes; 1586 1587 if (doctype) 1588 hadSI = GetAttrByName(doctype, "SYSTEM") != NULL; 1589 1590 if ((dtmode == TidyDoctypeStrict || 1591 dtmode == TidyDoctypeLoose) && doctype) 1592 { 1593 DiscardElement(doc, doctype); 1594 doctype = NULL; 1595 } 1596 1597 switch (dtmode) 1598 { 1599 case TidyDoctypeStrict: 1600 guessed = H41S; 1601 break; 1602 case TidyDoctypeLoose: 1603 guessed = H41T; 1604 break; 1605 case TidyDoctypeAuto: 1606 guessed = HTMLVersion(doc); 1607 break; 1608 } 1609 1610 lexer->versionEmitted = guessed; 1611 if (guessed == VERS_UNKNOWN) 1612 return no; 1613 1614 if (doctype) 1615 { 1616 doctype->element = tmbstrtolower(doctype->element); 1617 } 1618 else 1619 { 1620 doctype = NewDocTypeNode(doc); 1621 doctype->element = tmbstrdup("html"); 1622 } 1623 1624 RepairAttrValue(doc, doctype, "PUBLIC", GetFPIFromVers(guessed)); 1625 1626 if (hadSI) 1627 RepairAttrValue(doc, doctype, "SYSTEM", GetSIFromVers(guessed)); 1628 1629 return yes; 1630 } 1631 1632 /* ensure XML document starts with <?xml version="1.0"?> */ 1633 /* add encoding attribute if not using ASCII or UTF-8 output */ 1634 Bool FixXmlDecl( TidyDocImpl* doc ) 1635 { 1636 Node* xml; 1637 AttVal *version, *encoding; 1638 Lexer*lexer = doc->lexer; 1639 Node* root = &doc->root; 1640 1641 if ( root->content && root->content->type == XmlDecl ) 1642 { 1643 xml = root->content; 1644 } 1645 else 1646 { 1647 xml = NewNode(lexer); 1648 xml->type = XmlDecl; 1649 if ( root->content ) 1650 InsertNodeBeforeElement(root->content, xml); 1651 else 1652 root->content = xml; 1653 } 1654 1655 version = GetAttrByName(xml, "version"); 1656 encoding = GetAttrByName(xml, "encoding"); 1657 1658 /* 1659 We need to insert a check if declared encoding 1660 and output encoding mismatch and fix the XML 1661 declaration accordingly!!! 1662 */ 1663 1664 if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 ) 1665 { 1666 ctmbstr enc = GetEncodingNameFromTidyId(cfg(doc, TidyOutCharEncoding)); 1667 if ( enc ) 1668 AddAttribute( doc, xml, "encoding", enc ); 1669 } 1670 1671 if ( version == NULL ) 1672 AddAttribute( doc, xml, "version", "1.0" ); 1673 return yes; 1674 } 1675 1676 Node* InferredTag(TidyDocImpl* doc, TidyTagId id) 1677 { 1678 Lexer *lexer = doc->lexer; 1679 Node *node = NewNode( lexer ); 1680 const Dict* dict = LookupTagDef(id); 1681 1682 assert( dict != NULL ); 1683 1684 node->type = StartTag; 1685 node->implicit = yes; 1686 node->element = tmbstrdup(dict->name); 1687 node->tag = dict; 1688 node->start = lexer->txtstart; 1689 node->end = lexer->txtend; 1690 1691 return node; 1692 } 1693 1694 Bool ExpectsContent(Node *node) 1695 { 1696 if (node->type != StartTag) 1697 return no; 1698 1699 /* unknown element? */ 1700 if (node->tag == NULL) 1701 return yes; 1702 1703 if (node->tag->model & CM_EMPTY) 1704 return no; 1705 1706 return yes; 1707 } 1708 1709 /* 1710 create a text node for the contents of 1711 a CDATA element like style or script 1712 which ends with </foo> for some foo. 1713 */ 1714 1715 #define CDATA_INTERMEDIATE 1 1716 #define CDATA_STARTTAG 2 1717 #define CDATA_ENDTAG 3 1718 1719 Node *GetCDATA( TidyDocImpl* doc, Node *container ) 1720 { 1721 Lexer* lexer = doc->lexer; 1722 uint start = 0; 1723 int nested = 0; 1724 int state = CDATA_INTERMEDIATE; 1725 uint i; 1726 Bool isEmpty = yes; 1727 Bool matches = no; 1728 uint c; 1729 Bool hasSrc = AttrGetById(container, TidyAttr_SRC) != NULL; 1730 1731 lexer->lines = doc->docIn->curline; 1732 lexer->columns = doc->docIn->curcol; 1733 lexer->waswhite = no; 1734 lexer->txtstart = lexer->txtend = lexer->lexsize; 1735 1736 /* seen start tag, look for matching end tag */ 1737 while ((c = ReadChar(doc->docIn)) != EndOfStream) 1738 { 1739 AddCharToLexer(lexer, c); 1740 lexer->txtend = lexer->lexsize; 1741 1742 if (state == CDATA_INTERMEDIATE) 1743 { 1744 if (c != '<') 1745 { 1746 if (isEmpty && !IsWhite(c)) 1747 isEmpty = no; 1748 continue; 1749 } 1750 1751 c = ReadChar(doc->docIn); 1752 1753 if (IsLetter(c)) 1754 { 1755 /* <head><script src=foo><meta name=foo content=bar>*/ 1756 if (hasSrc && isEmpty && nodeIsSCRIPT(container)) 1757 { 1758 /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ 1759 lexer->lexsize = lexer->txtstart; 1760 UngetChar(c, doc->docIn); 1761 UngetChar('<', doc->docIn); 1762 return NULL; 1763 } 1764 AddCharToLexer(lexer, c); 1765 start = lexer->lexsize - 1; 1766 state = CDATA_STARTTAG; 1767 } 1768 else if (c == '/') 1769 { 1770 AddCharToLexer(lexer, c); 1771 1772 c = ReadChar(doc->docIn); 1773 1774 if (!IsLetter(c)) 1775 { 1776 UngetChar(c, doc->docIn); 1777 continue; 1778 } 1779 UngetChar(c, doc->docIn); 1780 1781 start = lexer->lexsize; 1782 state = CDATA_ENDTAG; 1783 } 1784 else if (c == '\\') 1785 { 1786 /* recognize document.write("<script><\/script>") */ 1787 AddCharToLexer(lexer, c); 1788 1789 c = ReadChar(doc->docIn); 1790 1791 if (c != '/') 1792 { 1793 UngetChar(c, doc->docIn); 1794 continue; 1795 } 1796 1797 AddCharToLexer(lexer, c); 1798 c = ReadChar(doc->docIn); 1799 1800 if (!IsLetter(c)) 1801 { 1802 UngetChar(c, doc->docIn); 1803 continue; 1804 } 1805 UngetChar(c, doc->docIn); 1806 1807 start = lexer->lexsize; 1808 state = CDATA_ENDTAG; 1809 } 1810 else 1811 { 1812 UngetChar(c, doc->docIn); 1813 } 1814 } 1815 /* '<' + Letter found */ 1816 else if (state == CDATA_STARTTAG) 1817 { 1818 if (IsLetter(c)) 1819 continue; 1820 1821 matches = tmbstrncasecmp(container->element, lexer->lexbuf + start, 1822 tmbstrlen(container->element)) == 0; 1823 if (matches) 1824 nested++; 1825 1826 state = CDATA_INTERMEDIATE; 1827 } 1828 /* '<' + '/' + Letter found */ 1829 else if (state == CDATA_ENDTAG) 1830 { 1831 if (IsLetter(c)) 1832 continue; 1833 1834 matches = tmbstrncasecmp(container->element, lexer->lexbuf + start, 1835 tmbstrlen(container->element)) == 0; 1836 1837 if (isEmpty && !matches) 1838 { 1839 /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ 1840 1841 for (i = lexer->lexsize - 1; i >= start; --i) 1842 UngetChar((uint)lexer->lexbuf[i], doc->docIn); 1843 UngetChar('/', doc->docIn); 1844 UngetChar('<', doc->docIn); 1845 break; 1846 } 1847 1848 if (matches && nested-- <= 0) 1849 { 1850 for (i = lexer->lexsize - 1; i >= start; --i) 1851 UngetChar((uint)lexer->lexbuf[i], doc->docIn); 1852 UngetChar('/', doc->docIn); 1853 UngetChar('<', doc->docIn); 1854 lexer->lexsize -= (lexer->lexsize - start) + 2; 1855 break; 1856 } 1857 else if (lexer->lexbuf[start - 2] != '\\') 1858 { 1859 /* if the end tag is not already escaped using backslash */ 1860 lexer->lines = doc->docIn->curline; 1861 lexer->columns = doc->docIn->curcol - 3; 1862 ReportError(doc, NULL, NULL, BAD_CDATA_CONTENT); 1863 1864 /* if javascript insert backslash before / */ 1865 if (IsJavaScript(container)) 1866 { 1867 for (i = lexer->lexsize; i > start-1; --i) 1868 lexer->lexbuf[i] = lexer->lexbuf[i-1]; 1869 1870 lexer->lexbuf[start-1] = '\\'; 1871 lexer->lexsize++; 1872 } 1873 } 1874 state = CDATA_INTERMEDIATE; 1875 } 1876 } 1877 if (isEmpty) 1878 lexer->lexsize = lexer->txtstart = lexer->txtend; 1879 else 1880 lexer->txtend = lexer->lexsize; 1881 1882 if (c == EndOfStream) 1883 ReportError(doc, container, NULL, MISSING_ENDTAG_FOR ); 1884 1885 /* if (lexer->txtend > lexer->txtstart) */ 1886 return TextToken(lexer); 1887 1888 return NULL; 1889 } 1890 1891 void UngetToken( TidyDocImpl* doc ) 1892 { 1893 doc->lexer->pushed = yes; 1894 } 1895 1896 #ifdef TIDY_STORE_ORIGINAL_TEXT 1897 #define CondReturnTextNode(doc, skip) \ 1898 if (lexer->txtend > lexer->txtstart) \ 1899 { \ 1900 lexer->token = TextToken(lexer); \ 1901 StoreOriginalTextInToken(doc, lexer->token, skip); \ 1902 return lexer->token; \ 1903 } 1904 #else 1905 #define CondReturnTextNode(doc, skip) \ 1906 if (lexer->txtend > lexer->txtstart) \ 1907 { \ 1908 lexer->token = TextToken(lexer); \ 1909 return lexer->token; \ 1910 } 1911 #endif 1912 1913 /* 1914 modes for GetToken() 1915 1916 MixedContent -- for elements which don't accept PCDATA 1917 Preformatted -- white space preserved as is 1918 IgnoreMarkup -- for CDATA elements such as script, style 1919 */ 1920 1921 Node* GetToken( TidyDocImpl* doc, uint mode ) 1922 { 1923 Lexer* lexer = doc->lexer; 1924 uint c, badcomment = 0; 1925 Bool isempty = no; 1926 AttVal *attributes = NULL; 1927 1928 if (lexer->pushed) 1929 { 1930 /* duplicate inlines in preference to pushed text nodes when appropriate */ 1931 if (lexer->token->type != TextNode || (!lexer->insert && !lexer->inode)) 1932 { 1933 lexer->pushed = no; 1934 return lexer->token; 1935 } 1936 } 1937 1938 /* at start of block elements, unclosed inline 1939 elements are inserted into the token stream */ 1940 1941 if (lexer->insert || lexer->inode) 1942 { 1943 if (lexer->pushed) 1944 { 1945 lexer->pushed = no; 1946 FreeNode( doc, lexer->token ); 1947 } 1948 return lexer->token = InsertedToken( doc ); 1949 } 1950 1951 if (mode == CdataContent) 1952 { 1953 assert( lexer->parent != NULL ); 1954 if (lexer->pushed) 1955 { 1956 lexer->pushed = no; 1957 FreeNode( doc, lexer->token ); 1958 } 1959 return lexer->token = GetCDATA(doc, lexer->parent); 1960 } 1961 1962 lexer->lines = doc->docIn->curline; 1963 lexer->columns = doc->docIn->curcol; 1964 lexer->waswhite = no; 1965 1966 lexer->txtstart = lexer->txtend = lexer->lexsize; 1967 1968 while ((c = ReadChar(doc->docIn)) != EndOfStream) 1969 { 1970 if (lexer->insertspace && !(mode & IgnoreWhitespace)) 1971 { 1972 AddCharToLexer(lexer, ' '); 1973 lexer->waswhite = yes; 1974 lexer->insertspace = no; 1975 } 1976 1977 if (c == 160 && (mode & Preformatted)) 1978 c = ' '; 1979 1980 AddCharToLexer(lexer, c); 1981 1982 switch (lexer->state) 1983 { 1984 case LEX_CONTENT: /* element content */ 1985 1986 /* 1987 Discard white space if appropriate. Its cheaper 1988 to do this here rather than in parser methods 1989 for elements that don't have mixed content. 1990 */ 1991 if (IsWhite(c) && (mode == IgnoreWhitespace) 1992 && lexer->lexsize == lexer->txtstart + 1) 1993 { 1994 --(lexer->lexsize); 1995 lexer->waswhite = no; 1996 lexer->lines = doc->docIn->curline; 1997 lexer->columns = doc->docIn->curcol; 1998 continue; 1999 } 2000 2001 if (c == '<') 2002 { 2003 lexer->state = LEX_GT; 2004 continue; 2005 } 2006 2007 if (IsWhite(c)) 2008 { 2009 /* was previous character white? */ 2010 if (lexer->waswhite) 2011 { 2012 if (mode != Preformatted && mode != IgnoreMarkup) 2013 { 2014 --(lexer->lexsize); 2015 lexer->lines = doc->docIn->curline; 2016 lexer->columns = doc->docIn->curcol; 2017 } 2018 } 2019 else /* prev character wasn't white */ 2020 { 2021 lexer->waswhite = yes; 2022 2023 if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') 2024 ChangeChar(lexer, ' '); 2025 } 2026 2027 continue; 2028 } 2029 else if (c == '&' && mode != IgnoreMarkup) 2030 ParseEntity( doc, mode ); 2031 2032 /* this is needed to avoid trimming trailing whitespace */ 2033 if (mode == IgnoreWhitespace) 2034 mode = MixedContent; 2035 2036 lexer->waswhite = no; 2037 continue; 2038 2039 case LEX_GT: /* < */ 2040 2041 /* check for endtag */ 2042 if (c == '/') 2043 { 2044 if ((c = ReadChar(doc->docIn)) == EndOfStream) 2045 { 2046 UngetChar(c, doc->docIn); 2047 continue; 2048 } 2049 2050 AddCharToLexer(lexer, c); 2051 2052 if (IsLetter(c)) 2053 { 2054 lexer->lexsize -= 3; 2055 lexer->txtend = lexer->lexsize; 2056 UngetChar(c, doc->docIn); 2057 lexer->state = LEX_ENDTAG; 2058 lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */ 2059 doc->docIn->curcol -= 2; 2060 2061 /* if some text before the </ return it now */ 2062 if (lexer->txtend > lexer->txtstart) 2063 { 2064 /* trim space character before end tag */ 2065 if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ') 2066 { 2067 lexer->lexsize -= 1; 2068 lexer->txtend = lexer->lexsize; 2069 } 2070 lexer->token = TextToken(lexer); 2071 #ifdef TIDY_STORE_ORIGINAL_TEXT 2072 StoreOriginalTextInToken(doc, lexer->token, 3); 2073 #endif 2074 return lexer->token; 2075 } 2076 2077 continue; /* no text so keep going */ 2078 } 2079 2080 /* otherwise treat as CDATA */ 2081 lexer->waswhite = no; 2082 lexer->state = LEX_CONTENT; 2083 continue; 2084 } 2085 2086 if (mode == IgnoreMarkup) 2087 { 2088 /* otherwise treat as CDATA */ 2089 lexer->waswhite = no; 2090 lexer->state = LEX_CONTENT; 2091 continue; 2092 } 2093 2094 /* 2095 look out for comments, doctype or marked sections 2096 this isn't quite right, but its getting there ... 2097 */ 2098 if (c == '!') 2099 { 2100 c = ReadChar(doc->docIn); 2101 2102 if (c == '-') 2103 { 2104 c = ReadChar(doc->docIn); 2105 2106 if (c == '-') 2107 { 2108 lexer->state = LEX_COMMENT; /* comment */ 2109 lexer->lexsize -= 2; 2110 lexer->txtend = lexer->lexsize; 2111 2112 CondReturnTextNode(doc, 4) 2113 2114 lexer->txtstart = lexer->lexsize; 2115 continue; 2116 } 2117 2118 ReportError(doc, NULL, NULL, MALFORMED_COMMENT ); 2119 } 2120 else if (c == 'd' || c == 'D') 2121 { 2122 /* todo: check for complete "<!DOCTYPE" not just <!D */ 2123 2124 uint skip = 0; 2125 2126 lexer->state = LEX_DOCTYPE; /* doctype */ 2127 lexer->lexsize -= 2; 2128 lexer->txtend = lexer->lexsize; 2129 mode = IgnoreWhitespace; 2130 2131 /* skip until white space or '>' */ 2132 2133 for (;;) 2134 { 2135 c = ReadChar(doc->docIn); 2136 ++skip; 2137 2138 if (c == EndOfStream || c == '>') 2139 { 2140 UngetChar(c, doc->docIn); 2141 break; 2142 } 2143 2144 2145 if (!IsWhite(c)) 2146 continue; 2147 2148 /* and skip to end of whitespace */ 2149 2150 for (;;) 2151 { 2152 c = ReadChar(doc->docIn); 2153 ++skip; 2154 2155 if (c == EndOfStream || c == '>') 2156 { 2157 UngetChar(c, doc->docIn); 2158 break; 2159 } 2160 2161 2162 if (IsWhite(c)) 2163 continue; 2164 2165 UngetChar(c, doc->docIn); 2166 break; 2167 } 2168 2169 break; 2170 } 2171 2172 CondReturnTextNode(doc, (skip + 3)) 2173 2174 lexer->txtstart = lexer->lexsize; 2175 continue; 2176 } 2177 else if (c == '[') 2178 { 2179 /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */ 2180 lexer->lexsize -= 2; 2181 lexer->state = LEX_SECTION; 2182 lexer->txtend = lexer->lexsize; 2183 2184 CondReturnTextNode(doc, 2) 2185 2186 lexer->txtstart = lexer->lexsize; 2187 continue; 2188 } 2189 2190 2191 2192 /* else swallow characters up to and including next '>' */ 2193 while ((c = ReadChar(doc->docIn)) != '>') 2194 { 2195 if (c == EndOfStream) 2196 { 2197 UngetChar(c, doc->docIn); 2198 break; 2199 } 2200 } 2201 2202 lexer->lexsize -= 2; 2203 lexer->lexbuf[lexer->lexsize] = '\0'; 2204 lexer->state = LEX_CONTENT; 2205 continue; 2206 } 2207 2208 /* 2209 processing instructions 2210 */ 2211 2212 if (c == '?') 2213 { 2214 lexer->lexsize -= 2; 2215 lexer->state = LEX_PROCINSTR; 2216 lexer->txtend = lexer->lexsize; 2217 2218 CondReturnTextNode(doc, 2) 2219 2220 lexer->txtstart = lexer->lexsize; 2221 continue; 2222 } 2223 2224 /* Microsoft ASP's e.g. <% ... server-code ... %> */ 2225 if (c == '%') 2226 { 2227 lexer->lexsize -= 2; 2228 lexer->state = LEX_ASP; 2229 lexer->txtend = lexer->lexsize; 2230 2231 CondReturnTextNode(doc, 2) 2232 2233 lexer->txtstart = lexer->lexsize; 2234 continue; 2235 } 2236 2237 /* Netscapes JSTE e.g. <# ... server-code ... #> */ 2238 if (c == '#') 2239 { 2240 lexer->lexsize -= 2; 2241 lexer->state = LEX_JSTE; 2242 lexer->txtend = lexer->lexsize; 2243 2244 CondReturnTextNode(doc, 2) 2245 2246 lexer->txtstart = lexer->lexsize; 2247 continue; 2248 } 2249 2250 /* check for start tag */ 2251 if (IsLetter(c)) 2252 { 2253 UngetChar(c, doc->docIn); /* push back letter */ 2254 UngetChar('<', doc->docIn); 2255 --(doc->docIn->curcol); 2256 lexer->lexsize -= 2; /* discard "<" + letter */ 2257 lexer->txtend = lexer->lexsize; 2258 lexer->state = LEX_STARTTAG; /* ready to read tag name */ 2259 2260 CondReturnTextNode(doc, 2) 2261 2262 /* lexer->txtstart = lexer->lexsize; missing here? */ 2263 continue; /* no text so keep going */ 2264 } 2265 2266 /* fix for bug 762102 */ 2267 if (c == '&') 2268 { 2269 UngetChar(c, doc->docIn); 2270 --(lexer->lexsize); 2271 } 2272 2273 /* otherwise treat as CDATA */ 2274 lexer->state = LEX_CONTENT; 2275 lexer->waswhite = no; 2276 continue; 2277 2278 case LEX_ENDTAG: /* </letter */ 2279 lexer->txtstart = lexer->lexsize - 1; 2280 doc->docIn->curcol += 2; 2281 c = ParseTagName( doc ); 2282 lexer->token = TagToken( doc, EndTag ); /* create endtag token */ 2283 lexer->lexsize = lexer->txtend = lexer->txtstart; 2284 2285 /* skip to '>' */ 2286 while ( c != '>' && c != EndOfStream ) 2287 { 2288 c = ReadChar(doc->docIn); 2289 } 2290 2291 if (c == EndOfStream) 2292 { 2293 FreeNode( doc, lexer->token ); 2294 continue; 2295 } 2296 2297 lexer->state = LEX_CONTENT; 2298 lexer->waswhite = no; 2299 #ifdef TIDY_STORE_ORIGINAL_TEXT 2300 StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */ 2301 #endif 2302 return lexer->token; /* the endtag token */ 2303 2304 case LEX_STARTTAG: /* first letter of tagname */ 2305 c = ReadChar(doc->docIn); 2306 ChangeChar(lexer, (tmbchar)c); 2307 lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */ 2308 c = ParseTagName( doc ); 2309 isempty = no; 2310 attributes = NULL; 2311 lexer->token = TagToken( doc, (isempty ? StartEndTag : StartTag) ); 2312 2313 /* parse attributes, consuming closing ">" */ 2314 if (c != '>') 2315 { 2316 if (c == '/') 2317 UngetChar(c, doc->docIn); 2318 2319 attributes = ParseAttrs( doc, &isempty ); 2320 } 2321 2322 if (isempty) 2323 lexer->token->type = StartEndTag; 2324 2325 lexer->token->attributes = attributes; 2326 lexer->lexsize = lexer->txtend = lexer->txtstart; 2327 2328 /* swallow newline following start tag */ 2329 /* special check needed for CRLF sequence */ 2330 /* this doesn't apply to empty elements */ 2331 /* nor to preformatted content that needs escaping */ 2332 2333 if ((mode != Preformatted && ExpectsContent(lexer->token)) 2334 || nodeIsBR(lexer->token) || nodeIsHR(lexer->token)) 2335 { 2336 c = ReadChar(doc->docIn); 2337 2338 if (c != '\n' && c != '\f') 2339 UngetChar(c, doc->docIn); 2340 2341 lexer->waswhite = yes; /* to swallow leading whitespace */ 2342 } 2343 else 2344 lexer->waswhite = no; 2345 2346 lexer->state = LEX_CONTENT; 2347 if (lexer->token->tag == NULL) 2348 ReportFatal( doc, NULL, lexer->token, UNKNOWN_ELEMENT ); 2349 else if ( !cfgBool(doc, TidyXmlTags) ) 2350 { 2351 Node* curr = lexer->token; 2352 ConstrainVersion( doc, curr->tag->versions ); 2353 2354 if ( curr->tag->versions & VERS_PROPRIETARY ) 2355 { 2356 if ( !cfgBool(doc, TidyMakeClean) || 2357 ( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) ) 2358 { 2359 ReportError(doc, NULL, curr, PROPRIETARY_ELEMENT ); 2360 2361 if ( nodeIsLAYER(curr) ) 2362 doc->badLayout |= USING_LAYER; 2363 else if ( nodeIsSPACER(curr) ) 2364 doc->badLayout |= USING_SPACER; 2365 else if ( nodeIsNOBR(curr) ) 2366 doc->badLayout |= USING_NOBR; 2367 } 2368 } 2369 2370 RepairDuplicateAttributes( doc, curr ); 2371 } 2372 #ifdef TIDY_STORE_ORIGINAL_TEXT 2373 StoreOriginalTextInToken(doc, lexer->token, 0); 2374 #endif 2375 return lexer->token; /* return start tag */ 2376 2377 case LEX_COMMENT: /* seen <!-- so look for --> */ 2378 2379 if (c != '-') 2380 continue; 2381 2382 c = ReadChar(doc->docIn); 2383 AddCharToLexer(lexer, c); 2384 2385 if (c != '-') 2386 continue; 2387 2388 end_comment: 2389 c = ReadChar(doc->docIn); 2390 2391 if (c == '>') 2392 { 2393 if (badcomment) 2394 ReportError(doc, NULL, NULL, MALFORMED_COMMENT ); 2395 2396 /* do not store closing -- in lexbuf */ 2397 lexer->lexsize -= 2; 2398 lexer->txtend = lexer->lexsize; 2399 lexer->lexbuf[lexer->lexsize] = '\0'; 2400 lexer->state = LEX_CONTENT; 2401 lexer->waswhite = no; 2402 lexer->token = CommentToken(doc); 2403 2404 /* now look for a line break */ 2405 2406 c = ReadChar(doc->docIn); 2407 2408 if (c == '\n') 2409 lexer->token->linebreak = yes; 2410 else 2411 UngetChar(c, doc->docIn); 2412 2413 return lexer->token; 2414 } 2415 2416 /* note position of first such error in the comment */ 2417 if (!badcomment) 2418 { 2419 lexer->lines = doc->docIn->curline; 2420 lexer->columns = doc->docIn->curcol - 3; 2421 } 2422 2423 badcomment++; 2424 2425 if ( cfgBool(doc, TidyFixComments) ) 2426 lexer->lexbuf[lexer->lexsize - 2] = '='; 2427 2428 /* if '-' then look for '>' to end the comment */ 2429 if (c == '-') 2430 { 2431 AddCharToLexer(lexer, c); 2432 goto end_comment; 2433 } 2434 2435 /* otherwise continue to look for --> */ 2436 lexer->lexbuf[lexer->lexsize - 1] = '='; 2437 2438 /* http://tidy.sf.net/bug/1266647 */ 2439 AddCharToLexer(lexer, c); 2440 2441 continue; 2442 2443 case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */ 2444 2445 /* use ParseDocTypeDecl() to tokenize doctype declaration */ 2446 UngetChar(c, doc->docIn); 2447 lexer->lexsize -= 1; 2448 lexer->token = ParseDocTypeDecl(doc); 2449 2450 lexer->txtend = lexer->lexsize; 2451 lexer->lexbuf[lexer->lexsize] = '\0'; 2452 lexer->state = LEX_CONTENT; 2453 lexer->waswhite = no; 2454 2455 /* make a note of the version named by the 1st doctype */ 2456 if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags)) 2457 lexer->doctype = FindGivenVersion(doc, lexer->token); 2458 return lexer->token; 2459 2460 case LEX_PROCINSTR: /* seen <? so look for '>' */ 2461 /* check for PHP preprocessor instructions <?php ... ?> */ 2462 2463 if (lexer->lexsize - lexer->txtstart == 3) 2464 { 2465 if (tmbstrncmp(lexer->lexbuf + lexer->txtstart, "php", 3) == 0) 2466 { 2467 lexer->state = LEX_PHP; 2468 continue; 2469 } 2470 } 2471 2472 if (lexer->lexsize - lexer->txtstart == 4) 2473 { 2474 if (tmbstrncmp(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 && 2475 IsWhite(lexer->lexbuf[lexer->txtstart + 3])) 2476 { 2477 lexer->state = LEX_XMLDECL; 2478 attributes = NULL; 2479 continue; 2480 } 2481 } 2482 2483 if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */ 2484 { 2485 if (c != '?') 2486 continue; 2487 2488 /* now look for '>' */ 2489 c = ReadChar(doc->docIn); 2490 2491 if (c == EndOfStream) 2492 { 2493 ReportError(doc, NULL, NULL, UNEXPECTED_END_OF_FILE ); 2494 UngetChar(c, doc->docIn); 2495 continue; 2496 } 2497 2498 AddCharToLexer(lexer, c); 2499 } 2500 2501 2502 if (c != '>') 2503 continue; 2504 2505 lexer->lexsize -= 1; 2506 2507 if (lexer->lexsize) 2508 { 2509 uint i; 2510 Bool closed; 2511 2512 for (i = 0; i < lexer->lexsize - lexer->txtstart && 2513 !IsWhite(lexer->lexbuf[i + lexer->txtstart]); ++i) 2514 /**/; 2515 2516 closed = lexer->lexbuf[lexer->lexsize - 1] == '?'; 2517 2518 if (closed) 2519 lexer->lexsize -= 1; 2520 2521 lexer->txtstart += i; 2522 lexer->txtend = lexer->lexsize; 2523 lexer->lexbuf[lexer->lexsize] = '\0'; 2524 2525 lexer->token = PIToken(doc); 2526 lexer->token->closed = closed; 2527 lexer->token->element = tmbstrndup(lexer->lexbuf + 2528 lexer->txtstart - i, i); 2529 } 2530 else 2531 { 2532 lexer->txtend = lexer->lexsize; 2533 lexer->lexbuf[lexer->lexsize] = '\0'; 2534 lexer->token = PIToken(doc); 2535 } 2536 2537 lexer->state = LEX_CONTENT; 2538 lexer->waswhite = no; 2539 return lexer->token; 2540 2541 case LEX_ASP: /* seen <% so look for "%>" */ 2542 if (c != '%') 2543 continue; 2544 2545 /* now look for '>' */ 2546 c = ReadChar(doc->docIn); 2547 2548 2549 if (c != '>') 2550 { 2551 UngetChar(c, doc->docIn); 2552 continue; 2553 } 2554 2555 lexer->lexsize -= 1; 2556 lexer->txtend = lexer->lexsize; 2557 lexer->lexbuf[lexer->lexsize] = '\0'; 2558 lexer->state = LEX_CONTENT; 2559 lexer->waswhite = no; 2560 return lexer->token = AspToken(doc); 2561 2562 case LEX_JSTE: /* seen <# so look for "#>" */ 2563 if (c != '#') 2564 continue; 2565 2566 /* now look for '>' */ 2567 c = ReadChar(doc->docIn); 2568 2569 2570 if (c != '>') 2571 { 2572 UngetChar(c, doc->docIn); 2573 continue; 2574 } 2575 2576 lexer->lexsize -= 1; 2577 lexer->txtend = lexer->lexsize; 2578 lexer->lexbuf[lexer->lexsize] = '\0'; 2579 lexer->state = LEX_CONTENT; 2580 lexer->waswhite = no; 2581 return lexer->token = JsteToken(doc); 2582 2583 case LEX_PHP: /* seen "<?php" so look for "?>" */ 2584 if (c != '?') 2585 continue; 2586 2587 /* now look for '>' */ 2588 c = ReadChar(doc->docIn); 2589 2590 if (c != '>') 2591 { 2592 UngetChar(c, doc->docIn); 2593 continue; 2594 } 2595 2596 lexer->lexsize -= 1; 2597 lexer->txtend = lexer->lexsize; 2598 lexer->lexbuf[lexer->lexsize] = '\0'; 2599 lexer->state = LEX_CONTENT; 2600 lexer->waswhite = no; 2601 return lexer->token = PhpToken(doc); 2602 2603 case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */ 2604 2605 if (IsWhite(c) && c != '?') 2606 continue; 2607 2608 /* get pseudo-attribute */ 2609 if (c != '?') 2610 { 2611 tmbstr name; 2612 Node *asp, *php; 2613 AttVal *av = NULL; 2614 int pdelim = 0; 2615 isempty = no; 2616 2617 UngetChar(c, doc->docIn); 2618 2619 name = ParseAttribute( doc, &isempty, &asp, &php ); 2620 2621 if (!name) 2622 { 2623 /* fix for http://tidy.sf.net/bug/788031 */ 2624 lexer->lexsize -= 1; 2625 lexer->txtend = lexer->txtstart; 2626 lexer->lexbuf[lexer->txtend] = '\0'; 2627 lexer->state = LEX_CONTENT; 2628 lexer->waswhite = no; 2629 lexer->token = XmlDeclToken(doc); 2630 lexer->token->attributes = attributes; 2631 return lexer->token; 2632 } 2633 2634 av = NewAttribute(); 2635 av->attribute = name; 2636 av->value = ParseValue( doc, name, yes, &isempty, &pdelim ); 2637 av->delim = pdelim; 2638 av->dict = FindAttribute( doc, av ); 2639 2640 AddAttrToList( &attributes, av ); 2641 /* continue; */ 2642 } 2643 2644 /* now look for '>' */ 2645 c = ReadChar(doc->docIn); 2646 2647 if (c != '>') 2648 { 2649 UngetChar(c, doc->docIn); 2650 continue; 2651 } 2652 lexer->lexsize -= 1; 2653 lexer->txtend = lexer->txtstart; 2654 lexer->lexbuf[lexer->txtend] = '\0'; 2655 lexer->state = LEX_CONTENT; 2656 lexer->waswhite = no; 2657 lexer->token = XmlDeclToken(doc); 2658 lexer->token->attributes = attributes; 2659 return lexer->token; 2660 2661 case LEX_SECTION: /* seen "<![" so look for "]>" */ 2662 if (c == '[') 2663 { 2664 if (lexer->lexsize == (lexer->txtstart + 6) && 2665 tmbstrncmp(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0) 2666 { 2667 lexer->state = LEX_CDATA; 2668 lexer->lexsize -= 6; 2669 continue; 2670 } 2671 } 2672 2673 if (c != ']') 2674 continue; 2675 2676 /* now look for '>' */ 2677 c = ReadChar(doc->docIn); 2678 2679 if (c != '>') 2680 { 2681 UngetChar(c, doc->docIn); 2682 continue; 2683 } 2684 2685 lexer->lexsize -= 1; 2686 lexer->txtend = lexer->lexsize; 2687 lexer->lexbuf[lexer->lexsize] = '\0'; 2688 lexer->state = LEX_CONTENT; 2689 lexer->waswhite = no; 2690 return lexer->token = SectionToken(doc); 2691 2692 case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */ 2693 if (c != ']') 2694 continue; 2695 2696 /* now look for ']' */ 2697 c = ReadChar(doc->docIn); 2698 2699 if (c != ']') 2700 { 2701 UngetChar(c, doc->docIn); 2702 continue; 2703 } 2704 2705 /* now look for '>' */ 2706 c = ReadChar(doc->docIn); 2707 2708 if (c != '>') 2709 { 2710 UngetChar(c, doc->docIn); 2711 continue; 2712 } 2713 2714 lexer->lexsize -= 1; 2715 lexer->txtend = lexer->lexsize; 2716 lexer->lexbuf[lexer->lexsize] = '\0'; 2717 lexer->state = LEX_CONTENT; 2718 lexer->waswhite = no; 2719 return lexer->token = CDATAToken(doc); 2720 } 2721 } 2722 2723 if (lexer->state == LEX_CONTENT) /* text string */ 2724 { 2725 lexer->txtend = lexer->lexsize; 2726 2727 if (lexer->txtend > lexer->txtstart) 2728 { 2729 UngetChar(c, doc->docIn); 2730 2731 if (lexer->lexbuf[lexer->lexsize - 1] == ' ') 2732 { 2733 lexer->lexsize -= 1; 2734 lexer->txtend = lexer->lexsize; 2735 } 2736 lexer->token = TextToken(lexer); 2737 #ifdef TIDY_STORE_ORIGINAL_TEXT 2738 StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */ 2739 #endif 2740 return lexer->token; 2741 } 2742 } 2743 else if (lexer->state == LEX_COMMENT) /* comment */ 2744 { 2745 if (c == EndOfStream) 2746 ReportError(doc, NULL, NULL, MALFORMED_COMMENT ); 2747 2748 lexer->txtend = lexer->lexsize; 2749 lexer->lexbuf[lexer->lexsize] = '\0'; 2750 lexer->state = LEX_CONTENT; 2751 lexer->waswhite = no; 2752 return lexer->token = CommentToken(doc); 2753 } 2754 2755 return 0; 2756 } 2757 2758 static void MapStr( ctmbstr str, uint code ) 2759 { 2760 while ( *str ) 2761 { 2762 uint i = (byte) *str++; 2763 lexmap[i] |= code; 2764 } 2765 } 2766 2767 void InitMap(void) 2768 { 2769 MapStr("\r\n\f", newline|white); 2770 MapStr(" \t", white); 2771 MapStr("-.:_", namechar); 2772 MapStr("0123456789", digit|namechar); 2773 MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar); 2774 MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar); 2775 } 2776 2777 /* 2778 parser for ASP within start tags 2779 2780 Some people use ASP for to customize attributes 2781 Tidy isn't really well suited to dealing with ASP 2782 This is a workaround for attributes, but won't 2783 deal with the case where the ASP is used to tailor 2784 the attribute value. Here is an example of a work 2785 around for using ASP in attribute values: 2786 2787 href='<%=rsSchool.Fields("ID").Value%>' 2788 2789 where the ASP that generates the attribute value 2790 is masked from Tidy by the quotemarks. 2791 2792 */ 2793 2794 static Node *ParseAsp( TidyDocImpl* doc ) 2795 { 2796 Lexer* lexer = doc->lexer; 2797 uint c; 2798 Node *asp = NULL; 2799 2800 lexer->txtstart = lexer->lexsize; 2801 2802 for (;;) 2803 { 2804 if ((c = ReadChar(doc->docIn)) == EndOfStream) 2805 break; 2806 2807 AddCharToLexer(lexer, c); 2808 2809 2810 if (c != '%') 2811 continue; 2812 2813 if ((c = ReadChar(doc->docIn)) == EndOfStream) 2814 break; 2815 2816 AddCharToLexer(lexer, c); 2817 2818 if (c == '>') 2819 { 2820 lexer->lexsize -= 2; 2821 break; 2822 } 2823 } 2824 2825 lexer->txtend = lexer->lexsize; 2826 if (lexer->txtend > lexer->txtstart) 2827 asp = AspToken(doc); 2828 2829 lexer->txtstart = lexer->txtend; 2830 return asp; 2831 } 2832 2833 2834 /* 2835 PHP is like ASP but is based upon XML 2836 processing instructions, e.g. <?php ... ?> 2837 */ 2838 static Node *ParsePhp( TidyDocImpl* doc ) 2839 { 2840 Lexer* lexer = doc->lexer; 2841 uint c; 2842 Node *php = NULL; 2843 2844 lexer->txtstart = lexer->lexsize; 2845 2846 for (;;) 2847 { 2848 if ((c = ReadChar(doc->docIn)) == EndOfStream) 2849 break; 2850 2851 AddCharToLexer(lexer, c); 2852 2853 2854 if (c != '?') 2855 continue; 2856 2857 if ((c = ReadChar(doc->docIn)) == EndOfStream) 2858 break; 2859 2860 AddCharToLexer(lexer, c); 2861 2862 if (c == '>') 2863 { 2864 lexer->lexsize -= 2; 2865 break; 2866 } 2867 } 2868 2869 lexer->txtend = lexer->lexsize; 2870 if (lexer->txtend > lexer->txtstart) 2871 php = PhpToken(doc); 2872 2873 lexer->txtstart = lexer->txtend; 2874 return php; 2875 } 2876 2877 /* consumes the '>' terminating start tags */ 2878 static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty, 2879 Node **asp, Node **php) 2880 { 2881 Lexer* lexer = doc->lexer; 2882 int start, len = 0; 2883 tmbstr attr = NULL; 2884 uint c, lastc; 2885 2886 *asp = NULL; /* clear asp pointer */ 2887 *php = NULL; /* clear php pointer */ 2888 2889 /* skip white space before the attribute */ 2890 2891 for (;;) 2892 { 2893 c = ReadChar( doc->docIn ); 2894 2895 2896 if (c == '/') 2897 { 2898 c = ReadChar( doc->docIn ); 2899 2900 if (c == '>') 2901 { 2902 *isempty = yes; 2903 return NULL; 2904 } 2905 2906 UngetChar(c, doc->docIn); 2907 c = '/'; 2908 break; 2909 } 2910 2911 if (c == '>') 2912 return NULL; 2913 2914 if (c =='<') 2915 { 2916 c = ReadChar(doc->docIn); 2917 2918 if (c == '%') 2919 { 2920 *asp = ParseAsp( doc ); 2921 return NULL; 2922 } 2923 else if (c == '?') 2924 { 2925 *php = ParsePhp( doc ); 2926 return NULL; 2927 } 2928 2929 UngetChar(c, doc->docIn); 2930 UngetChar('<', doc->docIn); 2931 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT ); 2932 return NULL; 2933 } 2934 2935 if (c == '=') 2936 { 2937 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN ); 2938 continue; 2939 } 2940 2941 if (c == '"' || c == '\'') 2942 { 2943 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK ); 2944 continue; 2945 } 2946 2947 if (c == EndOfStream) 2948 { 2949 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 2950 UngetChar(c, doc->docIn); 2951 return NULL; 2952 } 2953 2954 2955 if (!IsWhite(c)) 2956 break; 2957 } 2958 2959 start = lexer->lexsize; 2960 lastc = c; 2961 2962 for (;;) 2963 { 2964 /* but push back '=' for parseValue() */ 2965 if (c == '=' || c == '>') 2966 { 2967 UngetChar(c, doc->docIn); 2968 break; 2969 } 2970 2971 if (c == '<' || c == EndOfStream) 2972 { 2973 UngetChar(c, doc->docIn); 2974 break; 2975 } 2976 2977 if (lastc == '-' && (c == '"' || c == '\'')) 2978 { 2979 lexer->lexsize--; 2980 --len; 2981 UngetChar(c, doc->docIn); 2982 break; 2983 } 2984 2985 if (IsWhite(c)) 2986 break; 2987 2988 /* what should be done about non-namechar characters? */ 2989 /* currently these are incorporated into the attr name */ 2990 2991 if ( !cfgBool(doc, TidyXmlTags) && IsUpper(c) ) 2992 c = ToLower(c); 2993 2994 AddCharToLexer( lexer, c ); 2995 lastc = c; 2996 c = ReadChar(doc->docIn); 2997 } 2998 2999 /* handle attribute names with multibyte chars */ 3000 len = lexer->lexsize - start; 3001 attr = (len > 0 ? tmbstrndup(lexer->lexbuf+start, len) : NULL); 3002 lexer->lexsize = start; 3003 return attr; 3004 } 3005 3006 /* 3007 invoked when < is seen in place of attribute value 3008 but terminates on whitespace if not ASP, PHP or Tango 3009 this routine recognizes ' and " quoted strings 3010 */ 3011 static int ParseServerInstruction( TidyDocImpl* doc ) 3012 { 3013 Lexer* lexer = doc->lexer; 3014 uint c; 3015 int delim = '"'; 3016 Bool isrule = no; 3017 3018 c = ReadChar(doc->docIn); 3019 AddCharToLexer(lexer, c); 3020 3021 /* check for ASP, PHP or Tango */ 3022 if (c == '%' || c == '?' || c == '@') 3023 isrule = yes; 3024 3025 for (;;) 3026 { 3027 c = ReadChar(doc->docIn); 3028 3029 if (c == EndOfStream) 3030 break; 3031 3032 if (c == '>') 3033 { 3034 if (isrule) 3035 AddCharToLexer(lexer, c); 3036 else 3037 UngetChar(c, doc->docIn); 3038 3039 break; 3040 } 3041 3042 /* if not recognized as ASP, PHP or Tango */ 3043 /* then also finish value on whitespace */ 3044 if (!isrule) 3045 { 3046 if (IsWhite(c)) 3047 break; 3048 } 3049 3050 AddCharToLexer(lexer, c); 3051 3052 if (c == '"') 3053 { 3054 do 3055 { 3056 c = ReadChar(doc->docIn); 3057 if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */ 3058 { 3059 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3060 UngetChar(c, doc->docIn); 3061 return 0; 3062 } 3063 if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */ 3064 { 3065 UngetChar(c, doc->docIn); 3066 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT ); 3067 return 0; 3068 } 3069 AddCharToLexer(lexer, c); 3070 } 3071 while (c != '"'); 3072 delim = '\''; 3073 continue; 3074 } 3075 3076 if (c == '\'') 3077 { 3078 do 3079 { 3080 c = ReadChar(doc->docIn); 3081 if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */ 3082 { 3083 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3084 UngetChar(c, doc->docIn); 3085 return 0; 3086 } 3087 if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */ 3088 { 3089 UngetChar(c, doc->docIn); 3090 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT ); 3091 return 0; 3092 } 3093 AddCharToLexer(lexer, c); 3094 } 3095 while (c != '\''); 3096 } 3097 } 3098 3099 return delim; 3100 } 3101 3102 /* values start with "=" or " = " etc. */ 3103 /* doesn't consume the ">" at end of start tag */ 3104 3105 static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, 3106 Bool foldCase, Bool *isempty, int *pdelim) 3107 { 3108 Lexer* lexer = doc->lexer; 3109 int len = 0, start; 3110 Bool seen_gt = no; 3111 Bool munge = yes; 3112 uint c, lastc, delim, quotewarning; 3113 tmbstr value; 3114 3115 delim = (tmbchar) 0; 3116 *pdelim = '"'; 3117 3118 /* 3119 Henry Zrepa reports that some folk are using the 3120 embed element with script attributes where newlines 3121 are significant and must be preserved 3122 */ 3123 if ( cfgBool(doc, TidyLiteralAttribs) ) 3124 munge = no; 3125 3126 /* skip white space before the '=' */ 3127 3128 for (;;) 3129 { 3130 c = ReadChar(doc->docIn); 3131 3132 if (c == EndOfStream) 3133 { 3134 UngetChar(c, doc->docIn); 3135 break; 3136 } 3137 3138 if (!IsWhite(c)) 3139 break; 3140 } 3141 3142 /* 3143 c should be '=' if there is a value 3144 other legal possibilities are white 3145 space, '/' and '>' 3146 */ 3147 3148 if (c != '=' && c != '"' && c != '\'') 3149 { 3150 UngetChar(c, doc->docIn); 3151 return NULL; 3152 } 3153 3154 /* skip white space after '=' */ 3155 3156 for (;;) 3157 { 3158 c = ReadChar(doc->docIn); 3159 3160 if (c == EndOfStream) 3161 { 3162 UngetChar(c, doc->docIn); 3163 break; 3164 } 3165 3166 if (!IsWhite(c)) 3167 break; 3168 } 3169 3170 /* check for quote marks */ 3171 3172 if (c == '"' || c == '\'') 3173 delim = c; 3174 else if (c == '<') 3175 { 3176 start = lexer->lexsize; 3177 AddCharToLexer(lexer, c); 3178 *pdelim = ParseServerInstruction( doc ); 3179 len = lexer->lexsize - start; 3180 lexer->lexsize = start; 3181 return (len > 0 ? tmbstrndup(lexer->lexbuf+start, len) : NULL); 3182 } 3183 else 3184 UngetChar(c, doc->docIn); 3185 3186 /* 3187 and read the value string 3188 check for quote mark if needed 3189 */ 3190 3191 quotewarning = 0; 3192 start = lexer->lexsize; 3193 c = '\0'; 3194 3195 for (;;) 3196 { 3197 lastc = c; /* track last character */ 3198 c = ReadChar(doc->docIn); 3199 3200 if (c == EndOfStream) 3201 { 3202 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3203 UngetChar(c, doc->docIn); 3204 break; 3205 } 3206 3207 if (delim == (tmbchar)0) 3208 { 3209 if (c == '>') 3210 { 3211 UngetChar(c, doc->docIn); 3212 break; 3213 } 3214 3215 if (c == '"' || c == '\'') 3216 { 3217 uint q = c; 3218 3219 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK ); 3220 3221 /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */ 3222 /* this doesn't handle <a title=foo"/> which browsers treat as */ 3223 /* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */ 3224 3225 c = ReadChar(doc->docIn); 3226 if (c == '>') 3227 { 3228 AddCharToLexer(lexer, q); 3229 UngetChar(c, doc->docIn); 3230 break; 3231 } 3232 else 3233 { 3234 UngetChar(c, doc->docIn); 3235 c = q; 3236 } 3237 } 3238 3239 if (c == '<') 3240 { 3241 UngetChar(c, doc->docIn); 3242 c = '>'; 3243 UngetChar(c, doc->docIn); 3244 ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT ); 3245 break; 3246 } 3247 3248 /* 3249 For cases like <br clear=all/> need to avoid treating /> as 3250 part of the attribute value, however care is needed to avoid 3251 so treating <a href=http://www.acme.com/> in this way, which 3252 would map the <a> tag to <a href="http://www.acme.com"/> 3253 */ 3254 if (c == '/') 3255 { 3256 /* peek ahead in case of /> */ 3257 c = ReadChar(doc->docIn); 3258 3259 if ( c == '>' && !IsUrl(doc, name) ) 3260 { 3261 *isempty = yes; 3262 UngetChar(c, doc->docIn); 3263 break; 3264 } 3265 3266 /* unget peeked character */ 3267 UngetChar(c, doc->docIn); 3268 c = '/'; 3269 } 3270 } 3271 else /* delim is '\'' or '"' */ 3272 { 3273 if (c == delim) 3274 break; 3275 3276 if (c == '\n' || c == '<' || c == '>') 3277 ++quotewarning; 3278 3279 if (c == '>') 3280 seen_gt = yes; 3281 } 3282 3283 if (c == '&') 3284 { 3285 AddCharToLexer(lexer, c); 3286 ParseEntity( doc, 0 ); 3287 if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge) 3288 ChangeChar(lexer, ' '); 3289 continue; 3290 } 3291 3292 /* 3293 kludge for JavaScript attribute values 3294 with line continuations in string literals 3295 */ 3296 if (c == '\\') 3297 { 3298 c = ReadChar(doc->docIn); 3299 3300 if (c != '\n') 3301 { 3302 UngetChar(c, doc->docIn); 3303 c = '\\'; 3304 } 3305 } 3306 3307 if (IsWhite(c)) 3308 { 3309 if ( delim == 0 ) 3310 break; 3311 3312 if (munge) 3313 { 3314 /* discard line breaks in quoted URLs */ 3315 /* #438650 - fix by Randy Waki */ 3316 if ( c == '\n' && IsUrl(doc, name) ) 3317 { 3318 /* warn that we discard this newline */ 3319 ReportAttrError( doc, lexer->token, NULL, NEWLINE_IN_URI); 3320 continue; 3321 } 3322 3323 c = ' '; 3324 3325 if (lastc == ' ') 3326 continue; 3327 } 3328 } 3329 else if (foldCase && IsUpper(c)) 3330 c = ToLower(c); 3331 3332 AddCharToLexer(lexer, c); 3333 } 3334 3335 if (quotewarning > 10 && seen_gt && munge) 3336 { 3337 /* 3338 there is almost certainly a missing trailing quote mark 3339 as we have see too many newlines, < or > characters. 3340 3341 an exception is made for Javascript attributes and the 3342 javascript URL scheme which may legitimately include < and >, 3343 and for attributes starting with "<xml " as generated by 3344 Microsoft Office. 3345 */ 3346 if ( !IsScript(doc, name) && 3347 !(IsUrl(doc, name) && tmbstrncmp(lexer->lexbuf+start, "javascript:", 11) == 0) && 3348 !(tmbstrncmp(lexer->lexbuf+start, "<xml ", 5) == 0) 3349 ) 3350 ReportFatal( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE ); 3351 } 3352 3353 len = lexer->lexsize - start; 3354 lexer->lexsize = start; 3355 3356 3357 if (len > 0 || delim) 3358 { 3359 /* ignore leading and trailing white space for all but title, alt, value */ 3360 /* and prompts attributes unless --literal-attributes is set to yes */ 3361 /* #994841 - Whitespace is removed from value attributes */ 3362 3363 if (munge && 3364 tmbstrcasecmp(name, "alt") && 3365 tmbstrcasecmp(name, "title") && 3366 tmbstrcasecmp(name, "value") && 3367 tmbstrcasecmp(name, "prompt")) 3368 { 3369 while (IsWhite(lexer->lexbuf[start+len-1])) 3370 --len; 3371 3372 while (IsWhite(lexer->lexbuf[start]) && start < len) 3373 { 3374 ++start; 3375 --len; 3376 } 3377 } 3378 3379 value = tmbstrndup(lexer->lexbuf + start, len); 3380 } 3381 else 3382 value = NULL; 3383 3384 /* note delimiter if given */ 3385 *pdelim = (delim ? delim : '"'); 3386 3387 return value; 3388 } 3389 3390 /* attr must be non-NULL */ 3391 Bool IsValidAttrName( ctmbstr attr ) 3392 { 3393 uint i, c = attr[0]; 3394 3395 /* first character should be a letter */ 3396 if (!IsLetter(c)) 3397 return no; 3398 3399 /* remaining characters should be namechars */ 3400 for( i = 1; i < tmbstrlen(attr); i++) 3401 { 3402 c = attr[i]; 3403 3404 if (IsNamechar(c)) 3405 continue; 3406 3407 return no; 3408 } 3409 3410 return yes; 3411 } 3412 3413 /* create a new attribute */ 3414 AttVal *NewAttribute(void) 3415 { 3416 AttVal *av = (AttVal*) MemAlloc( sizeof(AttVal) ); 3417 ClearMemory( av, sizeof(AttVal) ); 3418 return av; 3419 } 3420 3421 /* create a new attribute with given name and value */ 3422 AttVal* NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 3423 int delim ) 3424 { 3425 AttVal *av = NewAttribute(); 3426 av->attribute = tmbstrdup(name); 3427 av->value = tmbstrdup(value); 3428 av->delim = delim; 3429 av->dict = FindAttribute( doc, av ); 3430 return av; 3431 } 3432 3433 static void AddAttrToList( AttVal** list, AttVal* av ) 3434 { 3435 if ( *list == NULL ) 3436 *list = av; 3437 else 3438 { 3439 AttVal* here = *list; 3440 while ( here->next ) 3441 here = here->next; 3442 here->next = av; 3443 } 3444 } 3445 3446 void InsertAttributeAtEnd( Node *node, AttVal *av ) 3447 { 3448 AddAttrToList(&node->attributes, av); 3449 } 3450 3451 void InsertAttributeAtStart( Node *node, AttVal *av ) 3452 { 3453 av->next = node->attributes; 3454 node->attributes = av; 3455 } 3456 3457 /* swallows closing '>' */ 3458 3459 static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty ) 3460 { 3461 Lexer* lexer = doc->lexer; 3462 AttVal *av, *list; 3463 tmbstr value; 3464 int delim; 3465 Node *asp, *php; 3466 3467 list = NULL; 3468 3469 while ( !EndOfInput(doc) ) 3470 { 3471 tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php ); 3472 3473 if (attribute == NULL) 3474 { 3475 /* check if attributes are created by ASP markup */ 3476 if (asp) 3477 { 3478 av = NewAttribute(); 3479 av->asp = asp; 3480 AddAttrToList( &list, av ); 3481 continue; 3482 } 3483 3484 /* check if attributes are created by PHP markup */ 3485 if (php) 3486 { 3487 av = NewAttribute(); 3488 av->php = php; 3489 AddAttrToList( &list, av ); 3490 continue; 3491 } 3492 3493 break; 3494 } 3495 3496 value = ParseValue( doc, attribute, no, isempty, &delim ); 3497 3498 if (attribute && (IsValidAttrName(attribute) || 3499 (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute)))) 3500 { 3501 av = NewAttribute(); 3502 av->delim = delim; 3503 av->attribute = attribute; 3504 av->value = value; 3505 av->dict = FindAttribute( doc, av ); 3506 AddAttrToList( &list, av ); 3507 } 3508 else 3509 { 3510 av = NewAttribute(); 3511 av->attribute = attribute; 3512 av->value = value; 3513 3514 if (LastChar(attribute) == '"') 3515 ReportAttrError( doc, lexer->token, av, MISSING_QUOTEMARK); 3516 else if (value == NULL) 3517 ReportAttrError(doc, lexer->token, av, MISSING_ATTR_VALUE); 3518 else 3519 ReportAttrError(doc, lexer->token, av, INVALID_ATTRIBUTE); 3520 3521 FreeAttribute( doc, av ); 3522 } 3523 } 3524 3525 return list; 3526 } 3527 3528 /* 3529 Returns document type declarations like 3530 3531 <!DOCTYPE foo PUBLIC "fpi" "sysid"> 3532 <!DOCTYPE bar SYSTEM "sysid"> 3533 <!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]> 3534 3535 as 3536 3537 <foo PUBLIC="fpi" SYSTEM="sysid" /> 3538 <bar SYSTEM="sysid" /> 3539 <baz> &lt;!ENTITY ouml &quot;&amp;#246&quot;&gt; </baz> 3540 */ 3541 static Node *ParseDocTypeDecl(TidyDocImpl* doc) 3542 { 3543 Lexer *lexer = doc->lexer; 3544 int start = lexer->lexsize; 3545 ParseDocTypeDeclState state = DT_DOCTYPENAME; 3546 uint c; 3547 uint delim = 0; 3548 Bool hasfpi = yes; 3549 3550 Node* node = NewNode(lexer); 3551 node->type = DocTypeTag; 3552 node->start = lexer->txtstart; 3553 node->end = lexer->txtend; 3554 3555 lexer->waswhite = no; 3556 3557 /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */ 3558 3559 while ((c = ReadChar(doc->docIn)) != EndOfStream) 3560 { 3561 /* convert newlines to spaces */ 3562 if (state != DT_INTSUBSET) 3563 c = c == '\n' ? ' ' : c; 3564 3565 /* convert white-space sequences to single space character */ 3566 if (IsWhite(c) && state != DT_INTSUBSET) 3567 { 3568 if (!lexer->waswhite) 3569 { 3570 AddCharToLexer(lexer, c); 3571 lexer->waswhite = yes; 3572 } 3573 else 3574 { 3575 /* discard space */ 3576 continue; 3577 } 3578 } 3579 else 3580 { 3581 AddCharToLexer(lexer, c); 3582 lexer->waswhite = no; 3583 } 3584 3585 switch(state) 3586 { 3587 case DT_INTERMEDIATE: 3588 /* determine what's next */ 3589 if (ToUpper(c) == 'P' || ToUpper(c) == 'S') 3590 { 3591 start = lexer->lexsize - 1; 3592 state = DT_PUBLICSYSTEM; 3593 continue; 3594 } 3595 else if (c == '[') 3596 { 3597 start = lexer->lexsize; 3598 state = DT_INTSUBSET; 3599 continue; 3600 } 3601 else if (c == '\'' || c == '"') 3602 { 3603 start = lexer->lexsize; 3604 delim = c; 3605 state = DT_QUOTEDSTRING; 3606 continue; 3607 } 3608 else if (c == '>') 3609 { 3610 AttVal* si; 3611 3612 node->end = --(lexer->lexsize); 3613 3614 si = GetAttrByName(node, "SYSTEM"); 3615 if (si) 3616 CheckUrl(doc, node, si); 3617 3618 if (!node->element || !IsValidXMLElemName(node->element)) 3619 { 3620 ReportError(doc, NULL, NULL, MALFORMED_DOCTYPE); 3621 FreeNode(doc, node); 3622 return NULL; 3623 } 3624 #ifdef TIDY_STORE_ORIGINAL_TEXT 3625 StoreOriginalTextInToken(doc, node, 0); 3626 #endif 3627 return node; 3628 } 3629 else 3630 { 3631 /* error */ 3632 } 3633 break; 3634 case DT_DOCTYPENAME: 3635 /* read document type name */ 3636 if (IsWhite(c) || c == '>' || c == '[') 3637 { 3638 node->element = tmbstrndup(lexer->lexbuf + start, 3639 lexer->lexsize - start - 1); 3640 if (c == '>' || c == '[') 3641 { 3642 --(lexer->lexsize); 3643 UngetChar(c, doc->docIn); 3644 } 3645 3646 state = DT_INTERMEDIATE; 3647 continue; 3648 } 3649 break; 3650 case DT_PUBLICSYSTEM: 3651 /* read PUBLIC/SYSTEM */ 3652 if (IsWhite(c) || c == '>') 3653 { 3654 char *attname = tmbstrndup(lexer->lexbuf + start, 3655 lexer->lexsize - start - 1); 3656 hasfpi = !(tmbstrcasecmp(attname, "SYSTEM") == 0); 3657 3658 MemFree(attname); 3659 3660 /* todo: report an error if SYSTEM/PUBLIC not uppercase */ 3661 3662 if (c == '>') 3663 { 3664 --(lexer->lexsize); 3665 UngetChar(c, doc->docIn); 3666 } 3667 3668 state = DT_INTERMEDIATE; 3669 continue; 3670 } 3671 break; 3672 case DT_QUOTEDSTRING: 3673 /* read quoted string */ 3674 if (c == delim) 3675 { 3676 char *value = tmbstrndup(lexer->lexbuf + start, 3677 lexer->lexsize - start - 1); 3678 AttVal* att = AddAttribute(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value); 3679 MemFree(value); 3680 att->delim = delim; 3681 hasfpi = no; 3682 state = DT_INTERMEDIATE; 3683 delim = 0; 3684 continue; 3685 } 3686 break; 3687 case DT_INTSUBSET: 3688 /* read internal subset */ 3689 if (c == ']') 3690 { 3691 Node* subset; 3692 lexer->txtstart = start; 3693 lexer->txtend = lexer->lexsize - 1; 3694 subset = TextToken(lexer); 3695 InsertNodeAtEnd(node, subset); 3696 state = DT_INTERMEDIATE; 3697 } 3698 break; 3699 } 3700 } 3701 3702 /* document type declaration not finished */ 3703 ReportError(doc, NULL, NULL, MALFORMED_DOCTYPE); 3704 FreeNode(doc, node); 3705 return NULL; 3706 } 3707

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.