~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/parser.c

Version: ~ [ 1.0 ] ~

** Warning: Cannot open xref database.

1 /* parser.c -- HTML Parser 2 3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: arnaud02 $ 9 $Date: 2005/10/21 12:54:15 $ 10 $Revision: 1.150 $ 11 12 */ 13 14 #include "tidy-int.h" 15 #include "lexer.h" 16 #include "parser.h" 17 #include "message.h" 18 #include "clean.h" 19 #include "tags.h" 20 #include "tmbstr.h" 21 22 #ifdef AUTO_INPUT_ENCODING 23 #include "charsets.h" 24 #endif 25 26 Bool CheckNodeIntegrity(Node *node) 27 { 28 #ifndef NO_NODE_INTEGRITY_CHECK 29 if (node->prev) 30 { 31 if (node->prev->next != node) 32 return no; 33 } 34 35 if (node->next) 36 { 37 if (node->next->prev != node) 38 return no; 39 } 40 41 if (node->parent) 42 { 43 Node *child = NULL; 44 if (node->prev == NULL && node->parent->content != node) 45 return no; 46 47 if (node->next == NULL && node->parent->last != node) 48 return no; 49 50 for (child = node->parent->content; child; child = child->next) 51 { 52 if (child == node) 53 break; 54 } 55 if ( node != child ) 56 return no; 57 } 58 59 for (node = node->content; node; node = node->next) 60 if ( !CheckNodeIntegrity(node) ) 61 return no; 62 63 #endif 64 return yes; 65 } 66 67 /* 68 used to determine how attributes 69 without values should be printed 70 this was introduced to deal with 71 user defined tags e.g. Cold Fusion 72 */ 73 Bool IsNewNode(Node *node) 74 { 75 if (node && node->tag) 76 { 77 return (node->tag->model & CM_NEW); 78 } 79 return yes; 80 } 81 82 void CoerceNode(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected) 83 { 84 const Dict* tag = LookupTagDef(tid); 85 Node* tmp = InferredTag(doc, tag->id); 86 87 if (obsolete) 88 ReportWarning(doc, node, tmp, OBSOLETE_ELEMENT); 89 else if (unexpected) 90 ReportError(doc, node, tmp, REPLACING_UNEX_ELEMENT); 91 else 92 ReportNotice(doc, node, tmp, REPLACING_ELEMENT); 93 94 MemFree(tmp->element); 95 MemFree(tmp); 96 97 node->was = node->tag; 98 node->tag = tag; 99 node->type = StartTag; 100 node->implicit = yes; 101 MemFree(node->element); 102 node->element = tmbstrdup(tag->name); 103 } 104 105 /* extract a node and its children from a markup tree */ 106 Node *RemoveNode(Node *node) 107 { 108 if (node->prev) 109 node->prev->next = node->next; 110 111 if (node->next) 112 node->next->prev = node->prev; 113 114 if (node->parent) 115 { 116 if (node->parent->content == node) 117 node->parent->content = node->next; 118 119 if (node->parent->last == node) 120 node->parent->last = node->prev; 121 } 122 123 node->parent = node->prev = node->next = NULL; 124 return node; 125 } 126 127 /* remove node from markup tree and discard it */ 128 Node *DiscardElement( TidyDocImpl* doc, Node *element ) 129 { 130 Node *next = NULL; 131 132 if (element) 133 { 134 next = element->next; 135 RemoveNode(element); 136 FreeNode( doc, element); 137 } 138 139 return next; 140 } 141 142 /* 143 insert "node" into markup tree as the firt element 144 of content of "element" 145 */ 146 void InsertNodeAtStart(Node *element, Node *node) 147 { 148 node->parent = element; 149 150 if (element->content == NULL) 151 element->last = node; 152 else 153 element->content->prev = node; 154 155 node->next = element->content; 156 node->prev = NULL; 157 element->content = node; 158 } 159 160 /* 161 insert "node" into markup tree as the last element 162 of content of "element" 163 */ 164 void InsertNodeAtEnd(Node *element, Node *node) 165 { 166 node->parent = element; 167 node->prev = element->last; 168 169 if (element->last != NULL) 170 element->last->next = node; 171 else 172 element->content = node; 173 174 element->last = node; 175 } 176 177 /* 178 insert "node" into markup tree in place of "element" 179 which is moved to become the child of the node 180 */ 181 static void InsertNodeAsParent(Node *element, Node *node) 182 { 183 node->content = element; 184 node->last = element; 185 node->parent = element->parent; 186 element->parent = node; 187 188 if (node->parent->content == element) 189 node->parent->content = node; 190 191 if (node->parent->last == element) 192 node->parent->last = node; 193 194 node->prev = element->prev; 195 element->prev = NULL; 196 197 if (node->prev) 198 node->prev->next = node; 199 200 node->next = element->next; 201 element->next = NULL; 202 203 if (node->next) 204 node->next->prev = node; 205 } 206 207 /* insert "node" into markup tree before "element" */ 208 void InsertNodeBeforeElement(Node *element, Node *node) 209 { 210 Node *parent; 211 212 parent = element->parent; 213 node->parent = parent; 214 node->next = element; 215 node->prev = element->prev; 216 element->prev = node; 217 218 if (node->prev) 219 node->prev->next = node; 220 221 if (parent->content == element) 222 parent->content = node; 223 } 224 225 /* insert "node" into markup tree after "element" */ 226 void InsertNodeAfterElement(Node *element, Node *node) 227 { 228 Node *parent; 229 230 parent = element->parent; 231 node->parent = parent; 232 233 /* AQ - 13 Jan 2000 fix for parent == NULL */ 234 if (parent != NULL && parent->last == element) 235 parent->last = node; 236 else 237 { 238 node->next = element->next; 239 /* AQ - 13 Jan 2000 fix for node->next == NULL */ 240 if (node->next != NULL) 241 node->next->prev = node; 242 } 243 244 element->next = node; 245 node->prev = element; 246 } 247 248 static Bool CanPrune( TidyDocImpl* doc, Node *element ) 249 { 250 if ( nodeIsText(element) ) 251 return yes; 252 253 if ( element->content ) 254 return no; 255 256 if ( element->tag == NULL ) 257 return no; 258 259 if ( element->tag->model & CM_BLOCK && element->attributes != NULL ) 260 return no; 261 262 if ( nodeIsA(element) && element->attributes != NULL ) 263 return no; 264 265 if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) ) 266 return no; 267 268 if ( element->tag->model & CM_ROW ) 269 return no; 270 271 if ( element->tag->model & CM_EMPTY ) 272 return no; 273 274 if ( nodeIsAPPLET(element) ) 275 return no; 276 277 if ( nodeIsOBJECT(element) ) 278 return no; 279 280 if ( nodeIsSCRIPT(element) && attrGetSRC(element) ) 281 return no; 282 283 if ( nodeIsTITLE(element) ) 284 return no; 285 286 /* #433359 - fix by Randy Waki 12 Mar 01 */ 287 if ( nodeIsIFRAME(element) ) 288 return no; 289 290 /* fix for bug 770297 */ 291 if (nodeIsTEXTAREA(element)) 292 return no; 293 294 if ( attrGetID(element) || attrGetNAME(element) ) 295 return no; 296 297 /* fix for bug 695408; a better fix would look for unknown and */ 298 /* known proprietary attributes that make the element significant */ 299 if (attrGetDATAFLD(element)) 300 return no; 301 302 /* fix for bug 723772, don't trim new-...-tags */ 303 if (element->tag->id == TidyTag_UNKNOWN) 304 return no; 305 306 if (nodeIsBODY(element)) 307 return no; 308 309 if (nodeIsCOLGROUP(element)) 310 return no; 311 312 return yes; 313 } 314 315 Node *TrimEmptyElement( TidyDocImpl* doc, Node *element ) 316 { 317 if ( CanPrune(doc, element) ) 318 { 319 if (element->type != TextNode) 320 ReportNotice(doc, element, NULL, TRIM_EMPTY_ELEMENT); 321 322 return DiscardElement(doc, element); 323 } 324 return element; 325 } 326 327 Node* DropEmptyElements(TidyDocImpl* doc, Node* node) 328 { 329 Node* next; 330 331 while (node) 332 { 333 next = node->next; 334 335 if (node->content) 336 DropEmptyElements(doc, node->content); 337 338 if (!nodeIsElement(node) && 339 !(nodeIsText(node) && !(node->start < node->end))) 340 { 341 node = next; 342 continue; 343 } 344 345 next = TrimEmptyElement(doc, node); 346 node = node == next ? node->next : next; 347 } 348 349 return node; 350 } 351 352 /* 353 errors in positioning of form start or end tags 354 generally require human intervention to fix 355 */ 356 static void BadForm( TidyDocImpl* doc ) 357 { 358 doc->badForm = yes; 359 /* doc->errors++; */ 360 } 361 362 /* 363 This maps 364 <em>hello </em><strong>world</strong> 365 to 366 <em>hello</em> <strong>world</strong> 367 368 If last child of element is a text node 369 then trim trailing white space character 370 moving it to after element's end tag. 371 */ 372 static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) 373 { 374 Lexer* lexer = doc->lexer; 375 byte c; 376 377 if (nodeIsText(last)) 378 { 379 if (last->end > last->start) 380 { 381 c = (byte) lexer->lexbuf[ last->end - 1 ]; 382 383 if ( c == ' ' 384 #ifdef COMMENT_NBSP_FIX 385 || c == 160 386 #endif 387 ) 388 { 389 #ifdef COMMENT_NBSP_FIX 390 /* take care with <td>&nbsp;</td> */ 391 if ( c == 160 && 392 ( element->tag == doc->tags.tag_td || 393 element->tag == doc->tags.tag_th ) 394 ) 395 { 396 if (last->end > last->start + 1) 397 last->end -= 1; 398 } 399 else 400 #endif 401 { 402 last->end -= 1; 403 if ( (element->tag->model & CM_INLINE) && 404 !(element->tag->model & CM_FIELD) ) 405 lexer->insertspace = yes; 406 } 407 } 408 } 409 } 410 } 411 412 #if 0 413 static Node *EscapeTag(Lexer *lexer, Node *element) 414 { 415 Node *node = NewNode(lexer); 416 417 node->start = lexer->lexsize; 418 AddByte(lexer, '<'); 419 420 if (element->type == EndTag) 421 AddByte(lexer, '/'); 422 423 if (element->element) 424 { 425 char *p; 426 for (p = element->element; *p != '\0'; ++p) 427 AddByte(lexer, *p); 428 } 429 else if (element->type == DocTypeTag) 430 { 431 uint i; 432 AddStringLiteral( lexer, "!DOCTYPE " ); 433 for (i = element->start; i < element->end; ++i) 434 AddByte(lexer, lexer->lexbuf[i]); 435 } 436 437 if (element->type == StartEndTag) 438 AddByte(lexer, '/'); 439 440 AddByte(lexer, '>'); 441 node->end = lexer->lexsize; 442 443 return node; 444 } 445 #endif /* 0 */ 446 447 /* Only true for text nodes. */ 448 Bool IsBlank(Lexer *lexer, Node *node) 449 { 450 Bool isBlank = nodeIsText(node); 451 if ( isBlank ) 452 isBlank = ( node->end == node->start || /* Zero length */ 453 ( node->end == node->start+1 /* or one blank. */ 454 && lexer->lexbuf[node->start] == ' ' ) ); 455 return isBlank; 456 } 457 458 /* 459 This maps 460 <p>hello<em> world</em> 461 to 462 <p>hello <em>world</em> 463 464 Trims initial space, by moving it before the 465 start tag, or if this element is the first in 466 parent's content, then by discarding the space 467 */ 468 static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) 469 { 470 Lexer* lexer = doc->lexer; 471 Node *prev, *node; 472 473 if ( nodeIsText(text) && 474 lexer->lexbuf[text->start] == ' ' && 475 text->start < text->end ) 476 { 477 if ( (element->tag->model & CM_INLINE) && 478 !(element->tag->model & CM_FIELD) ) 479 { 480 prev = element->prev; 481 482 if (nodeIsText(prev)) 483 { 484 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') 485 lexer->lexbuf[(prev->end)++] = ' '; 486 487 ++(element->start); 488 } 489 else /* create new node */ 490 { 491 node = NewNode(lexer); 492 node->start = (element->start)++; 493 node->end = element->start; 494 lexer->lexbuf[node->start] = ' '; 495 InsertNodeBeforeElement(element ,node); 496 } 497 } 498 499 /* discard the space in current node */ 500 ++(text->start); 501 } 502 } 503 504 static Bool IsPreDescendant(Node* node) 505 { 506 Node *parent = node->parent; 507 508 while (parent) 509 { 510 if (parent->tag && parent->tag->parser == ParsePre) 511 return yes; 512 513 parent = parent->parent; 514 } 515 516 return no; 517 } 518 519 static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) 520 { 521 Node* next; 522 523 if (!nodeIsText(node)) 524 return no; 525 526 if (node->parent->type == DocTypeTag) 527 return no; 528 529 if (IsPreDescendant(node)) 530 return no; 531 532 if (node->parent->tag->parser == ParseScript) 533 return no; 534 535 next = node->next; 536 537 /* <p>... </p> */ 538 if (!next && !nodeHasCM(node->parent, CM_INLINE)) 539 return yes; 540 541 /* <div><small>... </small><h3>...</h3></div> */ 542 if (!next && node->parent->next && !nodeHasCM(node->parent->next, CM_INLINE)) 543 return yes; 544 545 if (!next) 546 return no; 547 548 if (nodeIsBR(next)) 549 return yes; 550 551 if (nodeHasCM(next, CM_INLINE)) 552 return no; 553 554 /* <a href='/'>...</a> <p>...</p> */ 555 if (next->type == StartTag) 556 return yes; 557 558 /* <strong>...</strong> <hr /> */ 559 if (next->type == StartEndTag) 560 return yes; 561 562 /* evil adjacent text nodes, Tidy should not generate these :-( */ 563 if (nodeIsText(next) && next->start < next->end 564 && IsWhite(doc->lexer->lexbuf[next->start])) 565 return yes; 566 567 return no; 568 } 569 570 static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) 571 { 572 if (!nodeIsText(node)) 573 return no; 574 575 if (node->parent->type == DocTypeTag) 576 return no; 577 578 if (IsPreDescendant(node)) 579 return no; 580 581 if (node->parent->tag->parser == ParseScript) 582 return no; 583 584 /* <p>...<br> <em>...</em>...</p> */ 585 if (nodeIsBR(node->prev)) 586 return yes; 587 588 /* <p> ...</p> */ 589 if (node->prev == NULL && !nodeHasCM(node->parent, CM_INLINE)) 590 return yes; 591 592 /* <h4>...</h4> <em>...</em> */ 593 if (node->prev && !nodeHasCM(node->prev, CM_INLINE) && 594 nodeIsElement(node->prev)) 595 return yes; 596 597 /* <p><span> ...</span></p> */ 598 if (!node->prev && !node->parent->prev && !nodeHasCM(node->parent->parent, CM_INLINE)) 599 return yes; 600 601 return no; 602 } 603 604 static void CleanSpaces(TidyDocImpl* doc, Node* node) 605 { 606 Node* next; 607 608 while (node) 609 { 610 next = node->next; 611 612 if (nodeIsText(node) && CleanLeadingWhitespace(doc, node)) 613 while (node->start < node->end && IsWhite(doc->lexer->lexbuf[node->start])) 614 ++(node->start); 615 616 if (nodeIsText(node) && CleanTrailingWhitespace(doc, node)) 617 while (node->end > node->start && IsWhite(doc->lexer->lexbuf[node->end - 1])) 618 --(node->end); 619 620 if (nodeIsText(node) && !(node->start < node->end)) 621 { 622 RemoveNode(node); 623 FreeNode(doc, node); 624 node = next; 625 626 continue; 627 } 628 629 if (node->content) 630 CleanSpaces(doc, node->content); 631 632 node = next; 633 } 634 } 635 636 /* 637 Move initial and trailing space out. 638 This routine maps: 639 640 hello<em> world</em> 641 to 642 hello <em>world</em> 643 and 644 <em>hello </em><strong>world</strong> 645 to 646 <em>hello</em> <strong>world</strong> 647 */ 648 static void TrimSpaces( TidyDocImpl* doc, Node *element) 649 { 650 Node* text = element->content; 651 652 if (nodeIsPRE(element) || IsPreDescendant(element)) 653 return; 654 655 if (nodeIsText(text)) 656 TrimInitialSpace(doc, element, text); 657 658 text = element->last; 659 660 if (nodeIsText(text)) 661 TrimTrailingSpace(doc, element, text); 662 } 663 664 Bool DescendantOf( Node *element, TidyTagId tid ) 665 { 666 Node *parent; 667 for ( parent = element->parent; 668 parent != NULL; 669 parent = parent->parent ) 670 { 671 if ( TagIsId(parent, tid) ) 672 return yes; 673 } 674 return no; 675 } 676 677 static Bool InsertMisc(Node *element, Node *node) 678 { 679 if (node->type == CommentTag || 680 node->type == ProcInsTag || 681 node->type == CDATATag || 682 node->type == SectionTag || 683 node->type == AspTag || 684 node->type == JsteTag || 685 node->type == PhpTag ) 686 { 687 InsertNodeAtEnd(element, node); 688 return yes; 689 } 690 691 if ( node->type == XmlDecl ) 692 { 693 Node* root = element; 694 while ( root && root->parent ) 695 root = root->parent; 696 if ( root ) 697 { 698 InsertNodeAtStart( root, node ); 699 return yes; 700 } 701 } 702 703 /* Declared empty tags seem to be slipping through 704 ** the cracks. This is an experiment to figure out 705 ** a decent place to pick them up. 706 */ 707 if ( node->tag && 708 nodeIsElement(node) && 709 nodeCMIsEmpty(node) && TagId(node) == TidyTag_UNKNOWN && 710 (node->tag->versions & VERS_PROPRIETARY) != 0 ) 711 { 712 InsertNodeAtEnd(element, node); 713 return yes; 714 } 715 716 return no; 717 } 718 719 720 static void ParseTag( TidyDocImpl* doc, Node *node, uint mode ) 721 { 722 Lexer* lexer = doc->lexer; 723 /* 724 Fix by GLP 2000-12-21. Need to reset insertspace if this 725 is both a non-inline and empty tag (base, link, meta, isindex, hr, area). 726 */ 727 if (node->tag->model & CM_EMPTY) 728 { 729 lexer->waswhite = no; 730 if (node->tag->parser == NULL) 731 return; 732 } 733 else if (!(node->tag->model & CM_INLINE)) 734 lexer->insertspace = no; 735 736 if (node->tag->parser == NULL) 737 return; 738 739 if (node->type == StartEndTag) 740 return; 741 742 (*node->tag->parser)( doc, node, mode ); 743 } 744 745 /* 746 the doctype has been found after other tags, 747 and needs moving to before the html element 748 */ 749 static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) 750 { 751 Node* existing = FindDocType( doc ); 752 if ( existing ) 753 { 754 ReportError(doc, element, doctype, DISCARDING_UNEXPECTED ); 755 FreeNode( doc, doctype ); 756 } 757 else 758 { 759 ReportError(doc, element, doctype, DOCTYPE_AFTER_TAGS ); 760 while ( !nodeIsHTML(element) ) 761 element = element->parent; 762 InsertNodeBeforeElement( element, doctype ); 763 } 764 } 765 766 /* 767 move node to the head, where element is used as starting 768 point in hunt for head. normally called during parsing 769 */ 770 static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) 771 { 772 Node *head; 773 774 RemoveNode( node ); /* make sure that node is isolated */ 775 776 if ( nodeIsElement(node) ) 777 { 778 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN ); 779 780 head = FindHEAD(doc); 781 assert(head != NULL); 782 783 InsertNodeAtEnd(head, node); 784 785 if ( node->tag->parser ) 786 ParseTag( doc, node, IgnoreWhitespace ); 787 } 788 else 789 { 790 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 791 FreeNode( doc, node ); 792 } 793 } 794 795 /* moves given node to end of body element */ 796 static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) 797 { 798 Node* body = FindBody( doc ); 799 if ( body ) 800 { 801 RemoveNode( node ); 802 InsertNodeAtEnd( body, node ); 803 } 804 } 805 806 /* 807 element is node created by the lexer 808 upon seeing the start tag, or by the 809 parser when the start tag is inferred 810 */ 811 void ParseBlock( TidyDocImpl* doc, Node *element, uint mode) 812 { 813 Lexer* lexer = doc->lexer; 814 Node *node; 815 Bool checkstack = yes; 816 uint istackbase = 0; 817 818 if ( element->tag->model & CM_EMPTY ) 819 return; 820 821 if ( nodeIsFORM(element) && 822 DescendantOf(element, TidyTag_FORM) ) 823 ReportError(doc, element, NULL, ILLEGAL_NESTING ); 824 825 /* 826 InlineDup() asks the lexer to insert inline emphasis tags 827 currently pushed on the istack, but take care to avoid 828 propagating inline emphasis inside OBJECT or APPLET. 829 For these elements a fresh inline stack context is created 830 and disposed of upon reaching the end of the element. 831 They thus behave like table cells in this respect. 832 */ 833 if (element->tag->model & CM_OBJECT) 834 { 835 istackbase = lexer->istackbase; 836 lexer->istackbase = lexer->istacksize; 837 } 838 839 if (!(element->tag->model & CM_MIXED)) 840 InlineDup( doc, NULL ); 841 842 mode = IgnoreWhitespace; 843 844 while ((node = GetToken(doc, mode /*MixedContent*/)) != NULL) 845 { 846 /* end tag for this element */ 847 if (node->type == EndTag && node->tag && 848 (node->tag == element->tag || element->was == node->tag)) 849 { 850 FreeNode( doc, node ); 851 852 if (element->tag->model & CM_OBJECT) 853 { 854 /* pop inline stack */ 855 while (lexer->istacksize > lexer->istackbase) 856 PopInline( doc, NULL ); 857 lexer->istackbase = istackbase; 858 } 859 860 element->closed = yes; 861 TrimSpaces( doc, element ); 862 return; 863 } 864 865 if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD )) 866 { 867 /* If we're in the HEAD, close it before proceeding. 868 This is an extremely rare occurance, but has been observed. 869 */ 870 UngetToken( doc ); 871 break; 872 } 873 874 if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) 875 { 876 if ( nodeIsElement(node) ) 877 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 878 FreeNode( doc, node ); 879 continue; 880 } 881 882 883 if (node->type == EndTag) 884 { 885 if (node->tag == NULL) 886 { 887 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 888 FreeNode( doc, node ); 889 continue; 890 } 891 else if ( nodeIsBR(node) ) 892 node->type = StartTag; 893 else if ( nodeIsP(node) ) 894 { 895 /* Cannot have a block inside a paragraph, so no checking 896 for an ancestor is necessary -- but we _can_ have 897 paragraphs inside a block, so change it to an implicit 898 empty paragraph, to be dealt with according to the user's 899 options 900 */ 901 node->type = StartEndTag; 902 node->implicit = yes; 903 #if OBSOLETE 904 CoerceNode(doc, node, TidyTag_BR, no, no); 905 FreeAttrs( doc, node ); /* discard align attribute etc. */ 906 InsertNodeAtEnd( element, node ); 907 node = InferredTag(doc, TidyTag_BR); 908 #endif 909 } 910 else if (DescendantOf( element, node->tag->id )) 911 { 912 /* 913 if this is the end tag for an ancestor element 914 then infer end tag for this element 915 */ 916 UngetToken( doc ); 917 break; 918 #if OBSOLETE 919 Node *parent; 920 for ( parent = element->parent; 921 parent != NULL; 922 parent = parent->parent ) 923 { 924 if (node->tag == parent->tag) 925 { 926 if (!(element->tag->model & CM_OPT)) 927 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE ); 928 929 UngetToken( doc ); 930 931 if (element->tag->model & CM_OBJECT) 932 { 933 /* pop inline stack */ 934 while (lexer->istacksize > lexer->istackbase) 935 PopInline( doc, NULL ); 936 lexer->istackbase = istackbase; 937 } 938 939 TrimSpaces( doc, element ); 940 return; 941 } 942 } 943 #endif 944 } 945 else 946 { 947 /* special case </tr> etc. for stuff moved in front of table */ 948 if ( lexer->exiled 949 && node->tag->model 950 && (node->tag->model & CM_TABLE) ) 951 { 952 UngetToken( doc ); 953 TrimSpaces( doc, element ); 954 return; 955 } 956 } 957 } 958 959 /* mixed content model permits text */ 960 if (nodeIsText(node)) 961 { 962 if ( checkstack ) 963 { 964 checkstack = no; 965 if (!(element->tag->model & CM_MIXED)) 966 { 967 if ( InlineDup(doc, node) > 0 ) 968 continue; 969 } 970 } 971 972 InsertNodeAtEnd(element, node); 973 mode = MixedContent; 974 975 /* 976 HTML4 strict doesn't allow mixed content for 977 elements with %block; as their content model 978 */ 979 /* 980 But only body, map, blockquote, form and 981 noscript have content model %block; 982 */ 983 if ( nodeIsBODY(element) || 984 nodeIsMAP(element) || 985 nodeIsBLOCKQUOTE(element) || 986 nodeIsFORM(element) || 987 nodeIsNOSCRIPT(element) ) 988 ConstrainVersion( doc, ~VERS_HTML40_STRICT ); 989 continue; 990 } 991 992 if ( InsertMisc(element, node) ) 993 continue; 994 995 /* allow PARAM elements? */ 996 if ( nodeIsPARAM(node) ) 997 { 998 if ( nodeHasCM(element, CM_PARAM) && nodeIsElement(node) ) 999 { 1000 InsertNodeAtEnd(element, node); 1001 continue; 1002 } 1003 1004 /* otherwise discard it */ 1005 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1006 FreeNode( doc, node ); 1007 continue; 1008 } 1009 1010 /* allow AREA elements? */ 1011 if ( nodeIsAREA(node) ) 1012 { 1013 if ( nodeIsMAP(element) && nodeIsElement(node) ) 1014 { 1015 InsertNodeAtEnd(element, node); 1016 continue; 1017 } 1018 1019 /* otherwise discard it */ 1020 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1021 FreeNode( doc, node ); 1022 continue; 1023 } 1024 1025 /* ignore unknown start/end tags */ 1026 if ( node->tag == NULL ) 1027 { 1028 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1029 FreeNode( doc, node ); 1030 continue; 1031 } 1032 1033 /* 1034 Allow CM_INLINE elements here. 1035 1036 Allow CM_BLOCK elements here unless 1037 lexer->excludeBlocks is yes. 1038 1039 LI and DD are special cased. 1040 1041 Otherwise infer end tag for this element. 1042 */ 1043 1044 if ( !nodeHasCM(node, CM_INLINE) ) 1045 { 1046 if ( !nodeIsElement(node) ) 1047 { 1048 if ( nodeIsFORM(node) ) 1049 BadForm( doc ); 1050 1051 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1052 FreeNode( doc, node ); 1053 continue; 1054 } 1055 1056 /* #427671 - Fix by Randy Waki - 10 Aug 00 */ 1057 /* 1058 If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION 1059 start tag, discard the start tag and let the subsequent content get 1060 parsed as content of the enclosing LI. This seems to mimic IE and 1061 Netscape, and avoids an infinite loop: without this check, 1062 ParseBlock (which is parsing the LI's content) and ParseList (which 1063 is parsing the LI's parent's content) repeatedly defer to each 1064 other to parse the illegal start tag, each time inferring a missing 1065 </li> or <li> respectively. 1066 1067 NOTE: This check is a bit fragile. It specifically checks for the 1068 four tags that happen to weave their way through the current series 1069 of tests performed by ParseBlock and ParseList to trigger the 1070 infinite loop. 1071 */ 1072 if ( nodeIsLI(element) ) 1073 { 1074 if ( nodeIsFRAME(node) || 1075 nodeIsFRAMESET(node) || 1076 nodeIsOPTGROUP(node) || 1077 nodeIsOPTION(node) ) 1078 { 1079 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1080 FreeNode( doc, node ); /* DSR - 27Apr02 avoid memory leak */ 1081 continue; 1082 } 1083 } 1084 1085 if ( nodeIsTD(element) || nodeIsTH(element) ) 1086 { 1087 /* if parent is a table cell, avoid inferring the end of the cell */ 1088 1089 if ( nodeHasCM(node, CM_HEAD) ) 1090 { 1091 MoveToHead( doc, element, node ); 1092 continue; 1093 } 1094 1095 if ( nodeHasCM(node, CM_LIST) ) 1096 { 1097 UngetToken( doc ); 1098 node = InferredTag(doc, TidyTag_UL); 1099 /* AddClass( doc, node, "noindent" ); */ 1100 lexer->excludeBlocks = yes; 1101 } 1102 else if ( nodeHasCM(node, CM_DEFLIST) ) 1103 { 1104 UngetToken( doc ); 1105 node = InferredTag(doc, TidyTag_DL); 1106 lexer->excludeBlocks = yes; 1107 } 1108 1109 /* infer end of current table cell */ 1110 if ( !nodeHasCM(node, CM_BLOCK) ) 1111 { 1112 UngetToken( doc ); 1113 TrimSpaces( doc, element ); 1114 return; 1115 } 1116 } 1117 else if ( nodeHasCM(node, CM_BLOCK) ) 1118 { 1119 if ( lexer->excludeBlocks ) 1120 { 1121 if ( !nodeHasCM(element, CM_OPT) ) 1122 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE ); 1123 1124 UngetToken( doc ); 1125 1126 if ( nodeHasCM(element, CM_OBJECT) ) 1127 lexer->istackbase = istackbase; 1128 1129 TrimSpaces( doc, element ); 1130 return; 1131 } 1132 } 1133 else /* things like list items */ 1134 { 1135 if (node->tag->model & CM_HEAD) 1136 { 1137 MoveToHead( doc, element, node ); 1138 continue; 1139 } 1140 1141 /* 1142 special case where a form start tag 1143 occurs in a tr and is followed by td or th 1144 */ 1145 1146 if ( nodeIsFORM(element) && 1147 nodeIsTD(element->parent) && 1148 element->parent->implicit ) 1149 { 1150 if ( nodeIsTD(node) ) 1151 { 1152 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1153 FreeNode( doc, node ); 1154 continue; 1155 } 1156 1157 if ( nodeIsTH(node) ) 1158 { 1159 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1160 FreeNode( doc, node ); 1161 node = element->parent; 1162 MemFree(node->element); 1163 node->element = tmbstrdup("th"); 1164 node->tag = LookupTagDef( TidyTag_TH ); 1165 continue; 1166 } 1167 } 1168 1169 if ( !nodeHasCM(element, CM_OPT) && !element->implicit ) 1170 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE ); 1171 1172 UngetToken( doc ); 1173 1174 if ( nodeHasCM(node, CM_LIST) ) 1175 { 1176 if ( element->parent && element->parent->tag && 1177 element->parent->tag->parser == ParseList ) 1178 { 1179 TrimSpaces( doc, element ); 1180 return; 1181 } 1182 1183 node = InferredTag(doc, TidyTag_UL); 1184 /* AddClass( doc, node, "noindent" ); */ 1185 } 1186 else if ( nodeHasCM(node, CM_DEFLIST) ) 1187 { 1188 if ( nodeIsDL(element->parent) ) 1189 { 1190 TrimSpaces( doc, element ); 1191 return; 1192 } 1193 1194 node = InferredTag(doc, TidyTag_DL); 1195 } 1196 else if ( nodeHasCM(node, CM_TABLE) || nodeHasCM(node, CM_ROW) ) 1197 { 1198 node = InferredTag(doc, TidyTag_TABLE); 1199 } 1200 else if ( nodeHasCM(element, CM_OBJECT) ) 1201 { 1202 /* pop inline stack */ 1203 while ( lexer->istacksize > lexer->istackbase ) 1204 PopInline( doc, NULL ); 1205 lexer->istackbase = istackbase; 1206 TrimSpaces( doc, element ); 1207 return; 1208 1209 } 1210 else 1211 { 1212 TrimSpaces( doc, element ); 1213 return; 1214 } 1215 } 1216 } 1217 1218 /* parse known element */ 1219 if (nodeIsElement(node)) 1220 { 1221 if (node->tag->model & CM_INLINE) 1222 { 1223 if (checkstack && !node->implicit) 1224 { 1225 checkstack = no; 1226 1227 if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */ 1228 { 1229 if ( InlineDup(doc, node) > 0 ) 1230 continue; 1231 } 1232 } 1233 1234 mode = MixedContent; 1235 } 1236 else 1237 { 1238 checkstack = yes; 1239 mode = IgnoreWhitespace; 1240 } 1241 1242 /* trim white space before <br> */ 1243 if ( nodeIsBR(node) ) 1244 TrimSpaces( doc, element ); 1245 1246 InsertNodeAtEnd(element, node); 1247 1248 if (node->implicit) 1249 ReportError(doc, element, node, INSERTING_TAG ); 1250 1251 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ ); 1252 continue; 1253 } 1254 1255 /* discard unexpected tags */ 1256 if (node->type == EndTag) 1257 PopInline( doc, node ); /* if inline end tag */ 1258 1259 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1260 FreeNode( doc, node ); 1261 continue; 1262 } 1263 1264 if (!(element->tag->model & CM_OPT)) 1265 ReportError(doc, element, node, MISSING_ENDTAG_FOR); 1266 1267 if (element->tag->model & CM_OBJECT) 1268 { 1269 /* pop inline stack */ 1270 while ( lexer->istacksize > lexer->istackbase ) 1271 PopInline( doc, NULL ); 1272 lexer->istackbase = istackbase; 1273 } 1274 1275 TrimSpaces( doc, element ); 1276 } 1277 1278 void ParseInline( TidyDocImpl* doc, Node *element, uint mode ) 1279 { 1280 Lexer* lexer = doc->lexer; 1281 Node *node, *parent; 1282 1283 if (element->tag->model & CM_EMPTY) 1284 return; 1285 1286 /* 1287 ParseInline is used for some block level elements like H1 to H6 1288 For such elements we need to insert inline emphasis tags currently 1289 on the inline stack. For Inline elements, we normally push them 1290 onto the inline stack provided they aren't implicit or OBJECT/APPLET. 1291 This test is carried out in PushInline and PopInline, see istack.c 1292 1293 InlineDup(...) is not called for elements with a CM_MIXED (inline and 1294 block) content model, e.g. <del> or <ins>, otherwise constructs like 1295 1296 <p>111<a name='foo'>222<del>333</del>444</a>555</p> 1297 <p>111<span>222<del>333</del>444</span>555</p> 1298 <p>111<em>222<del>333</del>444</em>555</p> 1299 1300 will get corrupted. 1301 */ 1302 if ((nodeHasCM(element, CM_BLOCK) || nodeIsDT(element)) && 1303 !nodeHasCM(element, CM_MIXED)) 1304 InlineDup(doc, NULL); 1305 else if (nodeHasCM(element, CM_INLINE)) 1306 PushInline(doc, element); 1307 1308 if ( nodeIsNOBR(element) ) 1309 doc->badLayout |= USING_NOBR; 1310 else if ( nodeIsFONT(element) ) 1311 doc->badLayout |= USING_FONT; 1312 1313 /* Inline elements may or may not be within a preformatted element */ 1314 if (mode != Preformatted) 1315 mode = MixedContent; 1316 1317 while ((node = GetToken(doc, mode)) != NULL) 1318 { 1319 /* end tag for current element */ 1320 if (node->tag == element->tag && node->type == EndTag) 1321 { 1322 if (element->tag->model & CM_INLINE) 1323 PopInline( doc, node ); 1324 1325 FreeNode( doc, node ); 1326 1327 if (!(mode & Preformatted)) 1328 TrimSpaces(doc, element); 1329 1330 /* 1331 if a font element wraps an anchor and nothing else 1332 then move the font element inside the anchor since 1333 otherwise it won't alter the anchor text color 1334 */ 1335 if ( nodeIsFONT(element) && 1336 element->content && element->content == element->last ) 1337 { 1338 Node *child = element->content; 1339 1340 if ( nodeIsA(child) ) 1341 { 1342 child->parent = element->parent; 1343 child->next = element->next; 1344 child->prev = element->prev; 1345 1346 element->next = NULL; 1347 element->prev = NULL; 1348 element->parent = child; 1349 1350 element->content = child->content; 1351 element->last = child->last; 1352 child->content = element; 1353 1354 FixNodeLinks(child); 1355 FixNodeLinks(element); 1356 } 1357 } 1358 1359 element->closed = yes; 1360 TrimSpaces( doc, element ); 1361 return; 1362 } 1363 1364 /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */ 1365 /* otherwise emphasis nesting is probably unintentional */ 1366 /* big, small, sub, sup have cumulative effect to leave them alone */ 1367 if ( node->type == StartTag 1368 && node->tag == element->tag 1369 && IsPushed( doc, node ) 1370 && !node->implicit 1371 && !element->implicit 1372 && node->tag && (node->tag->model & CM_INLINE) 1373 && !nodeIsA(node) 1374 && !nodeIsFONT(node) 1375 && !nodeIsBIG(node) 1376 && !nodeIsSMALL(node) 1377 && !nodeIsSUB(node) 1378 && !nodeIsSUP(node) 1379 && !nodeIsQ(node) 1380 && !nodeIsSPAN(node) 1381 ) 1382 { 1383 if (element->content != NULL && node->attributes == NULL) 1384 { 1385 ReportWarning(doc, element, node, COERCE_TO_ENDTAG_WARN); 1386 node->type = EndTag; 1387 UngetToken(doc); 1388 continue; 1389 } 1390 1391 if (node->attributes == NULL || element->attributes == NULL) 1392 ReportWarning(doc, element, node, NESTED_EMPHASIS); 1393 } 1394 else if ( IsPushed(doc, node) && node->type == StartTag && 1395 nodeIsQ(node) ) 1396 { 1397 ReportWarning(doc, element, node, NESTED_QUOTATION); 1398 } 1399 1400 if ( nodeIsText(node) ) 1401 { 1402 /* only called for 1st child */ 1403 if ( element->content == NULL && !(mode & Preformatted) ) 1404 TrimSpaces( doc, element ); 1405 1406 if ( node->start >= node->end ) 1407 { 1408 FreeNode( doc, node ); 1409 continue; 1410 } 1411 1412 InsertNodeAtEnd(element, node); 1413 continue; 1414 } 1415 1416 /* mixed content model so allow text */ 1417 if (InsertMisc(element, node)) 1418 continue; 1419 1420 /* deal with HTML tags */ 1421 if ( nodeIsHTML(node) ) 1422 { 1423 if ( nodeIsElement(node) ) 1424 { 1425 ReportError(doc, element, node, DISCARDING_UNEXPECTED ); 1426 FreeNode( doc, node ); 1427 continue; 1428 } 1429 1430 /* otherwise infer end of inline element */ 1431 UngetToken( doc ); 1432 1433 if (!(mode & Preformatted)) 1434 TrimSpaces(doc, element); 1435 1436 return; 1437 } 1438 1439 /* within <dt> or <pre> map <p> to <br> */ 1440 if ( nodeIsP(node) && 1441 node->type == StartTag && 1442 ( (mode & Preformatted) || 1443 nodeIsDT(element) || 1444 DescendantOf(element, TidyTag_DT ) 1445 ) 1446 ) 1447 { 1448 node->tag = LookupTagDef( TidyTag_BR ); 1449 MemFree(node->element); 1450 node->element = tmbstrdup("br"); 1451 TrimSpaces(doc, element); 1452 InsertNodeAtEnd(element, node); 1453 continue; 1454 } 1455 1456 /* <p> allowed within <address> in HTML 4.01 Transitional */ 1457 if ( nodeIsP(node) && 1458 node->type == StartTag && 1459 nodeIsADDRESS(element) ) 1460 { 1461 ConstrainVersion( doc, ~VERS_HTML40_STRICT ); 1462 InsertNodeAtEnd(element, node); 1463 (*node->tag->parser)( doc, node, mode ); 1464 continue; 1465 } 1466 1467 /* ignore unknown and PARAM tags */ 1468 if ( node->tag == NULL || nodeIsPARAM(node) ) 1469 { 1470 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1471 FreeNode( doc, node ); 1472 continue; 1473 } 1474 1475 if ( nodeIsBR(node) && node->type == EndTag ) 1476 node->type = StartTag; 1477 1478 if ( node->type == EndTag ) 1479 { 1480 /* coerce </br> to <br> */ 1481 if ( nodeIsBR(node) ) 1482 node->type = StartTag; 1483 else if ( nodeIsP(node) ) 1484 { 1485 /* coerce unmatched </p> to <br><br> */ 1486 if ( !DescendantOf(element, TidyTag_P) ) 1487 { 1488 CoerceNode(doc, node, TidyTag_BR, no, no); 1489 TrimSpaces( doc, element ); 1490 InsertNodeAtEnd( element, node ); 1491 node = InferredTag(doc, TidyTag_BR); 1492 InsertNodeAtEnd( element, node ); /* todo: check this */ 1493 continue; 1494 } 1495 } 1496 else if ( nodeHasCM(node, CM_INLINE) 1497 && !nodeIsA(node) 1498 && !nodeHasCM(node, CM_OBJECT) 1499 && nodeHasCM(element, CM_INLINE) ) 1500 { 1501 /* allow any inline end tag to end current element */ 1502 PopInline( doc, element ); 1503 1504 if ( !nodeIsA(element) ) 1505 { 1506 if ( nodeIsA(node) && node->tag != element->tag ) 1507 { 1508 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE ); 1509 UngetToken( doc ); 1510 } 1511 else 1512 { 1513 ReportError(doc, element, node, NON_MATCHING_ENDTAG); 1514 FreeNode( doc, node); 1515 } 1516 1517 if (!(mode & Preformatted)) 1518 TrimSpaces(doc, element); 1519 1520 return; 1521 } 1522 1523 /* if parent is <a> then discard unexpected inline end tag */ 1524 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1525 FreeNode( doc, node); 1526 continue; 1527 } /* special case </tr> etc. for stuff moved in front of table */ 1528 else if ( lexer->exiled 1529 && node->tag->model 1530 && (node->tag->model & CM_TABLE) ) 1531 { 1532 UngetToken( doc ); 1533 TrimSpaces(doc, element); 1534 return; 1535 } 1536 } 1537 1538 /* allow any header tag to end current header */ 1539 if ( nodeHasCM(node, CM_HEADING) && nodeHasCM(element, CM_HEADING) ) 1540 { 1541 1542 if ( node->tag == element->tag ) 1543 { 1544 ReportError(doc, element, node, NON_MATCHING_ENDTAG ); 1545 FreeNode( doc, node); 1546 } 1547 else 1548 { 1549 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE ); 1550 UngetToken( doc ); 1551 } 1552 1553 if (!(mode & Preformatted)) 1554 TrimSpaces(doc, element); 1555 1556 return; 1557 } 1558 1559 /* 1560 an <A> tag to ends any open <A> element 1561 but <A href=...> is mapped to </A><A href=...> 1562 */ 1563 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ 1564 /* if (node->tag == doc->tags.tag_a && !node->implicit && IsPushed(doc, node)) */ 1565 if ( nodeIsA(node) && !node->implicit && 1566 (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) 1567 { 1568 /* coerce <a> to </a> unless it has some attributes */ 1569 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ 1570 /* other fixes by Dave Raggett */ 1571 /* if (node->attributes == NULL) */ 1572 if (node->type != EndTag && node->attributes == NULL) 1573 { 1574 node->type = EndTag; 1575 ReportError(doc, element, node, COERCE_TO_ENDTAG); 1576 /* PopInline( doc, node ); */ 1577 UngetToken( doc ); 1578 continue; 1579 } 1580 1581 UngetToken( doc ); 1582 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE); 1583 /* PopInline( doc, element ); */ 1584 1585 if (!(mode & Preformatted)) 1586 TrimSpaces(doc, element); 1587 1588 return; 1589 } 1590 1591 if (element->tag->model & CM_HEADING) 1592 { 1593 if ( nodeIsCENTER(node) || nodeIsDIV(node) ) 1594 { 1595 if (!nodeIsElement(node)) 1596 { 1597 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1598 FreeNode( doc, node); 1599 continue; 1600 } 1601 1602 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN); 1603 1604 /* insert center as parent if heading is empty */ 1605 if (element->content == NULL) 1606 { 1607 InsertNodeAsParent(element, node); 1608 continue; 1609 } 1610 1611 /* split heading and make center parent of 2nd part */ 1612 InsertNodeAfterElement(element, node); 1613 1614 if (!(mode & Preformatted)) 1615 TrimSpaces(doc, element); 1616 1617 element = CloneNode( doc, element ); 1618 InsertNodeAtEnd(node, element); 1619 continue; 1620 } 1621 1622 if ( nodeIsHR(node) ) 1623 { 1624 if ( !nodeIsElement(node) ) 1625 { 1626 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1627 FreeNode( doc, node); 1628 continue; 1629 } 1630 1631 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN); 1632 1633 /* insert hr before heading if heading is empty */ 1634 if (element->content == NULL) 1635 { 1636 InsertNodeBeforeElement(element, node); 1637 continue; 1638 } 1639 1640 /* split heading and insert hr before 2nd part */ 1641 InsertNodeAfterElement(element, node); 1642 1643 if (!(mode & Preformatted)) 1644 TrimSpaces(doc, element); 1645 1646 element = CloneNode( doc, element ); 1647 InsertNodeAfterElement(node, element); 1648 continue; 1649 } 1650 } 1651 1652 if ( nodeIsDT(element) ) 1653 { 1654 if ( nodeIsHR(node) ) 1655 { 1656 Node *dd; 1657 if ( !nodeIsElement(node) ) 1658 { 1659 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1660 FreeNode( doc, node); 1661 continue; 1662 } 1663 1664 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN); 1665 dd = InferredTag(doc, TidyTag_DD); 1666 1667 /* insert hr within dd before dt if dt is empty */ 1668 if (element->content == NULL) 1669 { 1670 InsertNodeBeforeElement(element, dd); 1671 InsertNodeAtEnd(dd, node); 1672 continue; 1673 } 1674 1675 /* split dt and insert hr within dd before 2nd part */ 1676 InsertNodeAfterElement(element, dd); 1677 InsertNodeAtEnd(dd, node); 1678 1679 if (!(mode & Preformatted)) 1680 TrimSpaces(doc, element); 1681 1682 element = CloneNode( doc, element ); 1683 InsertNodeAfterElement(dd, element); 1684 continue; 1685 } 1686 } 1687 1688 1689 /* 1690 if this is the end tag for an ancestor element 1691 then infer end tag for this element 1692 */ 1693 if (node->type == EndTag) 1694 { 1695 for (parent = element->parent; 1696 parent != NULL; parent = parent->parent) 1697 { 1698 if (node->tag == parent->tag) 1699 { 1700 if (!(element->tag->model & CM_OPT) && !element->implicit) 1701 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE); 1702 1703 PopInline( doc, element ); 1704 UngetToken( doc ); 1705 1706 if (!(mode & Preformatted)) 1707 TrimSpaces(doc, element); 1708 1709 return; 1710 } 1711 } 1712 } 1713 1714 /* block level tags end this element */ 1715 if (!(node->tag->model & CM_INLINE) && 1716 !(element->tag->model & CM_MIXED)) 1717 { 1718 if ( !nodeIsElement(node) ) 1719 { 1720 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1721 FreeNode( doc, node); 1722 continue; 1723 } 1724 1725 if (!(element->tag->model & CM_OPT)) 1726 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE); 1727 1728 if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) 1729 { 1730 MoveToHead(doc, element, node); 1731 continue; 1732 } 1733 1734 /* 1735 prevent anchors from propagating into block tags 1736 except for headings h1 to h6 1737 */ 1738 if ( nodeIsA(element) ) 1739 { 1740 if (node->tag && !(node->tag->model & CM_HEADING)) 1741 PopInline( doc, element ); 1742 else if (!(element->content)) 1743 { 1744 DiscardElement( doc, element ); 1745 UngetToken( doc ); 1746 return; 1747 } 1748 } 1749 1750 UngetToken( doc ); 1751 1752 if (!(mode & Preformatted)) 1753 TrimSpaces(doc, element); 1754 1755 return; 1756 } 1757 1758 /* parse inline element */ 1759 if (nodeIsElement(node)) 1760 { 1761 if (node->implicit) 1762 ReportError(doc, element, node, INSERTING_TAG); 1763 1764 /* trim white space before <br> */ 1765 if ( nodeIsBR(node) ) 1766 TrimSpaces(doc, element); 1767 1768 InsertNodeAtEnd(element, node); 1769 ParseTag(doc, node, mode); 1770 continue; 1771 } 1772 1773 /* discard unexpected tags */ 1774 ReportError(doc, element, node, DISCARDING_UNEXPECTED); 1775 FreeNode( doc, node ); 1776 continue; 1777 } 1778 1779 if (!(element->tag->model & CM_OPT)) 1780 ReportError(doc, element, node, MISSING_ENDTAG_FOR); 1781 1782 } 1783 1784 void ParseEmpty(TidyDocImpl* doc, Node *element, uint mode) 1785 { 1786 Lexer* lexer = doc->lexer; 1787 if ( lexer->isvoyager ) 1788 { 1789 Node *node = GetToken( doc, mode); 1790 if ( node ) 1791 { 1792 if ( !(node->type == EndTag && node->tag == element->tag) ) 1793 { 1794 ReportError(doc, element, node, ELEMENT_NOT_EMPTY); 1795 UngetToken( doc ); 1796 } 1797 else 1798 { 1799 FreeNode( doc, node ); 1800 } 1801 } 1802 } 1803 } 1804 1805 void ParseDefList(TidyDocImpl* doc, Node *list, uint mode) 1806 { 1807 Lexer* lexer = doc->lexer; 1808 Node *node, *parent; 1809 1810 if (list->tag->model & CM_EMPTY) 1811 return; 1812 1813 lexer->insert = NULL; /* defer implicit inline start tags */ 1814 1815 while ((node = GetToken( doc, IgnoreWhitespace)) != NULL) 1816 { 1817 if (node->tag == list->tag && node->type == EndTag) 1818 { 1819 FreeNode( doc, node); 1820 list->closed = yes; 1821 return; 1822 } 1823 1824 /* deal with comments etc. */ 1825 if (InsertMisc(list, node)) 1826 continue; 1827 1828 if (nodeIsText(node)) 1829 { 1830 UngetToken( doc ); 1831 node = InferredTag(doc, TidyTag_DT); 1832 ReportError(doc, list, node, MISSING_STARTTAG); 1833 } 1834 1835 if (node->tag == NULL) 1836 { 1837 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 1838 FreeNode( doc, node); 1839 continue; 1840 } 1841 1842 /* 1843 if this is the end tag for an ancestor element 1844 then infer end tag for this element 1845 */ 1846 if (node->type == EndTag) 1847 { 1848 Bool discardIt = no; 1849 if ( nodeIsFORM(node) ) 1850 { 1851 BadForm( doc ); 1852 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 1853 FreeNode( doc, node ); 1854 continue; 1855 } 1856 1857 for (parent = list->parent; 1858 parent != NULL; parent = parent->parent) 1859 { 1860 /* Do not match across BODY to avoid infinite loop 1861 between ParseBody and this parser, 1862 See http://tidy.sf.net/bug/1098012. */ 1863 if (nodeIsBODY(parent)) 1864 { 1865 discardIt = yes; 1866 break; 1867 } 1868 if (node->tag == parent->tag) 1869 { 1870 ReportError(doc, list, node, MISSING_ENDTAG_BEFORE); 1871 1872 UngetToken( doc ); 1873 return; 1874 } 1875 } 1876 if (discardIt) 1877 { 1878 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 1879 FreeNode( doc, node); 1880 continue; 1881 } 1882 } 1883 1884 /* center in a dt or a dl breaks the dl list in two */ 1885 if ( nodeIsCENTER(node) ) 1886 { 1887 if (list->content) 1888 InsertNodeAfterElement(list, node); 1889 else /* trim empty dl list */ 1890 { 1891 InsertNodeBeforeElement(list, node); 1892 1893 /* #540296 tidy dumps with empty definition list */ 1894 #if 0 1895 DiscardElement(list); 1896 #endif 1897 } 1898 1899 /* #426885 - fix by Glenn Carroll 19 Apr 00, and 1900 Gary Dechaines 11 Aug 00 */ 1901 /* ParseTag can destroy node, if it finds that 1902 * this <center> is followed immediately by </center>. 1903 * It's awkward but necessary to determine if this 1904 * has happened. 1905 */ 1906 parent = node->parent; 1907 1908 /* and parse contents of center */ 1909 lexer->excludeBlocks = no; 1910 ParseTag( doc, node, mode); 1911 lexer->excludeBlocks = yes; 1912 1913 /* now create a new dl element, 1914 * unless node has been blown away because the 1915 * center was empty, as above. 1916 */ 1917 if (parent->last == node) 1918 { 1919 list = InferredTag(doc, TidyTag_DL); 1920 InsertNodeAfterElement(node, list); 1921 } 1922 continue; 1923 } 1924 1925 if ( !(nodeIsDT(node) || nodeIsDD(node)) ) 1926 { 1927 UngetToken( doc ); 1928 1929 if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) 1930 { 1931 ReportError(doc, list, node, TAG_NOT_ALLOWED_IN); 1932 return; 1933 } 1934 1935 /* if DD appeared directly in BODY then exclude blocks */ 1936 if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) 1937 return; 1938 1939 node = InferredTag(doc, TidyTag_DD); 1940 ReportError(doc, list, node, MISSING_STARTTAG); 1941 } 1942 1943 if (node->type == EndTag) 1944 { 1945 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 1946 FreeNode( doc, node); 1947 continue; 1948 } 1949 1950 /* node should be <DT> or <DD>*/ 1951 InsertNodeAtEnd(list, node); 1952 ParseTag( doc, node, IgnoreWhitespace); 1953 } 1954 1955 ReportError(doc, list, node, MISSING_ENDTAG_FOR); 1956 } 1957 1958 void ParseList(TidyDocImpl* doc, Node *list, uint ARG_UNUSED(mode)) 1959 { 1960 Lexer* lexer = doc->lexer; 1961 Node *node, *parent; 1962 1963 if (list->tag->model & CM_EMPTY) 1964 return; 1965 1966 lexer->insert = NULL; /* defer implicit inline start tags */ 1967 1968 while ((node = GetToken( doc, IgnoreWhitespace)) != NULL) 1969 { 1970 if (node->tag == list->tag && node->type == EndTag) 1971 { 1972 FreeNode( doc, node); 1973 list->closed = yes; 1974 return; 1975 } 1976 1977 /* deal with comments etc. */ 1978 if (InsertMisc(list, node)) 1979 continue; 1980 1981 if (node->type != TextNode && node->tag == NULL) 1982 { 1983 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 1984 FreeNode( doc, node); 1985 continue; 1986 } 1987 1988 /* 1989 if this is the end tag for an ancestor element 1990 then infer end tag for this element 1991 */ 1992 if (node->type == EndTag) 1993 { 1994 if ( nodeIsFORM(node) ) 1995 { 1996 BadForm( doc ); 1997 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 1998 FreeNode( doc, node ); 1999 continue; 2000 } 2001 2002 if (node->tag && node->tag->model & CM_INLINE) 2003 { 2004 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 2005 PopInline( doc, node ); 2006 FreeNode( doc, node); 2007 continue; 2008 } 2009 2010 for ( parent = list->parent; 2011 parent != NULL; parent = parent->parent ) 2012 { 2013 /* Do not match across BODY to avoid infinite loop 2014 between ParseBody and this parser, 2015 See http://tidy.sf.net/bug/1053626. */ 2016 if (nodeIsBODY(parent)) 2017 break; 2018 if (node->tag == parent->tag) 2019 { 2020 ReportError(doc, list, node, MISSING_ENDTAG_BEFORE); 2021 UngetToken( doc ); 2022 return; 2023 } 2024 } 2025 2026 ReportError(doc, list, node, DISCARDING_UNEXPECTED); 2027 FreeNode( doc, node); 2028 continue; 2029 } 2030 2031 if ( !nodeIsLI(node) ) 2032 { 2033 UngetToken( doc ); 2034 2035 if (node->tag && (node->tag->model & CM_BLOCK) && lexer->excludeBlocks) 2036 { 2037 ReportError(doc, list, node, MISSING_ENDTAG_BEFORE); 2038 return; 2039 } 2040 2041 node = InferredTag(doc, TidyTag_LI); 2042 AddAttribute( doc, node, "style", "list-style: none" ); 2043 ReportError(doc, list, node, MISSING_STARTTAG ); 2044 } 2045 2046 /* node should be <LI> */ 2047 InsertNodeAtEnd(list,node); 2048 ParseTag( doc, node, IgnoreWhitespace); 2049 } 2050 2051 ReportError(doc, list, node, MISSING_ENDTAG_FOR); 2052 } 2053 2054 /* 2055 unexpected content in table row is moved to just before 2056 the table in accordance with Netscape and IE. This code 2057 assumes that node hasn't been inserted into the row. 2058 */ 2059 static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, 2060 Node *node ) 2061 { 2062 Node *table; 2063 2064 /* first find the table element */ 2065 for (table = row->parent; table; table = table->parent) 2066 { 2067 if ( nodeIsTABLE(table) ) 2068 { 2069 InsertNodeBeforeElement( table, node ); 2070 return; 2071 } 2072 } 2073 /* No table element */ 2074 InsertNodeBeforeElement( row->parent, node ); 2075 } 2076 2077 /* 2078 if a table row is empty then insert an empty cell 2079 this practice is consistent with browser behavior 2080 and avoids potential problems with row spanning cells 2081 */ 2082 static void FixEmptyRow(TidyDocImpl* doc, Node *row) 2083 { 2084 Node *cell; 2085 2086 if (row->content == NULL) 2087 { 2088 cell = InferredTag(doc, TidyTag_TD); 2089 InsertNodeAtEnd(row, cell); 2090 ReportError(doc, row, cell, MISSING_STARTTAG); 2091 } 2092 } 2093 2094 void ParseRow(TidyDocImpl* doc, Node *row, uint ARG_UNUSED(mode)) 2095 { 2096 Lexer* lexer = doc->lexer; 2097 Node *node; 2098 Bool exclude_state; 2099 2100 if (row->tag->model & CM_EMPTY) 2101 return; 2102 2103 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 2104 { 2105 if (node->tag == row->tag) 2106 { 2107 if (node->type == EndTag) 2108 { 2109 FreeNode( doc, node); 2110 row->closed = yes; 2111 FixEmptyRow( doc, row); 2112 return; 2113 } 2114 2115 /* New row start implies end of current row */ 2116 UngetToken( doc ); 2117 FixEmptyRow( doc, row); 2118 return; 2119 } 2120 2121 /* 2122 if this is the end tag for an ancestor element 2123 then infer end tag for this element 2124 */ 2125 if ( node->type == EndTag ) 2126 { 2127 if ( DescendantOf(row, TagId(node)) ) 2128 { 2129 UngetToken( doc ); 2130 return; 2131 } 2132 2133 if ( nodeIsFORM(node) || nodeHasCM(node, CM_BLOCK|CM_INLINE) ) 2134 { 2135 if ( nodeIsFORM(node) ) 2136 BadForm( doc ); 2137 2138 ReportError(doc, row, node, DISCARDING_UNEXPECTED); 2139 FreeNode( doc, node); 2140 continue; 2141 } 2142 2143 if ( nodeIsTD(node) || nodeIsTH(node) ) 2144 { 2145 ReportError(doc, row, node, DISCARDING_UNEXPECTED); 2146 FreeNode( doc, node); 2147 continue; 2148 } 2149 } 2150 2151 /* deal with comments etc. */ 2152 if (InsertMisc(row, node)) 2153 continue; 2154 2155 /* discard unknown tags */ 2156 if (node->tag == NULL && node->type != TextNode) 2157 { 2158 ReportError(doc, row, node, DISCARDING_UNEXPECTED); 2159 FreeNode( doc, node); 2160 continue; 2161 } 2162 2163 /* discard unexpected <table> element */ 2164 if ( nodeIsTABLE(node) ) 2165 { 2166 ReportError(doc, row, node, DISCARDING_UNEXPECTED); 2167 FreeNode( doc, node); 2168 continue; 2169 } 2170 2171 /* THEAD, TFOOT or TBODY */ 2172 if ( nodeHasCM(node, CM_ROWGRP) ) 2173 { 2174 UngetToken( doc ); 2175 return; 2176 } 2177 2178 if (node->type == EndTag) 2179 { 2180 ReportError(doc, row, node, DISCARDING_UNEXPECTED); 2181 FreeNode( doc, node); 2182 continue; 2183 } 2184 2185 /* 2186 if text or inline or block move before table 2187 if head content move to head 2188 */ 2189 2190 if (node->type != EndTag) 2191 { 2192 if ( nodeIsFORM(node) ) 2193 { 2194 UngetToken( doc ); 2195 node = InferredTag(doc, TidyTag_TD); 2196 ReportError(doc, row, node, MISSING_STARTTAG); 2197 } 2198 else if ( nodeIsText(node) 2199 || nodeHasCM(node, CM_BLOCK | CM_INLINE) ) 2200 { 2201 MoveBeforeTable( doc, row, node ); 2202 ReportError(doc, row, node, TAG_NOT_ALLOWED_IN); 2203 lexer->exiled = yes; 2204 2205 if (node->type != TextNode) 2206 ParseTag( doc, node, IgnoreWhitespace); 2207 2208 lexer->exiled = no; 2209 continue; 2210 } 2211 else if (node->tag->model & CM_HEAD) 2212 { 2213 ReportError(doc, row, node, TAG_NOT_ALLOWED_IN); 2214 MoveToHead( doc, row, node); 2215 continue; 2216 } 2217 } 2218 2219 if ( !(nodeIsTD(node) || nodeIsTH(node)) ) 2220 { 2221 ReportError(doc, row, node, TAG_NOT_ALLOWED_IN); 2222 FreeNode( doc, node); 2223 continue; 2224 } 2225 2226 /* node should be <TD> or <TH> */ 2227 InsertNodeAtEnd(row, node); 2228 exclude_state = lexer->excludeBlocks; 2229 lexer->excludeBlocks = no; 2230 ParseTag( doc, node, IgnoreWhitespace); 2231 lexer->excludeBlocks = exclude_state; 2232 2233 /* pop inline stack */ 2234 2235 while ( lexer->istacksize > lexer->istackbase ) 2236 PopInline( doc, NULL ); 2237 } 2238 2239 } 2240 2241 void ParseRowGroup(TidyDocImpl* doc, Node *rowgroup, uint ARG_UNUSED(mode)) 2242 { 2243 Lexer* lexer = doc->lexer; 2244 Node *node, *parent; 2245 2246 if (rowgroup->tag->model & CM_EMPTY) 2247 return; 2248 2249 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 2250 { 2251 if (node->tag == rowgroup->tag) 2252 { 2253 if (node->type == EndTag) 2254 { 2255 rowgroup->closed = yes; 2256 FreeNode( doc, node); 2257 return; 2258 } 2259 2260 UngetToken( doc ); 2261 return; 2262 } 2263 2264 /* if </table> infer end tag */ 2265 if ( nodeIsTABLE(node) && node->type == EndTag ) 2266 { 2267 UngetToken( doc ); 2268 return; 2269 } 2270 2271 /* deal with comments etc. */ 2272 if (InsertMisc(rowgroup, node)) 2273 continue; 2274 2275 /* discard unknown tags */ 2276 if (node->tag == NULL && node->type != TextNode) 2277 { 2278 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2279 FreeNode( doc, node); 2280 continue; 2281 } 2282 2283 /* 2284 if TD or TH then infer <TR> 2285 if text or inline or block move before table 2286 if head content move to head 2287 */ 2288 2289 if (node->type != EndTag) 2290 { 2291 if ( nodeIsTD(node) || nodeIsTH(node) ) 2292 { 2293 UngetToken( doc ); 2294 node = InferredTag(doc, TidyTag_TR); 2295 ReportError(doc, rowgroup, node, MISSING_STARTTAG); 2296 } 2297 else if ( nodeIsText(node) 2298 || nodeHasCM(node, CM_BLOCK|CM_INLINE) ) 2299 { 2300 MoveBeforeTable( doc, rowgroup, node ); 2301 ReportError(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); 2302 lexer->exiled = yes; 2303 2304 if (node->type != TextNode) 2305 ParseTag(doc, node, IgnoreWhitespace); 2306 2307 lexer->exiled = no; 2308 continue; 2309 } 2310 else if (node->tag->model & CM_HEAD) 2311 { 2312 ReportError(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); 2313 MoveToHead(doc, rowgroup, node); 2314 continue; 2315 } 2316 } 2317 2318 /* 2319 if this is the end tag for ancestor element 2320 then infer end tag for this element 2321 */ 2322 if (node->type == EndTag) 2323 { 2324 if ( nodeIsFORM(node) || nodeHasCM(node, CM_BLOCK|CM_INLINE) ) 2325 { 2326 if ( nodeIsFORM(node) ) 2327 BadForm( doc ); 2328 2329 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2330 FreeNode( doc, node); 2331 continue; 2332 } 2333 2334 if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) ) 2335 { 2336 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2337 FreeNode( doc, node); 2338 continue; 2339 } 2340 2341 for ( parent = rowgroup->parent; 2342 parent != NULL; 2343 parent = parent->parent ) 2344 { 2345 if (node->tag == parent->tag) 2346 { 2347 UngetToken( doc ); 2348 return; 2349 } 2350 } 2351 } 2352 2353 /* 2354 if THEAD, TFOOT or TBODY then implied end tag 2355 2356 */ 2357 if (node->tag->model & CM_ROWGRP) 2358 { 2359 if (node->type != EndTag) 2360 { 2361 UngetToken( doc ); 2362 return; 2363 } 2364 } 2365 2366 if (node->type == EndTag) 2367 { 2368 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2369 FreeNode( doc, node); 2370 continue; 2371 } 2372 2373 if ( !nodeIsTR(node) ) 2374 { 2375 node = InferredTag(doc, TidyTag_TR); 2376 ReportError(doc, rowgroup, node, MISSING_STARTTAG); 2377 UngetToken( doc ); 2378 } 2379 2380 /* node should be <TR> */ 2381 InsertNodeAtEnd(rowgroup, node); 2382 ParseTag(doc, node, IgnoreWhitespace); 2383 } 2384 2385 } 2386 2387 void ParseColGroup(TidyDocImpl* doc, Node *colgroup, uint ARG_UNUSED(mode)) 2388 { 2389 Node *node, *parent; 2390 2391 if (colgroup->tag->model & CM_EMPTY) 2392 return; 2393 2394 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 2395 { 2396 if (node->tag == colgroup->tag && node->type == EndTag) 2397 { 2398 FreeNode( doc, node); 2399 colgroup->closed = yes; 2400 return; 2401 } 2402 2403 /* 2404 if this is the end tag for an ancestor element 2405 then infer end tag for this element 2406 */ 2407 if (node->type == EndTag) 2408 { 2409 if ( nodeIsFORM(node) ) 2410 { 2411 BadForm( doc ); 2412 ReportError(doc, colgroup, node, DISCARDING_UNEXPECTED); 2413 FreeNode( doc, node); 2414 continue; 2415 } 2416 2417 for ( parent = colgroup->parent; 2418 parent != NULL; 2419 parent = parent->parent ) 2420 { 2421 if (node->tag == parent->tag) 2422 { 2423 UngetToken( doc ); 2424 return; 2425 } 2426 } 2427 } 2428 2429 if (nodeIsText(node)) 2430 { 2431 UngetToken( doc ); 2432 return; 2433 } 2434 2435 /* deal with comments etc. */ 2436 if (InsertMisc(colgroup, node)) 2437 continue; 2438 2439 /* discard unknown tags */ 2440 if (node->tag == NULL) 2441 { 2442 ReportError(doc, colgroup, node, DISCARDING_UNEXPECTED); 2443 FreeNode( doc, node); 2444 continue; 2445 } 2446 2447 if ( !nodeIsCOL(node) ) 2448 { 2449 UngetToken( doc ); 2450 return; 2451 } 2452 2453 if (node->type == EndTag) 2454 { 2455 ReportError(doc, colgroup, node, DISCARDING_UNEXPECTED); 2456 FreeNode( doc, node); 2457 continue; 2458 } 2459 2460 /* node should be <COL> */ 2461 InsertNodeAtEnd(colgroup, node); 2462 ParseTag(doc, node, IgnoreWhitespace); 2463 } 2464 } 2465 2466 void ParseTableTag(TidyDocImpl* doc, Node *table, uint ARG_UNUSED(mode)) 2467 { 2468 Lexer* lexer = doc->lexer; 2469 Node *node, *parent; 2470 uint istackbase; 2471 2472 DeferDup( doc ); 2473 istackbase = lexer->istackbase; 2474 lexer->istackbase = lexer->istacksize; 2475 2476 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 2477 { 2478 if (node->tag == table->tag && node->type == EndTag) 2479 { 2480 FreeNode( doc, node); 2481 lexer->istackbase = istackbase; 2482 table->closed = yes; 2483 return; 2484 } 2485 2486 /* deal with comments etc. */ 2487 if (InsertMisc(table, node)) 2488 continue; 2489 2490 /* discard unknown tags */ 2491 if (node->tag == NULL && node->type != TextNode) 2492 { 2493 ReportError(doc, table, node, DISCARDING_UNEXPECTED); 2494 FreeNode( doc, node); 2495 continue; 2496 } 2497 2498 /* if TD or TH or text or inline or block then infer <TR> */ 2499 2500 if (node->type != EndTag) 2501 { 2502 if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) ) 2503 { 2504 UngetToken( doc ); 2505 node = InferredTag(doc, TidyTag_TR); 2506 ReportError(doc, table, node, MISSING_STARTTAG); 2507 } 2508 else if ( nodeIsText(node) ||nodeHasCM(node,CM_BLOCK|CM_INLINE) ) 2509 { 2510 InsertNodeBeforeElement(table, node); 2511 ReportError(doc, table, node, TAG_NOT_ALLOWED_IN); 2512 lexer->exiled = yes; 2513 2514 if (node->type != TextNode) 2515 ParseTag(doc, node, IgnoreWhitespace); 2516 2517 lexer->exiled = no; 2518 continue; 2519 } 2520 else if (node->tag->model & CM_HEAD) 2521 { 2522 MoveToHead(doc, table, node); 2523 continue; 2524 } 2525 } 2526 2527 /* 2528 if this is the end tag for an ancestor element 2529 then infer end tag for this element 2530 */ 2531 if (node->type == EndTag) 2532 { 2533 if ( nodeIsFORM(node) ) 2534 { 2535 BadForm( doc ); 2536 ReportError(doc, table, node, DISCARDING_UNEXPECTED); 2537 FreeNode( doc, node); 2538 continue; 2539 } 2540 2541 /* best to discard unexpected block/inline end tags */ 2542 if ( nodeHasCM(node, CM_TABLE|CM_ROW) || 2543 nodeHasCM(node, CM_BLOCK|CM_INLINE) ) 2544 { 2545 ReportError(doc, table, node, DISCARDING_UNEXPECTED); 2546 FreeNode( doc, node); 2547 continue; 2548 } 2549 2550 for ( parent = table->parent; 2551 parent != NULL; 2552 parent = parent->parent ) 2553 { 2554 if (node->tag == parent->tag) 2555 { 2556 ReportError(doc, table, node, MISSING_ENDTAG_BEFORE ); 2557 UngetToken( doc ); 2558 lexer->istackbase = istackbase; 2559 return; 2560 } 2561 } 2562 } 2563 2564 if (!(node->tag->model & CM_TABLE)) 2565 { 2566 UngetToken( doc ); 2567 ReportError(doc, table, node, TAG_NOT_ALLOWED_IN); 2568 lexer->istackbase = istackbase; 2569 return; 2570 } 2571 2572 if (nodeIsElement(node)) 2573 { 2574 InsertNodeAtEnd(table, node); 2575 ParseTag(doc, node, IgnoreWhitespace); 2576 continue; 2577 } 2578 2579 /* discard unexpected text nodes and end tags */ 2580 ReportError(doc, table, node, DISCARDING_UNEXPECTED); 2581 FreeNode( doc, node); 2582 } 2583 2584 ReportError(doc, table, node, MISSING_ENDTAG_FOR); 2585 lexer->istackbase = istackbase; 2586 } 2587 2588 /* acceptable content for pre elements */ 2589 Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) 2590 { 2591 /* p is coerced to br's, Text OK too */ 2592 if ( nodeIsP(node) || nodeIsText(node) ) 2593 return yes; 2594 2595 if ( node->tag == NULL || 2596 nodeIsPARAM(node) || 2597 !nodeHasCM(node, CM_INLINE|CM_NEW) ) 2598 return no; 2599 2600 return yes; 2601 } 2602 2603 void ParsePre( TidyDocImpl* doc, Node *pre, uint ARG_UNUSED(mode) ) 2604 { 2605 Node *node; 2606 2607 if (pre->tag->model & CM_EMPTY) 2608 return; 2609 2610 InlineDup( doc, NULL ); /* tell lexer to insert inlines if needed */ 2611 2612 while ((node = GetToken(doc, Preformatted)) != NULL) 2613 { 2614 if ( node->type == EndTag && 2615 (node->tag == pre->tag || DescendantOf(pre, TagId(node))) ) 2616 { 2617 if (nodeIsBODY(node) || nodeIsHTML(node)) 2618 { 2619 ReportError(doc, pre, node, DISCARDING_UNEXPECTED); 2620 FreeNode(doc, node); 2621 continue; 2622 } 2623 if (node->tag == pre->tag) 2624 { 2625 FreeNode(doc, node); 2626 } 2627 else 2628 { 2629 ReportError(doc, pre, node, MISSING_ENDTAG_BEFORE ); 2630 UngetToken( doc ); 2631 } 2632 pre->closed = yes; 2633 TrimSpaces(doc, pre); 2634 return; 2635 } 2636 2637 if (nodeIsText(node)) 2638 { 2639 InsertNodeAtEnd(pre, node); 2640 continue; 2641 } 2642 2643 /* deal with comments etc. */ 2644 if (InsertMisc(pre, node)) 2645 continue; 2646 2647 if (node->tag == NULL) 2648 { 2649 ReportError(doc, pre, node, DISCARDING_UNEXPECTED); 2650 FreeNode(doc, node); 2651 continue; 2652 } 2653 2654 /* strip unexpected tags */ 2655 if ( !PreContent(doc, node) ) 2656 { 2657 Node *newnode; 2658 2659 /* fix for http://tidy.sf.net/bug/772205 */ 2660 if (node->type == EndTag) 2661 { 2662 ReportError(doc, pre, node, DISCARDING_UNEXPECTED); 2663 FreeNode(doc, node); 2664 continue; 2665 } 2666 /* 2667 This is basically what Tidy 04 August 2000 did and far more accurate 2668 with respect to browser behaivour than the code commented out above. 2669 Tidy could try to propagate the <pre> into each disallowed child where 2670 <pre> is allowed in order to replicate some browsers behaivour, but 2671 there are a lot of exceptions, e.g. Internet Explorer does not propagate 2672 <pre> into table cells while Mozilla does. Opera 6 never propagates 2673 <pre> into blocklevel elements while Opera 7 behaves much like Mozilla. 2674 2675 Tidy behaves thus mostly like Opera 6 except for nested <pre> elements 2676 which are handled like Mozilla takes them (Opera6 closes all <pre> after 2677 the first </pre>). 2678 2679 There are similar issues like replacing <p> in <pre> with <br>, for 2680 example 2681 2682 <pre>...<p>...</pre> (Input) 2683 <pre>...<br>...</pre> (Tidy) 2684 <pre>...<br>...</pre> (Opera 7 and Internet Explorer) 2685 <pre>...<br><br>...</pre> (Opera 6 and Mozilla) 2686 2687 <pre>...<p>...</p>...</pre> (Input) 2688 <pre>...<br>......</pre> (Tidy, BUG!) 2689 <pre>...<br>...<br>...</pre> (Internet Explorer) 2690 <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6) 2691 <pre>...<br>...<br><br>...</pre> (Opera 7) 2692 2693 or something similar, they could also be closing the <pre> and propagate 2694 the <pre> into the newly opened <p>. 2695 2696 Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are 2697 dissallowed in <pre>, Tidy neither detects this nor does it perform any 2698 cleanup operation. Tidy should at least issue a warning if it encounters 2699 such constructs. 2700 2701 Todo: discarding </p> is abviously a bug, it should be replaced by <br>. 2702 */ 2703 InsertNodeAfterElement(pre, node); 2704 ReportError(doc, pre, node, MISSING_ENDTAG_BEFORE); 2705 ParseTag(doc, node, IgnoreWhitespace); 2706 2707 newnode = InferredTag(doc, TidyTag_PRE); 2708 ReportError(doc, pre, newnode, INSERTING_TAG); 2709 pre = newnode; 2710 InsertNodeAfterElement(node, pre); 2711 2712 continue; 2713 } 2714 2715 if ( nodeIsP(node) ) 2716 { 2717 if (node->type == StartTag) 2718 { 2719 ReportError(doc, pre, node, USING_BR_INPLACE_OF); 2720 2721 /* trim white space before <p> in <pre>*/ 2722 TrimSpaces(doc, pre); 2723 2724 /* coerce both <p> and </p> to <br> */ 2725 CoerceNode(doc, node, TidyTag_BR, no, no); 2726 FreeAttrs( doc, node ); /* discard align attribute etc. */ 2727 InsertNodeAtEnd( pre, node ); 2728 } 2729 else 2730 { 2731 ReportError(doc, pre, node, DISCARDING_UNEXPECTED); 2732 FreeNode( doc, node); 2733 } 2734 continue; 2735 } 2736 2737 if ( nodeIsElement(node) ) 2738 { 2739 /* trim white space before <br> */ 2740 if ( nodeIsBR(node) ) 2741 TrimSpaces(doc, pre); 2742 2743 InsertNodeAtEnd(pre, node); 2744 ParseTag(doc, node, Preformatted); 2745 continue; 2746 } 2747 2748 /* discard unexpected tags */ 2749 ReportError(doc, pre, node, DISCARDING_UNEXPECTED); 2750 FreeNode( doc, node); 2751 } 2752 2753 ReportError(doc, pre, node, MISSING_ENDTAG_FOR); 2754 } 2755 2756 void ParseOptGroup(TidyDocImpl* doc, Node *field, uint ARG_UNUSED(mode)) 2757 { 2758 Lexer* lexer = doc->lexer; 2759 Node *node; 2760 2761 lexer->insert = NULL; /* defer implicit inline start tags */ 2762 2763 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 2764 { 2765 if (node->tag == field->tag && node->type == EndTag) 2766 { 2767 FreeNode( doc, node); 2768 field->closed = yes; 2769 TrimSpaces(doc, field); 2770 return; 2771 } 2772 2773 /* deal with comments etc. */ 2774 if (InsertMisc(field, node)) 2775 continue; 2776 2777 if ( node->type == StartTag && 2778 (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) ) 2779 { 2780 if ( nodeIsOPTGROUP(node) ) 2781 ReportError(doc, field, node, CANT_BE_NESTED); 2782 2783 InsertNodeAtEnd(field, node); 2784 ParseTag(doc, node, MixedContent); 2785 continue; 2786 } 2787 2788 /* discard unexpected tags */ 2789 ReportError(doc, field, node, DISCARDING_UNEXPECTED ); 2790 FreeNode( doc, node); 2791 } 2792 } 2793 2794 2795 void ParseSelect(TidyDocImpl* doc, Node *field, uint ARG_UNUSED(mode)) 2796 { 2797 Lexer* lexer = doc->lexer; 2798 Node *node; 2799 2800 lexer->insert = NULL; /* defer implicit inline start tags */ 2801 2802 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 2803 { 2804 if (node->tag == field->tag && node->type == EndTag) 2805 { 2806 FreeNode( doc, node); 2807 field->closed = yes; 2808 TrimSpaces(doc, field); 2809 return; 2810 } 2811 2812 /* deal with comments etc. */ 2813 if (InsertMisc(field, node)) 2814 continue; 2815 2816 if ( node->type == StartTag && 2817 ( nodeIsOPTION(node) || 2818 nodeIsOPTGROUP(node) || 2819 nodeIsSCRIPT(node)) 2820 ) 2821 { 2822 InsertNodeAtEnd(field, node); 2823 ParseTag(doc, node, IgnoreWhitespace); 2824 continue; 2825 } 2826 2827 /* discard unexpected tags */ 2828 ReportError(doc, field, node, DISCARDING_UNEXPECTED); 2829 FreeNode( doc, node); 2830 } 2831 2832 ReportError(doc, field, node, MISSING_ENDTAG_FOR); 2833 } 2834 2835 void ParseText(TidyDocImpl* doc, Node *field, uint mode) 2836 { 2837 Lexer* lexer = doc->lexer; 2838 Node *node; 2839 2840 lexer->insert = NULL; /* defer implicit inline start tags */ 2841 2842 if ( nodeIsTEXTAREA(field) ) 2843 mode = Preformatted; 2844 else 2845 mode = MixedContent; /* kludge for font tags */ 2846 2847 while ((node = GetToken(doc, mode)) != NULL) 2848 { 2849 if (node->tag == field->tag && node->type == EndTag) 2850 { 2851 FreeNode( doc, node); 2852 field->closed = yes; 2853 TrimSpaces(doc, field); 2854 return; 2855 } 2856 2857 /* deal with comments etc. */ 2858 if (InsertMisc(field, node)) 2859 continue; 2860 2861 if (nodeIsText(node)) 2862 { 2863 /* only called for 1st child */ 2864 if (field->content == NULL && !(mode & Preformatted)) 2865 TrimSpaces(doc, field); 2866 2867 if (node->start >= node->end) 2868 { 2869 FreeNode( doc, node); 2870 continue; 2871 } 2872 2873 InsertNodeAtEnd(field, node); 2874 continue; 2875 } 2876 2877 /* for textarea should all cases of < and & be escaped? */ 2878 2879 /* discard inline tags e.g. font */ 2880 if ( node->tag 2881 && node->tag->model & CM_INLINE 2882 && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */ 2883 { 2884 ReportError(doc, field, node, DISCARDING_UNEXPECTED); 2885 FreeNode( doc, node); 2886 continue; 2887 } 2888 2889 /* terminate element on other tags */ 2890 if (!(field->tag->model & CM_OPT)) 2891 ReportError(doc, field, node, MISSING_ENDTAG_BEFORE); 2892 2893 UngetToken( doc ); 2894 TrimSpaces(doc, field); 2895 return; 2896 } 2897 2898 if (!(field->tag->model & CM_OPT)) 2899 ReportError(doc, field, node, MISSING_ENDTAG_FOR); 2900 } 2901 2902 2903 void ParseTitle(TidyDocImpl* doc, Node *title, uint ARG_UNUSED(mode)) 2904 { 2905 Node *node; 2906 while ((node = GetToken(doc, MixedContent)) != NULL) 2907 { 2908 if (node->tag == title->tag && node->type == StartTag) 2909 { 2910 ReportError(doc, title, node, COERCE_TO_ENDTAG); 2911 node->type = EndTag; 2912 UngetToken( doc ); 2913 continue; 2914 } 2915 else if (node->tag == title->tag && node->type == EndTag) 2916 { 2917 FreeNode( doc, node); 2918 title->closed = yes; 2919 TrimSpaces(doc, title); 2920 return; 2921 } 2922 2923 if (nodeIsText(node)) 2924 { 2925 /* only called for 1st child */ 2926 if (title->content == NULL) 2927 TrimInitialSpace(doc, title, node); 2928 2929 if (node->start >= node->end) 2930 { 2931 FreeNode( doc, node); 2932 continue; 2933 } 2934 2935 InsertNodeAtEnd(title, node); 2936 continue; 2937 } 2938 2939 /* deal with comments etc. */ 2940 if (InsertMisc(title, node)) 2941 continue; 2942 2943 /* discard unknown tags */ 2944 if (node->tag == NULL) 2945 { 2946 ReportError(doc, title, node, DISCARDING_UNEXPECTED); 2947 FreeNode( doc, node); 2948 continue; 2949 } 2950 2951 /* pushback unexpected tokens */ 2952 ReportError(doc, title, node, MISSING_ENDTAG_BEFORE); 2953 UngetToken( doc ); 2954 TrimSpaces(doc, title); 2955 return; 2956 } 2957 2958 ReportError(doc, title, node, MISSING_ENDTAG_FOR); 2959 } 2960 2961 /* 2962 This isn't quite right for CDATA content as it recognises 2963 tags within the content and parses them accordingly. 2964 This will unfortunately screw up scripts which include 2965 < + letter, < + !, < + ? or < + / + letter 2966 */ 2967 2968 void ParseScript(TidyDocImpl* doc, Node *script, uint ARG_UNUSED(mode)) 2969 { 2970 Node *node; 2971 2972 doc->lexer->parent = script; 2973 node = GetToken(doc, CdataContent); 2974 doc->lexer->parent = NULL; 2975 2976 if (node) 2977 { 2978 InsertNodeAtEnd(script, node); 2979 } 2980 else 2981 { 2982 /* handle e.g. a document like "<script>" */ 2983 ReportError(doc, script, NULL, MISSING_ENDTAG_FOR); 2984 return; 2985 } 2986 2987 node = GetToken(doc, IgnoreWhitespace); 2988 2989 if (!(node && node->type == EndTag && node->tag && 2990 node->tag->id == script->tag->id)) 2991 { 2992 ReportError(doc, script, node, MISSING_ENDTAG_FOR); 2993 2994 if (node) 2995 UngetToken(doc); 2996 } 2997 else 2998 { 2999 FreeNode(doc, node); 3000 } 3001 } 3002 3003 Bool IsJavaScript(Node *node) 3004 { 3005 Bool result = no; 3006 AttVal *attr; 3007 3008 if (node->attributes == NULL) 3009 return yes; 3010 3011 for (attr = node->attributes; attr; attr = attr->next) 3012 { 3013 if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr)) 3014 && AttrContains(attr, "javascript") ) 3015 { 3016 result = yes; 3017 break; 3018 } 3019 } 3020 3021 return result; 3022 } 3023 3024 void ParseHead(TidyDocImpl* doc, Node *head, uint ARG_UNUSED(mode)) 3025 { 3026 Lexer* lexer = doc->lexer; 3027 Node *node; 3028 int HasTitle = 0; 3029 int HasBase = 0; 3030 3031 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 3032 { 3033 if (node->tag == head->tag && node->type == EndTag) 3034 { 3035 FreeNode( doc, node); 3036 head->closed = yes; 3037 break; 3038 } 3039 3040 /* find and discard multiple <head> elements */ 3041 /* find and discard <html> in <head> elements */ 3042 if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag) 3043 { 3044 ReportError(doc, head, node, DISCARDING_UNEXPECTED); 3045 FreeNode(doc, node); 3046 continue; 3047 } 3048 3049 if (nodeIsText(node)) 3050 { 3051 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN); 3052 UngetToken( doc ); 3053 break; 3054 } 3055 3056 if (node->type == ProcInsTag && node->element && 3057 tmbstrcmp(node->element, "xml-stylesheet") == 0) 3058 { 3059 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN); 3060 InsertNodeBeforeElement(FindHTML(doc), node); 3061 continue; 3062 } 3063 3064 /* deal with comments etc. */ 3065 if (InsertMisc(head, node)) 3066 continue; 3067 3068 if (node->type == DocTypeTag) 3069 { 3070 InsertDocType(doc, head, node); 3071 continue; 3072 } 3073 3074 /* discard unknown tags */ 3075 if (node->tag == NULL) 3076 { 3077 ReportError(doc, head, node, DISCARDING_UNEXPECTED); 3078 FreeNode( doc, node); 3079 continue; 3080 } 3081 3082 /* 3083 if it doesn't belong in the head then 3084 treat as implicit end of head and deal 3085 with as part of the body 3086 */ 3087 if (!(node->tag->model & CM_HEAD)) 3088 { 3089 /* #545067 Implicit closing of head broken - warn only for XHTML input */ 3090 if ( lexer->isvoyager ) 3091 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN ); 3092 UngetToken( doc ); 3093 break; 3094 } 3095 3096 if (nodeIsElement(node)) 3097 { 3098 if ( nodeIsTITLE(node) ) 3099 { 3100 ++HasTitle; 3101 3102 if (HasTitle > 1) 3103 if (head) 3104 ReportError(doc, head, node, TOO_MANY_ELEMENTS_IN); 3105 else 3106 ReportError(doc, head, node, TOO_MANY_ELEMENTS); 3107 } 3108 else if ( nodeIsBASE(node) ) 3109 { 3110 ++HasBase; 3111 3112 if (HasBase > 1) 3113 if (head) 3114 ReportError(doc, head, node, TOO_MANY_ELEMENTS_IN); 3115 else 3116 ReportError(doc, head, node, TOO_MANY_ELEMENTS); 3117 } 3118 else if ( nodeIsNOSCRIPT(node) ) 3119 { 3120 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN); 3121 } 3122 3123 #ifdef AUTO_INPUT_ENCODING 3124 else if (nodeIsMETA(node)) 3125 { 3126 AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV); 3127 AttVal * content = AttrGetById(node, TidyAttr_CONTENT); 3128 if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content)) 3129 { 3130 tmbstr val, charset; 3131 uint end = 0; 3132 val = charset = tmbstrdup(content->value); 3133 val = tmbstrtolower(val); 3134 val = strstr(content->value, "charset"); 3135 3136 if (val) 3137 val += 7; 3138 3139 while(val && *val && (IsWhite((tchar)*val) || 3140 *val == '=' || *val == '"' || *val == '\'')) 3141 ++val; 3142 3143 while(val && val[end] && !(IsWhite((tchar)val[end]) || 3144 val[end] == '"' || val[end] == '\'' || val[end] == ';')) 3145 ++end; 3146 3147 if (val && end) 3148 { 3149 tmbstr encoding = tmbstrndup(val, end); 3150 uint id = GetEncodingIdFromName(encoding); 3151 3152 /* todo: detect mismatch with BOM/XMLDecl/declared */ 3153 /* todo: error for unsupported encodings */ 3154 /* todo: try to re-init transcoder */ 3155 /* todo: change input/output encoding settings */ 3156 /* todo: store id in StreamIn */ 3157 3158 MemFree(encoding); 3159 } 3160 3161 MemFree(charset); 3162 } 3163 } 3164 #endif /* AUTO_INPUT_ENCODING */ 3165 3166 InsertNodeAtEnd(head, node); 3167 ParseTag(doc, node, IgnoreWhitespace); 3168 continue; 3169 } 3170 3171 /* discard unexpected text nodes and end tags */ 3172 ReportError(doc, head, node, DISCARDING_UNEXPECTED); 3173 FreeNode( doc, node); 3174 } 3175 } 3176 3177 void ParseBody(TidyDocImpl* doc, Node *body, uint mode) 3178 { 3179 Lexer* lexer = doc->lexer; 3180 Node *node; 3181 Bool checkstack, iswhitenode; 3182 3183 mode = IgnoreWhitespace; 3184 checkstack = yes; 3185 3186 BumpObject( doc, body->parent ); 3187 3188 while ((node = GetToken(doc, mode)) != NULL) 3189 { 3190 /* find and discard multiple <body> elements */ 3191 if (node->tag == body->tag && node->type == StartTag) 3192 { 3193 ReportError(doc, body, node, DISCARDING_UNEXPECTED); 3194 FreeNode(doc, node); 3195 continue; 3196 } 3197 3198 /* #538536 Extra endtags not detected */ 3199 if ( nodeIsHTML(node) ) 3200 { 3201 if (nodeIsElement(node) || lexer->seenEndHtml) 3202 ReportError(doc, body, node, DISCARDING_UNEXPECTED); 3203 else 3204 lexer->seenEndHtml = 1; 3205 3206 FreeNode( doc, node); 3207 continue; 3208 } 3209 3210 if ( lexer->seenEndBody && 3211 ( node->type == StartTag || 3212 node->type == EndTag || 3213 node->type == StartEndTag ) ) 3214 { 3215 ReportError(doc, body, node, CONTENT_AFTER_BODY ); 3216 } 3217 3218 if ( node->tag == body->tag && node->type == EndTag ) 3219 { 3220 body->closed = yes; 3221 TrimSpaces(doc, body); 3222 FreeNode( doc, node); 3223 lexer->seenEndBody = 1; 3224 mode = IgnoreWhitespace; 3225 3226 if ( nodeIsNOFRAMES(body->parent) ) 3227 break; 3228 3229 continue; 3230 } 3231 3232 if ( nodeIsNOFRAMES(node) ) 3233 { 3234 if (node->type == StartTag) 3235 { 3236 InsertNodeAtEnd(body, node); 3237 ParseBlock(doc, node, mode); 3238 continue; 3239 } 3240 3241 if (node->type == EndTag && nodeIsNOFRAMES(body->parent) ) 3242 { 3243 TrimSpaces(doc, body); 3244 UngetToken( doc ); 3245 break; 3246 } 3247 } 3248 3249 if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node)) 3250 && nodeIsNOFRAMES(body->parent) ) 3251 { 3252 TrimSpaces(doc, body); 3253 UngetToken( doc ); 3254 break; 3255 } 3256 3257 iswhitenode = no; 3258 3259 if ( nodeIsText(node) && 3260 node->end <= node->start + 1 && 3261 lexer->lexbuf[node->start] == ' ' ) 3262 iswhitenode = yes; 3263 3264 /* deal with comments etc. */ 3265 if (InsertMisc(body, node)) 3266 continue; 3267 3268 /* #538536 Extra endtags not detected */ 3269 #if 0 3270 if ( lexer->seenEndBody == 1 && !iswhitenode ) 3271 { 3272 ++lexer->seenEndBody; 3273 ReportError(doc, body, node, CONTENT_AFTER_BODY); 3274 } 3275 #endif 3276 3277 /* mixed content model permits text */ 3278 if (nodeIsText(node)) 3279 { 3280 if (iswhitenode && mode == IgnoreWhitespace) 3281 { 3282 FreeNode( doc, node); 3283 continue; 3284 } 3285 3286 /* HTML 2 and HTML4 strict don't allow text here */ 3287 ConstrainVersion(doc, ~(VERS_HTML40_STRICT | VERS_HTML20)); 3288 3289 if (checkstack) 3290 { 3291 checkstack = no; 3292 3293 if ( InlineDup(doc, node) > 0 ) 3294 continue; 3295 } 3296 3297 InsertNodeAtEnd(body, node); 3298 mode = MixedContent; 3299 continue; 3300 } 3301 3302 if (node->type == DocTypeTag) 3303 { 3304 InsertDocType(doc, body, node); 3305 continue; 3306 } 3307 /* discard unknown and PARAM tags */ 3308 if ( node->tag == NULL || nodeIsPARAM(node) ) 3309 { 3310 ReportError(doc, body, node, DISCARDING_UNEXPECTED); 3311 FreeNode( doc, node); 3312 continue; 3313 } 3314 3315 /* 3316 Netscape allows LI and DD directly in BODY 3317 We infer UL or DL respectively and use this 3318 Bool to exclude block-level elements so as 3319 to match Netscape's observed behaviour. 3320 */ 3321 lexer->excludeBlocks = no; 3322 3323 if ( nodeIsINPUT(node) || 3324 (!nodeHasCM(node, CM_BLOCK) && !nodeHasCM(node, CM_INLINE)) 3325 ) 3326 { 3327 /* avoid this error message being issued twice */ 3328 if (!(node->tag->model & CM_HEAD)) 3329 ReportError(doc, body, node, TAG_NOT_ALLOWED_IN); 3330 3331 if (node->tag->model & CM_HTML) 3332 { 3333 /* copy body attributes if current body was inferred */ 3334 if ( nodeIsBODY(node) && body->implicit 3335 && body->attributes == NULL ) 3336 { 3337 body->attributes = node->attributes; 3338 node->attributes = NULL; 3339 } 3340 3341 FreeNode( doc, node); 3342 continue; 3343 } 3344 3345 if (node->tag->model & CM_HEAD) 3346 { 3347 MoveToHead(doc, body, node); 3348 continue; 3349 } 3350 3351 if (node->tag->model & CM_LIST) 3352 { 3353 UngetToken( doc ); 3354 node = InferredTag(doc, TidyTag_UL); 3355 /* AddClass( doc, node, "noindent" ); */ 3356 lexer->excludeBlocks = yes; 3357 } 3358 else if (node->tag->model & CM_DEFLIST) 3359 { 3360 UngetToken( doc ); 3361 node = InferredTag(doc, TidyTag_DL); 3362 lexer->excludeBlocks = yes; 3363 } 3364 else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) 3365 { 3366 UngetToken( doc ); 3367 node = InferredTag(doc, TidyTag_TABLE); 3368 lexer->excludeBlocks = yes; 3369 } 3370 else if ( nodeIsINPUT(node) ) 3371 { 3372 UngetToken( doc ); 3373 node = InferredTag(doc, TidyTag_FORM); 3374 lexer->excludeBlocks = yes; 3375 } 3376 else 3377 { 3378 if ( !nodeHasCM(node, CM_ROW | CM_FIELD) ) 3379 { 3380 UngetToken( doc ); 3381 return; 3382 } 3383 3384 /* ignore </td> </th> <option> etc. */ 3385 FreeNode( doc, node ); 3386 continue; 3387 } 3388 } 3389 3390 if (node->type == EndTag) 3391 { 3392 if ( nodeIsBR(node) ) 3393 node->type = StartTag; 3394 else if ( nodeIsP(node) ) 3395 { 3396 node->type = StartEndTag; 3397 node->implicit = yes; 3398 #if OBSOLETE 3399 CoerceNode(doc, node, TidyTag_BR, no, no); 3400 FreeAttrs( doc, node ); /* discard align attribute etc. */ 3401 InsertNodeAtEnd(body, node); 3402 node = InferredTag(doc, TidyTag_BR); 3403 #endif 3404 } 3405 else if ( nodeHasCM(node, CM_INLINE) ) 3406 PopInline( doc, node ); 3407 } 3408 3409 if (nodeIsElement(node)) 3410 { 3411 if ( nodeHasCM(node, CM_INLINE) && !nodeHasCM(node, CM_MIXED) ) 3412 { 3413 /* HTML4 strict doesn't allow inline content here */ 3414 /* but HTML2 does allow img elements as children of body */ 3415 if ( nodeIsIMG(node) ) 3416 ConstrainVersion(doc, ~VERS_HTML40_STRICT); 3417 else 3418 ConstrainVersion(doc, ~(VERS_HTML40_STRICT|VERS_HTML20)); 3419 3420 if (checkstack && !node->implicit) 3421 { 3422 checkstack = no; 3423 3424 if ( InlineDup(doc, node) > 0 ) 3425 continue; 3426 } 3427 3428 mode = MixedContent; 3429 } 3430 else 3431 { 3432 checkstack = yes; 3433 mode = IgnoreWhitespace; 3434 } 3435 3436 if (node->implicit) 3437 ReportError(doc, body, node, INSERTING_TAG); 3438 3439 InsertNodeAtEnd(body, node); 3440 ParseTag(doc, node, mode); 3441 continue; 3442 } 3443 3444 /* discard unexpected tags */ 3445 ReportError(doc, body, node, DISCARDING_UNEXPECTED); 3446 FreeNode( doc, node); 3447 } 3448 } 3449 3450 void ParseNoFrames(TidyDocImpl* doc, Node *noframes, uint mode) 3451 { 3452 Lexer* lexer = doc->lexer; 3453 Node *node; 3454 3455 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 3456 { 3457 doc->badAccess |= USING_NOFRAMES; 3458 } 3459 mode = IgnoreWhitespace; 3460 3461 while ( (node = GetToken(doc, mode)) != NULL ) 3462 { 3463 if ( node->tag == noframes->tag && node->type == EndTag ) 3464 { 3465 FreeNode( doc, node); 3466 noframes->closed = yes; 3467 TrimSpaces(doc, noframes); 3468 return; 3469 } 3470 3471 if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) ) 3472 { 3473 TrimSpaces(doc, noframes); 3474 if (node->type == EndTag) 3475 { 3476 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED); 3477 FreeNode( doc, node); /* Throw it away */ 3478 } 3479 else 3480 { 3481 ReportError(doc, noframes, node, MISSING_ENDTAG_BEFORE); 3482 UngetToken( doc ); 3483 } 3484 return; 3485 } 3486 3487 if ( nodeIsHTML(node) ) 3488 { 3489 if (nodeIsElement(node)) 3490 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED); 3491 3492 FreeNode( doc, node); 3493 continue; 3494 } 3495 3496 /* deal with comments etc. */ 3497 if (InsertMisc(noframes, node)) 3498 continue; 3499 3500 if ( nodeIsBODY(node) && node->type == StartTag ) 3501 { 3502 Bool seen_body = lexer->seenEndBody; 3503 InsertNodeAtEnd(noframes, node); 3504 ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/); 3505 3506 /* fix for bug http://tidy.sf.net/bug/887259 */ 3507 if (seen_body && FindBody(doc) != node) 3508 { 3509 CoerceNode(doc, node, TidyTag_DIV, no, no); 3510 MoveNodeToBody(doc, node); 3511 } 3512 continue; 3513 } 3514 3515 /* implicit body element inferred */ 3516 if (nodeIsText(node) || (node->tag && node->type != EndTag)) 3517 { 3518 if ( lexer->seenEndBody ) 3519 { 3520 Node *body = FindBody( doc ); 3521 if ( body == NULL ) 3522 { 3523 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED); 3524 FreeNode( doc, node); 3525 continue; 3526 } 3527 if ( nodeIsText(node) ) 3528 { 3529 UngetToken( doc ); 3530 node = InferredTag(doc, TidyTag_P); 3531 ReportError(doc, noframes, node, CONTENT_AFTER_BODY ); 3532 } 3533 InsertNodeAtEnd( body, node ); 3534 } 3535 else 3536 { 3537 UngetToken( doc ); 3538 node = InferredTag(doc, TidyTag_BODY); 3539 if ( cfgBool(doc, TidyXmlOut) ) 3540 ReportError(doc, noframes, node, INSERTING_TAG); 3541 InsertNodeAtEnd( noframes, node ); 3542 } 3543 3544 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ ); 3545 continue; 3546 } 3547 3548 /* discard unexpected end tags */ 3549 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED); 3550 FreeNode( doc, node); 3551 } 3552 3553 ReportError(doc, noframes, node, MISSING_ENDTAG_FOR); 3554 } 3555 3556 void ParseFrameSet(TidyDocImpl* doc, Node *frameset, uint ARG_UNUSED(mode)) 3557 { 3558 Lexer* lexer = doc->lexer; 3559 Node *node; 3560 3561 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 3562 { 3563 doc->badAccess |= USING_FRAMES; 3564 } 3565 3566 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 3567 { 3568 if (node->tag == frameset->tag && node->type == EndTag) 3569 { 3570 FreeNode( doc, node); 3571 frameset->closed = yes; 3572 TrimSpaces(doc, frameset); 3573 return; 3574 } 3575 3576 /* deal with comments etc. */ 3577 if (InsertMisc(frameset, node)) 3578 continue; 3579 3580 if (node->tag == NULL) 3581 { 3582 ReportError(doc, frameset, node, DISCARDING_UNEXPECTED); 3583 FreeNode( doc, node); 3584 continue; 3585 } 3586 3587 if (nodeIsElement(node)) 3588 { 3589 if (node->tag && node->tag->model & CM_HEAD) 3590 { 3591 MoveToHead(doc, frameset, node); 3592 continue; 3593 } 3594 } 3595 3596 if ( nodeIsBODY(node) ) 3597 { 3598 UngetToken( doc ); 3599 node = InferredTag(doc, TidyTag_NOFRAMES); 3600 ReportError(doc, frameset, node, INSERTING_TAG); 3601 } 3602 3603 if (node->type == StartTag && (node->tag->model & CM_FRAMES)) 3604 { 3605 InsertNodeAtEnd(frameset, node); 3606 lexer->excludeBlocks = no; 3607 ParseTag(doc, node, MixedContent); 3608 continue; 3609 } 3610 else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES)) 3611 { 3612 InsertNodeAtEnd(frameset, node); 3613 continue; 3614 } 3615 3616 /* discard unexpected tags */ 3617 ReportError(doc, frameset, node, DISCARDING_UNEXPECTED); 3618 FreeNode( doc, node); 3619 } 3620 3621 ReportError(doc, frameset, node, MISSING_ENDTAG_FOR); 3622 } 3623 3624 void ParseHTML(TidyDocImpl* doc, Node *html, uint mode) 3625 { 3626 Node *node, *head; 3627 Node *frameset = NULL; 3628 Node *noframes = NULL; 3629 3630 SetOptionBool( doc, TidyXmlTags, no ); 3631 3632 for (;;) 3633 { 3634 node = GetToken(doc, IgnoreWhitespace); 3635 3636 if (node == NULL) 3637 { 3638 node = InferredTag(doc, TidyTag_HEAD); 3639 break; 3640 } 3641 3642 if ( nodeIsHEAD(node) ) 3643 break; 3644 3645 if (node->tag == html->tag && node->type == EndTag) 3646 { 3647 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3648 FreeNode( doc, node); 3649 continue; 3650 } 3651 3652 /* find and discard multiple <html> elements */ 3653 if (node->tag == html->tag && node->type == StartTag) 3654 { 3655 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3656 FreeNode(doc, node); 3657 continue; 3658 } 3659 3660 /* deal with comments etc. */ 3661 if (InsertMisc(html, node)) 3662 continue; 3663 3664 UngetToken( doc ); 3665 node = InferredTag(doc, TidyTag_HEAD); 3666 break; 3667 } 3668 3669 head = node; 3670 InsertNodeAtEnd(html, head); 3671 ParseHead(doc, head, mode); 3672 3673 for (;;) 3674 { 3675 node = GetToken(doc, IgnoreWhitespace); 3676 3677 if (node == NULL) 3678 { 3679 if (frameset == NULL) /* implied body */ 3680 { 3681 node = InferredTag(doc, TidyTag_BODY); 3682 InsertNodeAtEnd(html, node); 3683 ParseBody(doc, node, mode); 3684 } 3685 3686 return; 3687 } 3688 3689 /* robustly handle html tags */ 3690 if (node->tag == html->tag) 3691 { 3692 if (node->type != StartTag && frameset == NULL) 3693 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3694 3695 FreeNode( doc, node); 3696 continue; 3697 } 3698 3699 /* deal with comments etc. */ 3700 if (InsertMisc(html, node)) 3701 continue; 3702 3703 /* if frameset document coerce <body> to <noframes> */ 3704 if ( nodeIsBODY(node) ) 3705 { 3706 if (node->type != StartTag) 3707 { 3708 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3709 FreeNode( doc, node); 3710 continue; 3711 } 3712 3713 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 3714 { 3715 if (frameset != NULL) 3716 { 3717 UngetToken( doc ); 3718 3719 if (noframes == NULL) 3720 { 3721 noframes = InferredTag(doc, TidyTag_NOFRAMES); 3722 InsertNodeAtEnd(frameset, noframes); 3723 ReportError(doc, html, noframes, INSERTING_TAG); 3724 } 3725 3726 ParseTag(doc, noframes, mode); 3727 continue; 3728 } 3729 } 3730 3731 ConstrainVersion(doc, ~VERS_FRAMESET); 3732 break; /* to parse body */ 3733 } 3734 3735 /* flag an error if we see more than one frameset */ 3736 if ( nodeIsFRAMESET(node) ) 3737 { 3738 if (node->type != StartTag) 3739 { 3740 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3741 FreeNode( doc, node); 3742 continue; 3743 } 3744 3745 if (frameset != NULL) 3746 ReportFatal(doc, html, node, DUPLICATE_FRAMESET); 3747 else 3748 frameset = node; 3749 3750 InsertNodeAtEnd(html, node); 3751 ParseTag(doc, node, mode); 3752 3753 /* 3754 see if it includes a noframes element so 3755 that we can merge subsequent noframes elements 3756 */ 3757 3758 for (node = frameset->content; node; node = node->next) 3759 { 3760 if ( nodeIsNOFRAMES(node) ) 3761 noframes = node; 3762 } 3763 continue; 3764 } 3765 3766 /* if not a frameset document coerce <noframes> to <body> */ 3767 if ( nodeIsNOFRAMES(node) ) 3768 { 3769 if (node->type != StartTag) 3770 { 3771 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3772 FreeNode( doc, node); 3773 continue; 3774 } 3775 3776 if (frameset == NULL) 3777 { 3778 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3779 FreeNode( doc, node); 3780 node = InferredTag(doc, TidyTag_BODY); 3781 break; 3782 } 3783 3784 if (noframes == NULL) 3785 { 3786 noframes = node; 3787 InsertNodeAtEnd(frameset, noframes); 3788 } 3789 else 3790 FreeNode( doc, node); 3791 3792 ParseTag(doc, noframes, mode); 3793 continue; 3794 } 3795 3796 if (nodeIsElement(node)) 3797 { 3798 if (node->tag && node->tag->model & CM_HEAD) 3799 { 3800 MoveToHead(doc, html, node); 3801 continue; 3802 } 3803 3804 /* discard illegal frame element following a frameset */ 3805 if ( frameset != NULL && nodeIsFRAME(node) ) 3806 { 3807 ReportError(doc, html, node, DISCARDING_UNEXPECTED); 3808 FreeNode(doc, node); 3809 continue; 3810 } 3811 } 3812 3813 UngetToken( doc ); 3814 3815 /* insert other content into noframes element */ 3816 3817 if (frameset) 3818 { 3819 if (noframes == NULL) 3820 { 3821 noframes = InferredTag(doc, TidyTag_NOFRAMES); 3822 InsertNodeAtEnd(frameset, noframes); 3823 } 3824 else 3825 ReportError(doc, html, node, NOFRAMES_CONTENT); 3826 3827 ConstrainVersion(doc, VERS_FRAMESET); 3828 ParseTag(doc, noframes, mode); 3829 continue; 3830 } 3831 3832 node = InferredTag(doc, TidyTag_BODY); 3833 ConstrainVersion(doc, ~VERS_FRAMESET); 3834 break; 3835 } 3836 3837 /* node must be body */ 3838 3839 InsertNodeAtEnd(html, node); 3840 ParseTag(doc, node, mode); 3841 } 3842 3843 static Bool nodeCMIsOnlyInline( Node* node ) 3844 { 3845 return nodeHasCM( node, CM_INLINE ) && !nodeHasCM( node, CM_BLOCK ); 3846 } 3847 3848 static void EncloseBodyText(TidyDocImpl* doc) 3849 { 3850 Node* node; 3851 Node* body = FindBody(doc); 3852 3853 if (!body) 3854 return; 3855 3856 node = body->content; 3857 3858 while (node) 3859 { 3860 if ((nodeIsText(node) && !IsBlank(doc->lexer, node)) || 3861 (nodeIsElement(node) && nodeCMIsOnlyInline(node))) 3862 { 3863 Node* p = InferredTag(doc, TidyTag_P); 3864 InsertNodeBeforeElement(node, p); 3865 while (node && (!nodeIsElement(node) || nodeCMIsOnlyInline(node))) 3866 { 3867 Node* next = node->next; 3868 RemoveNode(node); 3869 InsertNodeAtEnd(p, node); 3870 node = next; 3871 } 3872 TrimSpaces(doc, p); 3873 continue; 3874 } 3875 node = node->next; 3876 } 3877 } 3878 3879 /* <form>, <blockquote> and <noscript> do not allow #PCDATA in 3880 HTML 4.01 Strict (%block; model instead of %flow;). 3881 When requested, text nodes in these elements are wrapped in <p>. */ 3882 static void EncloseBlockText(TidyDocImpl* doc, Node* node) 3883 { 3884 Node *next; 3885 Node *block; 3886 3887 while (node) 3888 { 3889 next = node->next; 3890 3891 if (node->content) 3892 EncloseBlockText(doc, node->content); 3893 3894 if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) || 3895 nodeIsBLOCKQUOTE(node)) 3896 || !node->content) 3897 { 3898 node = next; 3899 continue; 3900 } 3901 3902 block = node->content; 3903 3904 if ((nodeIsText(block) && !IsBlank(doc->lexer, block)) || 3905 (nodeIsElement(block) && nodeCMIsOnlyInline(block))) 3906 { 3907 Node* p = InferredTag(doc, TidyTag_P); 3908 InsertNodeBeforeElement(block, p); 3909 while (block && 3910 (!nodeIsElement(block) || nodeCMIsOnlyInline(block))) 3911 { 3912 Node* tempNext = block->next; 3913 RemoveNode(block); 3914 InsertNodeAtEnd(p, block); 3915 block = tempNext; 3916 } 3917 TrimSpaces(doc, p); 3918 continue; 3919 } 3920 3921 node = next; 3922 } 3923 } 3924 3925 static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node) 3926 { 3927 Node *next; 3928 3929 while (node) 3930 { 3931 next = node->next; 3932 3933 if (nodeIsDIR(node) || nodeIsMENU(node)) 3934 CoerceNode(doc, node, TidyTag_UL, yes, yes); 3935 3936 if (nodeIsXMP(node) || nodeIsLISTING(node) || 3937 (node->tag && node->tag->id == TidyTag_PLAINTEXT)) 3938 CoerceNode(doc, node, TidyTag_PRE, yes, yes); 3939 3940 if (node->content) 3941 ReplaceObsoleteElements(doc, node->content); 3942 3943 node = next; 3944 } 3945 } 3946 3947 static void AttributeChecks(TidyDocImpl* doc, Node* node) 3948 { 3949 Node *next; 3950 3951 while (node) 3952 { 3953 next = node->next; 3954 3955 if (nodeIsElement(node)) 3956 { 3957 if (node->tag->chkattrs) 3958 node->tag->chkattrs(doc, node); 3959 else 3960 CheckAttributes(doc, node); 3961 } 3962 3963 if (node->content) 3964 AttributeChecks(doc, node->content); 3965 3966 node = next; 3967 } 3968 } 3969 3970 /* 3971 HTML is the top level element 3972 */ 3973 void ParseDocument(TidyDocImpl* doc) 3974 { 3975 Node *node, *html, *doctype = NULL; 3976 3977 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 3978 { 3979 if (node->type == XmlDecl) 3980 { 3981 if (FindXmlDecl(doc) && doc->root.content) 3982 { 3983 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED); 3984 FreeNode(doc, node); 3985 continue; 3986 } 3987 if (node->line != 1 || (node->line == 1 && node->column != 1)) 3988 { 3989 ReportError(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL); 3990 } 3991 } 3992 #ifdef AUTO_INPUT_ENCODING 3993 if (node->type == XmlDecl) 3994 { 3995 AttVal* encoding = GetAttrByName(node, "encoding"); 3996 if (AttrHasValue(encoding)) 3997 { 3998 uint id = GetEncodingIdFromName(encoding->value); 3999 4000 /* todo: detect mismatch with BOM/XMLDecl/declared */ 4001 /* todo: error for unsupported encodings */ 4002 /* todo: try to re-init transcoder */ 4003 /* todo: change input/output encoding settings */ 4004 /* todo: store id in StreamIn */ 4005 } 4006 } 4007 #endif /* AUTO_INPUT_ENCODING */ 4008 4009 /* deal with comments etc. */ 4010 if (InsertMisc( &doc->root, node )) 4011 continue; 4012 4013 if (node->type == DocTypeTag) 4014 { 4015 if (doctype == NULL) 4016 { 4017 InsertNodeAtEnd( &doc->root, node); 4018 doctype = node; 4019 } 4020 else 4021 { 4022 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4023 FreeNode( doc, node); 4024 } 4025 continue; 4026 } 4027 4028 if (node->type == EndTag) 4029 { 4030 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4031 FreeNode( doc, node); 4032 continue; 4033 } 4034 4035 if (node->type == StartTag && nodeIsHTML(node)) 4036 { 4037 AttVal *xmlns; 4038 4039 xmlns = AttrGetById(node, TidyAttr_XMLNS); 4040 4041 if (AttrValueIs(xmlns, XHTML_NAMESPACE)) 4042 { 4043 Bool htmlOut = cfgBool( doc, TidyHtmlOut ); 4044 doc->lexer->isvoyager = yes; /* Unless plain HTML */ 4045 SetOptionBool( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/ 4046 SetOptionBool( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */ 4047 4048 /* adjust other config options, just as in config.c */ 4049 if ( !htmlOut ) 4050 { 4051 SetOptionBool( doc, TidyUpperCaseTags, no ); 4052 SetOptionBool( doc, TidyUpperCaseAttrs, no ); 4053 } 4054 } 4055 } 4056 4057 if ( node->type != StartTag || !nodeIsHTML(node) ) 4058 { 4059 UngetToken( doc ); 4060 html = InferredTag(doc, TidyTag_HTML); 4061 } 4062 else 4063 html = node; 4064 4065 if (!FindDocType(doc)) 4066 ReportError(doc, NULL, NULL, MISSING_DOCTYPE); 4067 4068 InsertNodeAtEnd( &doc->root, html); 4069 ParseHTML( doc, html, IgnoreWhitespace ); 4070 break; 4071 } 4072 4073 if (!FindHTML(doc)) 4074 { 4075 /* a later check should complain if <body> is empty */ 4076 html = InferredTag(doc, TidyTag_HTML); 4077 InsertNodeAtEnd( &doc->root, html); 4078 ParseHTML(doc, html, IgnoreWhitespace); 4079 } 4080 4081 if (!FindTITLE(doc)) 4082 { 4083 Node* head = FindHEAD(doc); 4084 ReportError(doc, head, NULL, MISSING_TITLE_ELEMENT); 4085 InsertNodeAtEnd(head, InferredTag(doc, TidyTag_TITLE)); 4086 } 4087 4088 AttributeChecks(doc, &doc->root); 4089 ReplaceObsoleteElements(doc, &doc->root); 4090 DropEmptyElements(doc, &doc->root); 4091 CleanSpaces(doc, &doc->root); 4092 4093 if (cfgBool(doc, TidyEncloseBodyText)) 4094 EncloseBodyText(doc); 4095 if (cfgBool(doc, TidyEncloseBlockText)) 4096 EncloseBlockText(doc, &doc->root); 4097 } 4098 4099 Bool XMLPreserveWhiteSpace( TidyDocImpl* doc, Node *element) 4100 { 4101 AttVal *attribute; 4102 4103 /* search attributes for xml:space */ 4104 for (attribute = element->attributes; attribute; attribute = attribute->next) 4105 { 4106 if (AttrValueIs(attribute, "xml:space")) 4107 { 4108 if (AttrValueIs(attribute, "preserve")) 4109 return yes; 4110 4111 return no; 4112 } 4113 } 4114 4115 if (element->element == NULL) 4116 return no; 4117 4118 /* kludge for html docs without explicit xml:space attribute */ 4119 if (nodeIsPRE(element) || 4120 nodeIsSCRIPT(element) || 4121 nodeIsSTYLE(element) || 4122 FindParser(doc, element) == ParsePre) 4123 return yes; 4124 4125 /* kludge for XSL docs */ 4126 if ( tmbstrcasecmp(element->element, "xsl:text") == 0 ) 4127 return yes; 4128 4129 return no; 4130 } 4131 4132 /* 4133 XML documents 4134 */ 4135 static void ParseXMLElement(TidyDocImpl* doc, Node *element, uint mode) 4136 { 4137 Lexer* lexer = doc->lexer; 4138 Node *node; 4139 4140 /* if node is pre or has xml:space="preserve" then do so */ 4141 4142 if ( XMLPreserveWhiteSpace(doc, element) ) 4143 mode = Preformatted; 4144 4145 while ((node = GetToken(doc, mode)) != NULL) 4146 { 4147 if (node->type == EndTag && 4148 node->element && element->element && 4149 tmbstrcmp(node->element, element->element) == 0) 4150 { 4151 FreeNode( doc, node); 4152 element->closed = yes; 4153 break; 4154 } 4155 4156 /* discard unexpected end tags */ 4157 if (node->type == EndTag) 4158 { 4159 if (element) 4160 ReportFatal(doc, element, node, UNEXPECTED_ENDTAG_IN); 4161 else 4162 ReportFatal(doc, element, node, UNEXPECTED_ENDTAG); 4163 4164 FreeNode( doc, node); 4165 continue; 4166 } 4167 4168 /* parse content on seeing start tag */ 4169 if (node->type == StartTag) 4170 ParseXMLElement( doc, node, mode ); 4171 4172 InsertNodeAtEnd(element, node); 4173 } 4174 4175 /* 4176 if first child is text then trim initial space and 4177 delete text node if it is empty. 4178 */ 4179 4180 node = element->content; 4181 4182 if (nodeIsText(node) && mode != Preformatted) 4183 { 4184 if ( lexer->lexbuf[node->start] == ' ' ) 4185 { 4186 node->start++; 4187 4188 if (node->start >= node->end) 4189 DiscardElement( doc, node ); 4190 } 4191 } 4192 4193 /* 4194 if last child is text then trim final space and 4195 delete the text node if it is empty 4196 */ 4197 4198 node = element->last; 4199 4200 if (nodeIsText(node) && mode != Preformatted) 4201 { 4202 if ( lexer->lexbuf[node->end - 1] == ' ' ) 4203 { 4204 node->end--; 4205 4206 if (node->start >= node->end) 4207 DiscardElement( doc, node ); 4208 } 4209 } 4210 } 4211 4212 void ParseXMLDocument(TidyDocImpl* doc) 4213 { 4214 Node *node, *doctype = NULL; 4215 4216 SetOptionBool( doc, TidyXmlTags, yes ); 4217 4218 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL) 4219 { 4220 /* discard unexpected end tags */ 4221 if (node->type == EndTag) 4222 { 4223 ReportError(doc, NULL, node, UNEXPECTED_ENDTAG); 4224 FreeNode( doc, node); 4225 continue; 4226 } 4227 4228 /* deal with comments etc. */ 4229 if (InsertMisc( &doc->root, node)) 4230 continue; 4231 4232 if (node->type == DocTypeTag) 4233 { 4234 if (doctype == NULL) 4235 { 4236 InsertNodeAtEnd( &doc->root, node); 4237 doctype = node; 4238 } 4239 else 4240 { 4241 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4242 FreeNode( doc, node); 4243 } 4244 continue; 4245 } 4246 4247 if (node->type == StartEndTag) 4248 { 4249 InsertNodeAtEnd( &doc->root, node); 4250 continue; 4251 } 4252 4253 /* if start tag then parse element's content */ 4254 if (node->type == StartTag) 4255 { 4256 InsertNodeAtEnd( &doc->root, node ); 4257 ParseXMLElement( doc, node, IgnoreWhitespace ); 4258 continue; 4259 } 4260 4261 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4262 FreeNode( doc, node); 4263 } 4264 4265 /* ensure presence of initial <?xml version="1.0"?> */ 4266 if ( cfgBool(doc, TidyXmlDecl) ) 4267 FixXmlDecl( doc ); 4268 } 4269 4270 /* 4271 * local variables: 4272 * mode: c 4273 * indent-tabs-mode: nil 4274 * c-basic-offset: 4 4275 * eval: (c-set-offset 'substatement-open 0) 4276 * end: 4277 */ 4278

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.