Version:
~ [ 1.0 ] ~
1 /* parser.c -- HTML Parser
2
3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 CVS Info :
7
8 $Author: arnaud02 $
9 $Date: 2005/10/21 12:54:15 $
10 $Revision: 1.150 $
11
12 */
13
14 #include "tidy-int.h"
15 #include "lexer.h"
16 #include "parser.h"
17 #include "message.h"
18 #include "clean.h"
19 #include "tags.h"
20 #include "tmbstr.h"
21
22 #ifdef AUTO_INPUT_ENCODING
23 #include "charsets.h"
24 #endif
25
26 Bool CheckNodeIntegrity(Node *node)
27 {
28 #ifndef NO_NODE_INTEGRITY_CHECK
29 if (node->prev)
30 {
31 if (node->prev->next != node)
32 return no;
33 }
34
35 if (node->next)
36 {
37 if (node->next->prev != node)
38 return no;
39 }
40
41 if (node->parent)
42 {
43 Node *child = NULL;
44 if (node->prev == NULL && node->parent->content != node)
45 return no;
46
47 if (node->next == NULL && node->parent->last != node)
48 return no;
49
50 for (child = node->parent->content; child; child = child->next)
51 {
52 if (child == node)
53 break;
54 }
55 if ( node != child )
56 return no;
57 }
58
59 for (node = node->content; node; node = node->next)
60 if ( !CheckNodeIntegrity(node) )
61 return no;
62
63 #endif
64 return yes;
65 }
66
67 /*
68 used to determine how attributes
69 without values should be printed
70 this was introduced to deal with
71 user defined tags e.g. Cold Fusion
72 */
73 Bool IsNewNode(Node *node)
74 {
75 if (node && node->tag)
76 {
77 return (node->tag->model & CM_NEW);
78 }
79 return yes;
80 }
81
82 void CoerceNode(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
83 {
84 const Dict* tag = LookupTagDef(tid);
85 Node* tmp = InferredTag(doc, tag->id);
86
87 if (obsolete)
88 ReportWarning(doc, node, tmp, OBSOLETE_ELEMENT);
89 else if (unexpected)
90 ReportError(doc, node, tmp, REPLACING_UNEX_ELEMENT);
91 else
92 ReportNotice(doc, node, tmp, REPLACING_ELEMENT);
93
94 MemFree(tmp->element);
95 MemFree(tmp);
96
97 node->was = node->tag;
98 node->tag = tag;
99 node->type = StartTag;
100 node->implicit = yes;
101 MemFree(node->element);
102 node->element = tmbstrdup(tag->name);
103 }
104
105 /* extract a node and its children from a markup tree */
106 Node *RemoveNode(Node *node)
107 {
108 if (node->prev)
109 node->prev->next = node->next;
110
111 if (node->next)
112 node->next->prev = node->prev;
113
114 if (node->parent)
115 {
116 if (node->parent->content == node)
117 node->parent->content = node->next;
118
119 if (node->parent->last == node)
120 node->parent->last = node->prev;
121 }
122
123 node->parent = node->prev = node->next = NULL;
124 return node;
125 }
126
127 /* remove node from markup tree and discard it */
128 Node *DiscardElement( TidyDocImpl* doc, Node *element )
129 {
130 Node *next = NULL;
131
132 if (element)
133 {
134 next = element->next;
135 RemoveNode(element);
136 FreeNode( doc, element);
137 }
138
139 return next;
140 }
141
142 /*
143 insert "node" into markup tree as the firt element
144 of content of "element"
145 */
146 void InsertNodeAtStart(Node *element, Node *node)
147 {
148 node->parent = element;
149
150 if (element->content == NULL)
151 element->last = node;
152 else
153 element->content->prev = node;
154
155 node->next = element->content;
156 node->prev = NULL;
157 element->content = node;
158 }
159
160 /*
161 insert "node" into markup tree as the last element
162 of content of "element"
163 */
164 void InsertNodeAtEnd(Node *element, Node *node)
165 {
166 node->parent = element;
167 node->prev = element->last;
168
169 if (element->last != NULL)
170 element->last->next = node;
171 else
172 element->content = node;
173
174 element->last = node;
175 }
176
177 /*
178 insert "node" into markup tree in place of "element"
179 which is moved to become the child of the node
180 */
181 static void InsertNodeAsParent(Node *element, Node *node)
182 {
183 node->content = element;
184 node->last = element;
185 node->parent = element->parent;
186 element->parent = node;
187
188 if (node->parent->content == element)
189 node->parent->content = node;
190
191 if (node->parent->last == element)
192 node->parent->last = node;
193
194 node->prev = element->prev;
195 element->prev = NULL;
196
197 if (node->prev)
198 node->prev->next = node;
199
200 node->next = element->next;
201 element->next = NULL;
202
203 if (node->next)
204 node->next->prev = node;
205 }
206
207 /* insert "node" into markup tree before "element" */
208 void InsertNodeBeforeElement(Node *element, Node *node)
209 {
210 Node *parent;
211
212 parent = element->parent;
213 node->parent = parent;
214 node->next = element;
215 node->prev = element->prev;
216 element->prev = node;
217
218 if (node->prev)
219 node->prev->next = node;
220
221 if (parent->content == element)
222 parent->content = node;
223 }
224
225 /* insert "node" into markup tree after "element" */
226 void InsertNodeAfterElement(Node *element, Node *node)
227 {
228 Node *parent;
229
230 parent = element->parent;
231 node->parent = parent;
232
233 /* AQ - 13 Jan 2000 fix for parent == NULL */
234 if (parent != NULL && parent->last == element)
235 parent->last = node;
236 else
237 {
238 node->next = element->next;
239 /* AQ - 13 Jan 2000 fix for node->next == NULL */
240 if (node->next != NULL)
241 node->next->prev = node;
242 }
243
244 element->next = node;
245 node->prev = element;
246 }
247
248 static Bool CanPrune( TidyDocImpl* doc, Node *element )
249 {
250 if ( nodeIsText(element) )
251 return yes;
252
253 if ( element->content )
254 return no;
255
256 if ( element->tag == NULL )
257 return no;
258
259 if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
260 return no;
261
262 if ( nodeIsA(element) && element->attributes != NULL )
263 return no;
264
265 if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
266 return no;
267
268 if ( element->tag->model & CM_ROW )
269 return no;
270
271 if ( element->tag->model & CM_EMPTY )
272 return no;
273
274 if ( nodeIsAPPLET(element) )
275 return no;
276
277 if ( nodeIsOBJECT(element) )
278 return no;
279
280 if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
281 return no;
282
283 if ( nodeIsTITLE(element) )
284 return no;
285
286 /* #433359 - fix by Randy Waki 12 Mar 01 */
287 if ( nodeIsIFRAME(element) )
288 return no;
289
290 /* fix for bug 770297 */
291 if (nodeIsTEXTAREA(element))
292 return no;
293
294 if ( attrGetID(element) || attrGetNAME(element) )
295 return no;
296
297 /* fix for bug 695408; a better fix would look for unknown and */
298 /* known proprietary attributes that make the element significant */
299 if (attrGetDATAFLD(element))
300 return no;
301
302 /* fix for bug 723772, don't trim new-...-tags */
303 if (element->tag->id == TidyTag_UNKNOWN)
304 return no;
305
306 if (nodeIsBODY(element))
307 return no;
308
309 if (nodeIsCOLGROUP(element))
310 return no;
311
312 return yes;
313 }
314
315 Node *TrimEmptyElement( TidyDocImpl* doc, Node *element )
316 {
317 if ( CanPrune(doc, element) )
318 {
319 if (element->type != TextNode)
320 ReportNotice(doc, element, NULL, TRIM_EMPTY_ELEMENT);
321
322 return DiscardElement(doc, element);
323 }
324 return element;
325 }
326
327 Node* DropEmptyElements(TidyDocImpl* doc, Node* node)
328 {
329 Node* next;
330
331 while (node)
332 {
333 next = node->next;
334
335 if (node->content)
336 DropEmptyElements(doc, node->content);
337
338 if (!nodeIsElement(node) &&
339 !(nodeIsText(node) && !(node->start < node->end)))
340 {
341 node = next;
342 continue;
343 }
344
345 next = TrimEmptyElement(doc, node);
346 node = node == next ? node->next : next;
347 }
348
349 return node;
350 }
351
352 /*
353 errors in positioning of form start or end tags
354 generally require human intervention to fix
355 */
356 static void BadForm( TidyDocImpl* doc )
357 {
358 doc->badForm = yes;
359 /* doc->errors++; */
360 }
361
362 /*
363 This maps
364 <em>hello </em><strong>world</strong>
365 to
366 <em>hello</em> <strong>world</strong>
367
368 If last child of element is a text node
369 then trim trailing white space character
370 moving it to after element's end tag.
371 */
372 static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
373 {
374 Lexer* lexer = doc->lexer;
375 byte c;
376
377 if (nodeIsText(last))
378 {
379 if (last->end > last->start)
380 {
381 c = (byte) lexer->lexbuf[ last->end - 1 ];
382
383 if ( c == ' '
384 #ifdef COMMENT_NBSP_FIX
385 || c == 160
386 #endif
387 )
388 {
389 #ifdef COMMENT_NBSP_FIX
390 /* take care with <td> </td> */
391 if ( c == 160 &&
392 ( element->tag == doc->tags.tag_td ||
393 element->tag == doc->tags.tag_th )
394 )
395 {
396 if (last->end > last->start + 1)
397 last->end -= 1;
398 }
399 else
400 #endif
401 {
402 last->end -= 1;
403 if ( (element->tag->model & CM_INLINE) &&
404 !(element->tag->model & CM_FIELD) )
405 lexer->insertspace = yes;
406 }
407 }
408 }
409 }
410 }
411
412 #if 0
413 static Node *EscapeTag(Lexer *lexer, Node *element)
414 {
415 Node *node = NewNode(lexer);
416
417 node->start = lexer->lexsize;
418 AddByte(lexer, '<');
419
420 if (element->type == EndTag)
421 AddByte(lexer, '/');
422
423 if (element->element)
424 {
425 char *p;
426 for (p = element->element; *p != '\0'; ++p)
427 AddByte(lexer, *p);
428 }
429 else if (element->type == DocTypeTag)
430 {
431 uint i;
432 AddStringLiteral( lexer, "!DOCTYPE " );
433 for (i = element->start; i < element->end; ++i)
434 AddByte(lexer, lexer->lexbuf[i]);
435 }
436
437 if (element->type == StartEndTag)
438 AddByte(lexer, '/');
439
440 AddByte(lexer, '>');
441 node->end = lexer->lexsize;
442
443 return node;
444 }
445 #endif /* 0 */
446
447 /* Only true for text nodes. */
448 Bool IsBlank(Lexer *lexer, Node *node)
449 {
450 Bool isBlank = nodeIsText(node);
451 if ( isBlank )
452 isBlank = ( node->end == node->start || /* Zero length */
453 ( node->end == node->start+1 /* or one blank. */
454 && lexer->lexbuf[node->start] == ' ' ) );
455 return isBlank;
456 }
457
458 /*
459 This maps
460 <p>hello<em> world</em>
461 to
462 <p>hello <em>world</em>
463
464 Trims initial space, by moving it before the
465 start tag, or if this element is the first in
466 parent's content, then by discarding the space
467 */
468 static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
469 {
470 Lexer* lexer = doc->lexer;
471 Node *prev, *node;
472
473 if ( nodeIsText(text) &&
474 lexer->lexbuf[text->start] == ' ' &&
475 text->start < text->end )
476 {
477 if ( (element->tag->model & CM_INLINE) &&
478 !(element->tag->model & CM_FIELD) )
479 {
480 prev = element->prev;
481
482 if (nodeIsText(prev))
483 {
484 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
485 lexer->lexbuf[(prev->end)++] = ' ';
486
487 ++(element->start);
488 }
489 else /* create new node */
490 {
491 node = NewNode(lexer);
492 node->start = (element->start)++;
493 node->end = element->start;
494 lexer->lexbuf[node->start] = ' ';
495 InsertNodeBeforeElement(element ,node);
496 }
497 }
498
499 /* discard the space in current node */
500 ++(text->start);
501 }
502 }
503
504 static Bool IsPreDescendant(Node* node)
505 {
506 Node *parent = node->parent;
507
508 while (parent)
509 {
510 if (parent->tag && parent->tag->parser == ParsePre)
511 return yes;
512
513 parent = parent->parent;
514 }
515
516 return no;
517 }
518
519 static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
520 {
521 Node* next;
522
523 if (!nodeIsText(node))
524 return no;
525
526 if (node->parent->type == DocTypeTag)
527 return no;
528
529 if (IsPreDescendant(node))
530 return no;
531
532 if (node->parent->tag->parser == ParseScript)
533 return no;
534
535 next = node->next;
536
537 /* <p>... </p> */
538 if (!next && !nodeHasCM(node->parent, CM_INLINE))
539 return yes;
540
541 /* <div><small>... </small><h3>...</h3></div> */
542 if (!next && node->parent->next && !nodeHasCM(node->parent->next, CM_INLINE))
543 return yes;
544
545 if (!next)
546 return no;
547
548 if (nodeIsBR(next))
549 return yes;
550
551 if (nodeHasCM(next, CM_INLINE))
552 return no;
553
554 /* <a href='/'>...</a> <p>...</p> */
555 if (next->type == StartTag)
556 return yes;
557
558 /* <strong>...</strong> <hr /> */
559 if (next->type == StartEndTag)
560 return yes;
561
562 /* evil adjacent text nodes, Tidy should not generate these :-( */
563 if (nodeIsText(next) && next->start < next->end
564 && IsWhite(doc->lexer->lexbuf[next->start]))
565 return yes;
566
567 return no;
568 }
569
570 static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
571 {
572 if (!nodeIsText(node))
573 return no;
574
575 if (node->parent->type == DocTypeTag)
576 return no;
577
578 if (IsPreDescendant(node))
579 return no;
580
581 if (node->parent->tag->parser == ParseScript)
582 return no;
583
584 /* <p>...<br> <em>...</em>...</p> */
585 if (nodeIsBR(node->prev))
586 return yes;
587
588 /* <p> ...</p> */
589 if (node->prev == NULL && !nodeHasCM(node->parent, CM_INLINE))
590 return yes;
591
592 /* <h4>...</h4> <em>...</em> */
593 if (node->prev && !nodeHasCM(node->prev, CM_INLINE) &&
594 nodeIsElement(node->prev))
595 return yes;
596
597 /* <p><span> ...</span></p> */
598 if (!node->prev && !node->parent->prev && !nodeHasCM(node->parent->parent, CM_INLINE))
599 return yes;
600
601 return no;
602 }
603
604 static void CleanSpaces(TidyDocImpl* doc, Node* node)
605 {
606 Node* next;
607
608 while (node)
609 {
610 next = node->next;
611
612 if (nodeIsText(node) && CleanLeadingWhitespace(doc, node))
613 while (node->start < node->end && IsWhite(doc->lexer->lexbuf[node->start]))
614 ++(node->start);
615
616 if (nodeIsText(node) && CleanTrailingWhitespace(doc, node))
617 while (node->end > node->start && IsWhite(doc->lexer->lexbuf[node->end - 1]))
618 --(node->end);
619
620 if (nodeIsText(node) && !(node->start < node->end))
621 {
622 RemoveNode(node);
623 FreeNode(doc, node);
624 node = next;
625
626 continue;
627 }
628
629 if (node->content)
630 CleanSpaces(doc, node->content);
631
632 node = next;
633 }
634 }
635
636 /*
637 Move initial and trailing space out.
638 This routine maps:
639
640 hello<em> world</em>
641 to
642 hello <em>world</em>
643 and
644 <em>hello </em><strong>world</strong>
645 to
646 <em>hello</em> <strong>world</strong>
647 */
648 static void TrimSpaces( TidyDocImpl* doc, Node *element)
649 {
650 Node* text = element->content;
651
652 if (nodeIsPRE(element) || IsPreDescendant(element))
653 return;
654
655 if (nodeIsText(text))
656 TrimInitialSpace(doc, element, text);
657
658 text = element->last;
659
660 if (nodeIsText(text))
661 TrimTrailingSpace(doc, element, text);
662 }
663
664 Bool DescendantOf( Node *element, TidyTagId tid )
665 {
666 Node *parent;
667 for ( parent = element->parent;
668 parent != NULL;
669 parent = parent->parent )
670 {
671 if ( TagIsId(parent, tid) )
672 return yes;
673 }
674 return no;
675 }
676
677 static Bool InsertMisc(Node *element, Node *node)
678 {
679 if (node->type == CommentTag ||
680 node->type == ProcInsTag ||
681 node->type == CDATATag ||
682 node->type == SectionTag ||
683 node->type == AspTag ||
684 node->type == JsteTag ||
685 node->type == PhpTag )
686 {
687 InsertNodeAtEnd(element, node);
688 return yes;
689 }
690
691 if ( node->type == XmlDecl )
692 {
693 Node* root = element;
694 while ( root && root->parent )
695 root = root->parent;
696 if ( root )
697 {
698 InsertNodeAtStart( root, node );
699 return yes;
700 }
701 }
702
703 /* Declared empty tags seem to be slipping through
704 ** the cracks. This is an experiment to figure out
705 ** a decent place to pick them up.
706 */
707 if ( node->tag &&
708 nodeIsElement(node) &&
709 nodeCMIsEmpty(node) && TagId(node) == TidyTag_UNKNOWN &&
710 (node->tag->versions & VERS_PROPRIETARY) != 0 )
711 {
712 InsertNodeAtEnd(element, node);
713 return yes;
714 }
715
716 return no;
717 }
718
719
720 static void ParseTag( TidyDocImpl* doc, Node *node, uint mode )
721 {
722 Lexer* lexer = doc->lexer;
723 /*
724 Fix by GLP 2000-12-21. Need to reset insertspace if this
725 is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
726 */
727 if (node->tag->model & CM_EMPTY)
728 {
729 lexer->waswhite = no;
730 if (node->tag->parser == NULL)
731 return;
732 }
733 else if (!(node->tag->model & CM_INLINE))
734 lexer->insertspace = no;
735
736 if (node->tag->parser == NULL)
737 return;
738
739 if (node->type == StartEndTag)
740 return;
741
742 (*node->tag->parser)( doc, node, mode );
743 }
744
745 /*
746 the doctype has been found after other tags,
747 and needs moving to before the html element
748 */
749 static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
750 {
751 Node* existing = FindDocType( doc );
752 if ( existing )
753 {
754 ReportError(doc, element, doctype, DISCARDING_UNEXPECTED );
755 FreeNode( doc, doctype );
756 }
757 else
758 {
759 ReportError(doc, element, doctype, DOCTYPE_AFTER_TAGS );
760 while ( !nodeIsHTML(element) )
761 element = element->parent;
762 InsertNodeBeforeElement( element, doctype );
763 }
764 }
765
766 /*
767 move node to the head, where element is used as starting
768 point in hunt for head. normally called during parsing
769 */
770 static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
771 {
772 Node *head;
773
774 RemoveNode( node ); /* make sure that node is isolated */
775
776 if ( nodeIsElement(node) )
777 {
778 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN );
779
780 head = FindHEAD(doc);
781 assert(head != NULL);
782
783 InsertNodeAtEnd(head, node);
784
785 if ( node->tag->parser )
786 ParseTag( doc, node, IgnoreWhitespace );
787 }
788 else
789 {
790 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
791 FreeNode( doc, node );
792 }
793 }
794
795 /* moves given node to end of body element */
796 static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
797 {
798 Node* body = FindBody( doc );
799 if ( body )
800 {
801 RemoveNode( node );
802 InsertNodeAtEnd( body, node );
803 }
804 }
805
806 /*
807 element is node created by the lexer
808 upon seeing the start tag, or by the
809 parser when the start tag is inferred
810 */
811 void ParseBlock( TidyDocImpl* doc, Node *element, uint mode)
812 {
813 Lexer* lexer = doc->lexer;
814 Node *node;
815 Bool checkstack = yes;
816 uint istackbase = 0;
817
818 if ( element->tag->model & CM_EMPTY )
819 return;
820
821 if ( nodeIsFORM(element) &&
822 DescendantOf(element, TidyTag_FORM) )
823 ReportError(doc, element, NULL, ILLEGAL_NESTING );
824
825 /*
826 InlineDup() asks the lexer to insert inline emphasis tags
827 currently pushed on the istack, but take care to avoid
828 propagating inline emphasis inside OBJECT or APPLET.
829 For these elements a fresh inline stack context is created
830 and disposed of upon reaching the end of the element.
831 They thus behave like table cells in this respect.
832 */
833 if (element->tag->model & CM_OBJECT)
834 {
835 istackbase = lexer->istackbase;
836 lexer->istackbase = lexer->istacksize;
837 }
838
839 if (!(element->tag->model & CM_MIXED))
840 InlineDup( doc, NULL );
841
842 mode = IgnoreWhitespace;
843
844 while ((node = GetToken(doc, mode /*MixedContent*/)) != NULL)
845 {
846 /* end tag for this element */
847 if (node->type == EndTag && node->tag &&
848 (node->tag == element->tag || element->was == node->tag))
849 {
850 FreeNode( doc, node );
851
852 if (element->tag->model & CM_OBJECT)
853 {
854 /* pop inline stack */
855 while (lexer->istacksize > lexer->istackbase)
856 PopInline( doc, NULL );
857 lexer->istackbase = istackbase;
858 }
859
860 element->closed = yes;
861 TrimSpaces( doc, element );
862 return;
863 }
864
865 if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
866 {
867 /* If we're in the HEAD, close it before proceeding.
868 This is an extremely rare occurance, but has been observed.
869 */
870 UngetToken( doc );
871 break;
872 }
873
874 if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
875 {
876 if ( nodeIsElement(node) )
877 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
878 FreeNode( doc, node );
879 continue;
880 }
881
882
883 if (node->type == EndTag)
884 {
885 if (node->tag == NULL)
886 {
887 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
888 FreeNode( doc, node );
889 continue;
890 }
891 else if ( nodeIsBR(node) )
892 node->type = StartTag;
893 else if ( nodeIsP(node) )
894 {
895 /* Cannot have a block inside a paragraph, so no checking
896 for an ancestor is necessary -- but we _can_ have
897 paragraphs inside a block, so change it to an implicit
898 empty paragraph, to be dealt with according to the user's
899 options
900 */
901 node->type = StartEndTag;
902 node->implicit = yes;
903 #if OBSOLETE
904 CoerceNode(doc, node, TidyTag_BR, no, no);
905 FreeAttrs( doc, node ); /* discard align attribute etc. */
906 InsertNodeAtEnd( element, node );
907 node = InferredTag(doc, TidyTag_BR);
908 #endif
909 }
910 else if (DescendantOf( element, node->tag->id ))
911 {
912 /*
913 if this is the end tag for an ancestor element
914 then infer end tag for this element
915 */
916 UngetToken( doc );
917 break;
918 #if OBSOLETE
919 Node *parent;
920 for ( parent = element->parent;
921 parent != NULL;
922 parent = parent->parent )
923 {
924 if (node->tag == parent->tag)
925 {
926 if (!(element->tag->model & CM_OPT))
927 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
928
929 UngetToken( doc );
930
931 if (element->tag->model & CM_OBJECT)
932 {
933 /* pop inline stack */
934 while (lexer->istacksize > lexer->istackbase)
935 PopInline( doc, NULL );
936 lexer->istackbase = istackbase;
937 }
938
939 TrimSpaces( doc, element );
940 return;
941 }
942 }
943 #endif
944 }
945 else
946 {
947 /* special case </tr> etc. for stuff moved in front of table */
948 if ( lexer->exiled
949 && node->tag->model
950 && (node->tag->model & CM_TABLE) )
951 {
952 UngetToken( doc );
953 TrimSpaces( doc, element );
954 return;
955 }
956 }
957 }
958
959 /* mixed content model permits text */
960 if (nodeIsText(node))
961 {
962 if ( checkstack )
963 {
964 checkstack = no;
965 if (!(element->tag->model & CM_MIXED))
966 {
967 if ( InlineDup(doc, node) > 0 )
968 continue;
969 }
970 }
971
972 InsertNodeAtEnd(element, node);
973 mode = MixedContent;
974
975 /*
976 HTML4 strict doesn't allow mixed content for
977 elements with %block; as their content model
978 */
979 /*
980 But only body, map, blockquote, form and
981 noscript have content model %block;
982 */
983 if ( nodeIsBODY(element) ||
984 nodeIsMAP(element) ||
985 nodeIsBLOCKQUOTE(element) ||
986 nodeIsFORM(element) ||
987 nodeIsNOSCRIPT(element) )
988 ConstrainVersion( doc, ~VERS_HTML40_STRICT );
989 continue;
990 }
991
992 if ( InsertMisc(element, node) )
993 continue;
994
995 /* allow PARAM elements? */
996 if ( nodeIsPARAM(node) )
997 {
998 if ( nodeHasCM(element, CM_PARAM) && nodeIsElement(node) )
999 {
1000 InsertNodeAtEnd(element, node);
1001 continue;
1002 }
1003
1004 /* otherwise discard it */
1005 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1006 FreeNode( doc, node );
1007 continue;
1008 }
1009
1010 /* allow AREA elements? */
1011 if ( nodeIsAREA(node) )
1012 {
1013 if ( nodeIsMAP(element) && nodeIsElement(node) )
1014 {
1015 InsertNodeAtEnd(element, node);
1016 continue;
1017 }
1018
1019 /* otherwise discard it */
1020 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1021 FreeNode( doc, node );
1022 continue;
1023 }
1024
1025 /* ignore unknown start/end tags */
1026 if ( node->tag == NULL )
1027 {
1028 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1029 FreeNode( doc, node );
1030 continue;
1031 }
1032
1033 /*
1034 Allow CM_INLINE elements here.
1035
1036 Allow CM_BLOCK elements here unless
1037 lexer->excludeBlocks is yes.
1038
1039 LI and DD are special cased.
1040
1041 Otherwise infer end tag for this element.
1042 */
1043
1044 if ( !nodeHasCM(node, CM_INLINE) )
1045 {
1046 if ( !nodeIsElement(node) )
1047 {
1048 if ( nodeIsFORM(node) )
1049 BadForm( doc );
1050
1051 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1052 FreeNode( doc, node );
1053 continue;
1054 }
1055
1056 /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1057 /*
1058 If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1059 start tag, discard the start tag and let the subsequent content get
1060 parsed as content of the enclosing LI. This seems to mimic IE and
1061 Netscape, and avoids an infinite loop: without this check,
1062 ParseBlock (which is parsing the LI's content) and ParseList (which
1063 is parsing the LI's parent's content) repeatedly defer to each
1064 other to parse the illegal start tag, each time inferring a missing
1065 </li> or <li> respectively.
1066
1067 NOTE: This check is a bit fragile. It specifically checks for the
1068 four tags that happen to weave their way through the current series
1069 of tests performed by ParseBlock and ParseList to trigger the
1070 infinite loop.
1071 */
1072 if ( nodeIsLI(element) )
1073 {
1074 if ( nodeIsFRAME(node) ||
1075 nodeIsFRAMESET(node) ||
1076 nodeIsOPTGROUP(node) ||
1077 nodeIsOPTION(node) )
1078 {
1079 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1080 FreeNode( doc, node ); /* DSR - 27Apr02 avoid memory leak */
1081 continue;
1082 }
1083 }
1084
1085 if ( nodeIsTD(element) || nodeIsTH(element) )
1086 {
1087 /* if parent is a table cell, avoid inferring the end of the cell */
1088
1089 if ( nodeHasCM(node, CM_HEAD) )
1090 {
1091 MoveToHead( doc, element, node );
1092 continue;
1093 }
1094
1095 if ( nodeHasCM(node, CM_LIST) )
1096 {
1097 UngetToken( doc );
1098 node = InferredTag(doc, TidyTag_UL);
1099 /* AddClass( doc, node, "noindent" ); */
1100 lexer->excludeBlocks = yes;
1101 }
1102 else if ( nodeHasCM(node, CM_DEFLIST) )
1103 {
1104 UngetToken( doc );
1105 node = InferredTag(doc, TidyTag_DL);
1106 lexer->excludeBlocks = yes;
1107 }
1108
1109 /* infer end of current table cell */
1110 if ( !nodeHasCM(node, CM_BLOCK) )
1111 {
1112 UngetToken( doc );
1113 TrimSpaces( doc, element );
1114 return;
1115 }
1116 }
1117 else if ( nodeHasCM(node, CM_BLOCK) )
1118 {
1119 if ( lexer->excludeBlocks )
1120 {
1121 if ( !nodeHasCM(element, CM_OPT) )
1122 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1123
1124 UngetToken( doc );
1125
1126 if ( nodeHasCM(element, CM_OBJECT) )
1127 lexer->istackbase = istackbase;
1128
1129 TrimSpaces( doc, element );
1130 return;
1131 }
1132 }
1133 else /* things like list items */
1134 {
1135 if (node->tag->model & CM_HEAD)
1136 {
1137 MoveToHead( doc, element, node );
1138 continue;
1139 }
1140
1141 /*
1142 special case where a form start tag
1143 occurs in a tr and is followed by td or th
1144 */
1145
1146 if ( nodeIsFORM(element) &&
1147 nodeIsTD(element->parent) &&
1148 element->parent->implicit )
1149 {
1150 if ( nodeIsTD(node) )
1151 {
1152 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1153 FreeNode( doc, node );
1154 continue;
1155 }
1156
1157 if ( nodeIsTH(node) )
1158 {
1159 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1160 FreeNode( doc, node );
1161 node = element->parent;
1162 MemFree(node->element);
1163 node->element = tmbstrdup("th");
1164 node->tag = LookupTagDef( TidyTag_TH );
1165 continue;
1166 }
1167 }
1168
1169 if ( !nodeHasCM(element, CM_OPT) && !element->implicit )
1170 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1171
1172 UngetToken( doc );
1173
1174 if ( nodeHasCM(node, CM_LIST) )
1175 {
1176 if ( element->parent && element->parent->tag &&
1177 element->parent->tag->parser == ParseList )
1178 {
1179 TrimSpaces( doc, element );
1180 return;
1181 }
1182
1183 node = InferredTag(doc, TidyTag_UL);
1184 /* AddClass( doc, node, "noindent" ); */
1185 }
1186 else if ( nodeHasCM(node, CM_DEFLIST) )
1187 {
1188 if ( nodeIsDL(element->parent) )
1189 {
1190 TrimSpaces( doc, element );
1191 return;
1192 }
1193
1194 node = InferredTag(doc, TidyTag_DL);
1195 }
1196 else if ( nodeHasCM(node, CM_TABLE) || nodeHasCM(node, CM_ROW) )
1197 {
1198 node = InferredTag(doc, TidyTag_TABLE);
1199 }
1200 else if ( nodeHasCM(element, CM_OBJECT) )
1201 {
1202 /* pop inline stack */
1203 while ( lexer->istacksize > lexer->istackbase )
1204 PopInline( doc, NULL );
1205 lexer->istackbase = istackbase;
1206 TrimSpaces( doc, element );
1207 return;
1208
1209 }
1210 else
1211 {
1212 TrimSpaces( doc, element );
1213 return;
1214 }
1215 }
1216 }
1217
1218 /* parse known element */
1219 if (nodeIsElement(node))
1220 {
1221 if (node->tag->model & CM_INLINE)
1222 {
1223 if (checkstack && !node->implicit)
1224 {
1225 checkstack = no;
1226
1227 if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1228 {
1229 if ( InlineDup(doc, node) > 0 )
1230 continue;
1231 }
1232 }
1233
1234 mode = MixedContent;
1235 }
1236 else
1237 {
1238 checkstack = yes;
1239 mode = IgnoreWhitespace;
1240 }
1241
1242 /* trim white space before <br> */
1243 if ( nodeIsBR(node) )
1244 TrimSpaces( doc, element );
1245
1246 InsertNodeAtEnd(element, node);
1247
1248 if (node->implicit)
1249 ReportError(doc, element, node, INSERTING_TAG );
1250
1251 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1252 continue;
1253 }
1254
1255 /* discard unexpected tags */
1256 if (node->type == EndTag)
1257 PopInline( doc, node ); /* if inline end tag */
1258
1259 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1260 FreeNode( doc, node );
1261 continue;
1262 }
1263
1264 if (!(element->tag->model & CM_OPT))
1265 ReportError(doc, element, node, MISSING_ENDTAG_FOR);
1266
1267 if (element->tag->model & CM_OBJECT)
1268 {
1269 /* pop inline stack */
1270 while ( lexer->istacksize > lexer->istackbase )
1271 PopInline( doc, NULL );
1272 lexer->istackbase = istackbase;
1273 }
1274
1275 TrimSpaces( doc, element );
1276 }
1277
1278 void ParseInline( TidyDocImpl* doc, Node *element, uint mode )
1279 {
1280 Lexer* lexer = doc->lexer;
1281 Node *node, *parent;
1282
1283 if (element->tag->model & CM_EMPTY)
1284 return;
1285
1286 /*
1287 ParseInline is used for some block level elements like H1 to H6
1288 For such elements we need to insert inline emphasis tags currently
1289 on the inline stack. For Inline elements, we normally push them
1290 onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1291 This test is carried out in PushInline and PopInline, see istack.c
1292
1293 InlineDup(...) is not called for elements with a CM_MIXED (inline and
1294 block) content model, e.g. <del> or <ins>, otherwise constructs like
1295
1296 <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1297 <p>111<span>222<del>333</del>444</span>555</p>
1298 <p>111<em>222<del>333</del>444</em>555</p>
1299
1300 will get corrupted.
1301 */
1302 if ((nodeHasCM(element, CM_BLOCK) || nodeIsDT(element)) &&
1303 !nodeHasCM(element, CM_MIXED))
1304 InlineDup(doc, NULL);
1305 else if (nodeHasCM(element, CM_INLINE))
1306 PushInline(doc, element);
1307
1308 if ( nodeIsNOBR(element) )
1309 doc->badLayout |= USING_NOBR;
1310 else if ( nodeIsFONT(element) )
1311 doc->badLayout |= USING_FONT;
1312
1313 /* Inline elements may or may not be within a preformatted element */
1314 if (mode != Preformatted)
1315 mode = MixedContent;
1316
1317 while ((node = GetToken(doc, mode)) != NULL)
1318 {
1319 /* end tag for current element */
1320 if (node->tag == element->tag && node->type == EndTag)
1321 {
1322 if (element->tag->model & CM_INLINE)
1323 PopInline( doc, node );
1324
1325 FreeNode( doc, node );
1326
1327 if (!(mode & Preformatted))
1328 TrimSpaces(doc, element);
1329
1330 /*
1331 if a font element wraps an anchor and nothing else
1332 then move the font element inside the anchor since
1333 otherwise it won't alter the anchor text color
1334 */
1335 if ( nodeIsFONT(element) &&
1336 element->content && element->content == element->last )
1337 {
1338 Node *child = element->content;
1339
1340 if ( nodeIsA(child) )
1341 {
1342 child->