Version:
~ [ 1.0 ] ~
** Warning: Cannot open xref database.
1 /* parser.c -- HTML Parser
2
3 (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 CVS Info :
7
8 $Author: arnaud02 $
9 $Date: 2005/10/21 12:54:15 $
10 $Revision: 1.150 $
11
12 */
13
14 #include "tidy-int.h"
15 #include "lexer.h"
16 #include "parser.h"
17 #include "message.h"
18 #include "clean.h"
19 #include "tags.h"
20 #include "tmbstr.h"
21
22 #ifdef AUTO_INPUT_ENCODING
23 #include "charsets.h"
24 #endif
25
26 Bool CheckNodeIntegrity(Node *node)
27 {
28 #ifndef NO_NODE_INTEGRITY_CHECK
29 if (node->prev)
30 {
31 if (node->prev->next != node)
32 return no;
33 }
34
35 if (node->next)
36 {
37 if (node->next->prev != node)
38 return no;
39 }
40
41 if (node->parent)
42 {
43 Node *child = NULL;
44 if (node->prev == NULL && node->parent->content != node)
45 return no;
46
47 if (node->next == NULL && node->parent->last != node)
48 return no;
49
50 for (child = node->parent->content; child; child = child->next)
51 {
52 if (child == node)
53 break;
54 }
55 if ( node != child )
56 return no;
57 }
58
59 for (node = node->content; node; node = node->next)
60 if ( !CheckNodeIntegrity(node) )
61 return no;
62
63 #endif
64 return yes;
65 }
66
67 /*
68 used to determine how attributes
69 without values should be printed
70 this was introduced to deal with
71 user defined tags e.g. Cold Fusion
72 */
73 Bool IsNewNode(Node *node)
74 {
75 if (node && node->tag)
76 {
77 return (node->tag->model & CM_NEW);
78 }
79 return yes;
80 }
81
82 void CoerceNode(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
83 {
84 const Dict* tag = LookupTagDef(tid);
85 Node* tmp = InferredTag(doc, tag->id);
86
87 if (obsolete)
88 ReportWarning(doc, node, tmp, OBSOLETE_ELEMENT);
89 else if (unexpected)
90 ReportError(doc, node, tmp, REPLACING_UNEX_ELEMENT);
91 else
92 ReportNotice(doc, node, tmp, REPLACING_ELEMENT);
93
94 MemFree(tmp->element);
95 MemFree(tmp);
96
97 node->was = node->tag;
98 node->tag = tag;
99 node->type = StartTag;
100 node->implicit = yes;
101 MemFree(node->element);
102 node->element = tmbstrdup(tag->name);
103 }
104
105 /* extract a node and its children from a markup tree */
106 Node *RemoveNode(Node *node)
107 {
108 if (node->prev)
109 node->prev->next = node->next;
110
111 if (node->next)
112 node->next->prev = node->prev;
113
114 if (node->parent)
115 {
116 if (node->parent->content == node)
117 node->parent->content = node->next;
118
119 if (node->parent->last == node)
120 node->parent->last = node->prev;
121 }
122
123 node->parent = node->prev = node->next = NULL;
124 return node;
125 }
126
127 /* remove node from markup tree and discard it */
128 Node *DiscardElement( TidyDocImpl* doc, Node *element )
129 {
130 Node *next = NULL;
131
132 if (element)
133 {
134 next = element->next;
135 RemoveNode(element);
136 FreeNode( doc, element);
137 }
138
139 return next;
140 }
141
142 /*
143 insert "node" into markup tree as the firt element
144 of content of "element"
145 */
146 void InsertNodeAtStart(Node *element, Node *node)
147 {
148 node->parent = element;
149
150 if (element->content == NULL)
151 element->last = node;
152 else
153 element->content->prev = node;
154
155 node->next = element->content;
156 node->prev = NULL;
157 element->content = node;
158 }
159
160 /*
161 insert "node" into markup tree as the last element
162 of content of "element"
163 */
164 void InsertNodeAtEnd(Node *element, Node *node)
165 {
166 node->parent = element;
167 node->prev = element->last;
168
169 if (element->last != NULL)
170 element->last->next = node;
171 else
172 element->content = node;
173
174 element->last = node;
175 }
176
177 /*
178 insert "node" into markup tree in place of "element"
179 which is moved to become the child of the node
180 */
181 static void InsertNodeAsParent(Node *element, Node *node)
182 {
183 node->content = element;
184 node->last = element;
185 node->parent = element->parent;
186 element->parent = node;
187
188 if (node->parent->content == element)
189 node->parent->content = node;
190
191 if (node->parent->last == element)
192 node->parent->last = node;
193
194 node->prev = element->prev;
195 element->prev = NULL;
196
197 if (node->prev)
198 node->prev->next = node;
199
200 node->next = element->next;
201 element->next = NULL;
202
203 if (node->next)
204 node->next->prev = node;
205 }
206
207 /* insert "node" into markup tree before "element" */
208 void InsertNodeBeforeElement(Node *element, Node *node)
209 {
210 Node *parent;
211
212 parent = element->parent;
213 node->parent = parent;
214 node->next = element;
215 node->prev = element->prev;
216 element->prev = node;
217
218 if (node->prev)
219 node->prev->next = node;
220
221 if (parent->content == element)
222 parent->content = node;
223 }
224
225 /* insert "node" into markup tree after "element" */
226 void InsertNodeAfterElement(Node *element, Node *node)
227 {
228 Node *parent;
229
230 parent = element->parent;
231 node->parent = parent;
232
233 /* AQ - 13 Jan 2000 fix for parent == NULL */
234 if (parent != NULL && parent->last == element)
235 parent->last = node;
236 else
237 {
238 node->next = element->next;
239 /* AQ - 13 Jan 2000 fix for node->next == NULL */
240 if (node->next != NULL)
241 node->next->prev = node;
242 }
243
244 element->next = node;
245 node->prev = element;
246 }
247
248 static Bool CanPrune( TidyDocImpl* doc, Node *element )
249 {
250 if ( nodeIsText(element) )
251 return yes;
252
253 if ( element->content )
254 return no;
255
256 if ( element->tag == NULL )
257 return no;
258
259 if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
260 return no;
261
262 if ( nodeIsA(element) && element->attributes != NULL )
263 return no;
264
265 if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
266 return no;
267
268 if ( element->tag->model & CM_ROW )
269 return no;
270
271 if ( element->tag->model & CM_EMPTY )
272 return no;
273
274 if ( nodeIsAPPLET(element) )
275 return no;
276
277 if ( nodeIsOBJECT(element) )
278 return no;
279
280 if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
281 return no;
282
283 if ( nodeIsTITLE(element) )
284 return no;
285
286 /* #433359 - fix by Randy Waki 12 Mar 01 */
287 if ( nodeIsIFRAME(element) )
288 return no;
289
290 /* fix for bug 770297 */
291 if (nodeIsTEXTAREA(element))
292 return no;
293
294 if ( attrGetID(element) || attrGetNAME(element) )
295 return no;
296
297 /* fix for bug 695408; a better fix would look for unknown and */
298 /* known proprietary attributes that make the element significant */
299 if (attrGetDATAFLD(element))
300 return no;
301
302 /* fix for bug 723772, don't trim new-...-tags */
303 if (element->tag->id == TidyTag_UNKNOWN)
304 return no;
305
306 if (nodeIsBODY(element))
307 return no;
308
309 if (nodeIsCOLGROUP(element))
310 return no;
311
312 return yes;
313 }
314
315 Node *TrimEmptyElement( TidyDocImpl* doc, Node *element )
316 {
317 if ( CanPrune(doc, element) )
318 {
319 if (element->type != TextNode)
320 ReportNotice(doc, element, NULL, TRIM_EMPTY_ELEMENT);
321
322 return DiscardElement(doc, element);
323 }
324 return element;
325 }
326
327 Node* DropEmptyElements(TidyDocImpl* doc, Node* node)
328 {
329 Node* next;
330
331 while (node)
332 {
333 next = node->next;
334
335 if (node->content)
336 DropEmptyElements(doc, node->content);
337
338 if (!nodeIsElement(node) &&
339 !(nodeIsText(node) && !(node->start < node->end)))
340 {
341 node = next;
342 continue;
343 }
344
345 next = TrimEmptyElement(doc, node);
346 node = node == next ? node->next : next;
347 }
348
349 return node;
350 }
351
352 /*
353 errors in positioning of form start or end tags
354 generally require human intervention to fix
355 */
356 static void BadForm( TidyDocImpl* doc )
357 {
358 doc->badForm = yes;
359 /* doc->errors++; */
360 }
361
362 /*
363 This maps
364 <em>hello </em><strong>world</strong>
365 to
366 <em>hello</em> <strong>world</strong>
367
368 If last child of element is a text node
369 then trim trailing white space character
370 moving it to after element's end tag.
371 */
372 static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
373 {
374 Lexer* lexer = doc->lexer;
375 byte c;
376
377 if (nodeIsText(last))
378 {
379 if (last->end > last->start)
380 {
381 c = (byte) lexer->lexbuf[ last->end - 1 ];
382
383 if ( c == ' '
384 #ifdef COMMENT_NBSP_FIX
385 || c == 160
386 #endif
387 )
388 {
389 #ifdef COMMENT_NBSP_FIX
390 /* take care with <td> </td> */
391 if ( c == 160 &&
392 ( element->tag == doc->tags.tag_td ||
393 element->tag == doc->tags.tag_th )
394 )
395 {
396 if (last->end > last->start + 1)
397 last->end -= 1;
398 }
399 else
400 #endif
401 {
402 last->end -= 1;
403 if ( (element->tag->model & CM_INLINE) &&
404 !(element->tag->model & CM_FIELD) )
405 lexer->insertspace = yes;
406 }
407 }
408 }
409 }
410 }
411
412 #if 0
413 static Node *EscapeTag(Lexer *lexer, Node *element)
414 {
415 Node *node = NewNode(lexer);
416
417 node->start = lexer->lexsize;
418 AddByte(lexer, '<');
419
420 if (element->type == EndTag)
421 AddByte(lexer, '/');
422
423 if (element->element)
424 {
425 char *p;
426 for (p = element->element; *p != '\0'; ++p)
427 AddByte(lexer, *p);
428 }
429 else if (element->type == DocTypeTag)
430 {
431 uint i;
432 AddStringLiteral( lexer, "!DOCTYPE " );
433 for (i = element->start; i < element->end; ++i)
434 AddByte(lexer, lexer->lexbuf[i]);
435 }
436
437 if (element->type == StartEndTag)
438 AddByte(lexer, '/');
439
440 AddByte(lexer, '>');
441 node->end = lexer->lexsize;
442
443 return node;
444 }
445 #endif /* 0 */
446
447 /* Only true for text nodes. */
448 Bool IsBlank(Lexer *lexer, Node *node)
449 {
450 Bool isBlank = nodeIsText(node);
451 if ( isBlank )
452 isBlank = ( node->end == node->start || /* Zero length */
453 ( node->end == node->start+1 /* or one blank. */
454 && lexer->lexbuf[node->start] == ' ' ) );
455 return isBlank;
456 }
457
458 /*
459 This maps
460 <p>hello<em> world</em>
461 to
462 <p>hello <em>world</em>
463
464 Trims initial space, by moving it before the
465 start tag, or if this element is the first in
466 parent's content, then by discarding the space
467 */
468 static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
469 {
470 Lexer* lexer = doc->lexer;
471 Node *prev, *node;
472
473 if ( nodeIsText(text) &&
474 lexer->lexbuf[text->start] == ' ' &&
475 text->start < text->end )
476 {
477 if ( (element->tag->model & CM_INLINE) &&
478 !(element->tag->model & CM_FIELD) )
479 {
480 prev = element->prev;
481
482 if (nodeIsText(prev))
483 {
484 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
485 lexer->lexbuf[(prev->end)++] = ' ';
486
487 ++(element->start);
488 }
489 else /* create new node */
490 {
491 node = NewNode(lexer);
492 node->start = (element->start)++;
493 node->end = element->start;
494 lexer->lexbuf[node->start] = ' ';
495 InsertNodeBeforeElement(element ,node);
496 }
497 }
498
499 /* discard the space in current node */
500 ++(text->start);
501 }
502 }
503
504 static Bool IsPreDescendant(Node* node)
505 {
506 Node *parent = node->parent;
507
508 while (parent)
509 {
510 if (parent->tag && parent->tag->parser == ParsePre)
511 return yes;
512
513 parent = parent->parent;
514 }
515
516 return no;
517 }
518
519 static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
520 {
521 Node* next;
522
523 if (!nodeIsText(node))
524 return no;
525
526 if (node->parent->type == DocTypeTag)
527 return no;
528
529 if (IsPreDescendant(node))
530 return no;
531
532 if (node->parent->tag->parser == ParseScript)
533 return no;
534
535 next = node->next;
536
537 /* <p>... </p> */
538 if (!next && !nodeHasCM(node->parent, CM_INLINE))
539 return yes;
540
541 /* <div><small>... </small><h3>...</h3></div> */
542 if (!next && node->parent->next && !nodeHasCM(node->parent->next, CM_INLINE))
543 return yes;
544
545 if (!next)
546 return no;
547
548 if (nodeIsBR(next))
549 return yes;
550
551 if (nodeHasCM(next, CM_INLINE))
552 return no;
553
554 /* <a href='/'>...</a> <p>...</p> */
555 if (next->type == StartTag)
556 return yes;
557
558 /* <strong>...</strong> <hr /> */
559 if (next->type == StartEndTag)
560 return yes;
561
562 /* evil adjacent text nodes, Tidy should not generate these :-( */
563 if (nodeIsText(next) && next->start < next->end
564 && IsWhite(doc->lexer->lexbuf[next->start]))
565 return yes;
566
567 return no;
568 }
569
570 static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
571 {
572 if (!nodeIsText(node))
573 return no;
574
575 if (node->parent->type == DocTypeTag)
576 return no;
577
578 if (IsPreDescendant(node))
579 return no;
580
581 if (node->parent->tag->parser == ParseScript)
582 return no;
583
584 /* <p>...<br> <em>...</em>...</p> */
585 if (nodeIsBR(node->prev))
586 return yes;
587
588 /* <p> ...</p> */
589 if (node->prev == NULL && !nodeHasCM(node->parent, CM_INLINE))
590 return yes;
591
592 /* <h4>...</h4> <em>...</em> */
593 if (node->prev && !nodeHasCM(node->prev, CM_INLINE) &&
594 nodeIsElement(node->prev))
595 return yes;
596
597 /* <p><span> ...</span></p> */
598 if (!node->prev && !node->parent->prev && !nodeHasCM(node->parent->parent, CM_INLINE))
599 return yes;
600
601 return no;
602 }
603
604 static void CleanSpaces(TidyDocImpl* doc, Node* node)
605 {
606 Node* next;
607
608 while (node)
609 {
610 next = node->next;
611
612 if (nodeIsText(node) && CleanLeadingWhitespace(doc, node))
613 while (node->start < node->end && IsWhite(doc->lexer->lexbuf[node->start]))
614 ++(node->start);
615
616 if (nodeIsText(node) && CleanTrailingWhitespace(doc, node))
617 while (node->end > node->start && IsWhite(doc->lexer->lexbuf[node->end - 1]))
618 --(node->end);
619
620 if (nodeIsText(node) && !(node->start < node->end))
621 {
622 RemoveNode(node);
623 FreeNode(doc, node);
624 node = next;
625
626 continue;
627 }
628
629 if (node->content)
630 CleanSpaces(doc, node->content);
631
632 node = next;
633 }
634 }
635
636 /*
637 Move initial and trailing space out.
638 This routine maps:
639
640 hello<em> world</em>
641 to
642 hello <em>world</em>
643 and
644 <em>hello </em><strong>world</strong>
645 to
646 <em>hello</em> <strong>world</strong>
647 */
648 static void TrimSpaces( TidyDocImpl* doc, Node *element)
649 {
650 Node* text = element->content;
651
652 if (nodeIsPRE(element) || IsPreDescendant(element))
653 return;
654
655 if (nodeIsText(text))
656 TrimInitialSpace(doc, element, text);
657
658 text = element->last;
659
660 if (nodeIsText(text))
661 TrimTrailingSpace(doc, element, text);
662 }
663
664 Bool DescendantOf( Node *element, TidyTagId tid )
665 {
666 Node *parent;
667 for ( parent = element->parent;
668 parent != NULL;
669 parent = parent->parent )
670 {
671 if ( TagIsId(parent, tid) )
672 return yes;
673 }
674 return no;
675 }
676
677 static Bool InsertMisc(Node *element, Node *node)
678 {
679 if (node->type == CommentTag ||
680 node->type == ProcInsTag ||
681 node->type == CDATATag ||
682 node->type == SectionTag ||
683 node->type == AspTag ||
684 node->type == JsteTag ||
685 node->type == PhpTag )
686 {
687 InsertNodeAtEnd(element, node);
688 return yes;
689 }
690
691 if ( node->type == XmlDecl )
692 {
693 Node* root = element;
694 while ( root && root->parent )
695 root = root->parent;
696 if ( root )
697 {
698 InsertNodeAtStart( root, node );
699 return yes;
700 }
701 }
702
703 /* Declared empty tags seem to be slipping through
704 ** the cracks. This is an experiment to figure out
705 ** a decent place to pick them up.
706 */
707 if ( node->tag &&
708 nodeIsElement(node) &&
709 nodeCMIsEmpty(node) && TagId(node) == TidyTag_UNKNOWN &&
710 (node->tag->versions & VERS_PROPRIETARY) != 0 )
711 {
712 InsertNodeAtEnd(element, node);
713 return yes;
714 }
715
716 return no;
717 }
718
719
720 static void ParseTag( TidyDocImpl* doc, Node *node, uint mode )
721 {
722 Lexer* lexer = doc->lexer;
723 /*
724 Fix by GLP 2000-12-21. Need to reset insertspace if this
725 is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
726 */
727 if (node->tag->model & CM_EMPTY)
728 {
729 lexer->waswhite = no;
730 if (node->tag->parser == NULL)
731 return;
732 }
733 else if (!(node->tag->model & CM_INLINE))
734 lexer->insertspace = no;
735
736 if (node->tag->parser == NULL)
737 return;
738
739 if (node->type == StartEndTag)
740 return;
741
742 (*node->tag->parser)( doc, node, mode );
743 }
744
745 /*
746 the doctype has been found after other tags,
747 and needs moving to before the html element
748 */
749 static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
750 {
751 Node* existing = FindDocType( doc );
752 if ( existing )
753 {
754 ReportError(doc, element, doctype, DISCARDING_UNEXPECTED );
755 FreeNode( doc, doctype );
756 }
757 else
758 {
759 ReportError(doc, element, doctype, DOCTYPE_AFTER_TAGS );
760 while ( !nodeIsHTML(element) )
761 element = element->parent;
762 InsertNodeBeforeElement( element, doctype );
763 }
764 }
765
766 /*
767 move node to the head, where element is used as starting
768 point in hunt for head. normally called during parsing
769 */
770 static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
771 {
772 Node *head;
773
774 RemoveNode( node ); /* make sure that node is isolated */
775
776 if ( nodeIsElement(node) )
777 {
778 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN );
779
780 head = FindHEAD(doc);
781 assert(head != NULL);
782
783 InsertNodeAtEnd(head, node);
784
785 if ( node->tag->parser )
786 ParseTag( doc, node, IgnoreWhitespace );
787 }
788 else
789 {
790 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
791 FreeNode( doc, node );
792 }
793 }
794
795 /* moves given node to end of body element */
796 static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
797 {
798 Node* body = FindBody( doc );
799 if ( body )
800 {
801 RemoveNode( node );
802 InsertNodeAtEnd( body, node );
803 }
804 }
805
806 /*
807 element is node created by the lexer
808 upon seeing the start tag, or by the
809 parser when the start tag is inferred
810 */
811 void ParseBlock( TidyDocImpl* doc, Node *element, uint mode)
812 {
813 Lexer* lexer = doc->lexer;
814 Node *node;
815 Bool checkstack = yes;
816 uint istackbase = 0;
817
818 if ( element->tag->model & CM_EMPTY )
819 return;
820
821 if ( nodeIsFORM(element) &&
822 DescendantOf(element, TidyTag_FORM) )
823 ReportError(doc, element, NULL, ILLEGAL_NESTING );
824
825 /*
826 InlineDup() asks the lexer to insert inline emphasis tags
827 currently pushed on the istack, but take care to avoid
828 propagating inline emphasis inside OBJECT or APPLET.
829 For these elements a fresh inline stack context is created
830 and disposed of upon reaching the end of the element.
831 They thus behave like table cells in this respect.
832 */
833 if (element->tag->model & CM_OBJECT)
834 {
835 istackbase = lexer->istackbase;
836 lexer->istackbase = lexer->istacksize;
837 }
838
839 if (!(element->tag->model & CM_MIXED))
840 InlineDup( doc, NULL );
841
842 mode = IgnoreWhitespace;
843
844 while ((node = GetToken(doc, mode /*MixedContent*/)) != NULL)
845 {
846 /* end tag for this element */
847 if (node->type == EndTag && node->tag &&
848 (node->tag == element->tag || element->was == node->tag))
849 {
850 FreeNode( doc, node );
851
852 if (element->tag->model & CM_OBJECT)
853 {
854 /* pop inline stack */
855 while (lexer->istacksize > lexer->istackbase)
856 PopInline( doc, NULL );
857 lexer->istackbase = istackbase;
858 }
859
860 element->closed = yes;
861 TrimSpaces( doc, element );
862 return;
863 }
864
865 if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
866 {
867 /* If we're in the HEAD, close it before proceeding.
868 This is an extremely rare occurance, but has been observed.
869 */
870 UngetToken( doc );
871 break;
872 }
873
874 if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
875 {
876 if ( nodeIsElement(node) )
877 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
878 FreeNode( doc, node );
879 continue;
880 }
881
882
883 if (node->type == EndTag)
884 {
885 if (node->tag == NULL)
886 {
887 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
888 FreeNode( doc, node );
889 continue;
890 }
891 else if ( nodeIsBR(node) )
892 node->type = StartTag;
893 else if ( nodeIsP(node) )
894 {
895 /* Cannot have a block inside a paragraph, so no checking
896 for an ancestor is necessary -- but we _can_ have
897 paragraphs inside a block, so change it to an implicit
898 empty paragraph, to be dealt with according to the user's
899 options
900 */
901 node->type = StartEndTag;
902 node->implicit = yes;
903 #if OBSOLETE
904 CoerceNode(doc, node, TidyTag_BR, no, no);
905 FreeAttrs( doc, node ); /* discard align attribute etc. */
906 InsertNodeAtEnd( element, node );
907 node = InferredTag(doc, TidyTag_BR);
908 #endif
909 }
910 else if (DescendantOf( element, node->tag->id ))
911 {
912 /*
913 if this is the end tag for an ancestor element
914 then infer end tag for this element
915 */
916 UngetToken( doc );
917 break;
918 #if OBSOLETE
919 Node *parent;
920 for ( parent = element->parent;
921 parent != NULL;
922 parent = parent->parent )
923 {
924 if (node->tag == parent->tag)
925 {
926 if (!(element->tag->model & CM_OPT))
927 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
928
929 UngetToken( doc );
930
931 if (element->tag->model & CM_OBJECT)
932 {
933 /* pop inline stack */
934 while (lexer->istacksize > lexer->istackbase)
935 PopInline( doc, NULL );
936 lexer->istackbase = istackbase;
937 }
938
939 TrimSpaces( doc, element );
940 return;
941 }
942 }
943 #endif
944 }
945 else
946 {
947 /* special case </tr> etc. for stuff moved in front of table */
948 if ( lexer->exiled
949 && node->tag->model
950 && (node->tag->model & CM_TABLE) )
951 {
952 UngetToken( doc );
953 TrimSpaces( doc, element );
954 return;
955 }
956 }
957 }
958
959 /* mixed content model permits text */
960 if (nodeIsText(node))
961 {
962 if ( checkstack )
963 {
964 checkstack = no;
965 if (!(element->tag->model & CM_MIXED))
966 {
967 if ( InlineDup(doc, node) > 0 )
968 continue;
969 }
970 }
971
972 InsertNodeAtEnd(element, node);
973 mode = MixedContent;
974
975 /*
976 HTML4 strict doesn't allow mixed content for
977 elements with %block; as their content model
978 */
979 /*
980 But only body, map, blockquote, form and
981 noscript have content model %block;
982 */
983 if ( nodeIsBODY(element) ||
984 nodeIsMAP(element) ||
985 nodeIsBLOCKQUOTE(element) ||
986 nodeIsFORM(element) ||
987 nodeIsNOSCRIPT(element) )
988 ConstrainVersion( doc, ~VERS_HTML40_STRICT );
989 continue;
990 }
991
992 if ( InsertMisc(element, node) )
993 continue;
994
995 /* allow PARAM elements? */
996 if ( nodeIsPARAM(node) )
997 {
998 if ( nodeHasCM(element, CM_PARAM) && nodeIsElement(node) )
999 {
1000 InsertNodeAtEnd(element, node);
1001 continue;
1002 }
1003
1004 /* otherwise discard it */
1005 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1006 FreeNode( doc, node );
1007 continue;
1008 }
1009
1010 /* allow AREA elements? */
1011 if ( nodeIsAREA(node) )
1012 {
1013 if ( nodeIsMAP(element) && nodeIsElement(node) )
1014 {
1015 InsertNodeAtEnd(element, node);
1016 continue;
1017 }
1018
1019 /* otherwise discard it */
1020 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1021 FreeNode( doc, node );
1022 continue;
1023 }
1024
1025 /* ignore unknown start/end tags */
1026 if ( node->tag == NULL )
1027 {
1028 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1029 FreeNode( doc, node );
1030 continue;
1031 }
1032
1033 /*
1034 Allow CM_INLINE elements here.
1035
1036 Allow CM_BLOCK elements here unless
1037 lexer->excludeBlocks is yes.
1038
1039 LI and DD are special cased.
1040
1041 Otherwise infer end tag for this element.
1042 */
1043
1044 if ( !nodeHasCM(node, CM_INLINE) )
1045 {
1046 if ( !nodeIsElement(node) )
1047 {
1048 if ( nodeIsFORM(node) )
1049 BadForm( doc );
1050
1051 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1052 FreeNode( doc, node );
1053 continue;
1054 }
1055
1056 /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1057 /*
1058 If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1059 start tag, discard the start tag and let the subsequent content get
1060 parsed as content of the enclosing LI. This seems to mimic IE and
1061 Netscape, and avoids an infinite loop: without this check,
1062 ParseBlock (which is parsing the LI's content) and ParseList (which
1063 is parsing the LI's parent's content) repeatedly defer to each
1064 other to parse the illegal start tag, each time inferring a missing
1065 </li> or <li> respectively.
1066
1067 NOTE: This check is a bit fragile. It specifically checks for the
1068 four tags that happen to weave their way through the current series
1069 of tests performed by ParseBlock and ParseList to trigger the
1070 infinite loop.
1071 */
1072 if ( nodeIsLI(element) )
1073 {
1074 if ( nodeIsFRAME(node) ||
1075 nodeIsFRAMESET(node) ||
1076 nodeIsOPTGROUP(node) ||
1077 nodeIsOPTION(node) )
1078 {
1079 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1080 FreeNode( doc, node ); /* DSR - 27Apr02 avoid memory leak */
1081 continue;
1082 }
1083 }
1084
1085 if ( nodeIsTD(element) || nodeIsTH(element) )
1086 {
1087 /* if parent is a table cell, avoid inferring the end of the cell */
1088
1089 if ( nodeHasCM(node, CM_HEAD) )
1090 {
1091 MoveToHead( doc, element, node );
1092 continue;
1093 }
1094
1095 if ( nodeHasCM(node, CM_LIST) )
1096 {
1097 UngetToken( doc );
1098 node = InferredTag(doc, TidyTag_UL);
1099 /* AddClass( doc, node, "noindent" ); */
1100 lexer->excludeBlocks = yes;
1101 }
1102 else if ( nodeHasCM(node, CM_DEFLIST) )
1103 {
1104 UngetToken( doc );
1105 node = InferredTag(doc, TidyTag_DL);
1106 lexer->excludeBlocks = yes;
1107 }
1108
1109 /* infer end of current table cell */
1110 if ( !nodeHasCM(node, CM_BLOCK) )
1111 {
1112 UngetToken( doc );
1113 TrimSpaces( doc, element );
1114 return;
1115 }
1116 }
1117 else if ( nodeHasCM(node, CM_BLOCK) )
1118 {
1119 if ( lexer->excludeBlocks )
1120 {
1121 if ( !nodeHasCM(element, CM_OPT) )
1122 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1123
1124 UngetToken( doc );
1125
1126 if ( nodeHasCM(element, CM_OBJECT) )
1127 lexer->istackbase = istackbase;
1128
1129 TrimSpaces( doc, element );
1130 return;
1131 }
1132 }
1133 else /* things like list items */
1134 {
1135 if (node->tag->model & CM_HEAD)
1136 {
1137 MoveToHead( doc, element, node );
1138 continue;
1139 }
1140
1141 /*
1142 special case where a form start tag
1143 occurs in a tr and is followed by td or th
1144 */
1145
1146 if ( nodeIsFORM(element) &&
1147 nodeIsTD(element->parent) &&
1148 element->parent->implicit )
1149 {
1150 if ( nodeIsTD(node) )
1151 {
1152 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1153 FreeNode( doc, node );
1154 continue;
1155 }
1156
1157 if ( nodeIsTH(node) )
1158 {
1159 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1160 FreeNode( doc, node );
1161 node = element->parent;
1162 MemFree(node->element);
1163 node->element = tmbstrdup("th");
1164 node->tag = LookupTagDef( TidyTag_TH );
1165 continue;
1166 }
1167 }
1168
1169 if ( !nodeHasCM(element, CM_OPT) && !element->implicit )
1170 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1171
1172 UngetToken( doc );
1173
1174 if ( nodeHasCM(node, CM_LIST) )
1175 {
1176 if ( element->parent && element->parent->tag &&
1177 element->parent->tag->parser == ParseList )
1178 {
1179 TrimSpaces( doc, element );
1180 return;
1181 }
1182
1183 node = InferredTag(doc, TidyTag_UL);
1184 /* AddClass( doc, node, "noindent" ); */
1185 }
1186 else if ( nodeHasCM(node, CM_DEFLIST) )
1187 {
1188 if ( nodeIsDL(element->parent) )
1189 {
1190 TrimSpaces( doc, element );
1191 return;
1192 }
1193
1194 node = InferredTag(doc, TidyTag_DL);
1195 }
1196 else if ( nodeHasCM(node, CM_TABLE) || nodeHasCM(node, CM_ROW) )
1197 {
1198 node = InferredTag(doc, TidyTag_TABLE);
1199 }
1200 else if ( nodeHasCM(element, CM_OBJECT) )
1201 {
1202 /* pop inline stack */
1203 while ( lexer->istacksize > lexer->istackbase )
1204 PopInline( doc, NULL );
1205 lexer->istackbase = istackbase;
1206 TrimSpaces( doc, element );
1207 return;
1208
1209 }
1210 else
1211 {
1212 TrimSpaces( doc, element );
1213 return;
1214 }
1215 }
1216 }
1217
1218 /* parse known element */
1219 if (nodeIsElement(node))
1220 {
1221 if (node->tag->model & CM_INLINE)
1222 {
1223 if (checkstack && !node->implicit)
1224 {
1225 checkstack = no;
1226
1227 if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1228 {
1229 if ( InlineDup(doc, node) > 0 )
1230 continue;
1231 }
1232 }
1233
1234 mode = MixedContent;
1235 }
1236 else
1237 {
1238 checkstack = yes;
1239 mode = IgnoreWhitespace;
1240 }
1241
1242 /* trim white space before <br> */
1243 if ( nodeIsBR(node) )
1244 TrimSpaces( doc, element );
1245
1246 InsertNodeAtEnd(element, node);
1247
1248 if (node->implicit)
1249 ReportError(doc, element, node, INSERTING_TAG );
1250
1251 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1252 continue;
1253 }
1254
1255 /* discard unexpected tags */
1256 if (node->type == EndTag)
1257 PopInline( doc, node ); /* if inline end tag */
1258
1259 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1260 FreeNode( doc, node );
1261 continue;
1262 }
1263
1264 if (!(element->tag->model & CM_OPT))
1265 ReportError(doc, element, node, MISSING_ENDTAG_FOR);
1266
1267 if (element->tag->model & CM_OBJECT)
1268 {
1269 /* pop inline stack */
1270 while ( lexer->istacksize > lexer->istackbase )
1271 PopInline( doc, NULL );
1272 lexer->istackbase = istackbase;
1273 }
1274
1275 TrimSpaces( doc, element );
1276 }
1277
1278 void ParseInline( TidyDocImpl* doc, Node *element, uint mode )
1279 {
1280 Lexer* lexer = doc->lexer;
1281 Node *node, *parent;
1282
1283 if (element->tag->model & CM_EMPTY)
1284 return;
1285
1286 /*
1287 ParseInline is used for some block level elements like H1 to H6
1288 For such elements we need to insert inline emphasis tags currently
1289 on the inline stack. For Inline elements, we normally push them
1290 onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1291 This test is carried out in PushInline and PopInline, see istack.c
1292
1293 InlineDup(...) is not called for elements with a CM_MIXED (inline and
1294 block) content model, e.g. <del> or <ins>, otherwise constructs like
1295
1296 <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1297 <p>111<span>222<del>333</del>444</span>555</p>
1298 <p>111<em>222<del>333</del>444</em>555</p>
1299
1300 will get corrupted.
1301 */
1302 if ((nodeHasCM(element, CM_BLOCK) || nodeIsDT(element)) &&
1303 !nodeHasCM(element, CM_MIXED))
1304 InlineDup(doc, NULL);
1305 else if (nodeHasCM(element, CM_INLINE))
1306 PushInline(doc, element);
1307
1308 if ( nodeIsNOBR(element) )
1309 doc->badLayout |= USING_NOBR;
1310 else if ( nodeIsFONT(element) )
1311 doc->badLayout |= USING_FONT;
1312
1313 /* Inline elements may or may not be within a preformatted element */
1314 if (mode != Preformatted)
1315 mode = MixedContent;
1316
1317 while ((node = GetToken(doc, mode)) != NULL)
1318 {
1319 /* end tag for current element */
1320 if (node->tag == element->tag && node->type == EndTag)
1321 {
1322 if (element->tag->model & CM_INLINE)
1323 PopInline( doc, node );
1324
1325 FreeNode( doc, node );
1326
1327 if (!(mode & Preformatted))
1328 TrimSpaces(doc, element);
1329
1330 /*
1331 if a font element wraps an anchor and nothing else
1332 then move the font element inside the anchor since
1333 otherwise it won't alter the anchor text color
1334 */
1335 if ( nodeIsFONT(element) &&
1336 element->content && element->content == element->last )
1337 {
1338 Node *child = element->content;
1339
1340 if ( nodeIsA(child) )
1341 {
1342 child->parent = element->parent;
1343 child->next = element->next;
1344 child->prev = element->prev;
1345
1346 element->next = NULL;
1347 element->prev = NULL;
1348 element->parent = child;
1349
1350 element->content = child->content;
1351 element->last = child->last;
1352 child->content = element;
1353
1354 FixNodeLinks(child);
1355 FixNodeLinks(element);
1356 }
1357 }
1358
1359 element->closed = yes;
1360 TrimSpaces( doc, element );
1361 return;
1362 }
1363
1364 /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
1365 /* otherwise emphasis nesting is probably unintentional */
1366 /* big, small, sub, sup have cumulative effect to leave them alone */
1367 if ( node->type == StartTag
1368 && node->tag == element->tag
1369 && IsPushed( doc, node )
1370 && !node->implicit
1371 && !element->implicit
1372 && node->tag && (node->tag->model & CM_INLINE)
1373 && !nodeIsA(node)
1374 && !nodeIsFONT(node)
1375 && !nodeIsBIG(node)
1376 && !nodeIsSMALL(node)
1377 && !nodeIsSUB(node)
1378 && !nodeIsSUP(node)
1379 && !nodeIsQ(node)
1380 && !nodeIsSPAN(node)
1381 )
1382 {
1383 if (element->content != NULL && node->attributes == NULL)
1384 {
1385 ReportWarning(doc, element, node, COERCE_TO_ENDTAG_WARN);
1386 node->type = EndTag;
1387 UngetToken(doc);
1388 continue;
1389 }
1390
1391 if (node->attributes == NULL || element->attributes == NULL)
1392 ReportWarning(doc, element, node, NESTED_EMPHASIS);
1393 }
1394 else if ( IsPushed(doc, node) && node->type == StartTag &&
1395 nodeIsQ(node) )
1396 {
1397 ReportWarning(doc, element, node, NESTED_QUOTATION);
1398 }
1399
1400 if ( nodeIsText(node) )
1401 {
1402 /* only called for 1st child */
1403 if ( element->content == NULL && !(mode & Preformatted) )
1404 TrimSpaces( doc, element );
1405
1406 if ( node->start >= node->end )
1407 {
1408 FreeNode( doc, node );
1409 continue;
1410 }
1411
1412 InsertNodeAtEnd(element, node);
1413 continue;
1414 }
1415
1416 /* mixed content model so allow text */
1417 if (InsertMisc(element, node))
1418 continue;
1419
1420 /* deal with HTML tags */
1421 if ( nodeIsHTML(node) )
1422 {
1423 if ( nodeIsElement(node) )
1424 {
1425 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1426 FreeNode( doc, node );
1427 continue;
1428 }
1429
1430 /* otherwise infer end of inline element */
1431 UngetToken( doc );
1432
1433 if (!(mode & Preformatted))
1434 TrimSpaces(doc, element);
1435
1436 return;
1437 }
1438
1439 /* within <dt> or <pre> map <p> to <br> */
1440 if ( nodeIsP(node) &&
1441 node->type == StartTag &&
1442 ( (mode & Preformatted) ||
1443 nodeIsDT(element) ||
1444 DescendantOf(element, TidyTag_DT )
1445 )
1446 )
1447 {
1448 node->tag = LookupTagDef( TidyTag_BR );
1449 MemFree(node->element);
1450 node->element = tmbstrdup("br");
1451 TrimSpaces(doc, element);
1452 InsertNodeAtEnd(element, node);
1453 continue;
1454 }
1455
1456 /* <p> allowed within <address> in HTML 4.01 Transitional */
1457 if ( nodeIsP(node) &&
1458 node->type == StartTag &&
1459 nodeIsADDRESS(element) )
1460 {
1461 ConstrainVersion( doc, ~VERS_HTML40_STRICT );
1462 InsertNodeAtEnd(element, node);
1463 (*node->tag->parser)( doc, node, mode );
1464 continue;
1465 }
1466
1467 /* ignore unknown and PARAM tags */
1468 if ( node->tag == NULL || nodeIsPARAM(node) )
1469 {
1470 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1471 FreeNode( doc, node );
1472 continue;
1473 }
1474
1475 if ( nodeIsBR(node) && node->type == EndTag )
1476 node->type = StartTag;
1477
1478 if ( node->type == EndTag )
1479 {
1480 /* coerce </br> to <br> */
1481 if ( nodeIsBR(node) )
1482 node->type = StartTag;
1483 else if ( nodeIsP(node) )
1484 {
1485 /* coerce unmatched </p> to <br><br> */
1486 if ( !DescendantOf(element, TidyTag_P) )
1487 {
1488 CoerceNode(doc, node, TidyTag_BR, no, no);
1489 TrimSpaces( doc, element );
1490 InsertNodeAtEnd( element, node );
1491 node = InferredTag(doc, TidyTag_BR);
1492 InsertNodeAtEnd( element, node ); /* todo: check this */
1493 continue;
1494 }
1495 }
1496 else if ( nodeHasCM(node, CM_INLINE)
1497 && !nodeIsA(node)
1498 && !nodeHasCM(node, CM_OBJECT)
1499 && nodeHasCM(element, CM_INLINE) )
1500 {
1501 /* allow any inline end tag to end current element */
1502 PopInline( doc, element );
1503
1504 if ( !nodeIsA(element) )
1505 {
1506 if ( nodeIsA(node) && node->tag != element->tag )
1507 {
1508 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1509 UngetToken( doc );
1510 }
1511 else
1512 {
1513 ReportError(doc, element, node, NON_MATCHING_ENDTAG);
1514 FreeNode( doc, node);
1515 }
1516
1517 if (!(mode & Preformatted))
1518 TrimSpaces(doc, element);
1519
1520 return;
1521 }
1522
1523 /* if parent is <a> then discard unexpected inline end tag */
1524 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1525 FreeNode( doc, node);
1526 continue;
1527 } /* special case </tr> etc. for stuff moved in front of table */
1528 else if ( lexer->exiled
1529 && node->tag->model
1530 && (node->tag->model & CM_TABLE) )
1531 {
1532 UngetToken( doc );
1533 TrimSpaces(doc, element);
1534 return;
1535 }
1536 }
1537
1538 /* allow any header tag to end current header */
1539 if ( nodeHasCM(node, CM_HEADING) && nodeHasCM(element, CM_HEADING) )
1540 {
1541
1542 if ( node->tag == element->tag )
1543 {
1544 ReportError(doc, element, node, NON_MATCHING_ENDTAG );
1545 FreeNode( doc, node);
1546 }
1547 else
1548 {
1549 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1550 UngetToken( doc );
1551 }
1552
1553 if (!(mode & Preformatted))
1554 TrimSpaces(doc, element);
1555
1556 return;
1557 }
1558
1559 /*
1560 an <A> tag to ends any open <A> element
1561 but <A href=...> is mapped to </A><A href=...>
1562 */
1563 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1564 /* if (node->tag == doc->tags.tag_a && !node->implicit && IsPushed(doc, node)) */
1565 if ( nodeIsA(node) && !node->implicit &&
1566 (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1567 {
1568 /* coerce <a> to </a> unless it has some attributes */
1569 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1570 /* other fixes by Dave Raggett */
1571 /* if (node->attributes == NULL) */
1572 if (node->type != EndTag && node->attributes == NULL)
1573 {
1574 node->type = EndTag;
1575 ReportError(doc, element, node, COERCE_TO_ENDTAG);
1576 /* PopInline( doc, node ); */
1577 UngetToken( doc );
1578 continue;
1579 }
1580
1581 UngetToken( doc );
1582 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE);
1583 /* PopInline( doc, element ); */
1584
1585 if (!(mode & Preformatted))
1586 TrimSpaces(doc, element);
1587
1588 return;
1589 }
1590
1591 if (element->tag->model & CM_HEADING)
1592 {
1593 if ( nodeIsCENTER(node) || nodeIsDIV(node) )
1594 {
1595 if (!nodeIsElement(node))
1596 {
1597 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1598 FreeNode( doc, node);
1599 continue;
1600 }
1601
1602 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN);
1603
1604 /* insert center as parent if heading is empty */
1605 if (element->content == NULL)
1606 {
1607 InsertNodeAsParent(element, node);
1608 continue;
1609 }
1610
1611 /* split heading and make center parent of 2nd part */
1612 InsertNodeAfterElement(element, node);
1613
1614 if (!(mode & Preformatted))
1615 TrimSpaces(doc, element);
1616
1617 element = CloneNode( doc, element );
1618 InsertNodeAtEnd(node, element);
1619 continue;
1620 }
1621
1622 if ( nodeIsHR(node) )
1623 {
1624 if ( !nodeIsElement(node) )
1625 {
1626 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1627 FreeNode( doc, node);
1628 continue;
1629 }
1630
1631 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN);
1632
1633 /* insert hr before heading if heading is empty */
1634 if (element->content == NULL)
1635 {
1636 InsertNodeBeforeElement(element, node);
1637 continue;
1638 }
1639
1640 /* split heading and insert hr before 2nd part */
1641 InsertNodeAfterElement(element, node);
1642
1643 if (!(mode & Preformatted))
1644 TrimSpaces(doc, element);
1645
1646 element = CloneNode( doc, element );
1647 InsertNodeAfterElement(node, element);
1648 continue;
1649 }
1650 }
1651
1652 if ( nodeIsDT(element) )
1653 {
1654 if ( nodeIsHR(node) )
1655 {
1656 Node *dd;
1657 if ( !nodeIsElement(node) )
1658 {
1659 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1660 FreeNode( doc, node);
1661 continue;
1662 }
1663
1664 ReportError(doc, element, node, TAG_NOT_ALLOWED_IN);
1665 dd = InferredTag(doc, TidyTag_DD);
1666
1667 /* insert hr within dd before dt if dt is empty */
1668 if (element->content == NULL)
1669 {
1670 InsertNodeBeforeElement(element, dd);
1671 InsertNodeAtEnd(dd, node);
1672 continue;
1673 }
1674
1675 /* split dt and insert hr within dd before 2nd part */
1676 InsertNodeAfterElement(element, dd);
1677 InsertNodeAtEnd(dd, node);
1678
1679 if (!(mode & Preformatted))
1680 TrimSpaces(doc, element);
1681
1682 element = CloneNode( doc, element );
1683 InsertNodeAfterElement(dd, element);
1684 continue;
1685 }
1686 }
1687
1688
1689 /*
1690 if this is the end tag for an ancestor element
1691 then infer end tag for this element
1692 */
1693 if (node->type == EndTag)
1694 {
1695 for (parent = element->parent;
1696 parent != NULL; parent = parent->parent)
1697 {
1698 if (node->tag == parent->tag)
1699 {
1700 if (!(element->tag->model & CM_OPT) && !element->implicit)
1701 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE);
1702
1703 PopInline( doc, element );
1704 UngetToken( doc );
1705
1706 if (!(mode & Preformatted))
1707 TrimSpaces(doc, element);
1708
1709 return;
1710 }
1711 }
1712 }
1713
1714 /* block level tags end this element */
1715 if (!(node->tag->model & CM_INLINE) &&
1716 !(element->tag->model & CM_MIXED))
1717 {
1718 if ( !nodeIsElement(node) )
1719 {
1720 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1721 FreeNode( doc, node);
1722 continue;
1723 }
1724
1725 if (!(element->tag->model & CM_OPT))
1726 ReportError(doc, element, node, MISSING_ENDTAG_BEFORE);
1727
1728 if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
1729 {
1730 MoveToHead(doc, element, node);
1731 continue;
1732 }
1733
1734 /*
1735 prevent anchors from propagating into block tags
1736 except for headings h1 to h6
1737 */
1738 if ( nodeIsA(element) )
1739 {
1740 if (node->tag && !(node->tag->model & CM_HEADING))
1741 PopInline( doc, element );
1742 else if (!(element->content))
1743 {
1744 DiscardElement( doc, element );
1745 UngetToken( doc );
1746 return;
1747 }
1748 }
1749
1750 UngetToken( doc );
1751
1752 if (!(mode & Preformatted))
1753 TrimSpaces(doc, element);
1754
1755 return;
1756 }
1757
1758 /* parse inline element */
1759 if (nodeIsElement(node))
1760 {
1761 if (node->implicit)
1762 ReportError(doc, element, node, INSERTING_TAG);
1763
1764 /* trim white space before <br> */
1765 if ( nodeIsBR(node) )
1766 TrimSpaces(doc, element);
1767
1768 InsertNodeAtEnd(element, node);
1769 ParseTag(doc, node, mode);
1770 continue;
1771 }
1772
1773 /* discard unexpected tags */
1774 ReportError(doc, element, node, DISCARDING_UNEXPECTED);
1775 FreeNode( doc, node );
1776 continue;
1777 }
1778
1779 if (!(element->tag->model & CM_OPT))
1780 ReportError(doc, element, node, MISSING_ENDTAG_FOR);
1781
1782 }
1783
1784 void ParseEmpty(TidyDocImpl* doc, Node *element, uint mode)
1785 {
1786 Lexer* lexer = doc->lexer;
1787 if ( lexer->isvoyager )
1788 {
1789 Node *node = GetToken( doc, mode);
1790 if ( node )
1791 {
1792 if ( !(node->type == EndTag && node->tag == element->tag) )
1793 {
1794 ReportError(doc, element, node, ELEMENT_NOT_EMPTY);
1795 UngetToken( doc );
1796 }
1797 else
1798 {
1799 FreeNode( doc, node );
1800 }
1801 }
1802 }
1803 }
1804
1805 void ParseDefList(TidyDocImpl* doc, Node *list, uint mode)
1806 {
1807 Lexer* lexer = doc->lexer;
1808 Node *node, *parent;
1809
1810 if (list->tag->model & CM_EMPTY)
1811 return;
1812
1813 lexer->insert = NULL; /* defer implicit inline start tags */
1814
1815 while ((node = GetToken( doc, IgnoreWhitespace)) != NULL)
1816 {
1817 if (node->tag == list->tag && node->type == EndTag)
1818 {
1819 FreeNode( doc, node);
1820 list->closed = yes;
1821 return;
1822 }
1823
1824 /* deal with comments etc. */
1825 if (InsertMisc(list, node))
1826 continue;
1827
1828 if (nodeIsText(node))
1829 {
1830 UngetToken( doc );
1831 node = InferredTag(doc, TidyTag_DT);
1832 ReportError(doc, list, node, MISSING_STARTTAG);
1833 }
1834
1835 if (node->tag == NULL)
1836 {
1837 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
1838 FreeNode( doc, node);
1839 continue;
1840 }
1841
1842 /*
1843 if this is the end tag for an ancestor element
1844 then infer end tag for this element
1845 */
1846 if (node->type == EndTag)
1847 {
1848 Bool discardIt = no;
1849 if ( nodeIsFORM(node) )
1850 {
1851 BadForm( doc );
1852 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
1853 FreeNode( doc, node );
1854 continue;
1855 }
1856
1857 for (parent = list->parent;
1858 parent != NULL; parent = parent->parent)
1859 {
1860 /* Do not match across BODY to avoid infinite loop
1861 between ParseBody and this parser,
1862 See http://tidy.sf.net/bug/1098012. */
1863 if (nodeIsBODY(parent))
1864 {
1865 discardIt = yes;
1866 break;
1867 }
1868 if (node->tag == parent->tag)
1869 {
1870 ReportError(doc, list, node, MISSING_ENDTAG_BEFORE);
1871
1872 UngetToken( doc );
1873 return;
1874 }
1875 }
1876 if (discardIt)
1877 {
1878 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
1879 FreeNode( doc, node);
1880 continue;
1881 }
1882 }
1883
1884 /* center in a dt or a dl breaks the dl list in two */
1885 if ( nodeIsCENTER(node) )
1886 {
1887 if (list->content)
1888 InsertNodeAfterElement(list, node);
1889 else /* trim empty dl list */
1890 {
1891 InsertNodeBeforeElement(list, node);
1892
1893 /* #540296 tidy dumps with empty definition list */
1894 #if 0
1895 DiscardElement(list);
1896 #endif
1897 }
1898
1899 /* #426885 - fix by Glenn Carroll 19 Apr 00, and
1900 Gary Dechaines 11 Aug 00 */
1901 /* ParseTag can destroy node, if it finds that
1902 * this <center> is followed immediately by </center>.
1903 * It's awkward but necessary to determine if this
1904 * has happened.
1905 */
1906 parent = node->parent;
1907
1908 /* and parse contents of center */
1909 lexer->excludeBlocks = no;
1910 ParseTag( doc, node, mode);
1911 lexer->excludeBlocks = yes;
1912
1913 /* now create a new dl element,
1914 * unless node has been blown away because the
1915 * center was empty, as above.
1916 */
1917 if (parent->last == node)
1918 {
1919 list = InferredTag(doc, TidyTag_DL);
1920 InsertNodeAfterElement(node, list);
1921 }
1922 continue;
1923 }
1924
1925 if ( !(nodeIsDT(node) || nodeIsDD(node)) )
1926 {
1927 UngetToken( doc );
1928
1929 if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
1930 {
1931 ReportError(doc, list, node, TAG_NOT_ALLOWED_IN);
1932 return;
1933 }
1934
1935 /* if DD appeared directly in BODY then exclude blocks */
1936 if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
1937 return;
1938
1939 node = InferredTag(doc, TidyTag_DD);
1940 ReportError(doc, list, node, MISSING_STARTTAG);
1941 }
1942
1943 if (node->type == EndTag)
1944 {
1945 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
1946 FreeNode( doc, node);
1947 continue;
1948 }
1949
1950 /* node should be <DT> or <DD>*/
1951 InsertNodeAtEnd(list, node);
1952 ParseTag( doc, node, IgnoreWhitespace);
1953 }
1954
1955 ReportError(doc, list, node, MISSING_ENDTAG_FOR);
1956 }
1957
1958 void ParseList(TidyDocImpl* doc, Node *list, uint ARG_UNUSED(mode))
1959 {
1960 Lexer* lexer = doc->lexer;
1961 Node *node, *parent;
1962
1963 if (list->tag->model & CM_EMPTY)
1964 return;
1965
1966 lexer->insert = NULL; /* defer implicit inline start tags */
1967
1968 while ((node = GetToken( doc, IgnoreWhitespace)) != NULL)
1969 {
1970 if (node->tag == list->tag && node->type == EndTag)
1971 {
1972 FreeNode( doc, node);
1973 list->closed = yes;
1974 return;
1975 }
1976
1977 /* deal with comments etc. */
1978 if (InsertMisc(list, node))
1979 continue;
1980
1981 if (node->type != TextNode && node->tag == NULL)
1982 {
1983 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
1984 FreeNode( doc, node);
1985 continue;
1986 }
1987
1988 /*
1989 if this is the end tag for an ancestor element
1990 then infer end tag for this element
1991 */
1992 if (node->type == EndTag)
1993 {
1994 if ( nodeIsFORM(node) )
1995 {
1996 BadForm( doc );
1997 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
1998 FreeNode( doc, node );
1999 continue;
2000 }
2001
2002 if (node->tag && node->tag->model & CM_INLINE)
2003 {
2004 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
2005 PopInline( doc, node );
2006 FreeNode( doc, node);
2007 continue;
2008 }
2009
2010 for ( parent = list->parent;
2011 parent != NULL; parent = parent->parent )
2012 {
2013 /* Do not match across BODY to avoid infinite loop
2014 between ParseBody and this parser,
2015 See http://tidy.sf.net/bug/1053626. */
2016 if (nodeIsBODY(parent))
2017 break;
2018 if (node->tag == parent->tag)
2019 {
2020 ReportError(doc, list, node, MISSING_ENDTAG_BEFORE);
2021 UngetToken( doc );
2022 return;
2023 }
2024 }
2025
2026 ReportError(doc, list, node, DISCARDING_UNEXPECTED);
2027 FreeNode( doc, node);
2028 continue;
2029 }
2030
2031 if ( !nodeIsLI(node) )
2032 {
2033 UngetToken( doc );
2034
2035 if (node->tag && (node->tag->model & CM_BLOCK) && lexer->excludeBlocks)
2036 {
2037 ReportError(doc, list, node, MISSING_ENDTAG_BEFORE);
2038 return;
2039 }
2040
2041 node = InferredTag(doc, TidyTag_LI);
2042 AddAttribute( doc, node, "style", "list-style: none" );
2043 ReportError(doc, list, node, MISSING_STARTTAG );
2044 }
2045
2046 /* node should be <LI> */
2047 InsertNodeAtEnd(list,node);
2048 ParseTag( doc, node, IgnoreWhitespace);
2049 }
2050
2051 ReportError(doc, list, node, MISSING_ENDTAG_FOR);
2052 }
2053
2054 /*
2055 unexpected content in table row is moved to just before
2056 the table in accordance with Netscape and IE. This code
2057 assumes that node hasn't been inserted into the row.
2058 */
2059 static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
2060 Node *node )
2061 {
2062 Node *table;
2063
2064 /* first find the table element */
2065 for (table = row->parent; table; table = table->parent)
2066 {
2067 if ( nodeIsTABLE(table) )
2068 {
2069 InsertNodeBeforeElement( table, node );
2070 return;
2071 }
2072 }
2073 /* No table element */
2074 InsertNodeBeforeElement( row->parent, node );
2075 }
2076
2077 /*
2078 if a table row is empty then insert an empty cell
2079 this practice is consistent with browser behavior
2080 and avoids potential problems with row spanning cells
2081 */
2082 static void FixEmptyRow(TidyDocImpl* doc, Node *row)
2083 {
2084 Node *cell;
2085
2086 if (row->content == NULL)
2087 {
2088 cell = InferredTag(doc, TidyTag_TD);
2089 InsertNodeAtEnd(row, cell);
2090 ReportError(doc, row, cell, MISSING_STARTTAG);
2091 }
2092 }
2093
2094 void ParseRow(TidyDocImpl* doc, Node *row, uint ARG_UNUSED(mode))
2095 {
2096 Lexer* lexer = doc->lexer;
2097 Node *node;
2098 Bool exclude_state;
2099
2100 if (row->tag->model & CM_EMPTY)
2101 return;
2102
2103 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
2104 {
2105 if (node->tag == row->tag)
2106 {
2107 if (node->type == EndTag)
2108 {
2109 FreeNode( doc, node);
2110 row->closed = yes;
2111 FixEmptyRow( doc, row);
2112 return;
2113 }
2114
2115 /* New row start implies end of current row */
2116 UngetToken( doc );
2117 FixEmptyRow( doc, row);
2118 return;
2119 }
2120
2121 /*
2122 if this is the end tag for an ancestor element
2123 then infer end tag for this element
2124 */
2125 if ( node->type == EndTag )
2126 {
2127 if ( DescendantOf(row, TagId(node)) )
2128 {
2129 UngetToken( doc );
2130 return;
2131 }
2132
2133 if ( nodeIsFORM(node) || nodeHasCM(node, CM_BLOCK|CM_INLINE) )
2134 {
2135 if ( nodeIsFORM(node) )
2136 BadForm( doc );
2137
2138 ReportError(doc, row, node, DISCARDING_UNEXPECTED);
2139 FreeNode( doc, node);
2140 continue;
2141 }
2142
2143 if ( nodeIsTD(node) || nodeIsTH(node) )
2144 {
2145 ReportError(doc, row, node, DISCARDING_UNEXPECTED);
2146 FreeNode( doc, node);
2147 continue;
2148 }
2149 }
2150
2151 /* deal with comments etc. */
2152 if (InsertMisc(row, node))
2153 continue;
2154
2155 /* discard unknown tags */
2156 if (node->tag == NULL && node->type != TextNode)
2157 {
2158 ReportError(doc, row, node, DISCARDING_UNEXPECTED);
2159 FreeNode( doc, node);
2160 continue;
2161 }
2162
2163 /* discard unexpected <table> element */
2164 if ( nodeIsTABLE(node) )
2165 {
2166 ReportError(doc, row, node, DISCARDING_UNEXPECTED);
2167 FreeNode( doc, node);
2168 continue;
2169 }
2170
2171 /* THEAD, TFOOT or TBODY */
2172 if ( nodeHasCM(node, CM_ROWGRP) )
2173 {
2174 UngetToken( doc );
2175 return;
2176 }
2177
2178 if (node->type == EndTag)
2179 {
2180 ReportError(doc, row, node, DISCARDING_UNEXPECTED);
2181 FreeNode( doc, node);
2182 continue;
2183 }
2184
2185 /*
2186 if text or inline or block move before table
2187 if head content move to head
2188 */
2189
2190 if (node->type != EndTag)
2191 {
2192 if ( nodeIsFORM(node) )
2193 {
2194 UngetToken( doc );
2195 node = InferredTag(doc, TidyTag_TD);
2196 ReportError(doc, row, node, MISSING_STARTTAG);
2197 }
2198 else if ( nodeIsText(node)
2199 || nodeHasCM(node, CM_BLOCK | CM_INLINE) )
2200 {
2201 MoveBeforeTable( doc, row, node );
2202 ReportError(doc, row, node, TAG_NOT_ALLOWED_IN);
2203 lexer->exiled = yes;
2204
2205 if (node->type != TextNode)
2206 ParseTag( doc, node, IgnoreWhitespace);
2207
2208 lexer->exiled = no;
2209 continue;
2210 }
2211 else if (node->tag->model & CM_HEAD)
2212 {
2213 ReportError(doc, row, node, TAG_NOT_ALLOWED_IN);
2214 MoveToHead( doc, row, node);
2215 continue;
2216 }
2217 }
2218
2219 if ( !(nodeIsTD(node) || nodeIsTH(node)) )
2220 {
2221 ReportError(doc, row, node, TAG_NOT_ALLOWED_IN);
2222 FreeNode( doc, node);
2223 continue;
2224 }
2225
2226 /* node should be <TD> or <TH> */
2227 InsertNodeAtEnd(row, node);
2228 exclude_state = lexer->excludeBlocks;
2229 lexer->excludeBlocks = no;
2230 ParseTag( doc, node, IgnoreWhitespace);
2231 lexer->excludeBlocks = exclude_state;
2232
2233 /* pop inline stack */
2234
2235 while ( lexer->istacksize > lexer->istackbase )
2236 PopInline( doc, NULL );
2237 }
2238
2239 }
2240
2241 void ParseRowGroup(TidyDocImpl* doc, Node *rowgroup, uint ARG_UNUSED(mode))
2242 {
2243 Lexer* lexer = doc->lexer;
2244 Node *node, *parent;
2245
2246 if (rowgroup->tag->model & CM_EMPTY)
2247 return;
2248
2249 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
2250 {
2251 if (node->tag == rowgroup->tag)
2252 {
2253 if (node->type == EndTag)
2254 {
2255 rowgroup->closed = yes;
2256 FreeNode( doc, node);
2257 return;
2258 }
2259
2260 UngetToken( doc );
2261 return;
2262 }
2263
2264 /* if </table> infer end tag */
2265 if ( nodeIsTABLE(node) && node->type == EndTag )
2266 {
2267 UngetToken( doc );
2268 return;
2269 }
2270
2271 /* deal with comments etc. */
2272 if (InsertMisc(rowgroup, node))
2273 continue;
2274
2275 /* discard unknown tags */
2276 if (node->tag == NULL && node->type != TextNode)
2277 {
2278 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2279 FreeNode( doc, node);
2280 continue;
2281 }
2282
2283 /*
2284 if TD or TH then infer <TR>
2285 if text or inline or block move before table
2286 if head content move to head
2287 */
2288
2289 if (node->type != EndTag)
2290 {
2291 if ( nodeIsTD(node) || nodeIsTH(node) )
2292 {
2293 UngetToken( doc );
2294 node = InferredTag(doc, TidyTag_TR);
2295 ReportError(doc, rowgroup, node, MISSING_STARTTAG);
2296 }
2297 else if ( nodeIsText(node)
2298 || nodeHasCM(node, CM_BLOCK|CM_INLINE) )
2299 {
2300 MoveBeforeTable( doc, rowgroup, node );
2301 ReportError(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2302 lexer->exiled = yes;
2303
2304 if (node->type != TextNode)
2305 ParseTag(doc, node, IgnoreWhitespace);
2306
2307 lexer->exiled = no;
2308 continue;
2309 }
2310 else if (node->tag->model & CM_HEAD)
2311 {
2312 ReportError(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2313 MoveToHead(doc, rowgroup, node);
2314 continue;
2315 }
2316 }
2317
2318 /*
2319 if this is the end tag for ancestor element
2320 then infer end tag for this element
2321 */
2322 if (node->type == EndTag)
2323 {
2324 if ( nodeIsFORM(node) || nodeHasCM(node, CM_BLOCK|CM_INLINE) )
2325 {
2326 if ( nodeIsFORM(node) )
2327 BadForm( doc );
2328
2329 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2330 FreeNode( doc, node);
2331 continue;
2332 }
2333
2334 if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
2335 {
2336 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2337 FreeNode( doc, node);
2338 continue;
2339 }
2340
2341 for ( parent = rowgroup->parent;
2342 parent != NULL;
2343 parent = parent->parent )
2344 {
2345 if (node->tag == parent->tag)
2346 {
2347 UngetToken( doc );
2348 return;
2349 }
2350 }
2351 }
2352
2353 /*
2354 if THEAD, TFOOT or TBODY then implied end tag
2355
2356 */
2357 if (node->tag->model & CM_ROWGRP)
2358 {
2359 if (node->type != EndTag)
2360 {
2361 UngetToken( doc );
2362 return;
2363 }
2364 }
2365
2366 if (node->type == EndTag)
2367 {
2368 ReportError(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2369 FreeNode( doc, node);
2370 continue;
2371 }
2372
2373 if ( !nodeIsTR(node) )
2374 {
2375 node = InferredTag(doc, TidyTag_TR);
2376 ReportError(doc, rowgroup, node, MISSING_STARTTAG);
2377 UngetToken( doc );
2378 }
2379
2380 /* node should be <TR> */
2381 InsertNodeAtEnd(rowgroup, node);
2382 ParseTag(doc, node, IgnoreWhitespace);
2383 }
2384
2385 }
2386
2387 void ParseColGroup(TidyDocImpl* doc, Node *colgroup, uint ARG_UNUSED(mode))
2388 {
2389 Node *node, *parent;
2390
2391 if (colgroup->tag->model & CM_EMPTY)
2392 return;
2393
2394 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
2395 {
2396 if (node->tag == colgroup->tag && node->type == EndTag)
2397 {
2398 FreeNode( doc, node);
2399 colgroup->closed = yes;
2400 return;
2401 }
2402
2403 /*
2404 if this is the end tag for an ancestor element
2405 then infer end tag for this element
2406 */
2407 if (node->type == EndTag)
2408 {
2409 if ( nodeIsFORM(node) )
2410 {
2411 BadForm( doc );
2412 ReportError(doc, colgroup, node, DISCARDING_UNEXPECTED);
2413 FreeNode( doc, node);
2414 continue;
2415 }
2416
2417 for ( parent = colgroup->parent;
2418 parent != NULL;
2419 parent = parent->parent )
2420 {
2421 if (node->tag == parent->tag)
2422 {
2423 UngetToken( doc );
2424 return;
2425 }
2426 }
2427 }
2428
2429 if (nodeIsText(node))
2430 {
2431 UngetToken( doc );
2432 return;
2433 }
2434
2435 /* deal with comments etc. */
2436 if (InsertMisc(colgroup, node))
2437 continue;
2438
2439 /* discard unknown tags */
2440 if (node->tag == NULL)
2441 {
2442 ReportError(doc, colgroup, node, DISCARDING_UNEXPECTED);
2443 FreeNode( doc, node);
2444 continue;
2445 }
2446
2447 if ( !nodeIsCOL(node) )
2448 {
2449 UngetToken( doc );
2450 return;
2451 }
2452
2453 if (node->type == EndTag)
2454 {
2455 ReportError(doc, colgroup, node, DISCARDING_UNEXPECTED);
2456 FreeNode( doc, node);
2457 continue;
2458 }
2459
2460 /* node should be <COL> */
2461 InsertNodeAtEnd(colgroup, node);
2462 ParseTag(doc, node, IgnoreWhitespace);
2463 }
2464 }
2465
2466 void ParseTableTag(TidyDocImpl* doc, Node *table, uint ARG_UNUSED(mode))
2467 {
2468 Lexer* lexer = doc->lexer;
2469 Node *node, *parent;
2470 uint istackbase;
2471
2472 DeferDup( doc );
2473 istackbase = lexer->istackbase;
2474 lexer->istackbase = lexer->istacksize;
2475
2476 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
2477 {
2478 if (node->tag == table->tag && node->type == EndTag)
2479 {
2480 FreeNode( doc, node);
2481 lexer->istackbase = istackbase;
2482 table->closed = yes;
2483 return;
2484 }
2485
2486 /* deal with comments etc. */
2487 if (InsertMisc(table, node))
2488 continue;
2489
2490 /* discard unknown tags */
2491 if (node->tag == NULL && node->type != TextNode)
2492 {
2493 ReportError(doc, table, node, DISCARDING_UNEXPECTED);
2494 FreeNode( doc, node);
2495 continue;
2496 }
2497
2498 /* if TD or TH or text or inline or block then infer <TR> */
2499
2500 if (node->type != EndTag)
2501 {
2502 if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
2503 {
2504 UngetToken( doc );
2505 node = InferredTag(doc, TidyTag_TR);
2506 ReportError(doc, table, node, MISSING_STARTTAG);
2507 }
2508 else if ( nodeIsText(node) ||nodeHasCM(node,CM_BLOCK|CM_INLINE) )
2509 {
2510 InsertNodeBeforeElement(table, node);
2511 ReportError(doc, table, node, TAG_NOT_ALLOWED_IN);
2512 lexer->exiled = yes;
2513
2514 if (node->type != TextNode)
2515 ParseTag(doc, node, IgnoreWhitespace);
2516
2517 lexer->exiled = no;
2518 continue;
2519 }
2520 else if (node->tag->model & CM_HEAD)
2521 {
2522 MoveToHead(doc, table, node);
2523 continue;
2524 }
2525 }
2526
2527 /*
2528 if this is the end tag for an ancestor element
2529 then infer end tag for this element
2530 */
2531 if (node->type == EndTag)
2532 {
2533 if ( nodeIsFORM(node) )
2534 {
2535 BadForm( doc );
2536 ReportError(doc, table, node, DISCARDING_UNEXPECTED);
2537 FreeNode( doc, node);
2538 continue;
2539 }
2540
2541 /* best to discard unexpected block/inline end tags */
2542 if ( nodeHasCM(node, CM_TABLE|CM_ROW) ||
2543 nodeHasCM(node, CM_BLOCK|CM_INLINE) )
2544 {
2545 ReportError(doc, table, node, DISCARDING_UNEXPECTED);
2546 FreeNode( doc, node);
2547 continue;
2548 }
2549
2550 for ( parent = table->parent;
2551 parent != NULL;
2552 parent = parent->parent )
2553 {
2554 if (node->tag == parent->tag)
2555 {
2556 ReportError(doc, table, node, MISSING_ENDTAG_BEFORE );
2557 UngetToken( doc );
2558 lexer->istackbase = istackbase;
2559 return;
2560 }
2561 }
2562 }
2563
2564 if (!(node->tag->model & CM_TABLE))
2565 {
2566 UngetToken( doc );
2567 ReportError(doc, table, node, TAG_NOT_ALLOWED_IN);
2568 lexer->istackbase = istackbase;
2569 return;
2570 }
2571
2572 if (nodeIsElement(node))
2573 {
2574 InsertNodeAtEnd(table, node);
2575 ParseTag(doc, node, IgnoreWhitespace);
2576 continue;
2577 }
2578
2579 /* discard unexpected text nodes and end tags */
2580 ReportError(doc, table, node, DISCARDING_UNEXPECTED);
2581 FreeNode( doc, node);
2582 }
2583
2584 ReportError(doc, table, node, MISSING_ENDTAG_FOR);
2585 lexer->istackbase = istackbase;
2586 }
2587
2588 /* acceptable content for pre elements */
2589 Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
2590 {
2591 /* p is coerced to br's, Text OK too */
2592 if ( nodeIsP(node) || nodeIsText(node) )
2593 return yes;
2594
2595 if ( node->tag == NULL ||
2596 nodeIsPARAM(node) ||
2597 !nodeHasCM(node, CM_INLINE|CM_NEW) )
2598 return no;
2599
2600 return yes;
2601 }
2602
2603 void ParsePre( TidyDocImpl* doc, Node *pre, uint ARG_UNUSED(mode) )
2604 {
2605 Node *node;
2606
2607 if (pre->tag->model & CM_EMPTY)
2608 return;
2609
2610 InlineDup( doc, NULL ); /* tell lexer to insert inlines if needed */
2611
2612 while ((node = GetToken(doc, Preformatted)) != NULL)
2613 {
2614 if ( node->type == EndTag &&
2615 (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
2616 {
2617 if (nodeIsBODY(node) || nodeIsHTML(node))
2618 {
2619 ReportError(doc, pre, node, DISCARDING_UNEXPECTED);
2620 FreeNode(doc, node);
2621 continue;
2622 }
2623 if (node->tag == pre->tag)
2624 {
2625 FreeNode(doc, node);
2626 }
2627 else
2628 {
2629 ReportError(doc, pre, node, MISSING_ENDTAG_BEFORE );
2630 UngetToken( doc );
2631 }
2632 pre->closed = yes;
2633 TrimSpaces(doc, pre);
2634 return;
2635 }
2636
2637 if (nodeIsText(node))
2638 {
2639 InsertNodeAtEnd(pre, node);
2640 continue;
2641 }
2642
2643 /* deal with comments etc. */
2644 if (InsertMisc(pre, node))
2645 continue;
2646
2647 if (node->tag == NULL)
2648 {
2649 ReportError(doc, pre, node, DISCARDING_UNEXPECTED);
2650 FreeNode(doc, node);
2651 continue;
2652 }
2653
2654 /* strip unexpected tags */
2655 if ( !PreContent(doc, node) )
2656 {
2657 Node *newnode;
2658
2659 /* fix for http://tidy.sf.net/bug/772205 */
2660 if (node->type == EndTag)
2661 {
2662 ReportError(doc, pre, node, DISCARDING_UNEXPECTED);
2663 FreeNode(doc, node);
2664 continue;
2665 }
2666 /*
2667 This is basically what Tidy 04 August 2000 did and far more accurate
2668 with respect to browser behaivour than the code commented out above.
2669 Tidy could try to propagate the <pre> into each disallowed child where
2670 <pre> is allowed in order to replicate some browsers behaivour, but
2671 there are a lot of exceptions, e.g. Internet Explorer does not propagate
2672 <pre> into table cells while Mozilla does. Opera 6 never propagates
2673 <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
2674
2675 Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
2676 which are handled like Mozilla takes them (Opera6 closes all <pre> after
2677 the first </pre>).
2678
2679 There are similar issues like replacing <p> in <pre> with <br>, for
2680 example
2681
2682 <pre>...<p>...</pre> (Input)
2683 <pre>...<br>...</pre> (Tidy)
2684 <pre>...<br>...</pre> (Opera 7 and Internet Explorer)
2685 <pre>...<br><br>...</pre> (Opera 6 and Mozilla)
2686
2687 <pre>...<p>...</p>...</pre> (Input)
2688 <pre>...<br>......</pre> (Tidy, BUG!)
2689 <pre>...<br>...<br>...</pre> (Internet Explorer)
2690 <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
2691 <pre>...<br>...<br><br>...</pre> (Opera 7)
2692
2693 or something similar, they could also be closing the <pre> and propagate
2694 the <pre> into the newly opened <p>.
2695
2696 Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
2697 dissallowed in <pre>, Tidy neither detects this nor does it perform any
2698 cleanup operation. Tidy should at least issue a warning if it encounters
2699 such constructs.
2700
2701 Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
2702 */
2703 InsertNodeAfterElement(pre, node);
2704 ReportError(doc, pre, node, MISSING_ENDTAG_BEFORE);
2705 ParseTag(doc, node, IgnoreWhitespace);
2706
2707 newnode = InferredTag(doc, TidyTag_PRE);
2708 ReportError(doc, pre, newnode, INSERTING_TAG);
2709 pre = newnode;
2710 InsertNodeAfterElement(node, pre);
2711
2712 continue;
2713 }
2714
2715 if ( nodeIsP(node) )
2716 {
2717 if (node->type == StartTag)
2718 {
2719 ReportError(doc, pre, node, USING_BR_INPLACE_OF);
2720
2721 /* trim white space before <p> in <pre>*/
2722 TrimSpaces(doc, pre);
2723
2724 /* coerce both <p> and </p> to <br> */
2725 CoerceNode(doc, node, TidyTag_BR, no, no);
2726 FreeAttrs( doc, node ); /* discard align attribute etc. */
2727 InsertNodeAtEnd( pre, node );
2728 }
2729 else
2730 {
2731 ReportError(doc, pre, node, DISCARDING_UNEXPECTED);
2732 FreeNode( doc, node);
2733 }
2734 continue;
2735 }
2736
2737 if ( nodeIsElement(node) )
2738 {
2739 /* trim white space before <br> */
2740 if ( nodeIsBR(node) )
2741 TrimSpaces(doc, pre);
2742
2743 InsertNodeAtEnd(pre, node);
2744 ParseTag(doc, node, Preformatted);
2745 continue;
2746 }
2747
2748 /* discard unexpected tags */
2749 ReportError(doc, pre, node, DISCARDING_UNEXPECTED);
2750 FreeNode( doc, node);
2751 }
2752
2753 ReportError(doc, pre, node, MISSING_ENDTAG_FOR);
2754 }
2755
2756 void ParseOptGroup(TidyDocImpl* doc, Node *field, uint ARG_UNUSED(mode))
2757 {
2758 Lexer* lexer = doc->lexer;
2759 Node *node;
2760
2761 lexer->insert = NULL; /* defer implicit inline start tags */
2762
2763 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
2764 {
2765 if (node->tag == field->tag && node->type == EndTag)
2766 {
2767 FreeNode( doc, node);
2768 field->closed = yes;
2769 TrimSpaces(doc, field);
2770 return;
2771 }
2772
2773 /* deal with comments etc. */
2774 if (InsertMisc(field, node))
2775 continue;
2776
2777 if ( node->type == StartTag &&
2778 (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
2779 {
2780 if ( nodeIsOPTGROUP(node) )
2781 ReportError(doc, field, node, CANT_BE_NESTED);
2782
2783 InsertNodeAtEnd(field, node);
2784 ParseTag(doc, node, MixedContent);
2785 continue;
2786 }
2787
2788 /* discard unexpected tags */
2789 ReportError(doc, field, node, DISCARDING_UNEXPECTED );
2790 FreeNode( doc, node);
2791 }
2792 }
2793
2794
2795 void ParseSelect(TidyDocImpl* doc, Node *field, uint ARG_UNUSED(mode))
2796 {
2797 Lexer* lexer = doc->lexer;
2798 Node *node;
2799
2800 lexer->insert = NULL; /* defer implicit inline start tags */
2801
2802 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
2803 {
2804 if (node->tag == field->tag && node->type == EndTag)
2805 {
2806 FreeNode( doc, node);
2807 field->closed = yes;
2808 TrimSpaces(doc, field);
2809 return;
2810 }
2811
2812 /* deal with comments etc. */
2813 if (InsertMisc(field, node))
2814 continue;
2815
2816 if ( node->type == StartTag &&
2817 ( nodeIsOPTION(node) ||
2818 nodeIsOPTGROUP(node) ||
2819 nodeIsSCRIPT(node))
2820 )
2821 {
2822 InsertNodeAtEnd(field, node);
2823 ParseTag(doc, node, IgnoreWhitespace);
2824 continue;
2825 }
2826
2827 /* discard unexpected tags */
2828 ReportError(doc, field, node, DISCARDING_UNEXPECTED);
2829 FreeNode( doc, node);
2830 }
2831
2832 ReportError(doc, field, node, MISSING_ENDTAG_FOR);
2833 }
2834
2835 void ParseText(TidyDocImpl* doc, Node *field, uint mode)
2836 {
2837 Lexer* lexer = doc->lexer;
2838 Node *node;
2839
2840 lexer->insert = NULL; /* defer implicit inline start tags */
2841
2842 if ( nodeIsTEXTAREA(field) )
2843 mode = Preformatted;
2844 else
2845 mode = MixedContent; /* kludge for font tags */
2846
2847 while ((node = GetToken(doc, mode)) != NULL)
2848 {
2849 if (node->tag == field->tag && node->type == EndTag)
2850 {
2851 FreeNode( doc, node);
2852 field->closed = yes;
2853 TrimSpaces(doc, field);
2854 return;
2855 }
2856
2857 /* deal with comments etc. */
2858 if (InsertMisc(field, node))
2859 continue;
2860
2861 if (nodeIsText(node))
2862 {
2863 /* only called for 1st child */
2864 if (field->content == NULL && !(mode & Preformatted))
2865 TrimSpaces(doc, field);
2866
2867 if (node->start >= node->end)
2868 {
2869 FreeNode( doc, node);
2870 continue;
2871 }
2872
2873 InsertNodeAtEnd(field, node);
2874 continue;
2875 }
2876
2877 /* for textarea should all cases of < and & be escaped? */
2878
2879 /* discard inline tags e.g. font */
2880 if ( node->tag
2881 && node->tag->model & CM_INLINE
2882 && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
2883 {
2884 ReportError(doc, field, node, DISCARDING_UNEXPECTED);
2885 FreeNode( doc, node);
2886 continue;
2887 }
2888
2889 /* terminate element on other tags */
2890 if (!(field->tag->model & CM_OPT))
2891 ReportError(doc, field, node, MISSING_ENDTAG_BEFORE);
2892
2893 UngetToken( doc );
2894 TrimSpaces(doc, field);
2895 return;
2896 }
2897
2898 if (!(field->tag->model & CM_OPT))
2899 ReportError(doc, field, node, MISSING_ENDTAG_FOR);
2900 }
2901
2902
2903 void ParseTitle(TidyDocImpl* doc, Node *title, uint ARG_UNUSED(mode))
2904 {
2905 Node *node;
2906 while ((node = GetToken(doc, MixedContent)) != NULL)
2907 {
2908 if (node->tag == title->tag && node->type == StartTag)
2909 {
2910 ReportError(doc, title, node, COERCE_TO_ENDTAG);
2911 node->type = EndTag;
2912 UngetToken( doc );
2913 continue;
2914 }
2915 else if (node->tag == title->tag && node->type == EndTag)
2916 {
2917 FreeNode( doc, node);
2918 title->closed = yes;
2919 TrimSpaces(doc, title);
2920 return;
2921 }
2922
2923 if (nodeIsText(node))
2924 {
2925 /* only called for 1st child */
2926 if (title->content == NULL)
2927 TrimInitialSpace(doc, title, node);
2928
2929 if (node->start >= node->end)
2930 {
2931 FreeNode( doc, node);
2932 continue;
2933 }
2934
2935 InsertNodeAtEnd(title, node);
2936 continue;
2937 }
2938
2939 /* deal with comments etc. */
2940 if (InsertMisc(title, node))
2941 continue;
2942
2943 /* discard unknown tags */
2944 if (node->tag == NULL)
2945 {
2946 ReportError(doc, title, node, DISCARDING_UNEXPECTED);
2947 FreeNode( doc, node);
2948 continue;
2949 }
2950
2951 /* pushback unexpected tokens */
2952 ReportError(doc, title, node, MISSING_ENDTAG_BEFORE);
2953 UngetToken( doc );
2954 TrimSpaces(doc, title);
2955 return;
2956 }
2957
2958 ReportError(doc, title, node, MISSING_ENDTAG_FOR);
2959 }
2960
2961 /*
2962 This isn't quite right for CDATA content as it recognises
2963 tags within the content and parses them accordingly.
2964 This will unfortunately screw up scripts which include
2965 < + letter, < + !, < + ? or < + / + letter
2966 */
2967
2968 void ParseScript(TidyDocImpl* doc, Node *script, uint ARG_UNUSED(mode))
2969 {
2970 Node *node;
2971
2972 doc->lexer->parent = script;
2973 node = GetToken(doc, CdataContent);
2974 doc->lexer->parent = NULL;
2975
2976 if (node)
2977 {
2978 InsertNodeAtEnd(script, node);
2979 }
2980 else
2981 {
2982 /* handle e.g. a document like "<script>" */
2983 ReportError(doc, script, NULL, MISSING_ENDTAG_FOR);
2984 return;
2985 }
2986
2987 node = GetToken(doc, IgnoreWhitespace);
2988
2989 if (!(node && node->type == EndTag && node->tag &&
2990 node->tag->id == script->tag->id))
2991 {
2992 ReportError(doc, script, node, MISSING_ENDTAG_FOR);
2993
2994 if (node)
2995 UngetToken(doc);
2996 }
2997 else
2998 {
2999 FreeNode(doc, node);
3000 }
3001 }
3002
3003 Bool IsJavaScript(Node *node)
3004 {
3005 Bool result = no;
3006 AttVal *attr;
3007
3008 if (node->attributes == NULL)
3009 return yes;
3010
3011 for (attr = node->attributes; attr; attr = attr->next)
3012 {
3013 if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
3014 && AttrContains(attr, "javascript") )
3015 {
3016 result = yes;
3017 break;
3018 }
3019 }
3020
3021 return result;
3022 }
3023
3024 void ParseHead(TidyDocImpl* doc, Node *head, uint ARG_UNUSED(mode))
3025 {
3026 Lexer* lexer = doc->lexer;
3027 Node *node;
3028 int HasTitle = 0;
3029 int HasBase = 0;
3030
3031 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
3032 {
3033 if (node->tag == head->tag && node->type == EndTag)
3034 {
3035 FreeNode( doc, node);
3036 head->closed = yes;
3037 break;
3038 }
3039
3040 /* find and discard multiple <head> elements */
3041 /* find and discard <html> in <head> elements */
3042 if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
3043 {
3044 ReportError(doc, head, node, DISCARDING_UNEXPECTED);
3045 FreeNode(doc, node);
3046 continue;
3047 }
3048
3049 if (nodeIsText(node))
3050 {
3051 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN);
3052 UngetToken( doc );
3053 break;
3054 }
3055
3056 if (node->type == ProcInsTag && node->element &&
3057 tmbstrcmp(node->element, "xml-stylesheet") == 0)
3058 {
3059 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN);
3060 InsertNodeBeforeElement(FindHTML(doc), node);
3061 continue;
3062 }
3063
3064 /* deal with comments etc. */
3065 if (InsertMisc(head, node))
3066 continue;
3067
3068 if (node->type == DocTypeTag)
3069 {
3070 InsertDocType(doc, head, node);
3071 continue;
3072 }
3073
3074 /* discard unknown tags */
3075 if (node->tag == NULL)
3076 {
3077 ReportError(doc, head, node, DISCARDING_UNEXPECTED);
3078 FreeNode( doc, node);
3079 continue;
3080 }
3081
3082 /*
3083 if it doesn't belong in the head then
3084 treat as implicit end of head and deal
3085 with as part of the body
3086 */
3087 if (!(node->tag->model & CM_HEAD))
3088 {
3089 /* #545067 Implicit closing of head broken - warn only for XHTML input */
3090 if ( lexer->isvoyager )
3091 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN );
3092 UngetToken( doc );
3093 break;
3094 }
3095
3096 if (nodeIsElement(node))
3097 {
3098 if ( nodeIsTITLE(node) )
3099 {
3100 ++HasTitle;
3101
3102 if (HasTitle > 1)
3103 if (head)
3104 ReportError(doc, head, node, TOO_MANY_ELEMENTS_IN);
3105 else
3106 ReportError(doc, head, node, TOO_MANY_ELEMENTS);
3107 }
3108 else if ( nodeIsBASE(node) )
3109 {
3110 ++HasBase;
3111
3112 if (HasBase > 1)
3113 if (head)
3114 ReportError(doc, head, node, TOO_MANY_ELEMENTS_IN);
3115 else
3116 ReportError(doc, head, node, TOO_MANY_ELEMENTS);
3117 }
3118 else if ( nodeIsNOSCRIPT(node) )
3119 {
3120 ReportError(doc, head, node, TAG_NOT_ALLOWED_IN);
3121 }
3122
3123 #ifdef AUTO_INPUT_ENCODING
3124 else if (nodeIsMETA(node))
3125 {
3126 AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV);
3127 AttVal * content = AttrGetById(node, TidyAttr_CONTENT);
3128 if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content))
3129 {
3130 tmbstr val, charset;
3131 uint end = 0;
3132 val = charset = tmbstrdup(content->value);
3133 val = tmbstrtolower(val);
3134 val = strstr(content->value, "charset");
3135
3136 if (val)
3137 val += 7;
3138
3139 while(val && *val && (IsWhite((tchar)*val) ||
3140 *val == '=' || *val == '"' || *val == '\''))
3141 ++val;
3142
3143 while(val && val[end] && !(IsWhite((tchar)val[end]) ||
3144 val[end] == '"' || val[end] == '\'' || val[end] == ';'))
3145 ++end;
3146
3147 if (val && end)
3148 {
3149 tmbstr encoding = tmbstrndup(val, end);
3150 uint id = GetEncodingIdFromName(encoding);
3151
3152 /* todo: detect mismatch with BOM/XMLDecl/declared */
3153 /* todo: error for unsupported encodings */
3154 /* todo: try to re-init transcoder */
3155 /* todo: change input/output encoding settings */
3156 /* todo: store id in StreamIn */
3157
3158 MemFree(encoding);
3159 }
3160
3161 MemFree(charset);
3162 }
3163 }
3164 #endif /* AUTO_INPUT_ENCODING */
3165
3166 InsertNodeAtEnd(head, node);
3167 ParseTag(doc, node, IgnoreWhitespace);
3168 continue;
3169 }
3170
3171 /* discard unexpected text nodes and end tags */
3172 ReportError(doc, head, node, DISCARDING_UNEXPECTED);
3173 FreeNode( doc, node);
3174 }
3175 }
3176
3177 void ParseBody(TidyDocImpl* doc, Node *body, uint mode)
3178 {
3179 Lexer* lexer = doc->lexer;
3180 Node *node;
3181 Bool checkstack, iswhitenode;
3182
3183 mode = IgnoreWhitespace;
3184 checkstack = yes;
3185
3186 BumpObject( doc, body->parent );
3187
3188 while ((node = GetToken(doc, mode)) != NULL)
3189 {
3190 /* find and discard multiple <body> elements */
3191 if (node->tag == body->tag && node->type == StartTag)
3192 {
3193 ReportError(doc, body, node, DISCARDING_UNEXPECTED);
3194 FreeNode(doc, node);
3195 continue;
3196 }
3197
3198 /* #538536 Extra endtags not detected */
3199 if ( nodeIsHTML(node) )
3200 {
3201 if (nodeIsElement(node) || lexer->seenEndHtml)
3202 ReportError(doc, body, node, DISCARDING_UNEXPECTED);
3203 else
3204 lexer->seenEndHtml = 1;
3205
3206 FreeNode( doc, node);
3207 continue;
3208 }
3209
3210 if ( lexer->seenEndBody &&
3211 ( node->type == StartTag ||
3212 node->type == EndTag ||
3213 node->type == StartEndTag ) )
3214 {
3215 ReportError(doc, body, node, CONTENT_AFTER_BODY );
3216 }
3217
3218 if ( node->tag == body->tag && node->type == EndTag )
3219 {
3220 body->closed = yes;
3221 TrimSpaces(doc, body);
3222 FreeNode( doc, node);
3223 lexer->seenEndBody = 1;
3224 mode = IgnoreWhitespace;
3225
3226 if ( nodeIsNOFRAMES(body->parent) )
3227 break;
3228
3229 continue;
3230 }
3231
3232 if ( nodeIsNOFRAMES(node) )
3233 {
3234 if (node->type == StartTag)
3235 {
3236 InsertNodeAtEnd(body, node);
3237 ParseBlock(doc, node, mode);
3238 continue;
3239 }
3240
3241 if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
3242 {
3243 TrimSpaces(doc, body);
3244 UngetToken( doc );
3245 break;
3246 }
3247 }
3248
3249 if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
3250 && nodeIsNOFRAMES(body->parent) )
3251 {
3252 TrimSpaces(doc, body);
3253 UngetToken( doc );
3254 break;
3255 }
3256
3257 iswhitenode = no;
3258
3259 if ( nodeIsText(node) &&
3260 node->end <= node->start + 1 &&
3261 lexer->lexbuf[node->start] == ' ' )
3262 iswhitenode = yes;
3263
3264 /* deal with comments etc. */
3265 if (InsertMisc(body, node))
3266 continue;
3267
3268 /* #538536 Extra endtags not detected */
3269 #if 0
3270 if ( lexer->seenEndBody == 1 && !iswhitenode )
3271 {
3272 ++lexer->seenEndBody;
3273 ReportError(doc, body, node, CONTENT_AFTER_BODY);
3274 }
3275 #endif
3276
3277 /* mixed content model permits text */
3278 if (nodeIsText(node))
3279 {
3280 if (iswhitenode && mode == IgnoreWhitespace)
3281 {
3282 FreeNode( doc, node);
3283 continue;
3284 }
3285
3286 /* HTML 2 and HTML4 strict don't allow text here */
3287 ConstrainVersion(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
3288
3289 if (checkstack)
3290 {
3291 checkstack = no;
3292
3293 if ( InlineDup(doc, node) > 0 )
3294 continue;
3295 }
3296
3297 InsertNodeAtEnd(body, node);
3298 mode = MixedContent;
3299 continue;
3300 }
3301
3302 if (node->type == DocTypeTag)
3303 {
3304 InsertDocType(doc, body, node);
3305 continue;
3306 }
3307 /* discard unknown and PARAM tags */
3308 if ( node->tag == NULL || nodeIsPARAM(node) )
3309 {
3310 ReportError(doc, body, node, DISCARDING_UNEXPECTED);
3311 FreeNode( doc, node);
3312 continue;
3313 }
3314
3315 /*
3316 Netscape allows LI and DD directly in BODY
3317 We infer UL or DL respectively and use this
3318 Bool to exclude block-level elements so as
3319 to match Netscape's observed behaviour.
3320 */
3321 lexer->excludeBlocks = no;
3322
3323 if ( nodeIsINPUT(node) ||
3324 (!nodeHasCM(node, CM_BLOCK) && !nodeHasCM(node, CM_INLINE))
3325 )
3326 {
3327 /* avoid this error message being issued twice */
3328 if (!(node->tag->model & CM_HEAD))
3329 ReportError(doc, body, node, TAG_NOT_ALLOWED_IN);
3330
3331 if (node->tag->model & CM_HTML)
3332 {
3333 /* copy body attributes if current body was inferred */
3334 if ( nodeIsBODY(node) && body->implicit
3335 && body->attributes == NULL )
3336 {
3337 body->attributes = node->attributes;
3338 node->attributes = NULL;
3339 }
3340
3341 FreeNode( doc, node);
3342 continue;
3343 }
3344
3345 if (node->tag->model & CM_HEAD)
3346 {
3347 MoveToHead(doc, body, node);
3348 continue;
3349 }
3350
3351 if (node->tag->model & CM_LIST)
3352 {
3353 UngetToken( doc );
3354 node = InferredTag(doc, TidyTag_UL);
3355 /* AddClass( doc, node, "noindent" ); */
3356 lexer->excludeBlocks = yes;
3357 }
3358 else if (node->tag->model & CM_DEFLIST)
3359 {
3360 UngetToken( doc );
3361 node = InferredTag(doc, TidyTag_DL);
3362 lexer->excludeBlocks = yes;
3363 }
3364 else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
3365 {
3366 UngetToken( doc );
3367 node = InferredTag(doc, TidyTag_TABLE);
3368 lexer->excludeBlocks = yes;
3369 }
3370 else if ( nodeIsINPUT(node) )
3371 {
3372 UngetToken( doc );
3373 node = InferredTag(doc, TidyTag_FORM);
3374 lexer->excludeBlocks = yes;
3375 }
3376 else
3377 {
3378 if ( !nodeHasCM(node, CM_ROW | CM_FIELD) )
3379 {
3380 UngetToken( doc );
3381 return;
3382 }
3383
3384 /* ignore </td> </th> <option> etc. */
3385 FreeNode( doc, node );
3386 continue;
3387 }
3388 }
3389
3390 if (node->type == EndTag)
3391 {
3392 if ( nodeIsBR(node) )
3393 node->type = StartTag;
3394 else if ( nodeIsP(node) )
3395 {
3396 node->type = StartEndTag;
3397 node->implicit = yes;
3398 #if OBSOLETE
3399 CoerceNode(doc, node, TidyTag_BR, no, no);
3400 FreeAttrs( doc, node ); /* discard align attribute etc. */
3401 InsertNodeAtEnd(body, node);
3402 node = InferredTag(doc, TidyTag_BR);
3403 #endif
3404 }
3405 else if ( nodeHasCM(node, CM_INLINE) )
3406 PopInline( doc, node );
3407 }
3408
3409 if (nodeIsElement(node))
3410 {
3411 if ( nodeHasCM(node, CM_INLINE) && !nodeHasCM(node, CM_MIXED) )
3412 {
3413 /* HTML4 strict doesn't allow inline content here */
3414 /* but HTML2 does allow img elements as children of body */
3415 if ( nodeIsIMG(node) )
3416 ConstrainVersion(doc, ~VERS_HTML40_STRICT);
3417 else
3418 ConstrainVersion(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
3419
3420 if (checkstack && !node->implicit)
3421 {
3422 checkstack = no;
3423
3424 if ( InlineDup(doc, node) > 0 )
3425 continue;
3426 }
3427
3428 mode = MixedContent;
3429 }
3430 else
3431 {
3432 checkstack = yes;
3433 mode = IgnoreWhitespace;
3434 }
3435
3436 if (node->implicit)
3437 ReportError(doc, body, node, INSERTING_TAG);
3438
3439 InsertNodeAtEnd(body, node);
3440 ParseTag(doc, node, mode);
3441 continue;
3442 }
3443
3444 /* discard unexpected tags */
3445 ReportError(doc, body, node, DISCARDING_UNEXPECTED);
3446 FreeNode( doc, node);
3447 }
3448 }
3449
3450 void ParseNoFrames(TidyDocImpl* doc, Node *noframes, uint mode)
3451 {
3452 Lexer* lexer = doc->lexer;
3453 Node *node;
3454
3455 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3456 {
3457 doc->badAccess |= USING_NOFRAMES;
3458 }
3459 mode = IgnoreWhitespace;
3460
3461 while ( (node = GetToken(doc, mode)) != NULL )
3462 {
3463 if ( node->tag == noframes->tag && node->type == EndTag )
3464 {
3465 FreeNode( doc, node);
3466 noframes->closed = yes;
3467 TrimSpaces(doc, noframes);
3468 return;
3469 }
3470
3471 if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
3472 {
3473 TrimSpaces(doc, noframes);
3474 if (node->type == EndTag)
3475 {
3476 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED);
3477 FreeNode( doc, node); /* Throw it away */
3478 }
3479 else
3480 {
3481 ReportError(doc, noframes, node, MISSING_ENDTAG_BEFORE);
3482 UngetToken( doc );
3483 }
3484 return;
3485 }
3486
3487 if ( nodeIsHTML(node) )
3488 {
3489 if (nodeIsElement(node))
3490 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED);
3491
3492 FreeNode( doc, node);
3493 continue;
3494 }
3495
3496 /* deal with comments etc. */
3497 if (InsertMisc(noframes, node))
3498 continue;
3499
3500 if ( nodeIsBODY(node) && node->type == StartTag )
3501 {
3502 Bool seen_body = lexer->seenEndBody;
3503 InsertNodeAtEnd(noframes, node);
3504 ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/);
3505
3506 /* fix for bug http://tidy.sf.net/bug/887259 */
3507 if (seen_body && FindBody(doc) != node)
3508 {
3509 CoerceNode(doc, node, TidyTag_DIV, no, no);
3510 MoveNodeToBody(doc, node);
3511 }
3512 continue;
3513 }
3514
3515 /* implicit body element inferred */
3516 if (nodeIsText(node) || (node->tag && node->type != EndTag))
3517 {
3518 if ( lexer->seenEndBody )
3519 {
3520 Node *body = FindBody( doc );
3521 if ( body == NULL )
3522 {
3523 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED);
3524 FreeNode( doc, node);
3525 continue;
3526 }
3527 if ( nodeIsText(node) )
3528 {
3529 UngetToken( doc );
3530 node = InferredTag(doc, TidyTag_P);
3531 ReportError(doc, noframes, node, CONTENT_AFTER_BODY );
3532 }
3533 InsertNodeAtEnd( body, node );
3534 }
3535 else
3536 {
3537 UngetToken( doc );
3538 node = InferredTag(doc, TidyTag_BODY);
3539 if ( cfgBool(doc, TidyXmlOut) )
3540 ReportError(doc, noframes, node, INSERTING_TAG);
3541 InsertNodeAtEnd( noframes, node );
3542 }
3543
3544 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
3545 continue;
3546 }
3547
3548 /* discard unexpected end tags */
3549 ReportError(doc, noframes, node, DISCARDING_UNEXPECTED);
3550 FreeNode( doc, node);
3551 }
3552
3553 ReportError(doc, noframes, node, MISSING_ENDTAG_FOR);
3554 }
3555
3556 void ParseFrameSet(TidyDocImpl* doc, Node *frameset, uint ARG_UNUSED(mode))
3557 {
3558 Lexer* lexer = doc->lexer;
3559 Node *node;
3560
3561 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3562 {
3563 doc->badAccess |= USING_FRAMES;
3564 }
3565
3566 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
3567 {
3568 if (node->tag == frameset->tag && node->type == EndTag)
3569 {
3570 FreeNode( doc, node);
3571 frameset->closed = yes;
3572 TrimSpaces(doc, frameset);
3573 return;
3574 }
3575
3576 /* deal with comments etc. */
3577 if (InsertMisc(frameset, node))
3578 continue;
3579
3580 if (node->tag == NULL)
3581 {
3582 ReportError(doc, frameset, node, DISCARDING_UNEXPECTED);
3583 FreeNode( doc, node);
3584 continue;
3585 }
3586
3587 if (nodeIsElement(node))
3588 {
3589 if (node->tag && node->tag->model & CM_HEAD)
3590 {
3591 MoveToHead(doc, frameset, node);
3592 continue;
3593 }
3594 }
3595
3596 if ( nodeIsBODY(node) )
3597 {
3598 UngetToken( doc );
3599 node = InferredTag(doc, TidyTag_NOFRAMES);
3600 ReportError(doc, frameset, node, INSERTING_TAG);
3601 }
3602
3603 if (node->type == StartTag && (node->tag->model & CM_FRAMES))
3604 {
3605 InsertNodeAtEnd(frameset, node);
3606 lexer->excludeBlocks = no;
3607 ParseTag(doc, node, MixedContent);
3608 continue;
3609 }
3610 else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES))
3611 {
3612 InsertNodeAtEnd(frameset, node);
3613 continue;
3614 }
3615
3616 /* discard unexpected tags */
3617 ReportError(doc, frameset, node, DISCARDING_UNEXPECTED);
3618 FreeNode( doc, node);
3619 }
3620
3621 ReportError(doc, frameset, node, MISSING_ENDTAG_FOR);
3622 }
3623
3624 void ParseHTML(TidyDocImpl* doc, Node *html, uint mode)
3625 {
3626 Node *node, *head;
3627 Node *frameset = NULL;
3628 Node *noframes = NULL;
3629
3630 SetOptionBool( doc, TidyXmlTags, no );
3631
3632 for (;;)
3633 {
3634 node = GetToken(doc, IgnoreWhitespace);
3635
3636 if (node == NULL)
3637 {
3638 node = InferredTag(doc, TidyTag_HEAD);
3639 break;
3640 }
3641
3642 if ( nodeIsHEAD(node) )
3643 break;
3644
3645 if (node->tag == html->tag && node->type == EndTag)
3646 {
3647 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3648 FreeNode( doc, node);
3649 continue;
3650 }
3651
3652 /* find and discard multiple <html> elements */
3653 if (node->tag == html->tag && node->type == StartTag)
3654 {
3655 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3656 FreeNode(doc, node);
3657 continue;
3658 }
3659
3660 /* deal with comments etc. */
3661 if (InsertMisc(html, node))
3662 continue;
3663
3664 UngetToken( doc );
3665 node = InferredTag(doc, TidyTag_HEAD);
3666 break;
3667 }
3668
3669 head = node;
3670 InsertNodeAtEnd(html, head);
3671 ParseHead(doc, head, mode);
3672
3673 for (;;)
3674 {
3675 node = GetToken(doc, IgnoreWhitespace);
3676
3677 if (node == NULL)
3678 {
3679 if (frameset == NULL) /* implied body */
3680 {
3681 node = InferredTag(doc, TidyTag_BODY);
3682 InsertNodeAtEnd(html, node);
3683 ParseBody(doc, node, mode);
3684 }
3685
3686 return;
3687 }
3688
3689 /* robustly handle html tags */
3690 if (node->tag == html->tag)
3691 {
3692 if (node->type != StartTag && frameset == NULL)
3693 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3694
3695 FreeNode( doc, node);
3696 continue;
3697 }
3698
3699 /* deal with comments etc. */
3700 if (InsertMisc(html, node))
3701 continue;
3702
3703 /* if frameset document coerce <body> to <noframes> */
3704 if ( nodeIsBODY(node) )
3705 {
3706 if (node->type != StartTag)
3707 {
3708 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3709 FreeNode( doc, node);
3710 continue;
3711 }
3712
3713 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3714 {
3715 if (frameset != NULL)
3716 {
3717 UngetToken( doc );
3718
3719 if (noframes == NULL)
3720 {
3721 noframes = InferredTag(doc, TidyTag_NOFRAMES);
3722 InsertNodeAtEnd(frameset, noframes);
3723 ReportError(doc, html, noframes, INSERTING_TAG);
3724 }
3725
3726 ParseTag(doc, noframes, mode);
3727 continue;
3728 }
3729 }
3730
3731 ConstrainVersion(doc, ~VERS_FRAMESET);
3732 break; /* to parse body */
3733 }
3734
3735 /* flag an error if we see more than one frameset */
3736 if ( nodeIsFRAMESET(node) )
3737 {
3738 if (node->type != StartTag)
3739 {
3740 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3741 FreeNode( doc, node);
3742 continue;
3743 }
3744
3745 if (frameset != NULL)
3746 ReportFatal(doc, html, node, DUPLICATE_FRAMESET);
3747 else
3748 frameset = node;
3749
3750 InsertNodeAtEnd(html, node);
3751 ParseTag(doc, node, mode);
3752
3753 /*
3754 see if it includes a noframes element so
3755 that we can merge subsequent noframes elements
3756 */
3757
3758 for (node = frameset->content; node; node = node->next)
3759 {
3760 if ( nodeIsNOFRAMES(node) )
3761 noframes = node;
3762 }
3763 continue;
3764 }
3765
3766 /* if not a frameset document coerce <noframes> to <body> */
3767 if ( nodeIsNOFRAMES(node) )
3768 {
3769 if (node->type != StartTag)
3770 {
3771 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3772 FreeNode( doc, node);
3773 continue;
3774 }
3775
3776 if (frameset == NULL)
3777 {
3778 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3779 FreeNode( doc, node);
3780 node = InferredTag(doc, TidyTag_BODY);
3781 break;
3782 }
3783
3784 if (noframes == NULL)
3785 {
3786 noframes = node;
3787 InsertNodeAtEnd(frameset, noframes);
3788 }
3789 else
3790 FreeNode( doc, node);
3791
3792 ParseTag(doc, noframes, mode);
3793 continue;
3794 }
3795
3796 if (nodeIsElement(node))
3797 {
3798 if (node->tag && node->tag->model & CM_HEAD)
3799 {
3800 MoveToHead(doc, html, node);
3801 continue;
3802 }
3803
3804 /* discard illegal frame element following a frameset */
3805 if ( frameset != NULL && nodeIsFRAME(node) )
3806 {
3807 ReportError(doc, html, node, DISCARDING_UNEXPECTED);
3808 FreeNode(doc, node);
3809 continue;
3810 }
3811 }
3812
3813 UngetToken( doc );
3814
3815 /* insert other content into noframes element */
3816
3817 if (frameset)
3818 {
3819 if (noframes == NULL)
3820 {
3821 noframes = InferredTag(doc, TidyTag_NOFRAMES);
3822 InsertNodeAtEnd(frameset, noframes);
3823 }
3824 else
3825 ReportError(doc, html, node, NOFRAMES_CONTENT);
3826
3827 ConstrainVersion(doc, VERS_FRAMESET);
3828 ParseTag(doc, noframes, mode);
3829 continue;
3830 }
3831
3832 node = InferredTag(doc, TidyTag_BODY);
3833 ConstrainVersion(doc, ~VERS_FRAMESET);
3834 break;
3835 }
3836
3837 /* node must be body */
3838
3839 InsertNodeAtEnd(html, node);
3840 ParseTag(doc, node, mode);
3841 }
3842
3843 static Bool nodeCMIsOnlyInline( Node* node )
3844 {
3845 return nodeHasCM( node, CM_INLINE ) && !nodeHasCM( node, CM_BLOCK );
3846 }
3847
3848 static void EncloseBodyText(TidyDocImpl* doc)
3849 {
3850 Node* node;
3851 Node* body = FindBody(doc);
3852
3853 if (!body)
3854 return;
3855
3856 node = body->content;
3857
3858 while (node)
3859 {
3860 if ((nodeIsText(node) && !IsBlank(doc->lexer, node)) ||
3861 (nodeIsElement(node) && nodeCMIsOnlyInline(node)))
3862 {
3863 Node* p = InferredTag(doc, TidyTag_P);
3864 InsertNodeBeforeElement(node, p);
3865 while (node && (!nodeIsElement(node) || nodeCMIsOnlyInline(node)))
3866 {
3867 Node* next = node->next;
3868 RemoveNode(node);
3869 InsertNodeAtEnd(p, node);
3870 node = next;
3871 }
3872 TrimSpaces(doc, p);
3873 continue;
3874 }
3875 node = node->next;
3876 }
3877 }
3878
3879 /* <form>, <blockquote> and <noscript> do not allow #PCDATA in
3880 HTML 4.01 Strict (%block; model instead of %flow;).
3881 When requested, text nodes in these elements are wrapped in <p>. */
3882 static void EncloseBlockText(TidyDocImpl* doc, Node* node)
3883 {
3884 Node *next;
3885 Node *block;
3886
3887 while (node)
3888 {
3889 next = node->next;
3890
3891 if (node->content)
3892 EncloseBlockText(doc, node->content);
3893
3894 if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
3895 nodeIsBLOCKQUOTE(node))
3896 || !node->content)
3897 {
3898 node = next;
3899 continue;
3900 }
3901
3902 block = node->content;
3903
3904 if ((nodeIsText(block) && !IsBlank(doc->lexer, block)) ||
3905 (nodeIsElement(block) && nodeCMIsOnlyInline(block)))
3906 {
3907 Node* p = InferredTag(doc, TidyTag_P);
3908 InsertNodeBeforeElement(block, p);
3909 while (block &&
3910 (!nodeIsElement(block) || nodeCMIsOnlyInline(block)))
3911 {
3912 Node* tempNext = block->next;
3913 RemoveNode(block);
3914 InsertNodeAtEnd(p, block);
3915 block = tempNext;
3916 }
3917 TrimSpaces(doc, p);
3918 continue;
3919 }
3920
3921 node = next;
3922 }
3923 }
3924
3925 static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
3926 {
3927 Node *next;
3928
3929 while (node)
3930 {
3931 next = node->next;
3932
3933 if (nodeIsDIR(node) || nodeIsMENU(node))
3934 CoerceNode(doc, node, TidyTag_UL, yes, yes);
3935
3936 if (nodeIsXMP(node) || nodeIsLISTING(node) ||
3937 (node->tag && node->tag->id == TidyTag_PLAINTEXT))
3938 CoerceNode(doc, node, TidyTag_PRE, yes, yes);
3939
3940 if (node->content)
3941 ReplaceObsoleteElements(doc, node->content);
3942
3943 node = next;
3944 }
3945 }
3946
3947 static void AttributeChecks(TidyDocImpl* doc, Node* node)
3948 {
3949 Node *next;
3950
3951 while (node)
3952 {
3953 next = node->next;
3954
3955 if (nodeIsElement(node))
3956 {
3957 if (node->tag->chkattrs)
3958 node->tag->chkattrs(doc, node);
3959 else
3960 CheckAttributes(doc, node);
3961 }
3962
3963 if (node->content)
3964 AttributeChecks(doc, node->content);
3965
3966 node = next;
3967 }
3968 }
3969
3970 /*
3971 HTML is the top level element
3972 */
3973 void ParseDocument(TidyDocImpl* doc)
3974 {
3975 Node *node, *html, *doctype = NULL;
3976
3977 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
3978 {
3979 if (node->type == XmlDecl)
3980 {
3981 if (FindXmlDecl(doc) && doc->root.content)
3982 {
3983 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED);
3984 FreeNode(doc, node);
3985 continue;
3986 }
3987 if (node->line != 1 || (node->line == 1 && node->column != 1))
3988 {
3989 ReportError(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
3990 }
3991 }
3992 #ifdef AUTO_INPUT_ENCODING
3993 if (node->type == XmlDecl)
3994 {
3995 AttVal* encoding = GetAttrByName(node, "encoding");
3996 if (AttrHasValue(encoding))
3997 {
3998 uint id = GetEncodingIdFromName(encoding->value);
3999
4000 /* todo: detect mismatch with BOM/XMLDecl/declared */
4001 /* todo: error for unsupported encodings */
4002 /* todo: try to re-init transcoder */
4003 /* todo: change input/output encoding settings */
4004 /* todo: store id in StreamIn */
4005 }
4006 }
4007 #endif /* AUTO_INPUT_ENCODING */
4008
4009 /* deal with comments etc. */
4010 if (InsertMisc( &doc->root, node ))
4011 continue;
4012
4013 if (node->type == DocTypeTag)
4014 {
4015 if (doctype == NULL)
4016 {
4017 InsertNodeAtEnd( &doc->root, node);
4018 doctype = node;
4019 }
4020 else
4021 {
4022 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4023 FreeNode( doc, node);
4024 }
4025 continue;
4026 }
4027
4028 if (node->type == EndTag)
4029 {
4030 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4031 FreeNode( doc, node);
4032 continue;
4033 }
4034
4035 if (node->type == StartTag && nodeIsHTML(node))
4036 {
4037 AttVal *xmlns;
4038
4039 xmlns = AttrGetById(node, TidyAttr_XMLNS);
4040
4041 if (AttrValueIs(xmlns, XHTML_NAMESPACE))
4042 {
4043 Bool htmlOut = cfgBool( doc, TidyHtmlOut );
4044 doc->lexer->isvoyager = yes; /* Unless plain HTML */
4045 SetOptionBool( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
4046 SetOptionBool( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */
4047
4048 /* adjust other config options, just as in config.c */
4049 if ( !htmlOut )
4050 {
4051 SetOptionBool( doc, TidyUpperCaseTags, no );
4052 SetOptionBool( doc, TidyUpperCaseAttrs, no );
4053 }
4054 }
4055 }
4056
4057 if ( node->type != StartTag || !nodeIsHTML(node) )
4058 {
4059 UngetToken( doc );
4060 html = InferredTag(doc, TidyTag_HTML);
4061 }
4062 else
4063 html = node;
4064
4065 if (!FindDocType(doc))
4066 ReportError(doc, NULL, NULL, MISSING_DOCTYPE);
4067
4068 InsertNodeAtEnd( &doc->root, html);
4069 ParseHTML( doc, html, IgnoreWhitespace );
4070 break;
4071 }
4072
4073 if (!FindHTML(doc))
4074 {
4075 /* a later check should complain if <body> is empty */
4076 html = InferredTag(doc, TidyTag_HTML);
4077 InsertNodeAtEnd( &doc->root, html);
4078 ParseHTML(doc, html, IgnoreWhitespace);
4079 }
4080
4081 if (!FindTITLE(doc))
4082 {
4083 Node* head = FindHEAD(doc);
4084 ReportError(doc, head, NULL, MISSING_TITLE_ELEMENT);
4085 InsertNodeAtEnd(head, InferredTag(doc, TidyTag_TITLE));
4086 }
4087
4088 AttributeChecks(doc, &doc->root);
4089 ReplaceObsoleteElements(doc, &doc->root);
4090 DropEmptyElements(doc, &doc->root);
4091 CleanSpaces(doc, &doc->root);
4092
4093 if (cfgBool(doc, TidyEncloseBodyText))
4094 EncloseBodyText(doc);
4095 if (cfgBool(doc, TidyEncloseBlockText))
4096 EncloseBlockText(doc, &doc->root);
4097 }
4098
4099 Bool XMLPreserveWhiteSpace( TidyDocImpl* doc, Node *element)
4100 {
4101 AttVal *attribute;
4102
4103 /* search attributes for xml:space */
4104 for (attribute = element->attributes; attribute; attribute = attribute->next)
4105 {
4106 if (AttrValueIs(attribute, "xml:space"))
4107 {
4108 if (AttrValueIs(attribute, "preserve"))
4109 return yes;
4110
4111 return no;
4112 }
4113 }
4114
4115 if (element->element == NULL)
4116 return no;
4117
4118 /* kludge for html docs without explicit xml:space attribute */
4119 if (nodeIsPRE(element) ||
4120 nodeIsSCRIPT(element) ||
4121 nodeIsSTYLE(element) ||
4122 FindParser(doc, element) == ParsePre)
4123 return yes;
4124
4125 /* kludge for XSL docs */
4126 if ( tmbstrcasecmp(element->element, "xsl:text") == 0 )
4127 return yes;
4128
4129 return no;
4130 }
4131
4132 /*
4133 XML documents
4134 */
4135 static void ParseXMLElement(TidyDocImpl* doc, Node *element, uint mode)
4136 {
4137 Lexer* lexer = doc->lexer;
4138 Node *node;
4139
4140 /* if node is pre or has xml:space="preserve" then do so */
4141
4142 if ( XMLPreserveWhiteSpace(doc, element) )
4143 mode = Preformatted;
4144
4145 while ((node = GetToken(doc, mode)) != NULL)
4146 {
4147 if (node->type == EndTag &&
4148 node->element && element->element &&
4149 tmbstrcmp(node->element, element->element) == 0)
4150 {
4151 FreeNode( doc, node);
4152 element->closed = yes;
4153 break;
4154 }
4155
4156 /* discard unexpected end tags */
4157 if (node->type == EndTag)
4158 {
4159 if (element)
4160 ReportFatal(doc, element, node, UNEXPECTED_ENDTAG_IN);
4161 else
4162 ReportFatal(doc, element, node, UNEXPECTED_ENDTAG);
4163
4164 FreeNode( doc, node);
4165 continue;
4166 }
4167
4168 /* parse content on seeing start tag */
4169 if (node->type == StartTag)
4170 ParseXMLElement( doc, node, mode );
4171
4172 InsertNodeAtEnd(element, node);
4173 }
4174
4175 /*
4176 if first child is text then trim initial space and
4177 delete text node if it is empty.
4178 */
4179
4180 node = element->content;
4181
4182 if (nodeIsText(node) && mode != Preformatted)
4183 {
4184 if ( lexer->lexbuf[node->start] == ' ' )
4185 {
4186 node->start++;
4187
4188 if (node->start >= node->end)
4189 DiscardElement( doc, node );
4190 }
4191 }
4192
4193 /*
4194 if last child is text then trim final space and
4195 delete the text node if it is empty
4196 */
4197
4198 node = element->last;
4199
4200 if (nodeIsText(node) && mode != Preformatted)
4201 {
4202 if ( lexer->lexbuf[node->end - 1] == ' ' )
4203 {
4204 node->end--;
4205
4206 if (node->start >= node->end)
4207 DiscardElement( doc, node );
4208 }
4209 }
4210 }
4211
4212 void ParseXMLDocument(TidyDocImpl* doc)
4213 {
4214 Node *node, *doctype = NULL;
4215
4216 SetOptionBool( doc, TidyXmlTags, yes );
4217
4218 while ((node = GetToken(doc, IgnoreWhitespace)) != NULL)
4219 {
4220 /* discard unexpected end tags */
4221 if (node->type == EndTag)
4222 {
4223 ReportError(doc, NULL, node, UNEXPECTED_ENDTAG);
4224 FreeNode( doc, node);
4225 continue;
4226 }
4227
4228 /* deal with comments etc. */
4229 if (InsertMisc( &doc->root, node))
4230 continue;
4231
4232 if (node->type == DocTypeTag)
4233 {
4234 if (doctype == NULL)
4235 {
4236 InsertNodeAtEnd( &doc->root, node);
4237 doctype = node;
4238 }
4239 else
4240 {
4241 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4242 FreeNode( doc, node);
4243 }
4244 continue;
4245 }
4246
4247 if (node->type == StartEndTag)
4248 {
4249 InsertNodeAtEnd( &doc->root, node);
4250 continue;
4251 }
4252
4253 /* if start tag then parse element's content */
4254 if (node->type == StartTag)
4255 {
4256 InsertNodeAtEnd( &doc->root, node );
4257 ParseXMLElement( doc, node, IgnoreWhitespace );
4258 continue;
4259 }
4260
4261 ReportError(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4262 FreeNode( doc, node);
4263 }
4264
4265 /* ensure presence of initial <?xml version="1.0"?> */
4266 if ( cfgBool(doc, TidyXmlDecl) )
4267 FixXmlDecl( doc );
4268 }
4269
4270 /*
4271 * local variables:
4272 * mode: c
4273 * indent-tabs-mode: nil
4274 * c-basic-offset: 4
4275 * eval: (c-set-offset 'substatement-open 0)
4276 * end:
4277 */
4278
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.