~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/parser.c

Version: ~ [ 1.0 ] ~

  1 /* parser.c -- HTML Parser
  2 
  3   (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
  4   See tidy.h for the copyright notice.
  5   
  6   CVS Info :
  7 
  8     $Author: arnaud02 $ 
  9     $Date: 2005/10/21 12:54:15 $ 
 10     $Revision: 1.150 $ 
 11 
 12 */
 13 
 14 #include "tidy-int.h"
 15 #include "lexer.h"
 16 #include "parser.h"
 17 #include "message.h"
 18 #include "clean.h"
 19 #include "tags.h"
 20 #include "tmbstr.h"
 21 
 22 #ifdef AUTO_INPUT_ENCODING
 23 #include "charsets.h"
 24 #endif
 25 
 26 Bool CheckNodeIntegrity(Node *node)
 27 {
 28 #ifndef NO_NODE_INTEGRITY_CHECK
 29     if (node->prev)
 30     {
 31         if (node->prev->next != node)
 32             return no;
 33     }
 34 
 35     if (node->next)
 36     {
 37         if (node->next->prev != node)
 38             return no;
 39     }
 40 
 41     if (node->parent)
 42     {
 43         Node *child = NULL;
 44         if (node->prev == NULL && node->parent->content != node)
 45             return no;
 46 
 47         if (node->next == NULL && node->parent->last != node)
 48             return no;
 49 
 50         for (child = node->parent->content; child; child = child->next)
 51         {
 52             if (child == node)
 53                 break;
 54         }
 55         if ( node != child )
 56             return no;
 57     }
 58 
 59     for (node = node->content; node; node = node->next)
 60         if ( !CheckNodeIntegrity(node) )
 61             return no;
 62 
 63 #endif
 64     return yes;
 65 }
 66 
 67 /*
 68  used to determine how attributes
 69  without values should be printed
 70  this was introduced to deal with
 71  user defined tags e.g. Cold Fusion
 72 */
 73 Bool IsNewNode(Node *node)
 74 {
 75     if (node && node->tag)
 76     {
 77         return (node->tag->model & CM_NEW);
 78     }
 79     return yes;
 80 }
 81 
 82 void CoerceNode(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
 83 {
 84     const Dict* tag = LookupTagDef(tid);
 85     Node* tmp = InferredTag(doc, tag->id);
 86 
 87     if (obsolete)
 88         ReportWarning(doc, node, tmp, OBSOLETE_ELEMENT);
 89     else if (unexpected)
 90         ReportError(doc, node, tmp, REPLACING_UNEX_ELEMENT);
 91     else
 92         ReportNotice(doc, node, tmp, REPLACING_ELEMENT);
 93 
 94     MemFree(tmp->element);
 95     MemFree(tmp);
 96 
 97     node->was = node->tag;
 98     node->tag = tag;
 99     node->type = StartTag;
100     node->implicit = yes;
101     MemFree(node->element);
102     node->element = tmbstrdup(tag->name);
103 }
104 
105 /* extract a node and its children from a markup tree */
106 Node *RemoveNode(Node *node)
107 {
108     if (node->prev)
109         node->prev->next = node->next;
110 
111     if (node->next)
112         node->next->prev = node->prev;
113 
114     if (node->parent)
115     {
116         if (node->parent->content == node)
117             node->parent->content = node->next;
118 
119         if (node->parent->last == node)
120             node->parent->last = node->prev;
121     }
122 
123     node->parent = node->prev = node->next = NULL;
124     return node;
125 }
126 
127 /* remove node from markup tree and discard it */
128 Node *DiscardElement( TidyDocImpl* doc, Node *element )
129 {
130     Node *next = NULL;
131 
132     if (element)
133     {
134         next = element->next;
135         RemoveNode(element);
136         FreeNode( doc, element);
137     }
138 
139     return next;
140 }
141 
142 /*
143  insert "node" into markup tree as the firt element
144  of content of "element"
145 */
146 void InsertNodeAtStart(Node *element, Node *node)
147 {
148     node->parent = element;
149 
150     if (element->content == NULL)
151         element->last = node;
152     else
153         element->content->prev = node;
154 
155     node->next = element->content;
156     node->prev = NULL;
157     element->content = node;
158 }
159 
160 /*
161  insert "node" into markup tree as the last element
162  of content of "element"
163 */
164 void InsertNodeAtEnd(Node *element, Node *node)
165 {
166     node->parent = element;
167     node->prev = element->last;
168 
169     if (element->last != NULL)
170         element->last->next = node;
171     else
172         element->content = node;
173 
174     element->last = node;
175 }
176 
177 /*
178  insert "node" into markup tree in place of "element"
179  which is moved to become the child of the node
180 */
181 static void InsertNodeAsParent(Node *element, Node *node)
182 {
183     node->content = element;
184     node->last = element;
185     node->parent = element->parent;
186     element->parent = node;
187 
188     if (node->parent->content == element)
189         node->parent->content = node;
190 
191     if (node->parent->last == element)
192         node->parent->last = node;
193 
194     node->prev = element->prev;
195     element->prev = NULL;
196 
197     if (node->prev)
198         node->prev->next = node;
199 
200     node->next = element->next;
201     element->next = NULL;
202 
203     if (node->next)
204         node->next->prev = node;
205 }
206 
207 /* insert "node" into markup tree before "element" */
208 void InsertNodeBeforeElement(Node *element, Node *node)
209 {
210     Node *parent;
211 
212     parent = element->parent;
213     node->parent = parent;
214     node->next = element;
215     node->prev = element->prev;
216     element->prev = node;
217 
218     if (node->prev)
219         node->prev->next = node;
220 
221     if (parent->content == element)
222         parent->content = node;
223 }
224 
225 /* insert "node" into markup tree after "element" */
226 void InsertNodeAfterElement(Node *element, Node *node)
227 {
228     Node *parent;
229 
230     parent = element->parent;
231     node->parent = parent;
232 
233     /* AQ - 13 Jan 2000 fix for parent == NULL */
234     if (parent != NULL && parent->last == element)
235         parent->last = node;
236     else
237     {
238         node->next = element->next;
239         /* AQ - 13 Jan 2000 fix for node->next == NULL */
240         if (node->next != NULL)
241             node->next->prev = node;
242     }
243 
244     element->next = node;
245     node->prev = element;
246 }
247 
248 static Bool CanPrune( TidyDocImpl* doc, Node *element )
249 {
250     if ( nodeIsText(element) )
251         return yes;
252 
253     if ( element->content )
254         return no;
255 
256     if ( element->tag == NULL )
257         return no;
258 
259     if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
260         return no;
261 
262     if ( nodeIsA(element) && element->attributes != NULL )
263         return no;
264 
265     if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
266         return no;
267 
268     if ( element->tag->model & CM_ROW )
269         return no;
270 
271     if ( element->tag->model & CM_EMPTY )
272         return no;
273 
274     if ( nodeIsAPPLET(element) )
275         return no;
276 
277     if ( nodeIsOBJECT(element) )
278         return no;
279 
280     if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
281         return no;
282 
283     if ( nodeIsTITLE(element) )
284         return no;
285 
286     /* #433359 - fix by Randy Waki 12 Mar 01 */
287     if ( nodeIsIFRAME(element) )
288         return no;
289 
290     /* fix for bug 770297 */
291     if (nodeIsTEXTAREA(element))
292         return no;
293 
294     if ( attrGetID(element) || attrGetNAME(element) )
295         return no;
296 
297     /* fix for bug 695408; a better fix would look for unknown and    */
298     /* known proprietary attributes that make the element significant */
299     if (attrGetDATAFLD(element))
300         return no;
301 
302     /* fix for bug 723772, don't trim new-...-tags */
303     if (element->tag->id == TidyTag_UNKNOWN)
304         return no;
305 
306     if (nodeIsBODY(element))
307         return no;
308 
309     if (nodeIsCOLGROUP(element))
310         return no;
311 
312     return yes;
313 }
314 
315 Node *TrimEmptyElement( TidyDocImpl* doc, Node *element )
316 {
317     if ( CanPrune(doc, element) )
318     {
319        if (element->type != TextNode)
320             ReportNotice(doc, element, NULL, TRIM_EMPTY_ELEMENT);
321 
322         return DiscardElement(doc, element);
323     }
324     return element;
325 }
326 
327 Node* DropEmptyElements(TidyDocImpl* doc, Node* node)
328 {
329     Node* next;
330 
331     while (node)
332     {
333         next = node->next;
334 
335         if (node->content)
336             DropEmptyElements(doc, node->content);
337 
338         if (!nodeIsElement(node) &&
339             !(nodeIsText(node) && !(node->start < node->end)))
340         {
341             node = next;
342             continue;
343         }
344 
345         next = TrimEmptyElement(doc, node);
346         node = node == next ? node->next : next;
347     }
348 
349     return node;
350 }
351 
352 /* 
353   errors in positioning of form start or end tags
354   generally require human intervention to fix
355 */
356 static void BadForm( TidyDocImpl* doc )
357 {
358     doc->badForm = yes;
359     /* doc->errors++; */
360 }
361 
362 /*
363   This maps 
364        <em>hello </em><strong>world</strong>
365   to
366        <em>hello</em> <strong>world</strong>
367 
368   If last child of element is a text node
369   then trim trailing white space character
370   moving it to after element's end tag.
371 */
372 static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
373 {
374     Lexer* lexer = doc->lexer;
375     byte c;
376 
377     if (nodeIsText(last))
378     {
379         if (last->end > last->start)
380         {
381             c = (byte) lexer->lexbuf[ last->end - 1 ];
382 
383             if (   c == ' '
384 #ifdef COMMENT_NBSP_FIX
385                 || c == 160
386 #endif
387                )
388             {
389 #ifdef COMMENT_NBSP_FIX
390                 /* take care with <td>&nbsp;</td> */
391                 if ( c == 160 && 
392                      ( element->tag == doc->tags.tag_td || 
393                        element->tag == doc->tags.tag_th )
394                    )
395                 {
396                     if (last->end > last->start + 1)
397                         last->end -= 1;
398                 }
399                 else
400 #endif
401                 {
402                     last->end -= 1;
403                     if ( (element->tag->model & CM_INLINE) &&
404                          !(element->tag->model & CM_FIELD) )
405                         lexer->insertspace = yes;
406                 }
407             }
408         }
409     }
410 }
411 
412 #if 0
413 static Node *EscapeTag(Lexer *lexer, Node *element)
414 {
415     Node *node = NewNode(lexer);
416 
417     node->start = lexer->lexsize;
418     AddByte(lexer, '<');
419 
420     if (element->type == EndTag)
421         AddByte(lexer, '/');
422 
423     if (element->element)
424     {
425         char *p;
426         for (p = element->element; *p != '\0'; ++p)
427             AddByte(lexer, *p);
428     }
429     else if (element->type == DocTypeTag)
430     {
431         uint i;
432         AddStringLiteral( lexer, "!DOCTYPE " );
433         for (i = element->start; i < element->end; ++i)
434             AddByte(lexer, lexer->lexbuf[i]);
435     }
436 
437     if (element->type == StartEndTag)
438         AddByte(lexer, '/');
439 
440     AddByte(lexer, '>');
441     node->end = lexer->lexsize;
442 
443     return node;
444 }
445 #endif /* 0 */
446 
447 /* Only true for text nodes. */
448 Bool IsBlank(Lexer *lexer, Node *node)
449 {
450     Bool isBlank = nodeIsText(node);
451     if ( isBlank )
452         isBlank = ( node->end == node->start ||       /* Zero length */
453                     ( node->end == node->start+1      /* or one blank. */
454                       && lexer->lexbuf[node->start] == ' ' ) );
455     return isBlank;
456 }
457 
458 /*
459   This maps 
460        <p>hello<em> world</em>
461   to
462        <p>hello <em>world</em>
463 
464   Trims initial space, by moving it before the
465   start tag, or if this element is the first in
466   parent's content, then by discarding the space
467 */
468 static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
469 {
470     Lexer* lexer = doc->lexer;
471     Node *prev, *node;
472 
473     if ( nodeIsText(text) && 
474          lexer->lexbuf[text->start] == ' ' && 
475          text->start < text->end )
476     {
477         if ( (element->tag->model & CM_INLINE) &&
478              !(element->tag->model & CM_FIELD) )
479         {
480             prev = element->prev;
481 
482             if (nodeIsText(prev))
483             {
484                 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
485                     lexer->lexbuf[(prev->end)++] = ' ';
486 
487                 ++(element->start);
488             }
489             else /* create new node */
490             {
491                 node = NewNode(lexer);
492                 node->start = (element->start)++;
493                 node->end = element->start;
494                 lexer->lexbuf[node->start] = ' ';
495                 InsertNodeBeforeElement(element ,node);
496             }
497         }
498 
499         /* discard the space in current node */
500         ++(text->start);
501     }
502 }
503 
504 static Bool IsPreDescendant(Node* node)
505 {
506     Node *parent = node->parent;
507 
508     while (parent)
509     {
510         if (parent->tag && parent->tag->parser == ParsePre)
511             return yes;
512 
513         parent = parent->parent;
514     }
515 
516     return no;
517 }
518 
519 static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
520 {
521     Node* next;
522 
523     if (!nodeIsText(node))
524         return no;
525 
526     if (node->parent->type == DocTypeTag)
527         return no;
528 
529     if (IsPreDescendant(node))
530         return no;
531 
532     if (node->parent->tag->parser == ParseScript)
533         return no;
534 
535     next = node->next;
536 
537     /* <p>... </p> */
538     if (!next && !nodeHasCM(node->parent, CM_INLINE))
539         return yes;
540 
541     /* <div><small>... </small><h3>...</h3></div> */
542     if (!next && node->parent->next && !nodeHasCM(node->parent->next, CM_INLINE))
543         return yes;
544 
545     if (!next)
546         return no;
547 
548     if (nodeIsBR(next))
549         return yes;
550 
551     if (nodeHasCM(next, CM_INLINE))
552         return no;
553 
554     /* <a href='/'>...</a> <p>...</p> */
555     if (next->type == StartTag)
556         return yes;
557 
558     /* <strong>...</strong> <hr /> */
559     if (next->type == StartEndTag)
560         return yes;
561 
562     /* evil adjacent text nodes, Tidy should not generate these :-( */
563     if (nodeIsText(next) && next->start < next->end
564         && IsWhite(doc->lexer->lexbuf[next->start]))
565         return yes;
566 
567     return no;
568 }
569 
570 static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
571 {
572     if (!nodeIsText(node))
573         return no;
574 
575     if (node->parent->type == DocTypeTag)
576         return no;
577 
578     if (IsPreDescendant(node))
579         return no;
580 
581     if (node->parent->tag->parser == ParseScript)
582         return no;
583 
584     /* <p>...<br> <em>...</em>...</p> */
585     if (nodeIsBR(node->prev))
586         return yes;
587 
588     /* <p> ...</p> */
589     if (node->prev == NULL && !nodeHasCM(node->parent, CM_INLINE))
590         return yes;
591 
592     /* <h4>...</h4> <em>...</em> */
593     if (node->prev && !nodeHasCM(node->prev, CM_INLINE) &&
594         nodeIsElement(node->prev))
595         return yes;
596 
597     /* <p><span> ...</span></p> */
598     if (!node->prev && !node->parent->prev && !nodeHasCM(node->parent->parent, CM_INLINE))
599         return yes;
600 
601     return no;
602 }
603 
604 static void CleanSpaces(TidyDocImpl* doc, Node* node)
605 {
606     Node* next;
607 
608     while (node)
609     {
610         next = node->next;
611 
612         if (nodeIsText(node) && CleanLeadingWhitespace(doc, node))
613             while (node->start < node->end && IsWhite(doc->lexer->lexbuf[node->start]))
614                 ++(node->start);
615 
616         if (nodeIsText(node) && CleanTrailingWhitespace(doc, node))
617             while (node->end > node->start && IsWhite(doc->lexer->lexbuf[node->end - 1]))
618                 --(node->end);
619 
620         if (nodeIsText(node) && !(node->start < node->end))
621         {
622             RemoveNode(node);
623             FreeNode(doc, node);
624             node = next;
625 
626             continue;
627         }
628 
629         if (node->content)
630             CleanSpaces(doc, node->content);
631 
632         node = next;
633     }
634 }
635 
636 /* 
637   Move initial and trailing space out.
638   This routine maps:
639 
640        hello<em> world</em>
641   to
642        hello <em>world</em>
643   and
644        <em>hello </em><strong>world</strong>
645   to
646        <em>hello</em> <strong>world</strong>
647 */
648 static void TrimSpaces( TidyDocImpl* doc, Node *element)
649 {
650     Node* text = element->content;
651 
652     if (nodeIsPRE(element) || IsPreDescendant(element))
653         return;
654 
655     if (nodeIsText(text))
656         TrimInitialSpace(doc, element, text);
657 
658     text = element->last;
659 
660     if (nodeIsText(text))
661         TrimTrailingSpace(doc, element, text);
662 }
663 
664 Bool DescendantOf( Node *element, TidyTagId tid )
665 {
666     Node *parent;
667     for ( parent = element->parent;
668           parent != NULL;
669           parent = parent->parent )
670     {
671         if ( TagIsId(parent, tid) )
672             return yes;
673     }
674     return no;
675 }
676 
677 static Bool InsertMisc(Node *element, Node *node)
678 {
679     if (node->type == CommentTag ||
680         node->type == ProcInsTag ||
681         node->type == CDATATag ||
682         node->type == SectionTag ||
683         node->type == AspTag ||
684         node->type == JsteTag ||
685         node->type == PhpTag )
686     {
687         InsertNodeAtEnd(element, node);
688         return yes;
689     }
690 
691     if ( node->type == XmlDecl )
692     {
693         Node* root = element;
694         while ( root && root->parent )
695             root = root->parent;
696         if ( root )
697         {
698           InsertNodeAtStart( root, node );
699           return yes;
700         }
701     }
702 
703     /* Declared empty tags seem to be slipping through
704     ** the cracks.  This is an experiment to figure out
705     ** a decent place to pick them up.
706     */
707     if ( node->tag &&
708          nodeIsElement(node) &&
709          nodeCMIsEmpty(node) && TagId(node) == TidyTag_UNKNOWN &&
710          (node->tag->versions & VERS_PROPRIETARY) != 0 )
711     {
712         InsertNodeAtEnd(element, node);
713         return yes;
714     }
715 
716     return no;
717 }
718 
719 
720 static void ParseTag( TidyDocImpl* doc, Node *node, uint mode )
721 {
722     Lexer* lexer = doc->lexer;
723     /*
724        Fix by GLP 2000-12-21.  Need to reset insertspace if this 
725        is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
726     */
727     if (node->tag->model & CM_EMPTY)
728     {
729         lexer->waswhite = no;
730         if (node->tag->parser == NULL)
731             return;
732     }
733     else if (!(node->tag->model & CM_INLINE))
734         lexer->insertspace = no;
735 
736     if (node->tag->parser == NULL)
737         return;
738 
739     if (node->type == StartEndTag)
740         return;
741 
742     (*node->tag->parser)( doc, node, mode );
743 }
744 
745 /*
746  the doctype has been found after other tags,
747  and needs moving to before the html element
748 */
749 static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
750 {
751     Node* existing = FindDocType( doc );
752     if ( existing )
753     {
754         ReportError(doc, element, doctype, DISCARDING_UNEXPECTED );
755         FreeNode( doc, doctype );
756     }
757     else
758     {
759         ReportError(doc, element, doctype, DOCTYPE_AFTER_TAGS );
760         while ( !nodeIsHTML(element) )
761             element = element->parent;
762         InsertNodeBeforeElement( element, doctype );
763     }
764 }
765 
766 /*
767  move node to the head, where element is used as starting
768  point in hunt for head. normally called during parsing
769 */
770 static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
771 {
772     Node *head;
773 
774     RemoveNode( node );  /* make sure that node is isolated */
775 
776     if ( nodeIsElement(node) )
777     {
778         ReportError(doc, element, node, TAG_NOT_ALLOWED_IN );
779 
780         head = FindHEAD(doc);
781         assert(head != NULL);
782 
783         InsertNodeAtEnd(head, node);
784 
785         if ( node->tag->parser )
786             ParseTag( doc, node, IgnoreWhitespace );
787     }
788     else
789     {
790         ReportError(doc, element, node, DISCARDING_UNEXPECTED);
791         FreeNode( doc, node );
792     }
793 }
794 
795 /* moves given node to end of body element */
796 static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
797 {
798     Node* body = FindBody( doc );
799     if ( body )
800     {
801         RemoveNode( node );
802         InsertNodeAtEnd( body, node );
803     }
804 }
805 
806 /*
807    element is node created by the lexer
808    upon seeing the start tag, or by the
809    parser when the start tag is inferred
810 */
811 void ParseBlock( TidyDocImpl* doc, Node *element, uint mode)
812 {
813     Lexer* lexer = doc->lexer;
814     Node *node;
815     Bool checkstack = yes;
816     uint istackbase = 0;
817 
818     if ( element->tag->model & CM_EMPTY )
819         return;
820 
821     if ( nodeIsFORM(element) && 
822          DescendantOf(element, TidyTag_FORM) )
823         ReportError(doc, element, NULL, ILLEGAL_NESTING );
824 
825     /*
826      InlineDup() asks the lexer to insert inline emphasis tags
827      currently pushed on the istack, but take care to avoid
828      propagating inline emphasis inside OBJECT or APPLET.
829      For these elements a fresh inline stack context is created
830      and disposed of upon reaching the end of the element.
831      They thus behave like table cells in this respect.
832     */
833     if (element->tag->model & CM_OBJECT)
834     {
835         istackbase = lexer->istackbase;
836         lexer->istackbase = lexer->istacksize;
837     }
838 
839     if (!(element->tag->model & CM_MIXED))
840         InlineDup( doc, NULL );
841 
842     mode = IgnoreWhitespace;
843 
844     while ((node = GetToken(doc, mode /*MixedContent*/)) != NULL)
845     {
846         /* end tag for this element */
847         if (node->type == EndTag && node->tag &&
848             (node->tag == element->tag || element->was == node->tag))
849         {
850             FreeNode( doc, node );
851 
852             if (element->tag->model & CM_OBJECT)
853             {
854                 /* pop inline stack */
855                 while (lexer->istacksize > lexer->istackbase)
856                     PopInline( doc, NULL );
857                 lexer->istackbase = istackbase;
858             }
859 
860             element->closed = yes;
861             TrimSpaces( doc, element );
862             return;
863         }
864 
865         if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
866         {
867             /*  If we're in the HEAD, close it before proceeding.
868                 This is an extremely rare occurance, but has been observed.
869             */
870             UngetToken( doc );
871             break;
872         }
873 
874         if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
875         {
876             if ( nodeIsElement(node) )
877                 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
878             FreeNode( doc, node );
879             continue;
880         }
881 
882 
883         if (node->type == EndTag)
884         {
885             if (node->tag == NULL)
886             {
887                 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
888                 FreeNode( doc, node );
889                 continue;
890             }
891             else if ( nodeIsBR(node) )
892                 node->type = StartTag;
893             else if ( nodeIsP(node) )
894             {
895                 /* Cannot have a block inside a paragraph, so no checking
896                    for an ancestor is necessary -- but we _can_ have
897                    paragraphs inside a block, so change it to an implicit
898                    empty paragraph, to be dealt with according to the user's
899                    options
900                 */
901                 node->type = StartEndTag;
902                 node->implicit = yes;
903 #if OBSOLETE
904                 CoerceNode(doc, node, TidyTag_BR, no, no);
905                 FreeAttrs( doc, node ); /* discard align attribute etc. */
906                 InsertNodeAtEnd( element, node );
907                 node = InferredTag(doc, TidyTag_BR);
908 #endif
909             }
910             else if (DescendantOf( element, node->tag->id ))
911             {
912                 /* 
913                   if this is the end tag for an ancestor element
914                   then infer end tag for this element
915                 */
916                 UngetToken( doc );
917                 break;
918 #if OBSOLETE
919                 Node *parent;
920                 for ( parent = element->parent;
921                       parent != NULL; 
922                       parent = parent->parent )
923                 {
924                     if (node->tag == parent->tag)
925                     {
926                         if (!(element->tag->model & CM_OPT))
927                             ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
928 
929                         UngetToken( doc );
930 
931                         if (element->tag->model & CM_OBJECT)
932                         {
933                             /* pop inline stack */
934                             while (lexer->istacksize > lexer->istackbase)
935                                 PopInline( doc, NULL );
936                             lexer->istackbase = istackbase;
937                         }
938 
939                         TrimSpaces( doc, element );
940                         return;
941                     }
942                 }
943 #endif
944             }
945             else
946             {
947                 /* special case </tr> etc. for stuff moved in front of table */
948                 if ( lexer->exiled
949                      && node->tag->model
950                      && (node->tag->model & CM_TABLE) )
951                 {
952                     UngetToken( doc );
953                     TrimSpaces( doc, element );
954                     return;
955                 }
956             }
957         }
958 
959         /* mixed content model permits text */
960         if (nodeIsText(node))
961         {
962             if ( checkstack )
963             {
964                 checkstack = no;
965                 if (!(element->tag->model & CM_MIXED))
966                 {
967                     if ( InlineDup(doc, node) > 0 )
968                         continue;
969                 }
970             }
971 
972             InsertNodeAtEnd(element, node);
973             mode = MixedContent;
974 
975             /*
976               HTML4 strict doesn't allow mixed content for
977               elements with %block; as their content model
978             */
979             /*
980               But only body, map, blockquote, form and
981               noscript have content model %block;
982             */
983             if ( nodeIsBODY(element)       ||
984                  nodeIsMAP(element)        ||
985                  nodeIsBLOCKQUOTE(element) ||
986                  nodeIsFORM(element)       ||
987                  nodeIsNOSCRIPT(element) )
988                 ConstrainVersion( doc, ~VERS_HTML40_STRICT );
989             continue;
990         }
991 
992         if ( InsertMisc(element, node) )
993             continue;
994 
995         /* allow PARAM elements? */
996         if ( nodeIsPARAM(node) )
997         {
998             if ( nodeHasCM(element, CM_PARAM) && nodeIsElement(node) )
999             {
1000                 InsertNodeAtEnd(element, node);
1001                 continue;
1002             }
1003 
1004             /* otherwise discard it */
1005             ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1006             FreeNode( doc, node );
1007             continue;
1008         }
1009 
1010         /* allow AREA elements? */
1011         if ( nodeIsAREA(node) )
1012         {
1013             if ( nodeIsMAP(element) && nodeIsElement(node) )
1014             {
1015                 InsertNodeAtEnd(element, node);
1016                 continue;
1017             }
1018 
1019             /* otherwise discard it */
1020             ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1021             FreeNode( doc, node );
1022             continue;
1023         }
1024 
1025         /* ignore unknown start/end tags */
1026         if ( node->tag == NULL )
1027         {
1028             ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1029             FreeNode( doc, node );
1030             continue;
1031         }
1032 
1033         /*
1034           Allow CM_INLINE elements here.
1035 
1036           Allow CM_BLOCK elements here unless
1037           lexer->excludeBlocks is yes.
1038 
1039           LI and DD are special cased.
1040 
1041           Otherwise infer end tag for this element.
1042         */
1043 
1044         if ( !nodeHasCM(node, CM_INLINE) )
1045         {
1046             if ( !nodeIsElement(node) )
1047             {
1048                 if ( nodeIsFORM(node) )
1049                     BadForm( doc );
1050 
1051                 ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1052                 FreeNode( doc, node );
1053                 continue;
1054             }
1055 
1056             /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1057             /*
1058              If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1059              start tag, discard the start tag and let the subsequent content get
1060              parsed as content of the enclosing LI.  This seems to mimic IE and
1061              Netscape, and avoids an infinite loop: without this check,
1062              ParseBlock (which is parsing the LI's content) and ParseList (which
1063              is parsing the LI's parent's content) repeatedly defer to each
1064              other to parse the illegal start tag, each time inferring a missing
1065              </li> or <li> respectively.
1066 
1067              NOTE: This check is a bit fragile.  It specifically checks for the
1068              four tags that happen to weave their way through the current series
1069              of tests performed by ParseBlock and ParseList to trigger the
1070              infinite loop.
1071             */
1072             if ( nodeIsLI(element) )
1073             {
1074                 if ( nodeIsFRAME(node)    ||
1075                      nodeIsFRAMESET(node) ||
1076                      nodeIsOPTGROUP(node) ||
1077                      nodeIsOPTION(node) )
1078                 {
1079                     ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1080                     FreeNode( doc, node );  /* DSR - 27Apr02 avoid memory leak */
1081                     continue;
1082                 }
1083             }
1084 
1085             if ( nodeIsTD(element) || nodeIsTH(element) )
1086             {
1087                 /* if parent is a table cell, avoid inferring the end of the cell */
1088 
1089                 if ( nodeHasCM(node, CM_HEAD) )
1090                 {
1091                     MoveToHead( doc, element, node );
1092                     continue;
1093                 }
1094 
1095                 if ( nodeHasCM(node, CM_LIST) )
1096                 {
1097                     UngetToken( doc );
1098                     node = InferredTag(doc, TidyTag_UL);
1099                     /* AddClass( doc, node, "noindent" ); */
1100                     lexer->excludeBlocks = yes;
1101                 }
1102                 else if ( nodeHasCM(node, CM_DEFLIST) )
1103                 {
1104                     UngetToken( doc );
1105                     node = InferredTag(doc, TidyTag_DL);
1106                     lexer->excludeBlocks = yes;
1107                 }
1108 
1109                 /* infer end of current table cell */
1110                 if ( !nodeHasCM(node, CM_BLOCK) )
1111                 {
1112                     UngetToken( doc );
1113                     TrimSpaces( doc, element );
1114                     return;
1115                 }
1116             }
1117             else if ( nodeHasCM(node, CM_BLOCK) )
1118             {
1119                 if ( lexer->excludeBlocks )
1120                 {
1121                     if ( !nodeHasCM(element, CM_OPT) )
1122                         ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1123 
1124                     UngetToken( doc );
1125 
1126                     if ( nodeHasCM(element, CM_OBJECT) )
1127                         lexer->istackbase = istackbase;
1128 
1129                     TrimSpaces( doc, element );
1130                     return;
1131                 }
1132             }
1133             else /* things like list items */
1134             {
1135                 if (node->tag->model & CM_HEAD)
1136                 {
1137                     MoveToHead( doc, element, node );
1138                     continue;
1139                 }
1140 
1141                 /*
1142                  special case where a form start tag
1143                  occurs in a tr and is followed by td or th
1144                 */
1145 
1146                 if ( nodeIsFORM(element) &&
1147                      nodeIsTD(element->parent) &&
1148                      element->parent->implicit )
1149                 {
1150                     if ( nodeIsTD(node) )
1151                     {
1152                         ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1153                         FreeNode( doc, node );
1154                         continue;
1155                     }
1156 
1157                     if ( nodeIsTH(node) )
1158                     {
1159                         ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1160                         FreeNode( doc, node );
1161                         node = element->parent;
1162                         MemFree(node->element);
1163                         node->element = tmbstrdup("th");
1164                         node->tag = LookupTagDef( TidyTag_TH );
1165                         continue;
1166                     }
1167                 }
1168 
1169                 if ( !nodeHasCM(element, CM_OPT) && !element->implicit )
1170                     ReportError(doc, element, node, MISSING_ENDTAG_BEFORE );
1171 
1172                 UngetToken( doc );
1173 
1174                 if ( nodeHasCM(node, CM_LIST) )
1175                 {
1176                     if ( element->parent && element->parent->tag &&
1177                          element->parent->tag->parser == ParseList )
1178                     {
1179                         TrimSpaces( doc, element );
1180                         return;
1181                     }
1182 
1183                     node = InferredTag(doc, TidyTag_UL);
1184                     /* AddClass( doc, node, "noindent" ); */
1185                 }
1186                 else if ( nodeHasCM(node, CM_DEFLIST) )
1187                 {
1188                     if ( nodeIsDL(element->parent) )
1189                     {
1190                         TrimSpaces( doc, element );
1191                         return;
1192                     }
1193 
1194                     node = InferredTag(doc, TidyTag_DL);
1195                 }
1196                 else if ( nodeHasCM(node, CM_TABLE) || nodeHasCM(node, CM_ROW) )
1197                 {
1198                     node = InferredTag(doc, TidyTag_TABLE);
1199                 }
1200                 else if ( nodeHasCM(element, CM_OBJECT) )
1201                 {
1202                     /* pop inline stack */
1203                     while ( lexer->istacksize > lexer->istackbase )
1204                         PopInline( doc, NULL );
1205                     lexer->istackbase = istackbase;
1206                     TrimSpaces( doc, element );
1207                     return;
1208 
1209                 }
1210                 else
1211                 {
1212                     TrimSpaces( doc, element );
1213                     return;
1214                 }
1215             }
1216         }
1217 
1218         /* parse known element */
1219         if (nodeIsElement(node))
1220         {
1221             if (node->tag->model & CM_INLINE)
1222             {
1223                 if (checkstack && !node->implicit)
1224                 {
1225                     checkstack = no;
1226 
1227                     if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1228                     {
1229                         if ( InlineDup(doc, node) > 0 )
1230                             continue;
1231                     }
1232                 }
1233 
1234                 mode = MixedContent;
1235             }
1236             else
1237             {
1238                 checkstack = yes;
1239                 mode = IgnoreWhitespace;
1240             }
1241 
1242             /* trim white space before <br> */
1243             if ( nodeIsBR(node) )
1244                 TrimSpaces( doc, element );
1245 
1246             InsertNodeAtEnd(element, node);
1247             
1248             if (node->implicit)
1249                 ReportError(doc, element, node, INSERTING_TAG );
1250 
1251             ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1252             continue;
1253         }
1254 
1255         /* discard unexpected tags */
1256         if (node->type == EndTag)
1257             PopInline( doc, node );  /* if inline end tag */
1258 
1259         ReportError(doc, element, node, DISCARDING_UNEXPECTED );
1260         FreeNode( doc, node );
1261         continue;
1262     }
1263 
1264     if (!(element->tag->model & CM_OPT))
1265         ReportError(doc, element, node, MISSING_ENDTAG_FOR);
1266 
1267     if (element->tag->model & CM_OBJECT)
1268     {
1269         /* pop inline stack */
1270         while ( lexer->istacksize > lexer->istackbase )
1271             PopInline( doc, NULL );
1272         lexer->istackbase = istackbase;
1273     }
1274 
1275     TrimSpaces( doc, element );
1276 }
1277 
1278 void ParseInline( TidyDocImpl* doc, Node *element, uint mode )
1279 {
1280     Lexer* lexer = doc->lexer;
1281     Node *node, *parent;
1282 
1283     if (element->tag->model & CM_EMPTY)
1284         return;
1285 
1286     /*
1287      ParseInline is used for some block level elements like H1 to H6
1288      For such elements we need to insert inline emphasis tags currently
1289      on the inline stack. For Inline elements, we normally push them
1290      onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1291      This test is carried out in PushInline and PopInline, see istack.c
1292 
1293      InlineDup(...) is not called for elements with a CM_MIXED (inline and
1294      block) content model, e.g. <del> or <ins>, otherwise constructs like 
1295 
1296        <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1297        <p>111<span>222<del>333</del>444</span>555</p>
1298        <p>111<em>222<del>333</del>444</em>555</p>
1299 
1300      will get corrupted.
1301     */
1302     if ((nodeHasCM(element, CM_BLOCK) || nodeIsDT(element)) &&
1303         !nodeHasCM(element, CM_MIXED))
1304         InlineDup(doc, NULL);
1305     else if (nodeHasCM(element, CM_INLINE))
1306         PushInline(doc, element);
1307 
1308     if ( nodeIsNOBR(element) )
1309         doc->badLayout |= USING_NOBR;
1310     else if ( nodeIsFONT(element) )
1311         doc->badLayout |= USING_FONT;
1312 
1313     /* Inline elements may or may not be within a preformatted element */
1314     if (mode != Preformatted)
1315         mode = MixedContent;
1316 
1317     while ((node = GetToken(doc, mode)) != NULL)
1318     {
1319         /* end tag for current element */
1320         if (node->tag == element->tag && node->type == EndTag)
1321         {
1322             if (element->tag->model & CM_INLINE)
1323                 PopInline( doc, node );
1324 
1325             FreeNode( doc, node );
1326 
1327             if (!(mode & Preformatted))
1328                 TrimSpaces(doc, element);
1329 
1330             /*
1331              if a font element wraps an anchor and nothing else
1332              then move the font element inside the anchor since
1333              otherwise it won't alter the anchor text color
1334             */
1335             if ( nodeIsFONT(element) && 
1336                  element->content && element->content == element->last )
1337             {
1338                 Node *child = element->content;
1339 
1340                 if ( nodeIsA(child) )
1341                 {
1342                     child->