~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

TidyLib
tidy/src/clean.c

Version: ~ [ 1.0 ] ~

  1 /*
  2   clean.c -- clean up misuse of presentation markup
  3 
  4   (c) 1998-2005 (W3C) MIT, ERCIM, Keio University
  5   See tidy.h for the copyright notice.
  6 
  7   CVS Info :
  8 
  9     $Author: arnaud02 $ 
 10     $Date: 2005/08/03 18:06:59 $ 
 11     $Revision: 1.98 $ 
 12 
 13   Filters from other formats such as Microsoft Word
 14   often make excessive use of presentation markup such
 15   as font tags, B, I, and the align attribute. By applying
 16   a set of production rules, it is straight forward to
 17   transform this to use CSS.
 18 
 19   Some rules replace some of the children of an element by
 20   style properties on the element, e.g.
 21 
 22   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
 23 
 24   Such rules are applied to the element's content and then
 25   to the element itself until none of the rules more apply.
 26   Having applied all the rules to an element, it will have
 27   a style attribute with one or more properties. 
 28 
 29   Other rules strip the element they apply to, replacing
 30   it by style properties on the contents, e.g.
 31   
 32   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
 33       
 34   These rules are applied to an element before processing
 35   its content and replace the current element by the first
 36   element in the exposed content.
 37 
 38   After applying both sets of rules, you can replace the
 39   style attribute by a class value and style rule in the
 40   document head. To support this, an association of styles
 41   and class names is built.
 42 
 43   A naive approach is to rely on string matching to test
 44   when two property lists are the same. A better approach
 45   would be to first sort the properties before matching.
 46 
 47 */
 48 
 49 #include <stdio.h>
 50 #include <stdlib.h>
 51 #include <string.h>
 52 
 53 #include "tidy-int.h"
 54 #include "clean.h"
 55 #include "lexer.h"
 56 #include "parser.h"
 57 #include "attrs.h"
 58 #include "message.h"
 59 #include "tmbstr.h"
 60 #include "utf8.h"
 61 
 62 void RenameElem( Node* node, TidyTagId tid )
 63 {
 64     const Dict* dict = LookupTagDef( tid );
 65     MemFree( node->element );
 66     node->element = tmbstrdup( dict->name );
 67     node->tag = dict;
 68 }
 69 
 70 static void FreeStyleProps(StyleProp *props)
 71 {
 72     StyleProp *next;
 73 
 74     while (props)
 75     {
 76         next = props->next;
 77         MemFree(props->name);
 78         MemFree(props->value);
 79         MemFree(props);
 80         props = next;
 81     }
 82 }
 83 
 84 static StyleProp *InsertProperty( StyleProp* props, ctmbstr name, ctmbstr value )
 85 {
 86     StyleProp *first, *prev, *prop;
 87     int cmp;
 88 
 89     prev = NULL;
 90     first = props;
 91 
 92     while (props)
 93     {
 94         cmp = tmbstrcmp(props->name, name);
 95 
 96         if (cmp == 0)
 97         {
 98             /* this property is already defined, ignore new value */
 99             return first;
100         }
101 
102         if (cmp > 0)
103         {
104             /* insert before this */
105 
106             prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
107             prop->name = tmbstrdup(name);
108             prop->value = tmbstrdup(value);
109             prop->next = props;
110 
111             if (prev)
112                 prev->next = prop;
113             else
114                 first = prop;
115 
116             return first;
117         }
118 
119         prev = props;
120         props = props->next;
121     }
122 
123     prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
124     prop->name = tmbstrdup(name);
125     prop->value = tmbstrdup(value);
126     prop->next = NULL;
127 
128     if (prev)
129         prev->next = prop;
130     else
131         first = prop;
132 
133     return first;
134 }
135 
136 /*
137  Create sorted linked list of properties from style string
138  It temporarily places nulls in place of ':' and ';' to
139  delimit the strings for the property name and value.
140  Some systems don't allow you to NULL literal strings,
141  so to avoid this, a copy is made first.
142 */
143 static StyleProp* CreateProps( StyleProp* prop, ctmbstr style )
144 {
145     tmbstr name, value = NULL, name_end, value_end, line;
146     Bool more;
147 
148     line = tmbstrdup(style);
149     name = line;
150 
151     while (*name)
152     {
153         while (*name == ' ')
154             ++name;
155 
156         name_end = name;
157 
158         while (*name_end)
159         {
160             if (*name_end == ':')
161             {
162                 value = name_end + 1;
163                 break;
164             }
165 
166             ++name_end;
167         }
168 
169         if (*name_end != ':')
170             break;
171 
172         while ( value && *value == ' ')
173             ++value;
174 
175         value_end = value;
176         more = no;
177 
178         while (*value_end)
179         {
180             if (*value_end == ';')
181             {
182                 more = yes;
183                 break;
184             }
185 
186             ++value_end;
187         }
188 
189         *name_end = '\0';
190         *value_end = '\0';
191 
192         prop = InsertProperty(prop, name, value);
193         *name_end = ':';
194 
195         if (more)
196         {
197             *value_end = ';';
198             name = value_end + 1;
199             continue;
200         }
201 
202         break;
203     }
204 
205     MemFree(line);  /* free temporary copy */
206     return prop;
207 }
208 
209 static tmbstr CreatePropString(StyleProp *props)
210 {
211     tmbstr style, p, s;
212     uint len;
213     StyleProp *prop;
214 
215     /* compute length */
216 
217     for (len = 0, prop = props; prop; prop = prop->next)
218     {
219         len += tmbstrlen(prop->name) + 2;
220         if (prop->value)
221             len += tmbstrlen(prop->value) + 2;
222     }
223 
224     style = (tmbstr) MemAlloc(len+1);
225     style[0] = '\0';
226 
227     for (p = style, prop = props; prop; prop = prop->next)
228     {
229         s = prop->name;
230 
231         while((*p++ = *s++))
232             continue;
233 
234         if (prop->value)
235         {
236             *--p = ':';
237             *++p = ' ';
238             ++p;
239 
240             s = prop->value;
241             while((*p++ = *s++))
242                 continue;
243         }
244         if (prop->next == NULL)
245             break;
246 
247         *--p = ';';
248         *++p = ' ';
249         ++p;
250     }
251 
252     return style;
253 }
254 
255 /*
256   create string with merged properties
257 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
258 {
259     tmbstr line;
260     StyleProp *prop;
261 
262     prop = CreateProps(NULL, style);
263     prop = CreateProps(prop, property);
264     line = CreatePropString(prop);
265     FreeStyleProps(prop);
266     return line;
267 }
268 */
269 
270 void FreeStyles( TidyDocImpl* doc )
271 {
272     Lexer* lexer = doc->lexer;
273     if ( lexer )
274     {
275         TagStyle *style, *next;
276         for ( style = lexer->styles; style; style = next )
277         {
278             next = style->next;
279             MemFree( style->tag );
280             MemFree( style->tag_class );
281             MemFree( style->properties );
282             MemFree( style );
283         }
284     }
285 }
286 
287 static tmbstr GensymClass( TidyDocImpl* doc )
288 {
289     tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
290     ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
291     if ( pfx == NULL || *pfx == 0 )
292       pfx = "c";
293 
294     tmbsnprintf(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
295     return tmbstrdup(buf);
296 }
297 
298 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
299 {
300     Lexer* lexer = doc->lexer;
301     TagStyle* style;
302 
303     for (style = lexer->styles; style; style=style->next)
304     {
305         if (tmbstrcmp(style->tag, tag) == 0 &&
306             tmbstrcmp(style->properties, properties) == 0)
307             return style->tag_class;
308     }
309 
310     style = (TagStyle *)MemAlloc( sizeof(TagStyle) );
311     style->tag = tmbstrdup(tag);
312     style->tag_class = GensymClass( doc );
313     style->properties = tmbstrdup( properties );
314     style->next = lexer->styles;
315     lexer->styles = style;
316     return style->tag_class;
317 }
318 
319 /*
320  Add class="foo" to node
321 */
322 void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
323 {
324     AttVal *classattr = AttrGetById(node, TidyAttr_CLASS);;
325 
326     /*
327      if there already is a class attribute
328      then append class name after a space.
329     */
330     if (classattr)
331     {
332         uint len = tmbstrlen(classattr->value) +
333                   tmbstrlen(classname) + 2;
334         tmbstr s = (tmbstr) MemAlloc( len );
335         tmbstrcpy( s, classattr->value );
336         tmbstrcat( s, " " );
337         tmbstrcat( s, classname );
338         MemFree( classattr->value );
339         classattr->value = s;
340     }
341     else /* create new class attribute */
342         AddAttribute( doc, node, "class", classname );
343 }
344 
345 
346 /*
347  Find style attribute in node, and replace it
348  by corresponding class attribute. Search for
349  class in style dictionary otherwise gensym
350  new class and add to dictionary.
351 
352  Assumes that node doesn't have a class attribute
353 */
354 static void Style2Rule( TidyDocImpl* doc, Node *node)
355 {
356     AttVal *styleattr, *classattr;
357     ctmbstr classname;
358 
359     styleattr = AttrGetById(node, TidyAttr_STYLE);
360 
361     if (styleattr)
362     {
363         /* fix for http://tidy.sf.net/bug/850215 */
364         if (!styleattr->value)
365         {
366             RemoveAttribute(doc, node, styleattr);
367             return;
368         }
369 
370         classname = FindStyle( doc, node->element, styleattr->value );
371         classattr = AttrGetById(node, TidyAttr_CLASS);
372 
373         /*
374          if there already is a class attribute
375          then append class name after an underscore
376         */
377         if (classattr)
378         {
379             uint len = tmbstrlen(classattr->value) +
380                       tmbstrlen(classname) + 2;
381             tmbstr s = (tmbstr) MemAlloc( len );
382             s[0] = '\0';
383             if (classattr->value)
384             {
385                 tmbstrcpy(s, classattr->value);
386                 tmbstrcat(s, " ");
387             }
388             tmbstrcat(s, classname);
389             if (classattr->value)
390                 MemFree(classattr->value);
391             classattr->value = s;
392             RemoveAttribute( doc, node, styleattr );
393         }
394         else /* reuse style attribute for class attribute */
395         {
396             MemFree(styleattr->attribute);
397             MemFree(styleattr->value);
398             styleattr->attribute = tmbstrdup("class");
399             styleattr->value = tmbstrdup(classname);
400         }
401     }
402 }
403 
404 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
405 {
406     if ( selector && color )
407     {
408         AddStringLiteral(lexer, selector);
409         AddStringLiteral(lexer, " { color: ");
410         AddStringLiteral(lexer, color);
411         AddStringLiteral(lexer, " }\n");
412     }
413 }
414 
415 /*
416  move presentation attribs from body to style element
417 
418  background="foo" ->  body { background-image: url(foo) }
419  bgcolor="foo"    ->  body { background-color: foo }
420  text="foo"       ->  body { color: foo }
421  link="foo"       ->  :link { color: foo }
422  vlink="foo"      ->  :visited { color: foo }
423  alink="foo"      ->  :active { color: foo }
424 */
425 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
426 {
427     Lexer* lexer  = doc->lexer;
428     tmbstr bgurl   = NULL;
429     tmbstr bgcolor = NULL;
430     tmbstr color   = NULL;
431     AttVal* attr;
432     
433     if (NULL != (attr = AttrGetById(body, TidyAttr_BACKGROUND)))
434     {
435         bgurl = attr->value;
436         attr->value = NULL;
437         RemoveAttribute( doc, body, attr );
438     }
439 
440     if (NULL != (attr = AttrGetById(body, TidyAttr_BGCOLOR)))
441     {
442         bgcolor = attr->value;
443         attr->value = NULL;
444         RemoveAttribute( doc, body, attr );
445     }
446 
447     if (NULL != (attr = AttrGetById(body, TidyAttr_TEXT)))
448     {
449         color = attr->value;
450         attr->value = NULL;
451         RemoveAttribute( doc, body, attr );
452     }
453 
454     if ( bgurl || bgcolor || color )
455     {
456         AddStringLiteral(lexer, " body {\n");
457         if (bgurl)
458         {
459             AddStringLiteral(lexer, "  background-image: url(");
460             AddStringLiteral(lexer, bgurl);
461             AddStringLiteral(lexer, ");\n");
462             MemFree(bgurl);
463         }
464         if (bgcolor)
465         {
466             AddStringLiteral(lexer, "  background-color: ");
467             AddStringLiteral(lexer, bgcolor);
468             AddStringLiteral(lexer, ";\n");
469             MemFree(bgcolor);
470         }
471         if (color)
472         {
473             AddStringLiteral(lexer, "  color: ");
474             AddStringLiteral(lexer, color);
475             AddStringLiteral(lexer, ";\n");
476             MemFree(color);
477         }
478 
479         AddStringLiteral(lexer, " }\n");
480     }
481 
482     if (NULL != (attr = AttrGetById(body, TidyAttr_LINK)))
483     {
484         AddColorRule(lexer, " :link", attr->value);
485         RemoveAttribute( doc, body, attr );
486     }
487 
488     if (NULL != (attr = AttrGetById(body, TidyAttr_VLINK)))
489     {
490         AddColorRule(lexer, " :visited", attr->value);
491         RemoveAttribute( doc, body, attr );
492     }
493 
494     if (NULL != (attr = AttrGetById(body, TidyAttr_ALINK)))
495     {
496         AddColorRule(lexer, " :active", attr->value);
497         RemoveAttribute( doc, body, attr );
498     }
499 }
500 
501 static Bool NiceBody( TidyDocImpl* doc )
502 {
503     Node* node = FindBody(doc);
504     if (node)
505     {
506         if (AttrGetById(node, TidyAttr_BACKGROUND) ||
507             AttrGetById(node, TidyAttr_BGCOLOR)    ||
508             AttrGetById(node, TidyAttr_TEXT)       ||
509             AttrGetById(node, TidyAttr_LINK)       ||
510             AttrGetById(node, TidyAttr_VLINK)      ||
511             AttrGetById(node, TidyAttr_ALINK))
512         {
513             doc->badLayout |= USING_BODY;
514             return no;
515         }
516     }
517 
518     return yes;
519 }
520 
521 /* create style element using rules from dictionary */
522 static void CreateStyleElement( TidyDocImpl* doc )
523 {
524     Lexer* lexer = doc->lexer;
525     Node *node, *head, *body;
526     TagStyle *style;
527     AttVal *av;
528 
529     if ( lexer->styles == NULL && NiceBody(doc) )
530         return;
531 
532     node = NewNode( lexer );
533     node->type = StartTag;
534     node->implicit = yes;
535     node->element = tmbstrdup("style");
536     FindTag( doc, node );
537 
538     /* insert type attribute */
539     av = NewAttributeEx( doc, "type", "text/css", '"' );
540     InsertAttributeAtStart( node, av );
541 
542     body = FindBody( doc );
543     lexer->txtstart = lexer->lexsize;
544     if ( body )
545         CleanBodyAttrs( doc, body );
546 
547     for (style = lexer->styles; style; style = style->next)
548     {
549         AddCharToLexer(lexer, ' ');
550         AddStringLiteral(lexer, style->tag);
551         AddCharToLexer(lexer, '.');
552         AddStringLiteral(lexer, style->tag_class);
553         AddCharToLexer(lexer, ' ');
554         AddCharToLexer(lexer, '{');
555         AddStringLiteral(lexer, style->properties);
556         AddCharToLexer(lexer, '}');
557         AddCharToLexer(lexer, '\n');
558     }
559 
560     lexer->txtend = lexer->lexsize;
561 
562     InsertNodeAtEnd( node, TextToken(lexer) );
563 
564     /*
565      now insert style element into document head
566 
567      doc is root node. search its children for html node
568      the head node should be first child of html node
569     */
570     if ( NULL != (head = FindHEAD( doc )) )
571         InsertNodeAtEnd( head, node );
572 }
573 
574 
575 /* ensure bidirectional links are consistent */
576 void FixNodeLinks(Node *node)
577 {
578     Node *child;
579 
580     if (node->prev)
581         node->prev->next = node;
582     else
583         node->parent->content = node;
584 
585     if (node->next)
586         node->next->prev = node;
587     else
588         node->parent->last = node;
589 
590     for (child = node->content; child; child = child->next)
591         child->parent = node;
592 }
593 
594 /*
595  used to strip child of node when
596  the node has one and only one child
597 */
598 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
599 {
600     Node *child;
601 
602     child = node->content;
603     node->content = child->content;
604     node->last = child->last;
605     child->content = NULL;
606     FreeNode(doc, child);
607 
608     for (child = node->content; child; child = child->next)
609         child->parent = node;
610 }
611 
612 /*
613   used to strip font start and end tags.
614   Extricate "element", replace it by its content and delete it.
615 */
616 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
617 {
618     if (element->content)
619     {
620         Node *node, *parent = element->parent;
621 
622         element->last->next = element->next;
623 
624         if (element->next)
625         {
626             element->next->prev = element->last;
627         }
628         else
629             parent->last = element->last;
630 
631         if (element->prev)
632         {
633             element->content->prev = element->prev;
634             element->prev->next = element->content;
635         }
636         else
637             parent->content = element->content;
638 
639         for (node = element->content; node; node = node->next)
640             node->parent = parent;
641 
642         *pnode = element->content;
643 
644         element->next = element->content = NULL;
645         FreeNode(doc, element);
646     }
647     else
648     {
649         *pnode = DiscardElement(doc, element);
650     }
651 }
652 
653 /*
654   Create new string that consists of the
655   combined style properties in s1 and s2
656 
657   To merge property lists, we build a linked
658   list of property/values and insert properties
659   into the list in order, merging values for
660   the same property name.
661 */
662 static tmbstr MergeProperties( ctmbstr s1, ctmbstr s2 )
663 {
664     tmbstr s;
665     StyleProp *prop;
666 
667     prop = CreateProps(NULL, s1);
668     prop = CreateProps(prop, s2);
669     s = CreatePropString(prop);
670     FreeStyleProps(prop);
671     return s;
672 }
673 
674 /*
675  Add style property to element, creating style
676  attribute as needed and adding ; delimiter
677 */
678 static void AddStyleProperty(TidyDocImpl* doc, Node *node, ctmbstr property )
679 {
680     AttVal *av = AttrGetById(node, TidyAttr_STYLE);
681 
682     /* if style attribute already exists then insert property */
683 
684     if ( av )
685     {
686         if (av->value != NULL)
687         {
688             tmbstr s = MergeProperties( av->value, property );
689             MemFree( av->value );
690             av->value = s;
691         }
692         else
693         {
694             av->value = tmbstrdup( property );
695         }
696     }
697     else /* else create new style attribute */
698     {
699         av = NewAttributeEx( doc, "style", property, '"' );
700         InsertAttributeAtStart( node, av );
701     }
702 }
703 
704 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
705 {
706     AttVal *av;
707     tmbstr s1, s2, names;
708 
709     for (s2 = NULL, av = child->attributes; av; av = av->next)
710     {
711         if (attrIsCLASS(av))
712         {
713             s2 = av->value;
714             break;
715         }
716     }
717 
718     for (s1 = NULL, av = node->attributes; av; av = av->next)
719     {
720         if (attrIsCLASS(av))
721         {
722             s1 = av->value;
723             break;
724         }
725     }
726 
727     if (s1)
728     {
729         if (s2)  /* merge class names from both */
730         {
731             uint l1, l2;
732             l1 = tmbstrlen(s1);
733             l2 = tmbstrlen(s2);
734             names = (tmbstr) MemAlloc(l1 + l2 + 2);
735             tmbstrcpy(names, s1);
736             names[l1] = ' ';
737             tmbstrcpy(names+l1+1, s2);
738             MemFree(av->value);
739             av->value = names;
740         }
741     }
742     else if (s2)  /* copy class names from child */
743     {
744         av = NewAttributeEx( doc, "class", s2, '"' );
745         InsertAttributeAtStart( node, av );
746     }
747 }
748 
749 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
750 {
751     AttVal *av;
752     tmbstr s1, s2, style;
753 
754     /*
755        the child may have a class attribute used
756        for attaching styles, if so the class name
757        needs to be copied to node's class
758     */
759     MergeClasses(doc, node, child);
760 
761     for (s2 = NULL, av = child->attributes; av; av = av->next)
762     {
763         if (attrIsSTYLE(av))
764         {
765             s2 = av->value;
766             break;
767         }
768     }
769 
770     for (s1 = NULL, av = node->attributes; av; av = av->next)
771     {
772         if (attrIsSTYLE(av))
773         {
774             s1 = av->value;
775             break;
776         }
777     }
778 
779     if (s1)
780     {
781         if (s2)  /* merge styles from both */
782         {
783             style = MergeProperties(s1, s2);
784             MemFree(av->value);
785             av->value = style;
786         }
787     }
788     else if (s2)  /* copy style of child */
789     {
790         av = NewAttributeEx( doc, "style", s2, '"' );
791         InsertAttributeAtStart( node, av );
792     }
793 }
794 
795 static ctmbstr FontSize2Name(ctmbstr size)
796 {
797     static const ctmbstr sizes[7] =
798     {
799         "60%", "70%", "80%", NULL,
800         "120%", "150%", "200%"
801     };
802 
803     /* increment of 0.8 */
804     static const ctmbstr minussizes[] =
805     {
806         "100%", "80%", "64%", "51%",
807         "40%", "32%", "26%"
808     };
809 
810     /* increment of 1.2 */
811     static const ctmbstr plussizes[] =
812     {
813         "100%", "120%", "144%", "172%",
814         "207%", "248%", "298%"
815     };
816 
817     if (size[0] == '\0')
818         return NULL;
819 
820     if ('' <= size[0] && size[0] <= '6')
821     {
822         int n = size[0] - '';
823         return sizes[n];
824     }
825 
826     if (size[0] == '-')
827     {
828         if ('' <= size[1] && size[1] <= '6')
829         {
830             int n = size[1] - '';
831             return minussizes[n];
832         }
833         return "smaller"; /*"70%"; */
834     }
835 
836     if ('' <= size[1] && size[1] <= '6')
837     {
838         int n = size[1] - '';
839         return plussizes[n];
840     }
841 
842     return "larger"; /* "140%" */
843 }
844 
845 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
846 {
847     tmbchar buf[256];
848     tmbsnprintf(buf, sizeof(buf), "font-family: %s", face );
849     AddStyleProperty( doc, node, buf );
850 }
851 
852 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
853 {
854     ctmbstr value = NULL;
855 
856     if (nodeIsP(node))
857     {
858         if (tmbstrcmp(size, "6") == 0)
859             value = "h1";
860         else if (tmbstrcmp(size, "5") == 0)
861             value = "h2";
862         else if (tmbstrcmp(size, "4") == 0)
863             value = "h3";
864 
865         if (value)
866         {
867             MemFree(node->element);
868             node->element = tmbstrdup(value);
869             FindTag(doc, node);
870             return;
871         }
872     }
873 
874     value = FontSize2Name(size);
875 
876     if (value)
877     {
878         tmbchar buf[64];
879         tmbsnprintf(buf, sizeof(buf), "font-size: %s", value);
880         AddStyleProperty( doc, node, buf );
881     }
882 }
883 
884 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
885 {
886     tmbchar buf[128];
887     tmbsnprintf(buf, sizeof(buf), "color: %s", color);
888     AddStyleProperty( doc, node, buf );
889 }
890 
891 /* force alignment value to lower case */
892 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
893 {
894     uint i;
895     tmbchar buf[128];
896 
897     tmbstrcpy( buf, "text-align: " );
898     for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
899     {
900         if ( (buf[i] = (tmbchar)ToLower(*align++)) == '\0' )
901             break;
902     }
903     buf[i] = '\0';
904     AddStyleProperty( doc, node, buf );
905 }
906 
907 /*
908  add style properties to node corresponding to
909  the font face, size and color attributes
910 */
911 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
912 {
913     while (av)
914     {
915         if (AttrHasValue(av))
916         {
917             if (attrIsFACE(av))
918                 AddFontFace( doc, node, av->value );
919             else if (attrIsSIZE(av))
920                 AddFontSize( doc, node, av->value );
921             else if (attrIsCOLOR(av))
922                 AddFontColor( doc, node, av->value );
923         }
924         av = av->next;
925     }
926 }
927 
928 /*
929     Symptom: <p align=center>
930     Action: <p style="text-align: center">
931 */
932 static void TextAlign( TidyDocImpl* doc, Node* node )
933 {
934     AttVal *av, *prev;
935 
936     prev = NULL;
937 
938     for (av = node->attributes; av; av = av->next)
939     {
940         if (attrIsALIGN(av))
941         {
942             if (prev)
943                 prev->next = av->next;
944             else
945                 node->attributes = av->next;
946 
947             if (av->value)
948                 AddAlign( doc, node, av->value );
949 
950             FreeAttribute(doc, av);
951             break;
952         }
953 
954         prev = av;
955     }
956 }
957 
958 /*
959    The clean up rules use the pnode argument to return the
960    next node when the original node has been deleted
961 */
962 
963 /*
964     Symptom: <dir> <li> where <li> is only child
965     Action: coerce <dir> <li> to <div> with indent.
966 */
967 
968 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
969 {
970     Node *child;
971 
972     if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
973     {
974         child = node->content;
975 
976         if (child == NULL)
977             return no;
978 
979         /* check child has no peers */
980 
981         if (child->next)
982             return no;
983 
984         if ( !nodeIsLI(child) )
985             return no;
986 
987         if ( !child->implicit )
988             return no;
989 
990         /* coerce dir to div */
991         node->tag = LookupTagDef( TidyTag_DIV );
992         MemFree( node->element );
993         node->element = tmbstrdup("div");
994         AddStyleProperty( doc, node, "margin-left: 2em" );
995         StripOnlyChild( doc, node );
996         return yes;
997     }
998 
999     return no;
1000 }
1001 
1002 /*
1003     Symptom: <center>
1004     Action: replace <center> by <div style="text-align: center">
1005 */
1006 
1007 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1008 {
1009     if ( nodeIsCENTER(node) )
1010     {
1011         if ( cfgBool(doc, TidyDropFontTags) )
1012         {
1013             if (node->content)
1014             {
1015                 Node *last = node->last;
1016                 DiscardContainer( doc, node, pnode );
1017 
1018                 node = InferredTag(doc, TidyTag_BR);
1019                 InsertNodeAfterElement(last, node);
1020             }
1021             else
1022             {
1023                 Node *prev = node->prev, *next = node->next,
1024                      *parent = node->parent;
1025                 DiscardContainer( doc, node, pnode );
1026 
1027                 node = InferredTag(doc, TidyTag_BR);
1028                 if (next)
1029                     InsertNodeBeforeElement(next, node);
1030                 else if (prev)
1031                     InsertNodeAfterElement(prev, node);
1032                 else
1033                     InsertNodeAtStart(parent, node);
1034             }
1035 
1036             return yes;
1037         }
1038 
1039         RenameElem( node, TidyTag_DIV );
1040         AddStyleProperty( doc, node, "text-align: center" );
1041         return yes;
1042     }
1043 
1044     return no;
1045 }
1046 
1047 /* Copy child attributes to node. Duplicate attributes are overwritten.
1048    Unique attributes (such as ID) disable the action.
1049    Attributes style and class are not dealt with. A call to MergeStyles
1050    will do that.
1051 */
1052 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1053 {
1054     AttVal *av1, *av2;
1055     TidyAttrId id;
1056 
1057     /* Detect attributes that cannot be merged or overwritten. */
1058     if (AttrGetById(child, TidyAttr_ID) != NULL
1059         && AttrGetById(node, TidyAttr_ID) != NULL)
1060         return no;
1061 
1062     /* Move child attributes to node. Attributes in node
1063      can be overwritten or merged. */
1064     for (av2 = child->attributes; av2; )
1065     {
1066         /* Dealt by MergeStyles. */
1067         if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1068         {
1069             av2 = av2->next;
1070             continue;
1071         }
1072         /* Avoid duplicates in node */
1073         if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1074             && (av1=AttrGetById(node, id))!= NULL)
1075             RemoveAttribute( doc, node, av1 );
1076 
1077         /* Move attribute from child to node */
1078         DetachAttribute( child, av2 );
1079         av1 = av2;
1080         av2 = av2->next;
1081         av1->next = NULL;
1082         InsertAttributeAtEnd( node, av1 );
1083     }
1084 
1085     return yes;
1086 }
1087 
1088 /*
1089     Symptom <XX><XX>...</XX></XX>
1090     Action: merge the two XXs
1091 
1092   For instance, this is useful after nested <dir>s used by Word
1093   for indenting have been converted to <div>s
1094 
1095   If state is "no", no merging.
1096   If state is "yes", inner element is discarded. Only Style and Class
1097   attributes are merged using MergeStyles().
1098   If state is "auto", atttibutes are merged as described in CopyAttrs().
1099   Style and Class attributes are merged using MergeStyles().
1100 */
1101 static Bool MergeNestedElements( TidyDocImpl* doc,
1102                                  TidyTagId Id, TidyTriState state, Node *node,
1103                                  Node **ARG_UNUSED(pnode))
1104 {
1105     Node *child;
1106 
1107     if ( state == TidyNoState
1108          || !TagIsId(node, Id) )
1109         return no;
1110 
1111     child = node->content;
1112 
1113     if ( child == NULL
1114          || child->next != NULL
1115          || !TagIsId(child, Id) )
1116         return no;
1117 
1118     if ( state == TidyAutoState
1119          && CopyAttrs(doc, node, child) == no )
1120         return no;
1121 
1122     MergeStyles( doc, node, child );
1123     StripOnlyChild( doc, node );
1124     return yes;
1125 }
1126 
1127 /*
1128     Symptom: <ul><li><ul>...</ul></li></ul>
1129     Action: discard outer list
1130 */
1131 
1132 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1133 {
1134     Node *child, *list;
1135 
1136     if ( nodeIsUL(node) || nodeIsOL(node) )
1137     {
1138         child = node->content;
1139 
1140         if (child == NULL)
1141             return no;
1142 
1143         /* check child has no peers */
1144 
1145         if (child->next)
1146             return no;
1147 
1148         list = child->content;
1149 
1150         if (!list)
1151             return no;
1152 
1153         if (list->tag != node->tag)
1154             return no;
1155 
1156         /* check list has no peers */
1157         if (list->next)
1158             return no;
1159 
1160         *pnode = list;  /* Set node to resume iteration */
1161 
1162         /* move inner list node into position of outer node */
1163         list->prev = node->prev;
1164         list->next = node->next;
1165         list->parent = node->parent;
1166         FixNodeLinks(list);
1167 
1168         /* get rid of outer ul and its li */
1169         child->content = NULL;
1170         FreeNode( doc, child ); /* See test #427841. */
1171         child = NULL;
1172         node->content = NULL;
1173         node->next = NULL;
1174         FreeNode( doc, node );
1175         node = NULL;
1176 
1177         /*
1178           If prev node was a list the chances are this node
1179           should be appended to that list. Word has no way of
1180           recognizing nested lists and just uses indents
1181         */
1182 
1183         if (list->prev)
1184         {
1185             if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1186                  && list->prev->last )
1187             {
1188                 node = list;
1189                 list = node->prev;
1190 
1191                 child = list->last;  /* <li> */
1192 
1193                 list->next = node->next;
1194                 FixNodeLinks(list);
1195 
1196                 node->parent = child;
1197                 node->next = NULL;
1198                 node->prev = child->last;
1199                 FixNodeLinks(node);
1200                 CleanNode( doc, node );
1201             }
1202         }
1203 
1204         return yes;
1205     }
1206 
1207     return no;
1208 }
1209 
1210 /*
1211   Some necessary conditions to apply BlockStyle().
1212  */
1213 
1214 static Bool CanApplyBlockStyle( Node *node )
1215 {
1216     if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1217         && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1218     {
1219         return yes;
1220     }
1221     return no;
1222 }
1223 
1224 /*
1225   Symptom: the only child of a block-level element is a
1226   presentation element such as B, I or FONT
1227 
1228   Action: add style "font-weight: bold" to the block and
1229   strip the <b> element, leaving its children.
1230 
1231   example:
1232 
1233     <p>
1234       <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1235     </p>
1236 
1237   becomes:
1238 
1239       <p style="font-weight: bold; font-family: Arial; font-size: 6">
1240         Draft Recommended Practice
1241       </p>
1242 
1243   This code also replaces the align attribute by a style attribute.
1244   However, to avoid CSS problems with Navigator 4, this isn't done
1245   for the elements: caption, tr and table
1246 */
1247 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1248 {
1249     Node *child;
1250 
1251     if (CanApplyBlockStyle(node))
1252     {
1253         /* check for align attribute */
1254         if ( !nodeIsCAPTION(node) )
1255             TextAlign( doc, node );
1256 
1257         child = node->content;
1258         if (child == NULL)
1259             return no;
1260 
1261         /* check child has no peers */
1262         if (child->next)
1263             return no;
1264 
1265         if ( nodeIsB(child) )
1266         {
1267             MergeStyles( doc, node, child );
1268             AddStyleProperty( doc, node, "font-weight: bold" );
1269             StripOnlyChild( doc, node );
1270             return yes;
1271         }
1272 
1273         if ( nodeIsI(child) )
1274         {
1275             MergeStyles( doc, node, child );
1276             AddStyleProperty( doc, node, "font-style: italic" );
1277             StripOnlyChild( doc, node );
1278             return yes;
1279         }
1280 
1281         if ( nodeIsFONT(child) )
1282         {
1283             MergeStyles( doc, node, child );
1284             AddFontStyles( doc, node, child->attributes );
1285             StripOnlyChild( doc, node );
1286             return yes;
1287         }
1288     }
1289 
1290     return no;
1291 }
1292 
1293 /* the only child of table cell or an inline element such as em */
1294 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1295 {
1296     Node *child;
1297 
1298     if ( !nodeIsFONT(node) && nodeHasCM(node, CM_INLINE|CM_ROW) )
1299     {
1300         child = node->content;
1301 
1302         if (child == NULL)
1303             return no;
1304 
1305         /* check child has no peers */
1306 
1307         if (child->next)
1308             return no;
1309 
1310         if ( nodeIsB(child) && cfgBool(doc, TidyLogicalEmphasis) )
1311         {
1312             MergeStyles( doc, node, child );
1313             AddStyleProperty( doc, node, "font-weight: bold" );
1314             StripOnlyChild( doc, node );
1315             return yes;
1316         }
1317 
1318         if ( nodeIsI(child) && cfgBool(doc, TidyLogicalEmphasis) )
1319         {
1320             MergeStyles( doc, node, child );
1321             AddStyleProperty( doc, node, "font-style: italic" );
1322             StripOnlyChild( doc, node );
1323             return yes;
1324         }
1325 
1326         if ( nodeIsFONT(child) )
1327         {
1328             MergeStyles( doc, node, child );
1329             AddFontStyles( doc, node, child->attributes );
1330             StripOnlyChild( doc, node );
1331             return yes;
1332         }
1333     }
1334 
1335     return no;
1336 }
1337 
1338 /*
1339   Replace font elements by span elements, deleting
1340   the font element's attributes and replacing them
1341   by a single style attribute.
1342 */
1343 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1344 {
1345     AttVal *av, *style, *next;
1346 
1347     if ( nodeIsFONT(node) )
1348     {
1349         if ( cfgBool(doc, TidyDropFontTags) )
1350         {
1351             DiscardContainer( doc, node, pnode );
1352             return yes;
1353         }
1354 
1355         /* if FONT is only child of parent element then leave alone
1356           Do so only if BlockStyle may be succesful. */
1357         if ( node->parent->content == node && node->next == NULL &&
1358              CanApplyBlockStyle(node->parent) )
1359             return no;
1360 
1361         AddFontStyles( doc, node, node->attributes );
1362 
1363         /* extract style attribute and free the rest */
1364         av = node->attributes;
1365         style = NULL;
1366 
1367         while (av)
1368         {
1369             next = av->next;
1370 
1371             if (attrIsSTYLE(av))
1372             {
1373                 av->next = NULL;
1374                 style = av;
1375             }
1376             else
1377             {
1378                 FreeAttribute( doc, av );
1379             }
1380             av = next;
1381         }
1382 
1383         node->attributes = style;
1384         RenameElem( node, TidyTag_SPAN );
1385         return yes;
1386     }
1387 
1388     return no;
1389 }
1390 
1391 /*
1392   Applies all matching rules to a node.
1393 */
1394 Node* CleanNode( TidyDocImpl* doc, Node *node )
1395 {
1396     Node *next = NULL;
1397     TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1398 
1399     for (next = node; nodeIsElement(node); node = next)
1400     {
1401         if ( Dir2Div(doc, node, &next) )
1402             continue;
1403 
1404         /* Special case: true result means
1405         ** that arg node and its parent no longer exist.
1406         ** So we must jump back up the CreateStyleProperties()
1407         ** call stack until we have a valid node reference.
1408         */
1409         if ( NestedList(doc, node, &next) )
1410             return next;
1411 
1412         if ( Center2Div(doc, node, &next) )
1413             continue;
1414 
1415         if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1416             continue;
1417 
1418         if ( BlockStyle(doc, node, &next) )
1419             continue;
1420 
1421         if ( InlineStyle(doc, node, &next) )
1422             continue;
1423 
1424         if ( Font2Span(doc, node, &next) )
1425             continue;
1426 
1427         break;
1428     }